Merge pull request #1273 from troosan/various_html_parsing_fixes

Various html parsing fixes, fixes for #1252 and #1254
This commit is contained in:
troosan 2018-02-09 22:50:18 +01:00 committed by GitHub
commit 3f40c5e408
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 146 additions and 12 deletions

View File

@ -8,10 +8,14 @@ v0.15.0 (?? ??? 2018)
### Added
- Parsing of "align" HTML attribute - @troosan #1231
- Parse formatting inside HTML lists - @troosan @samimussbach #1239 #945 #1215 #508
- Parsing of CSS `direction` instruction, HTML `lang` attribute, formatting inside table cell - @troosan #1273 #1252 #1254
- Add support for Track changes @Cip @troosan #354 #1262
### Fixed
- fix reading of docx default style - @troosan #1238
- fix the size unit of when parsing html images - @troosan #1254
- fixed HTML parsing of nested lists - @troosan #1265
- Save PNG alpha information when using remote images. @samsullivan #779

View File

@ -7,11 +7,13 @@ $phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
$html = '<h1>Adding element via HTML</h1>';
$html .= '<p>Some well formed HTML snippet needs to be used</p>';
$html .= '<p>Some well-formed HTML snippet needs to be used</p>';
$html .= '<p>With for example <strong>some<sup>1</sup> <em>inline</em> formatting</strong><sub>1</sub></p>';
$html .= '<p>A link to <a href="http://phpword.readthedocs.io/">Read the docs</a></p>';
$html .= '<p lang="he-IL" style="text-align: right; direction: rtl">היי, זה פסקה מימין לשמאל</p>';
$html .= '<p style="margin-top: 240pt;">Unordered (bulleted) list:</p>';
$html .= '<ul><li>Item 1</li><li>Item 2</li><ul><li>Item 2.1</li><li>Item 2.1</li></ul></ul>';
@ -29,10 +31,12 @@ $html .= '<ol>
<ol>
<li>List 2 item 1</li>
<li>List 2 item 2</li>
<li>
<ol>
<li>sub list 1</li>
<li>sub list 2</li>
</ol>
</li>
<li>List 2 item 3</li>
<ol>
<li>sub list 1, restarts with a</li>
@ -65,10 +69,20 @@ $html .= '<table align="center" style="width: 50%; border: 6px #0000FF double;">
</thead>
<tbody>
<tr><td style="border-style: dotted;">1</td><td colspan="2">2</td></tr>
<tr><td>4</td><td>5</td><td>6</td></tr>
<tr><td>This is <b>bold</b> text</td><td></td><td>6</td></tr>
</tbody>
</table>';
$html .= '<p style="margin-top: 240pt;">Table inside another table:</p>';
$html .= '<table align="center" style="width: 80%; border: 6px #0000FF double;">
<tr><td>
<table style="width: 100%; border: 4px #FF0000 dotted;">
<tr><td>column 1</td><td>column 2</td></tr>
</table>
</td></tr>
<tr><td style="text-align: center;">Cell in parent table</td></tr>
</table>';
\PhpOffice\PhpWord\Shared\Html::addHtml($section, $html, false, false);
// Save file

View File

@ -93,6 +93,13 @@ abstract class AbstractElement
*/
private $nestedLevel = 0;
/**
* A reference to the parent
*
* @var \PhpOffice\PhpWord\Element\AbstractElement
*/
private $parent;
/**
* changed element info
*
@ -328,6 +335,11 @@ abstract class AbstractElement
$this->commentRangeEnd->setEndElement($this);
}
public function getParent()
{
return $this->parent;
}
/**
* Set parent container
*
@ -338,6 +350,7 @@ abstract class AbstractElement
public function setParentContainer(AbstractElement $container)
{
$this->parentContainer = substr(get_class($container), strrpos(get_class($container), '\\') + 1);
$this->parent = $container;
// Set nested level
$this->nestedLevel = $container->getNestedLevel();

View File

@ -31,6 +31,7 @@ use PhpOffice\PhpWord\SimpleType\NumberFormat;
class Html
{
private static $listIndex = 0;
private static $xpath;
/**
* Add HTML parts.
@ -65,6 +66,7 @@ class Html
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = $preserveWhiteSpace;
$dom->loadXML($html);
self::$xpath = new \DOMXpath($dom);
$node = $dom->getElementsByTagName('body');
self::parseNode($node->item(0), $element);
@ -89,6 +91,10 @@ class Html
break;
case 'align':
$styles['alignment'] = self::mapAlign($attribute->value);
break;
case 'lang':
$styles['lang'] = $attribute->value;
break;
}
}
}
@ -333,7 +339,7 @@ class Html
* @param \DOMNode $node
* @param \PhpOffice\PhpWord\Element\Table $element
* @param array &$styles
* @return \PhpOffice\PhpWord\Element\Cell $element
* @return \PhpOffice\PhpWord\Element\Cell|\PhpOffice\PhpWord\Element\TextRun $element
*/
private static function parseCell($node, $element, &$styles)
{
@ -343,8 +349,29 @@ class Html
if (!empty($colspan)) {
$cellStyles['gridSpan'] = $colspan - 0;
}
$cell = $element->addCell(null, $cellStyles);
return $element->addCell(null, $cellStyles);
if (self::shouldAddTextRun($node)) {
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
}
return $cell;
}
/**
* Checks if $node contains an HTML element that cannot be added to TextRun
*
* @param \DOMNode $node
* @return bool Returns true if the node contains an HTML element that cannot be added to TextRun
*/
private static function shouldAddTextRun(\DOMNode $node)
{
$containsBlockElement = self::$xpath->query('.//table|./p|./ul|./ol', $node)->length > 0;
if ($containsBlockElement) {
return false;
}
return true;
}
/**
@ -375,7 +402,7 @@ class Html
*/
private static function parseList($node, $element, &$styles, &$data)
{
$isOrderedList = $node->nodeName == 'ol';
$isOrderedList = $node->nodeName === 'ol';
if (isset($data['listdepth'])) {
$data['listdepth']++;
} else {
@ -383,6 +410,9 @@ class Html
$styles['list'] = 'listStyle_' . self::$listIndex++;
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
}
if ($node->parentNode->nodeName === 'li') {
return $element->getParent();
}
}
private static function getListStyle($isOrderedList)
@ -469,6 +499,9 @@ class Html
case 'text-align':
$styles['alignment'] = self::mapAlign($cValue);
break;
case 'direction':
$styles['rtl'] = $cValue === 'rtl';
break;
case 'font-size':
$styles['size'] = Converter::cssToPoint($cValue);
break;
@ -556,10 +589,12 @@ class Html
case 'width':
$width = $attribute->value;
$style['width'] = $width;
$style['unit'] = \PhpOffice\PhpWord\Style\Image::UNIT_PX;
break;
case 'height':
$height = $attribute->value;
$style['height'] = $height;
$style['unit'] = \PhpOffice\PhpWord\Style\Image::UNIT_PX;
break;
case 'style':
$styleattr = explode(';', $attribute->value);

View File

@ -90,6 +90,10 @@ class Font extends AbstractStyle
$xmlWriter->writeAttributeIf($language->getLatin() !== null, 'w:val', $language->getLatin());
$xmlWriter->writeAttributeIf($language->getEastAsia() !== null, 'w:eastAsia', $language->getEastAsia());
$xmlWriter->writeAttributeIf($language->getBidirectional() !== null, 'w:bidi', $language->getBidirectional());
//if bidi is not set but we are writing RTL, write the latin language in the bidi tag
if ($style->isRTL() && $language->getBidirectional() === null && $language->getLatin() !== null) {
$xmlWriter->writeAttribute('w:bidi', $language->getLatin());
}
$xmlWriter->endElement();
}

View File

@ -150,6 +150,33 @@ class HtmlTest extends \PHPUnit\Framework\TestCase
$this->assertEquals('15', $doc->getElementAttribute('/w:document/w:body/w:p[2]/w:r/w:rPr/w:sz', 'w:val'));
}
/**
* Test direction style
*/
public function testParseTextDirection()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
Html::addHtml($section, '<span style="direction: rtl">test</span>');
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:rPr/w:rtl'));
}
/**
* Test html lang
*/
public function testParseLang()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
Html::addHtml($section, '<span lang="fr-BE">test</span>');
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:rPr/w:lang'));
$this->assertEquals('fr-BE', $doc->getElementAttribute('/w:document/w:body/w:p/w:r/w:rPr/w:lang', 'w:val'));
}
/**
* Test font-family style
*/
@ -199,7 +226,7 @@ class HtmlTest extends \PHPUnit\Framework\TestCase
</thead>
<tbody>
<tr><td style="border-style: dotted;">1</td><td colspan="2">2</td></tr>
<tr><td>4</td><td>5</td><td>6</td></tr>
<tr><td>This is <b>bold</b> text</td><td>5</td><td><p>6</p></td></tr>
</tbody>
</table>';
Html::addHtml($section, $html);
@ -272,6 +299,43 @@ class HtmlTest extends \PHPUnit\Framework\TestCase
$this->assertNotEquals($firstListnumId, $secondListnumId);
}
/**
* Tests parsing of nested ul/li
*/
public function testOrderedNestedListNumbering()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
$html = '<ol>
<li>List 1 item 1</li>
<li>List 1 item 2</li>
</ol>
<p>Some Text</p>
<ol>
<li>List 2 item 1</li>
<li>
<ol>
<li>sub list 1</li>
<li>sub list 2</li>
</ol>
</li>
</ol>';
Html::addHtml($section, $html, false, false);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:pPr/w:numPr/w:numId'));
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:t'));
$this->assertEquals('List 1 item 1', $doc->getElement('/w:document/w:body/w:p[1]/w:r/w:t')->nodeValue);
$this->assertEquals('List 2 item 1', $doc->getElement('/w:document/w:body/w:p[4]/w:r/w:t')->nodeValue);
$firstListnumId = $doc->getElementAttribute('/w:document/w:body/w:p[1]/w:pPr/w:numPr/w:numId', 'w:val');
$secondListnumId = $doc->getElementAttribute('/w:document/w:body/w:p[4]/w:pPr/w:numPr/w:numId', 'w:val');
$this->assertNotEquals($firstListnumId, $secondListnumId);
}
/**
* Tests parsing of ul/li
*/
@ -336,8 +400,8 @@ class HtmlTest extends \PHPUnit\Framework\TestCase
$baseXpath = '/w:document/w:body/w:p/w:r';
$this->assertTrue($doc->elementExists($baseXpath . '/w:pict/v:shape'));
$this->assertStringMatchesFormat('%Swidth:150pt%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
$this->assertStringMatchesFormat('%Sheight:200pt%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
$this->assertStringMatchesFormat('%Swidth:150px%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
$this->assertStringMatchesFormat('%Sheight:200px%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
$this->assertStringMatchesFormat('%Smso-position-horizontal:right%S', $doc->getElementAttribute($baseXpath . '[1]/w:pict/v:shape', 'style'));
$this->assertStringMatchesFormat('%Smso-position-horizontal:left%S', $doc->getElementAttribute($baseXpath . '[2]/w:pict/v:shape', 'style'));
}