Merge pull request #1902 from lubosdz/develop

Enhancements to addHTML parser
This commit is contained in:
troosan 2021-02-07 22:53:53 +01:00 committed by GitHub
commit 9e322dd6e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 465 additions and 16 deletions

View File

@ -62,10 +62,10 @@ class Html
// Preprocess: remove all line ends, decode HTML entity,
// fix ampersand and angle brackets and add body tag for HTML fragments
$html = str_replace(array("\n", "\r"), '', $html);
$html = str_replace(array('<', '>', '&'), array('_lt_', '_gt_', '_amp_'), $html);
$html = str_replace(array('<', '>', '&', '"'), array('_lt_', '_gt_', '_amp_', '_quot_'), $html);
$html = html_entity_decode($html, ENT_QUOTES, 'UTF-8');
$html = str_replace('&', '&', $html);
$html = str_replace(array('_lt_', '_gt_', '_amp_'), array('<', '>', '&'), $html);
$html = str_replace(array('_lt_', '_gt_', '_amp_', '_quot_'), array('<', '>', '&', '"'), $html);
if (false === $fullHTML) {
$html = '<body>' . $html . '</body>';
@ -100,15 +100,43 @@ class Html
$attributes = $node->attributes; // get all the attributes(eg: id, class)
foreach ($attributes as $attribute) {
switch ($attribute->name) {
$val = $attribute->value;
switch (strtolower($attribute->name)) {
case 'style':
$styles = self::parseStyle($attribute, $styles);
break;
case 'align':
$styles['alignment'] = self::mapAlign($attribute->value);
$styles['alignment'] = self::mapAlign(trim($val));
break;
case 'lang':
$styles['lang'] = $attribute->value;
$styles['lang'] = $val;
break;
case 'width':
// tables, cells
if (false !== strpos($val, '%')) {
// e.g. <table width="100%"> or <td width="50%">
$styles['width'] = intval($val) * 50;
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::PERCENT;
} else {
// e.g. <table width="250> where "250" = 250px (always pixels)
$styles['width'] = Converter::pixelToTwip($val);
$styles['unit'] = \PhpOffice\PhpWord\SimpleType\TblWidth::TWIP;
}
break;
case 'cellspacing':
// tables e.g. <table cellspacing="2">, where "2" = 2px (always pixels)
$val = intval($val).'px';
$styles['cellSpacing'] = Converter::cssToTwip($val);
break;
case 'bgcolor':
// tables, rows, cells e.g. <tr bgColor="#FF0000">
$styles['bgColor'] = trim($val, '# ');
break;
case 'valign':
// cells e.g. <td valign="middle">
if (preg_match('#(?:top|bottom|middle|baseline)#i', $val, $matches)) {
$styles['valign'] = self::mapAlignVertical($matches[0]);
}
break;
}
}
@ -165,6 +193,7 @@ class Html
'img' => array('Image', $node, $element, $styles, null, null, null),
'br' => array('LineBreak', null, $element, $styles, null, null, null),
'a' => array('Link', $node, $element, $styles, null, null, null),
'hr' => array('HorizRule', $node, $element, $styles, null, null, null),
);
$newElement = null;
@ -365,7 +394,11 @@ class Html
if (!empty($colspan)) {
$cellStyles['gridSpan'] = $colspan - 0;
}
$cell = $element->addCell(null, $cellStyles);
// set cell width to control column widths
$width = isset($cellStyles['width']) ? $cellStyles['width'] : null;
unset($cellStyles['width']); // would not apply
$cell = $element->addCell($width, $cellStyles);
if (self::shouldAddTextRun($node)) {
return $cell->addTextRun(self::parseInlineStyle($node, $styles['paragraph']));
@ -424,7 +457,32 @@ class Html
} else {
$data['listdepth'] = 0;
$styles['list'] = 'listStyle_' . self::$listIndex++;
$element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
$style = $element->getPhpWord()->addNumberingStyle($styles['list'], self::getListStyle($isOrderedList));
// extract attributes start & type e.g. <ol type="A" start="3">
$start = 0;
$type = '';
foreach ($node->attributes as $attribute) {
switch ($attribute->name) {
case 'start':
$start = (int) $attribute->value;
break;
case 'type':
$type = $attribute->value;
break;
}
}
$levels = $style->getLevels();
/** @var \PhpOffice\PhpWord\Style\NumberingLevel */
$level = $levels[0];
if ($start > 0) {
$level->setStart($start);
}
$type = $type ? self::mapListType($type) : null;
if ($type) {
$level->setFormat($type);
}
}
if ($node->parentNode->nodeName === 'li') {
return $element->getParent();
@ -506,7 +564,8 @@ class Html
foreach ($properties as $property) {
list($cKey, $cValue) = array_pad(explode(':', $property, 2), 2, null);
$cValue = trim($cValue);
switch (trim($cKey)) {
$cKey = strtolower(trim($cKey));
switch ($cKey) {
case 'text-decoration':
switch ($cValue) {
case 'underline':
@ -579,11 +638,18 @@ class Html
}
$styles['italic'] = $tValue;
break;
case 'margin':
$cValue = Converter::cssToTwip($cValue);
$styles['spaceBefore'] = $cValue;
$styles['spaceAfter'] = $cValue;
break;
case 'margin-top':
$styles['spaceBefore'] = Converter::cssToPoint($cValue);
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
$styles['spaceBefore'] = Converter::cssToTwip($cValue);
break;
case 'margin-bottom':
$styles['spaceAfter'] = Converter::cssToPoint($cValue);
// BC change: up to ver. 0.17.0 incorrectly converted to points - Converter::cssToPoint($cValue)
$styles['spaceAfter'] = Converter::cssToTwip($cValue);
break;
case 'border-color':
self::mapBorderColor($styles, $cValue);
@ -607,10 +673,37 @@ class Html
}
break;
case 'border':
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+)\s+([a-z]+)/', $cValue, $matches)) {
$styles['borderSize'] = Converter::cssToPoint($matches[1]);
$styles['borderColor'] = trim($matches[2], '#');
$styles['borderStyle'] = self::mapBorderStyle($matches[3]);
case 'border-top':
case 'border-bottom':
case 'border-right':
case 'border-left':
// must have exact order [width color style], e.g. "1px #0011CC solid" or "2pt green solid"
// Word does not accept shortened hex colors e.g. #CCC, only full e.g. #CCCCCC
if (preg_match('/([0-9]+[^0-9]*)\s+(\#[a-fA-F0-9]+|[a-zA-Z]+)\s+([a-z]+)/', $cValue, $matches)) {
if (false !== strpos($cKey, '-')) {
$which = explode('-', $cKey)[1];
$which = ucfirst($which); // e.g. bottom -> Bottom
} else {
$which = '';
}
// Note - border width normalization:
// Width of border in Word is calculated differently than HTML borders, usually showing up too bold.
// Smallest 1px (or 1pt) appears in Word like 2-3px/pt in HTML once converted to twips.
// Therefore we need to normalize converted twip value to cca 1/2 of value.
// This may be adjusted, if better ratio or formula found.
// BC change: up to ver. 0.17.0 was $size converted to points - Converter::cssToPoint($size)
$size = Converter::cssToTwip($matches[1]);
$size = intval($size / 2);
// valid variants may be e.g. borderSize, borderTopSize, borderLeftColor, etc ..
$styles["border{$which}Size"] = $size; // twips
$styles["border{$which}Color"] = trim($matches[2], '#');
$styles["border{$which}Style"] = self::mapBorderStyle($matches[3]);
}
break;
case 'vertical-align':
// https://developer.mozilla.org/en-US/docs/Web/CSS/vertical-align
if (preg_match('#(?:top|bottom|middle|sub|baseline)#i', $cValue, $matches)) {
$styles['valign'] = self::mapAlignVertical($matches[0]);
}
break;
}
@ -655,14 +748,14 @@ class Html
case 'float':
if (trim($v) == 'right') {
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_RIGHT;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
$style['overlap'] = true;
}
if (trim($v) == 'left') {
$style['hPos'] = \PhpOffice\PhpWord\Style\Image::POS_LEFT;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_PAGE;
$style['hPosRelTo'] = \PhpOffice\PhpWord\Style\Image::POS_RELTO_MARGIN; // inner section area
$style['pos'] = \PhpOffice\PhpWord\Style\Image::POS_RELATIVE;
$style['wrap'] = \PhpOffice\PhpWord\Style\Image::WRAP_TIGHT;
$style['overlap'] = true;
@ -777,6 +870,58 @@ class Html
}
}
/**
* Transforms a HTML/CSS alignment into a \PhpOffice\PhpWord\SimpleType\Jc
*
* @param string $cssAlignment
* @return string|null
*/
protected static function mapAlignVertical($alignment)
{
$alignment = strtolower($alignment);
switch ($alignment) {
case 'top':
case 'baseline':
case 'bottom':
return $alignment;
case 'middle':
return 'center';
case 'sub':
return 'bottom';
case 'text-top':
case 'baseline':
return 'top';
default:
// @discuss - which one should apply:
// - Word uses default vert. alignment: top
// - all browsers use default vert. alignment: middle
// Returning empty string means attribute wont be set so use Word default (top).
return '';
}
}
/**
* Map list style for ordered list
*
* @param string $cssListType
*/
protected static function mapListType($cssListType)
{
switch ($cssListType) {
case 'a':
return NumberFormat::LOWER_LETTER; // a, b, c, ..
case 'A':
return NumberFormat::UPPER_LETTER; // A, B, C, ..
case 'i':
return NumberFormat::LOWER_ROMAN; // i, ii, iii, iv, ..
case 'I':
return NumberFormat::UPPER_ROMAN; // I, II, III, IV, ..
case '1':
default:
return NumberFormat::DECIMAL; // 1, 2, 3, ..
}
}
/**
* Parse line break
*
@ -812,4 +957,38 @@ class Html
return $element->addLink($target, $node->textContent, $styles['font'], $styles['paragraph']);
}
/**
* Render horizontal rule
* Note: Word rule is not the same as HTML's <hr> since it does not support width and thus neither alignment
*
* @param \DOMNode $node
* @param \PhpOffice\PhpWord\Element\AbstractContainer $element
*/
protected static function parseHorizRule($node, $element)
{
$styles = self::parseInlineStyle($node);
// <hr> is implemented as an empty paragraph - extending 100% inside the section
// Some properties may be controlled, e.g. <hr style="border-bottom: 3px #DDDDDD solid; margin-bottom: 0;">
$fontStyle = $styles + ['size' => 3];
$paragraphStyle = $styles + [
'lineHeight' => 0.25, // multiply default line height - e.g. 1, 1.5 etc
'spacing' => 0, // twip
'spaceBefore' => 120, // twip, 240/2 (default line height)
'spaceAfter' => 120, // twip
'borderBottomSize' => empty($styles['line-height']) ? 1 : $styles['line-height'],
'borderBottomColor' => empty($styles['color']) ? '000000' : $styles['color'],
'borderBottomStyle' => 'single', // same as "solid"
];
$element->addText("", $fontStyle, $paragraphStyle);
// Notes: <hr/> cannot be:
// - table - throws error "cannot be inside textruns", e.g. lists
// - line - that is a shape, has different behaviour
// - repeated text, e.g. underline "_", because of unpredictable line wrapping
}
}

View File

@ -632,4 +632,274 @@ class HtmlTest extends AbstractWebServerEmbeddedTest
$this->assertTrue($doc->elementExists('/w:document/w:body/w:p/w:r/w:rPr/w:spacing'));
$this->assertEquals(150 * 15, $doc->getElement('/w:document/w:body/w:p/w:r/w:rPr/w:spacing')->getAttribute('w:val'));
}
/**
* Parse widths in tables and cells, which also allows for controlling column width
*/
public function testParseTableAndCellWidth()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection([
'orientation' => \PhpOffice\PhpWord\Style\Section::ORIENTATION_LANDSCAPE,
]);
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<table style="border: 1px #000000 solid; width: 100%;">
<tr>
<td style="width: 25%; border: 1px #000000 solid; text-align: center;">25%</td>
<td>
<table width="400" style="border: 1px #000000 solid; background-color: #EEEEEE;">
<tr>
<th colspan="3" style="border: 1px #000000 solid;">400px</th>
</tr>
<tr>
<th>T2.R2.C1</th>
<th style="width: 50pt; border: 1px #FF0000 dashed; background-color: #FFFFFF">50pt</th>
<th>T2.R2.C3</th>
</tr>
<tr>
<th width="300" colspan="2" style="border: 1px #000000 solid;">300px</th>
<th style="border: 1px #000000 solid;">T2.R3.C3</th>
</tr>
</table>
</td>
</tr>
</table>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
// outer table grid
$xpath = '/w:document/w:body/w:tbl/w:tblGrid/w:gridCol';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(25 * 50, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('dxa', $doc->getElement($xpath)->getAttribute('w:type'));
// <td style="width: 25%; ...
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc/w:tcPr/w:tcW';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(25 * 50, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('pct', $doc->getElement($xpath)->getAttribute('w:type'));
// <table width="400" .. 400px = 6000 twips (400 / 96 * 1440)
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc/w:tbl/w:tr/w:tc/w:tcPr/w:tcW';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(6000, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('dxa', $doc->getElement($xpath)->getAttribute('w:type'));
// <th style="width: 50pt; .. 50pt = 750 twips (50 / 72 * 1440)
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc/w:tbl/w:tr[2]/w:tc[2]/w:tcPr/w:tcW';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(1000, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('dxa', $doc->getElement($xpath)->getAttribute('w:type'));
// <th width="300" .. 300px = 4500 twips (300 / 96 * 1440)
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc/w:tbl/w:tr[3]/w:tc/w:tcPr/w:tcW';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(4500, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('dxa', $doc->getElement($xpath)->getAttribute('w:type'));
}
/**
* Test parsing background color for table rows and table cellspacing
*/
public function testParseCellspacingRowBgColor()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection([
'orientation' => \PhpOffice\PhpWord\Style\Section::ORIENTATION_LANDSCAPE,
]);
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<table cellspacing="3" bgColor="lightgreen" width="50%" align="center">
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr bgcolor="#FF0000">
<td>C</td>
<td>D</td>
</tr>
</table>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$xpath = '/w:document/w:body/w:tbl/w:tblPr/w:tblCellSpacing';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(3 * 15, $doc->getElement($xpath)->getAttribute('w:w'));
$this->assertEquals('dxa', $doc->getElement($xpath)->getAttribute('w:type'));
$xpath = '/w:document/w:body/w:tbl/w:tr[1]/w:tc[1]/w:tcPr/w:shd';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('lightgreen', $doc->getElement($xpath)->getAttribute('w:fill'));
$xpath = '/w:document/w:body/w:tbl/w:tr[2]/w:tc[1]/w:tcPr/w:shd';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('FF0000', $doc->getElement($xpath)->getAttribute('w:fill'));
}
/**
* Parse horizontal rule
*/
public function testParseHorizRule()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<p>Simple default rule:</p>
<hr/>
<p>Custom style rule:</p>
<hr style="margin-top: 30px; margin-bottom: 0; border-bottom: 5px lightblue solid;" />
<p>END</p>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
// default rule
$xpath = '/w:document/w:body/w:p[2]/w:pPr/w:pBdr/w:bottom';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('single', $doc->getElement($xpath)->getAttribute('w:val')); // solid
$this->assertEquals('1', $doc->getElement($xpath)->getAttribute('w:sz')); // 1 twip
$this->assertEquals('000000', $doc->getElement($xpath)->getAttribute('w:color')); // black
// custom style rule
$xpath = '/w:document/w:body/w:p[4]/w:pPr/w:pBdr/w:bottom';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('single', $doc->getElement($xpath)->getAttribute('w:val'));
$this->assertEquals(intval(5 * 15 / 2), $doc->getElement($xpath)->getAttribute('w:sz'));
$this->assertEquals('lightblue', $doc->getElement($xpath)->getAttribute('w:color'));
$xpath = '/w:document/w:body/w:p[4]/w:pPr/w:spacing';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals(450, $doc->getElement($xpath)->getAttribute('w:before'));
$this->assertEquals(0, $doc->getElement($xpath)->getAttribute('w:after'));
$this->assertEquals(240, $doc->getElement($xpath)->getAttribute('w:line'));
}
/**
* Parse ordered list start & numbering style
*/
public function testParseOrderedList()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<ol>
<li>standard ordered list line 1</li>
<li>standard ordered list line 2</li>
</ol>
<ol start="5" type="A">
<li>ordered list alphabetical, <span style="background-color: #EEEEEE; color: #FF0000;">line 5 => E</span></li>
<li>ordered list alphabetical, <span style="background-color: #EEEEEE; color: #FF0000;">line 6 => F</span></li>
</ol>
<ol start="3" type="i">
<li>ordered list roman lower, line <b>3 => iii</b></li>
<li>ordered list roman lower, line <b>4 => iv</b></li>
</ol>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
// compare numbering file
$xmlFile = 'word/numbering.xml';
// default - decimal start = 1
$xpath = '/w:numbering/w:abstractNum[1]/w:lvl[1]/w:start';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('1', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
$xpath = '/w:numbering/w:abstractNum[1]/w:lvl[1]/w:numFmt';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('decimal', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
// second list - start = 5, type A = upperLetter
$xpath = '/w:numbering/w:abstractNum[2]/w:lvl[1]/w:start';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('5', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
$xpath = '/w:numbering/w:abstractNum[2]/w:lvl[1]/w:numFmt';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('upperLetter', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
// third list - start = 3, type i = lowerRoman
$xpath = '/w:numbering/w:abstractNum[3]/w:lvl[1]/w:start';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('3', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
$xpath = '/w:numbering/w:abstractNum[3]/w:lvl[1]/w:numFmt';
$this->assertTrue($doc->elementExists($xpath, $xmlFile));
$this->assertEquals('lowerRoman', $doc->getElement($xpath, $xmlFile)->getAttribute('w:val'));
}
/**
* Parse ordered list start & numbering style
*/
public function testParseVerticalAlign()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<table width="100%">
<tr>
<td width="20%" style="border: 1px #666666 solid;">default</td>
<td width="20%" style="vertical-align: top; border: 1px #666666 solid;">top</td>
<td width="20%" style="vertical-align: middle; border: 1px #666666 solid;">middle</td>
<td width="20%" valign="bottom" style="border: 1px #666666 solid;">bottom</td>
<td bgcolor="#DDDDDD"><br/><br/><br/><br/><br/><br/><br/></td>
</tr>
</table>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc[1]/w:tcPr/w:vAlign';
$this->assertFalse($doc->elementExists($xpath));
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc[2]/w:tcPr/w:vAlign';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('top', $doc->getElement($xpath)->getAttribute('w:val'));
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc[3]/w:tcPr/w:vAlign';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('center', $doc->getElement($xpath)->getAttribute('w:val'));
$xpath = '/w:document/w:body/w:tbl/w:tr/w:tc[4]/w:tcPr/w:vAlign';
$this->assertTrue($doc->elementExists($xpath));
$this->assertEquals('bottom', $doc->getElement($xpath)->getAttribute('w:val'));
}
/**
* Fix bug - don't decode double quotes inside double quoted string
*/
public function testDontDecodeAlreadyEncodedDoubleQuotes()
{
$phpWord = new \PhpOffice\PhpWord\PhpWord();
$section = $phpWord->addSection();
// borders & backgrounds are here just for better visual comparison
$html = <<<HTML
<div style="font-family: Arial, &quot;Helvetice Neue&quot;">This would crash if inline quotes also decoded at loading XML into DOMDocument!</div>
HTML;
Html::addHtml($section, $html);
$doc = TestHelperDOCX::getDocument($phpWord, 'Word2007');
$this->assertTrue(is_object($doc));
}
}