Html Reader Not Handling non-ASCII Data Correctly (#2943)

* Html Reader Not Handling non-ASCII Data Correctly

Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than).

* Better Implementation

Use regexp to escape non-ASCII. Less kludgey, less reliant on the vagaries of the PHP maintainers.

* Additional Tests

Test non-ASCII outside of cell contents: sheet title, image alt attribute.

* Apply Same Change in Second Location

Forgot to change loadFromString.

* Additional Test

Confirm escaped ampersand is handled correctly.
This commit is contained in:
oleibman 2022-07-16 22:08:44 -07:00 committed by GitHub
parent db57af0c7f
commit 5de82981d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 86 additions and 5 deletions

View File

@ -201,7 +201,7 @@ class Html extends BaseReader
/** /**
* Loads Spreadsheet from file. * Loads Spreadsheet from file.
*/ */
protected function loadSpreadsheetFromFile(string $filename): Spreadsheet public function loadSpreadsheetFromFile(string $filename): Spreadsheet
{ {
// Create new Spreadsheet // Create new Spreadsheet
$spreadsheet = new Spreadsheet(); $spreadsheet = new Spreadsheet();
@ -651,7 +651,13 @@ class Html extends BaseReader
// Reload the HTML file into the DOM object // Reload the HTML file into the DOM object
try { try {
$convert = $this->securityScanner->scanFile($filename); $convert = $this->securityScanner->scanFile($filename);
$loaded = $dom->loadHTML($convert); $lowend = "\u{80}";
$highend = "\u{10ffff}";
$regexp = "/[$lowend-$highend]/u";
/** @var callable */
$callback = [self::class, 'replaceNonAscii'];
$convert = preg_replace_callback($regexp, $callback, $convert);
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
} catch (Throwable $e) { } catch (Throwable $e) {
$loaded = false; $loaded = false;
} }
@ -662,6 +668,11 @@ class Html extends BaseReader
return $this->loadDocument($dom, $spreadsheet); return $this->loadDocument($dom, $spreadsheet);
} }
private static function replaceNonAscii(array $matches): string
{
return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
}
/** /**
* Spreadsheet from content. * Spreadsheet from content.
* *
@ -674,7 +685,13 @@ class Html extends BaseReader
// Reload the HTML file into the DOM object // Reload the HTML file into the DOM object
try { try {
$convert = $this->securityScanner->scan($content); $convert = $this->securityScanner->scan($content);
$loaded = $dom->loadHTML($convert); $lowend = "\u{80}";
$highend = "\u{10ffff}";
$regexp = "/[$lowend-$highend]/u";
/** @var callable */
$callback = [self::class, 'replaceNonAscii'];
$convert = preg_replace_callback($regexp, $callback, $convert);
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
} catch (Throwable $e) { } catch (Throwable $e) {
$loaded = false; $loaded = false;
} }

View File

@ -13,7 +13,7 @@ class HtmlImageTest extends TestCase
$html = '<table> $html = '<table>
<tr> <tr>
<td><img src="' . $imagePath . '" alt="test image"></td> <td><img src="' . $imagePath . '" alt="test image voilà"></td>
</tr> </tr>
</table>'; </table>';
$filename = HtmlHelper::createHtml($html); $filename = HtmlHelper::createHtml($html);
@ -24,7 +24,7 @@ class HtmlImageTest extends TestCase
$drawing = $firstSheet->getDrawingCollection()[0]; $drawing = $firstSheet->getDrawingCollection()[0];
self::assertEquals($imagePath, $drawing->getPath()); self::assertEquals($imagePath, $drawing->getPath());
self::assertEquals('A1', $drawing->getCoordinates()); self::assertEquals('A1', $drawing->getCoordinates());
self::assertEquals('test image', $drawing->getName()); self::assertEquals('test image voilà', $drawing->getName());
self::assertEquals('100', $drawing->getWidth()); self::assertEquals('100', $drawing->getWidth());
self::assertEquals('100', $drawing->getHeight()); self::assertEquals('100', $drawing->getHeight());
} }

View File

@ -0,0 +1,36 @@
<?php
namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
use PhpOffice\PhpSpreadsheet\Reader\Html;
use PHPUnit\Framework\TestCase;
class Issue2942Test extends TestCase
{
public function testLoadFromString(): void
{
$content = '<table><tbody><tr><td>éàâèî</td></tr></tbody></table>';
$reader = new Html();
$spreadsheet = $reader->loadFromString($content);
$sheet = $spreadsheet->getActiveSheet();
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
}
public function testLoadFromFile(): void
{
$file = 'tests/data/Reader/HTML/utf8chars.html';
$reader = new Html();
$spreadsheet = $reader->loadSpreadsheetFromFile($file);
$sheet = $spreadsheet->getActiveSheet();
self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle());
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
self::assertSame('αβγδε', $sheet->getCell('B1')->getValue());
self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue());
self::assertSame('אבגדה', $sheet->getCell('B2')->getValue());
self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue());
self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue());
self::assertSame('അആ', $sheet->getCell('B3')->getValue());
self::assertSame('กขฃ', $sheet->getCell('C3')->getValue());
self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue());
}
}

View File

@ -0,0 +1,28 @@
<!DOCTYPE html>
<html>
<head>
<!-- deliberately do not identify charset for this test -->
<title>Test Utf-8 characters voilà</title>
</head>
<body>
<table>
<tbody>
<tr>
<td>éàâèî</td><!-- Latin1 -->
<td>αβγδε</td><!-- Greek -->
</tr>
<tr>
<td>𐐁𐐂𐐃 &amp; だけち</td><!-- Osmanya (not in BMP) and Hiragana -->
<td>אבגדה</td><!-- Hebrew -->
<td>𪔀𪔁𪔂</td><!-- CJK Unified Ideographs Extension B (not in BMP) -->
</tr>
<tr>
<td>᠐᠑᠒</td><!-- Mongolian -->
<td>അആ</td><!-- Malayalam -->
<td>กขฃ</td><!-- Thai -->
<td>✀✐✠</td><!-- Dingbats -->
</tr>
</tbody>
</table>
</body>
</html>