From 5de82981d8583c7a8503ae41b16978f72eb4e82c Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Sat, 16 Jul 2022 22:08:44 -0700 Subject: [PATCH] Html Reader Not Handling non-ASCII Data Correctly (#2943) * Html Reader Not Handling non-ASCII Data Correctly Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than). * Better Implementation Use regexp to escape non-ASCII. Less kludgey, less reliant on the vagaries of the PHP maintainers. * Additional Tests Test non-ASCII outside of cell contents: sheet title, image alt attribute. * Apply Same Change in Second Location Forgot to change loadFromString. * Additional Test Confirm escaped ampersand is handled correctly. --- src/PhpSpreadsheet/Reader/Html.php | 23 ++++++++++-- .../Reader/Html/HtmlImageTest.php | 4 +-- .../Reader/Html/Issue2942Test.php | 36 +++++++++++++++++++ tests/data/Reader/HTML/utf8chars.html | 28 +++++++++++++++ 4 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php create mode 100644 tests/data/Reader/HTML/utf8chars.html diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php index 3d859e15..76f128e0 100644 --- a/src/PhpSpreadsheet/Reader/Html.php +++ b/src/PhpSpreadsheet/Reader/Html.php @@ -201,7 +201,7 @@ class Html extends BaseReader /** * Loads Spreadsheet from file. */ - protected function loadSpreadsheetFromFile(string $filename): Spreadsheet + public function loadSpreadsheetFromFile(string $filename): Spreadsheet { // Create new Spreadsheet $spreadsheet = new Spreadsheet(); @@ -651,7 +651,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scanFile($filename); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } @@ -662,6 +668,11 @@ class Html extends BaseReader return $this->loadDocument($dom, $spreadsheet); } + private static function replaceNonAscii(array $matches): string + { + return '&#' . mb_ord($matches[0], 'UTF-8') . ';'; + } + /** * Spreadsheet from content. * @@ -674,7 +685,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scan($content); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php index cf4157e3..fe0117ca 100644 --- a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php @@ -13,7 +13,7 @@ class HtmlImageTest extends TestCase $html = ' - +
test imagetest image voilà
'; $filename = HtmlHelper::createHtml($html); @@ -24,7 +24,7 @@ class HtmlImageTest extends TestCase $drawing = $firstSheet->getDrawingCollection()[0]; self::assertEquals($imagePath, $drawing->getPath()); self::assertEquals('A1', $drawing->getCoordinates()); - self::assertEquals('test image', $drawing->getName()); + self::assertEquals('test image voilà', $drawing->getName()); self::assertEquals('100', $drawing->getWidth()); self::assertEquals('100', $drawing->getHeight()); } diff --git a/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php new file mode 100644 index 00000000..3a41805c --- /dev/null +++ b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php @@ -0,0 +1,36 @@ +éàâèî'; + $reader = new Html(); + $spreadsheet = $reader->loadFromString($content); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + } + + public function testLoadFromFile(): void + { + $file = 'tests/data/Reader/HTML/utf8chars.html'; + $reader = new Html(); + $spreadsheet = $reader->loadSpreadsheetFromFile($file); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle()); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + self::assertSame('αβγδε', $sheet->getCell('B1')->getValue()); + self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue()); + self::assertSame('אבגדה', $sheet->getCell('B2')->getValue()); + self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue()); + self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue()); + self::assertSame('അആ', $sheet->getCell('B3')->getValue()); + self::assertSame('กขฃ', $sheet->getCell('C3')->getValue()); + self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue()); + } +} diff --git a/tests/data/Reader/HTML/utf8chars.html b/tests/data/Reader/HTML/utf8chars.html new file mode 100644 index 00000000..8d58c798 --- /dev/null +++ b/tests/data/Reader/HTML/utf8chars.html @@ -0,0 +1,28 @@ + + + + +Test Utf-8 characters voilà + + + + + + + + + + + + + + + + + + + + +
éàâèîαβγδε
𐐁𐐂𐐃 & だけちאבגדה𪔀𪔁𪔂
᠐᠑᠒അആกขฃ✀✐✠
+ +