diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php index 3d859e15..76f128e0 100644 --- a/src/PhpSpreadsheet/Reader/Html.php +++ b/src/PhpSpreadsheet/Reader/Html.php @@ -201,7 +201,7 @@ class Html extends BaseReader /** * Loads Spreadsheet from file. */ - protected function loadSpreadsheetFromFile(string $filename): Spreadsheet + public function loadSpreadsheetFromFile(string $filename): Spreadsheet { // Create new Spreadsheet $spreadsheet = new Spreadsheet(); @@ -651,7 +651,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scanFile($filename); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } @@ -662,6 +668,11 @@ class Html extends BaseReader return $this->loadDocument($dom, $spreadsheet); } + private static function replaceNonAscii(array $matches): string + { + return '&#' . mb_ord($matches[0], 'UTF-8') . ';'; + } + /** * Spreadsheet from content. * @@ -674,7 +685,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scan($content); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php index cf4157e3..fe0117ca 100644 --- a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php @@ -13,7 +13,7 @@ class HtmlImageTest extends TestCase $html = ' - +
test imagetest image voilà
'; $filename = HtmlHelper::createHtml($html); @@ -24,7 +24,7 @@ class HtmlImageTest extends TestCase $drawing = $firstSheet->getDrawingCollection()[0]; self::assertEquals($imagePath, $drawing->getPath()); self::assertEquals('A1', $drawing->getCoordinates()); - self::assertEquals('test image', $drawing->getName()); + self::assertEquals('test image voilà', $drawing->getName()); self::assertEquals('100', $drawing->getWidth()); self::assertEquals('100', $drawing->getHeight()); } diff --git a/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php new file mode 100644 index 00000000..3a41805c --- /dev/null +++ b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php @@ -0,0 +1,36 @@ +éàâèî'; + $reader = new Html(); + $spreadsheet = $reader->loadFromString($content); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + } + + public function testLoadFromFile(): void + { + $file = 'tests/data/Reader/HTML/utf8chars.html'; + $reader = new Html(); + $spreadsheet = $reader->loadSpreadsheetFromFile($file); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle()); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + self::assertSame('αβγδε', $sheet->getCell('B1')->getValue()); + self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue()); + self::assertSame('אבגדה', $sheet->getCell('B2')->getValue()); + self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue()); + self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue()); + self::assertSame('അആ', $sheet->getCell('B3')->getValue()); + self::assertSame('กขฃ', $sheet->getCell('C3')->getValue()); + self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue()); + } +} diff --git a/tests/data/Reader/HTML/utf8chars.html b/tests/data/Reader/HTML/utf8chars.html new file mode 100644 index 00000000..8d58c798 --- /dev/null +++ b/tests/data/Reader/HTML/utf8chars.html @@ -0,0 +1,28 @@ + + + + +Test Utf-8 characters voilà + + + + + + + + + + + + + + + + + + + + +
éàâèîαβγδε
𐐁𐐂𐐃 & だけちאבגדה𪔀𪔁𪔂
᠐᠑᠒അആกขฃ✀✐✠
+ +