diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php index 3d859e15..76f128e0 100644 --- a/src/PhpSpreadsheet/Reader/Html.php +++ b/src/PhpSpreadsheet/Reader/Html.php @@ -201,7 +201,7 @@ class Html extends BaseReader /** * Loads Spreadsheet from file. */ - protected function loadSpreadsheetFromFile(string $filename): Spreadsheet + public function loadSpreadsheetFromFile(string $filename): Spreadsheet { // Create new Spreadsheet $spreadsheet = new Spreadsheet(); @@ -651,7 +651,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scanFile($filename); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } @@ -662,6 +668,11 @@ class Html extends BaseReader return $this->loadDocument($dom, $spreadsheet); } + private static function replaceNonAscii(array $matches): string + { + return '' . mb_ord($matches[0], 'UTF-8') . ';'; + } + /** * Spreadsheet from content. * @@ -674,7 +685,13 @@ class Html extends BaseReader // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scan($content); - $loaded = $dom->loadHTML($convert); + $lowend = "\u{80}"; + $highend = "\u{10ffff}"; + $regexp = "/[$lowend-$highend]/u"; + /** @var callable */ + $callback = [self::class, 'replaceNonAscii']; + $convert = preg_replace_callback($regexp, $callback, $convert); + $loaded = ($convert === null) ? false : $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; } diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php index cf4157e3..fe0117ca 100644 --- a/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Html/HtmlImageTest.php @@ -13,7 +13,7 @@ class HtmlImageTest extends TestCase $html = '
| éàâèî | +αβγδε | +||
| 𐐁𐐂𐐃 & だけち | +אבגדה | +𪔀𪔁𪔂 | +|
| ᠐᠑᠒ | +അആ | +กขฃ | +✀✐✠ | +