Html Reader Not Handling non-ASCII Data Correctly (#2943)
* Html Reader Not Handling non-ASCII Data Correctly Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than). * Better Implementation Use regexp to escape non-ASCII. Less kludgey, less reliant on the vagaries of the PHP maintainers. * Additional Tests Test non-ASCII outside of cell contents: sheet title, image alt attribute. * Apply Same Change in Second Location Forgot to change loadFromString. * Additional Test Confirm escaped ampersand is handled correctly.
This commit is contained in:
parent
db57af0c7f
commit
5de82981d8
|
|
@ -201,7 +201,7 @@ class Html extends BaseReader
|
|||
/**
|
||||
* Loads Spreadsheet from file.
|
||||
*/
|
||||
protected function loadSpreadsheetFromFile(string $filename): Spreadsheet
|
||||
public function loadSpreadsheetFromFile(string $filename): Spreadsheet
|
||||
{
|
||||
// Create new Spreadsheet
|
||||
$spreadsheet = new Spreadsheet();
|
||||
|
|
@ -651,7 +651,13 @@ class Html extends BaseReader
|
|||
// Reload the HTML file into the DOM object
|
||||
try {
|
||||
$convert = $this->securityScanner->scanFile($filename);
|
||||
$loaded = $dom->loadHTML($convert);
|
||||
$lowend = "\u{80}";
|
||||
$highend = "\u{10ffff}";
|
||||
$regexp = "/[$lowend-$highend]/u";
|
||||
/** @var callable */
|
||||
$callback = [self::class, 'replaceNonAscii'];
|
||||
$convert = preg_replace_callback($regexp, $callback, $convert);
|
||||
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
|
||||
} catch (Throwable $e) {
|
||||
$loaded = false;
|
||||
}
|
||||
|
|
@ -662,6 +668,11 @@ class Html extends BaseReader
|
|||
return $this->loadDocument($dom, $spreadsheet);
|
||||
}
|
||||
|
||||
private static function replaceNonAscii(array $matches): string
|
||||
{
|
||||
return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
|
||||
}
|
||||
|
||||
/**
|
||||
* Spreadsheet from content.
|
||||
*
|
||||
|
|
@ -674,7 +685,13 @@ class Html extends BaseReader
|
|||
// Reload the HTML file into the DOM object
|
||||
try {
|
||||
$convert = $this->securityScanner->scan($content);
|
||||
$loaded = $dom->loadHTML($convert);
|
||||
$lowend = "\u{80}";
|
||||
$highend = "\u{10ffff}";
|
||||
$regexp = "/[$lowend-$highend]/u";
|
||||
/** @var callable */
|
||||
$callback = [self::class, 'replaceNonAscii'];
|
||||
$convert = preg_replace_callback($regexp, $callback, $convert);
|
||||
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
|
||||
} catch (Throwable $e) {
|
||||
$loaded = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ class HtmlImageTest extends TestCase
|
|||
|
||||
$html = '<table>
|
||||
<tr>
|
||||
<td><img src="' . $imagePath . '" alt="test image"></td>
|
||||
<td><img src="' . $imagePath . '" alt="test image voilà"></td>
|
||||
</tr>
|
||||
</table>';
|
||||
$filename = HtmlHelper::createHtml($html);
|
||||
|
|
@ -24,7 +24,7 @@ class HtmlImageTest extends TestCase
|
|||
$drawing = $firstSheet->getDrawingCollection()[0];
|
||||
self::assertEquals($imagePath, $drawing->getPath());
|
||||
self::assertEquals('A1', $drawing->getCoordinates());
|
||||
self::assertEquals('test image', $drawing->getName());
|
||||
self::assertEquals('test image voilà', $drawing->getName());
|
||||
self::assertEquals('100', $drawing->getWidth());
|
||||
self::assertEquals('100', $drawing->getHeight());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
<?php
|
||||
|
||||
namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
|
||||
|
||||
use PhpOffice\PhpSpreadsheet\Reader\Html;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class Issue2942Test extends TestCase
|
||||
{
|
||||
public function testLoadFromString(): void
|
||||
{
|
||||
$content = '<table><tbody><tr><td>éàâèî</td></tr></tbody></table>';
|
||||
$reader = new Html();
|
||||
$spreadsheet = $reader->loadFromString($content);
|
||||
$sheet = $spreadsheet->getActiveSheet();
|
||||
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
|
||||
}
|
||||
|
||||
public function testLoadFromFile(): void
|
||||
{
|
||||
$file = 'tests/data/Reader/HTML/utf8chars.html';
|
||||
$reader = new Html();
|
||||
$spreadsheet = $reader->loadSpreadsheetFromFile($file);
|
||||
$sheet = $spreadsheet->getActiveSheet();
|
||||
self::assertSame('Test Utf-8 characters voilà', $sheet->getTitle());
|
||||
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
|
||||
self::assertSame('αβγδε', $sheet->getCell('B1')->getValue());
|
||||
self::assertSame('𐐁𐐂𐐃 & だけち', $sheet->getCell('A2')->getValue());
|
||||
self::assertSame('אבגדה', $sheet->getCell('B2')->getValue());
|
||||
self::assertSame('𪔀𪔁𪔂', $sheet->getCell('C2')->getValue());
|
||||
self::assertSame('᠐᠑᠒', $sheet->getCell('A3')->getValue());
|
||||
self::assertSame('അആ', $sheet->getCell('B3')->getValue());
|
||||
self::assertSame('กขฃ', $sheet->getCell('C3')->getValue());
|
||||
self::assertSame('✀✐✠', $sheet->getCell('D3')->getValue());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<!-- deliberately do not identify charset for this test -->
|
||||
<title>Test Utf-8 characters voilà</title>
|
||||
</head>
|
||||
<body>
|
||||
<table>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>éàâèî</td><!-- Latin1 -->
|
||||
<td>αβγδε</td><!-- Greek -->
|
||||
</tr>
|
||||
<tr>
|
||||
<td>𐐁𐐂𐐃 & だけち</td><!-- Osmanya (not in BMP) and Hiragana -->
|
||||
<td>אבגדה</td><!-- Hebrew -->
|
||||
<td>𪔀𪔁𪔂</td><!-- CJK Unified Ideographs Extension B (not in BMP) -->
|
||||
</tr>
|
||||
<tr>
|
||||
<td>᠐᠑᠒</td><!-- Mongolian -->
|
||||
<td>അആ</td><!-- Malayalam -->
|
||||
<td>กขฃ</td><!-- Thai -->
|
||||
<td>✀✐✠</td><!-- Dingbats -->
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Reference in New Issue