Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910)
Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences. A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably.
This commit is contained in:
parent
f90adcf28e
commit
c3f53854b6
|
|
@ -2785,36 +2785,6 @@ parameters:
|
||||||
count: 1
|
count: 1
|
||||||
path: src/PhpSpreadsheet/Shared/OLERead.php
|
path: src/PhpSpreadsheet/Shared/OLERead.php
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:formatNumber\\(\\) should return string but returns array\\|string\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:sanitizeUTF8\\(\\) should return string but returns string\\|false\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Parameter \\#1 \\$string of function strlen expects string, float given\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Parameter \\#3 \\$subject of function str_replace expects array\\|string, float given\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$decimalSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
|
||||||
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$thousandsSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
|
|
||||||
count: 1
|
|
||||||
path: src/PhpSpreadsheet/Shared/StringHelper.php
|
|
||||||
|
|
||||||
-
|
-
|
||||||
message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#"
|
message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#"
|
||||||
count: 1
|
count: 1
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,14 @@
|
||||||
namespace PhpOffice\PhpSpreadsheet\Shared;
|
namespace PhpOffice\PhpSpreadsheet\Shared;
|
||||||
|
|
||||||
use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
|
use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
|
||||||
|
use UConverter;
|
||||||
|
|
||||||
class StringHelper
|
class StringHelper
|
||||||
{
|
{
|
||||||
/** Constants */
|
/** Constants */
|
||||||
/** Regular Expressions */
|
/** Regular Expressions */
|
||||||
// Fraction
|
// Fraction
|
||||||
const STRING_REGEXP_FRACTION = '(-?)(\d+)\s+(\d+\/\d+)';
|
const STRING_REGEXP_FRACTION = '~^\s*(-?)((\d*)\s+)?(\d+\/\d+)\s*$~';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Control characters array.
|
* Control characters array.
|
||||||
|
|
@ -28,14 +29,14 @@ class StringHelper
|
||||||
/**
|
/**
|
||||||
* Decimal separator.
|
* Decimal separator.
|
||||||
*
|
*
|
||||||
* @var string
|
* @var ?string
|
||||||
*/
|
*/
|
||||||
private static $decimalSeparator;
|
private static $decimalSeparator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Thousands separator.
|
* Thousands separator.
|
||||||
*
|
*
|
||||||
* @var string
|
* @var ?string
|
||||||
*/
|
*/
|
||||||
private static $thousandsSeparator;
|
private static $thousandsSeparator;
|
||||||
|
|
||||||
|
|
@ -328,19 +329,31 @@ class StringHelper
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try to sanitize UTF8, stripping invalid byte sequences. Not perfect. Does not surrogate characters.
|
* Try to sanitize UTF8, replacing invalid sequences with Unicode substitution characters.
|
||||||
*/
|
*/
|
||||||
public static function sanitizeUTF8(string $textValue): string
|
public static function sanitizeUTF8(string $textValue): string
|
||||||
{
|
{
|
||||||
|
$textValue = str_replace(["\xef\xbf\xbe", "\xef\xbf\xbf"], "\xef\xbf\xbd", $textValue);
|
||||||
|
if (class_exists(UConverter::class)) {
|
||||||
|
$returnValue = UConverter::transcode($textValue, 'UTF-8', 'UTF-8');
|
||||||
|
if ($returnValue !== false) {
|
||||||
|
return $returnValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// @codeCoverageIgnoreStart
|
||||||
|
// I don't think any of the code below should ever be executed.
|
||||||
if (self::getIsIconvEnabled()) {
|
if (self::getIsIconvEnabled()) {
|
||||||
$textValue = @iconv('UTF-8', 'UTF-8', $textValue);
|
$returnValue = @iconv('UTF-8', 'UTF-8', $textValue);
|
||||||
|
if ($returnValue !== false) {
|
||||||
return $textValue;
|
return $returnValue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$textValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
|
// Phpstan does not think this can return false.
|
||||||
|
$returnValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
|
||||||
|
|
||||||
return $textValue;
|
return $returnValue;
|
||||||
|
// @codeCoverageIgnoreEnd
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -348,19 +361,19 @@ class StringHelper
|
||||||
*/
|
*/
|
||||||
public static function isUTF8(string $textValue): bool
|
public static function isUTF8(string $textValue): bool
|
||||||
{
|
{
|
||||||
return $textValue === '' || preg_match('/^./su', $textValue) === 1;
|
return $textValue === self::sanitizeUTF8($textValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Formats a numeric value as a string for output in various output writers forcing
|
* Formats a numeric value as a string for output in various output writers forcing
|
||||||
* point as decimal separator in case locale is other than English.
|
* point as decimal separator in case locale is other than English.
|
||||||
*
|
*
|
||||||
* @param mixed $numericValue
|
* @param float|int|string $numericValue
|
||||||
*/
|
*/
|
||||||
public static function formatNumber($numericValue): string
|
public static function formatNumber($numericValue): string
|
||||||
{
|
{
|
||||||
if (is_float($numericValue)) {
|
if (is_float($numericValue)) {
|
||||||
return str_replace(',', '.', $numericValue);
|
return str_replace(',', '.', (string) $numericValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (string) $numericValue;
|
return (string) $numericValue;
|
||||||
|
|
@ -537,9 +550,10 @@ class StringHelper
|
||||||
*/
|
*/
|
||||||
public static function convertToNumberIfFraction(string &$operand): bool
|
public static function convertToNumberIfFraction(string &$operand): bool
|
||||||
{
|
{
|
||||||
if (preg_match('/^' . self::STRING_REGEXP_FRACTION . '$/i', $operand, $match)) {
|
if (preg_match(self::STRING_REGEXP_FRACTION, $operand, $match)) {
|
||||||
$sign = ($match[1] == '-') ? '-' : '+';
|
$sign = ($match[1] == '-') ? '-' : '+';
|
||||||
$fractionFormula = '=' . $sign . $match[2] . $sign . $match[3];
|
$wholePart = ($match[3] === '') ? '' : ($sign . $match[3]);
|
||||||
|
$fractionFormula = '=' . $wholePart . $sign . $match[4];
|
||||||
$operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula);
|
$operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -686,6 +700,6 @@ class StringHelper
|
||||||
}
|
}
|
||||||
$v = (float) $textValue;
|
$v = (float) $textValue;
|
||||||
|
|
||||||
return (is_numeric(substr($textValue, 0, strlen($v)))) ? $v : $textValue;
|
return (is_numeric(substr($textValue, 0, strlen((string) $v)))) ? $v : $textValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,25 @@ class CsvEncodingTest extends TestCase
|
||||||
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
|
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testSurrogate(): void
|
||||||
|
{
|
||||||
|
// Surrogates should occur only in UTF-16, and should
|
||||||
|
// be properly converted to UTF8 when read.
|
||||||
|
// FFFE/FFFF are illegal, and should be converted to
|
||||||
|
// substitution character when read.
|
||||||
|
// Excel does not handle any of the cells in row 3 well.
|
||||||
|
// LibreOffice handles A3 fine, and discards B3/C3,
|
||||||
|
// which is a reasonable action.
|
||||||
|
$filename = 'tests/data/Reader/CSV/premiere.utf16le.csv';
|
||||||
|
$reader = new Csv();
|
||||||
|
$reader->setInputEncoding(Csv::guessEncoding($filename));
|
||||||
|
$spreadsheet = $reader->load($filename);
|
||||||
|
$sheet = $spreadsheet->getActiveSheet();
|
||||||
|
self::assertEquals('𐐀', $sheet->getCell('A3')->getValue());
|
||||||
|
self::assertEquals('<27>', $sheet->getCell('B3')->getValue());
|
||||||
|
self::assertEquals('<27>', $sheet->getCell('C3')->getValue());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @dataProvider providerGuessEncoding
|
* @dataProvider providerGuessEncoding
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace PhpOffice\PhpSpreadsheetTests\Shared;
|
||||||
|
|
||||||
|
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
|
||||||
|
use PhpOffice\PhpSpreadsheet\Spreadsheet;
|
||||||
|
use PHPUnit\Framework\TestCase;
|
||||||
|
|
||||||
|
class StringHelperInvalidCharTest extends TestCase
|
||||||
|
{
|
||||||
|
public function testInvalidChar(): void
|
||||||
|
{
|
||||||
|
$spreadsheet = new Spreadsheet();
|
||||||
|
$sheet = $spreadsheet->getActiveSheet();
|
||||||
|
$substitution = '<27>';
|
||||||
|
$array = [
|
||||||
|
['Normal string', 'Hello', 'Hello'],
|
||||||
|
['integer', 2, 2],
|
||||||
|
['float', 2.1, 2.1],
|
||||||
|
['boolean true', true, true],
|
||||||
|
['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"],
|
||||||
|
['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"],
|
||||||
|
['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"],
|
||||||
|
['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'],
|
||||||
|
['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
|
||||||
|
['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
|
||||||
|
['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"],
|
||||||
|
['Control character', "\u{7}", "\u{7}"],
|
||||||
|
];
|
||||||
|
|
||||||
|
$sheet->fromArray($array);
|
||||||
|
$row = 0;
|
||||||
|
foreach ($array as $value) {
|
||||||
|
self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1]));
|
||||||
|
++$row;
|
||||||
|
$expected = $value[2];
|
||||||
|
self::assertSame(
|
||||||
|
$expected,
|
||||||
|
$sheet->getCell("B$row")->getValue(),
|
||||||
|
$sheet->getCell("A$row")->getValue()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -119,4 +119,33 @@ class StringHelperTest extends TestCase
|
||||||
|
|
||||||
self::assertEquals($expectedResult, $result);
|
self::assertEquals($expectedResult, $result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider providerFractions
|
||||||
|
*/
|
||||||
|
public function testFraction(string $expected, string $value): void
|
||||||
|
{
|
||||||
|
$originalValue = $value;
|
||||||
|
$result = StringHelper::convertToNumberIfFraction($value);
|
||||||
|
if ($result === false) {
|
||||||
|
self::assertSame($expected, $originalValue);
|
||||||
|
self::assertSame($expected, $value);
|
||||||
|
} else {
|
||||||
|
self::assertSame($expected, (string) $value);
|
||||||
|
self::assertNotEquals($value, $originalValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function providerFractions(): array
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
'non-fraction' => ['1', '1'],
|
||||||
|
'common fraction' => ['1.5', '1 1/2'],
|
||||||
|
'fraction between -1 and 0' => ['-0.5', '-1/2'],
|
||||||
|
'fraction between -1 and 0 with space' => ['-0.5', ' - 1/2'],
|
||||||
|
'fraction between 0 and 1' => ['0.75', '3/4 '],
|
||||||
|
'fraction between 0 and 1 with space' => ['0.75', ' 3/4'],
|
||||||
|
'improper fraction' => ['1.75', '7/4'],
|
||||||
|
];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Binary file not shown.
Loading…
Reference in New Issue