From c3f53854b61f71f0d80ea449cbc3245514fa200d Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Sat, 2 Jul 2022 08:53:39 -0700 Subject: [PATCH] Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910) Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences. A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably. --- phpstan-baseline.neon | 30 ------------ src/PhpSpreadsheet/Shared/StringHelper.php | 44 ++++++++++++------ .../Reader/Csv/CsvEncodingTest.php | 19 ++++++++ .../Shared/StringHelperInvalidCharTest.php | 44 ++++++++++++++++++ .../Shared/StringHelperTest.php | 29 ++++++++++++ tests/data/Reader/CSV/premiere.utf16le.csv | Bin 112 -> 128 bytes 6 files changed, 121 insertions(+), 45 deletions(-) create mode 100644 tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php diff --git a/phpstan-baseline.neon b/phpstan-baseline.neon index 14160edf..04bdf273 100644 --- a/phpstan-baseline.neon +++ b/phpstan-baseline.neon @@ -2785,36 +2785,6 @@ parameters: count: 1 path: src/PhpSpreadsheet/Shared/OLERead.php - - - message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:formatNumber\\(\\) should return string but returns array\\|string\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - - - message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:sanitizeUTF8\\(\\) should return string but returns string\\|false\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - - - message: "#^Parameter \\#1 \\$string of function strlen expects string, float given\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - - - message: "#^Parameter \\#3 \\$subject of function str_replace expects array\\|string, float given\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - - - message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$decimalSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - - - message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$thousandsSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#" - count: 1 - path: src/PhpSpreadsheet/Shared/StringHelper.php - - message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#" count: 1 diff --git a/src/PhpSpreadsheet/Shared/StringHelper.php b/src/PhpSpreadsheet/Shared/StringHelper.php index 2ccb2424..c16de9ce 100644 --- a/src/PhpSpreadsheet/Shared/StringHelper.php +++ b/src/PhpSpreadsheet/Shared/StringHelper.php @@ -3,13 +3,14 @@ namespace PhpOffice\PhpSpreadsheet\Shared; use PhpOffice\PhpSpreadsheet\Calculation\Calculation; +use UConverter; class StringHelper { /** Constants */ /** Regular Expressions */ // Fraction - const STRING_REGEXP_FRACTION = '(-?)(\d+)\s+(\d+\/\d+)'; + const STRING_REGEXP_FRACTION = '~^\s*(-?)((\d*)\s+)?(\d+\/\d+)\s*$~'; /** * Control characters array. @@ -28,14 +29,14 @@ class StringHelper /** * Decimal separator. * - * @var string + * @var ?string */ private static $decimalSeparator; /** * Thousands separator. * - * @var string + * @var ?string */ private static $thousandsSeparator; @@ -328,19 +329,31 @@ class StringHelper } /** - * Try to sanitize UTF8, stripping invalid byte sequences. Not perfect. Does not surrogate characters. + * Try to sanitize UTF8, replacing invalid sequences with Unicode substitution characters. */ public static function sanitizeUTF8(string $textValue): string { + $textValue = str_replace(["\xef\xbf\xbe", "\xef\xbf\xbf"], "\xef\xbf\xbd", $textValue); + if (class_exists(UConverter::class)) { + $returnValue = UConverter::transcode($textValue, 'UTF-8', 'UTF-8'); + if ($returnValue !== false) { + return $returnValue; + } + } + // @codeCoverageIgnoreStart + // I don't think any of the code below should ever be executed. if (self::getIsIconvEnabled()) { - $textValue = @iconv('UTF-8', 'UTF-8', $textValue); - - return $textValue; + $returnValue = @iconv('UTF-8', 'UTF-8', $textValue); + if ($returnValue !== false) { + return $returnValue; + } } - $textValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8'); + // Phpstan does not think this can return false. + $returnValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8'); - return $textValue; + return $returnValue; + // @codeCoverageIgnoreEnd } /** @@ -348,19 +361,19 @@ class StringHelper */ public static function isUTF8(string $textValue): bool { - return $textValue === '' || preg_match('/^./su', $textValue) === 1; + return $textValue === self::sanitizeUTF8($textValue); } /** * Formats a numeric value as a string for output in various output writers forcing * point as decimal separator in case locale is other than English. * - * @param mixed $numericValue + * @param float|int|string $numericValue */ public static function formatNumber($numericValue): string { if (is_float($numericValue)) { - return str_replace(',', '.', $numericValue); + return str_replace(',', '.', (string) $numericValue); } return (string) $numericValue; @@ -537,9 +550,10 @@ class StringHelper */ public static function convertToNumberIfFraction(string &$operand): bool { - if (preg_match('/^' . self::STRING_REGEXP_FRACTION . '$/i', $operand, $match)) { + if (preg_match(self::STRING_REGEXP_FRACTION, $operand, $match)) { $sign = ($match[1] == '-') ? '-' : '+'; - $fractionFormula = '=' . $sign . $match[2] . $sign . $match[3]; + $wholePart = ($match[3] === '') ? '' : ($sign . $match[3]); + $fractionFormula = '=' . $wholePart . $sign . $match[4]; $operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula); return true; @@ -686,6 +700,6 @@ class StringHelper } $v = (float) $textValue; - return (is_numeric(substr($textValue, 0, strlen($v)))) ? $v : $textValue; + return (is_numeric(substr($textValue, 0, strlen((string) $v)))) ? $v : $textValue; } } diff --git a/tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php b/tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php index 448d3d1e..0cd79853 100644 --- a/tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php @@ -66,6 +66,25 @@ class CsvEncodingTest extends TestCase self::assertEquals('sixième', $sheet->getCell('C2')->getValue()); } + public function testSurrogate(): void + { + // Surrogates should occur only in UTF-16, and should + // be properly converted to UTF8 when read. + // FFFE/FFFF are illegal, and should be converted to + // substitution character when read. + // Excel does not handle any of the cells in row 3 well. + // LibreOffice handles A3 fine, and discards B3/C3, + // which is a reasonable action. + $filename = 'tests/data/Reader/CSV/premiere.utf16le.csv'; + $reader = new Csv(); + $reader->setInputEncoding(Csv::guessEncoding($filename)); + $spreadsheet = $reader->load($filename); + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals('𐐀', $sheet->getCell('A3')->getValue()); + self::assertEquals('�', $sheet->getCell('B3')->getValue()); + self::assertEquals('�', $sheet->getCell('C3')->getValue()); + } + /** * @dataProvider providerGuessEncoding */ diff --git a/tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php b/tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php new file mode 100644 index 00000000..54679499 --- /dev/null +++ b/tests/PhpSpreadsheetTests/Shared/StringHelperInvalidCharTest.php @@ -0,0 +1,44 @@ +getActiveSheet(); + $substitution = '�'; + $array = [ + ['Normal string', 'Hello', 'Hello'], + ['integer', 2, 2], + ['float', 2.1, 2.1], + ['boolean true', true, true], + ['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"], + ['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"], + ['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"], + ['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'], + ['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"], + ['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"], + ['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"], + ['Control character', "\u{7}", "\u{7}"], + ]; + + $sheet->fromArray($array); + $row = 0; + foreach ($array as $value) { + self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1])); + ++$row; + $expected = $value[2]; + self::assertSame( + $expected, + $sheet->getCell("B$row")->getValue(), + $sheet->getCell("A$row")->getValue() + ); + } + } +} diff --git a/tests/PhpSpreadsheetTests/Shared/StringHelperTest.php b/tests/PhpSpreadsheetTests/Shared/StringHelperTest.php index bcbcd5d2..85a92613 100644 --- a/tests/PhpSpreadsheetTests/Shared/StringHelperTest.php +++ b/tests/PhpSpreadsheetTests/Shared/StringHelperTest.php @@ -119,4 +119,33 @@ class StringHelperTest extends TestCase self::assertEquals($expectedResult, $result); } + + /** + * @dataProvider providerFractions + */ + public function testFraction(string $expected, string $value): void + { + $originalValue = $value; + $result = StringHelper::convertToNumberIfFraction($value); + if ($result === false) { + self::assertSame($expected, $originalValue); + self::assertSame($expected, $value); + } else { + self::assertSame($expected, (string) $value); + self::assertNotEquals($value, $originalValue); + } + } + + public function providerFractions(): array + { + return [ + 'non-fraction' => ['1', '1'], + 'common fraction' => ['1.5', '1 1/2'], + 'fraction between -1 and 0' => ['-0.5', '-1/2'], + 'fraction between -1 and 0 with space' => ['-0.5', ' - 1/2'], + 'fraction between 0 and 1' => ['0.75', '3/4 '], + 'fraction between 0 and 1 with space' => ['0.75', ' 3/4'], + 'improper fraction' => ['1.75', '7/4'], + ]; + } } diff --git a/tests/data/Reader/CSV/premiere.utf16le.csv b/tests/data/Reader/CSV/premiere.utf16le.csv index a5bb1ff12e771e8628bf3c52930311bffbb3ca94..4ca3c3e4d36cc53c72741b69e4b31a94dfd18b91 100644 GIT binary patch delta 22 ecmXSDV4P4Oz<7h