Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910)

Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences.

A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably.
This commit is contained in:
oleibman 2022-07-02 08:53:39 -07:00 committed by GitHub
parent f90adcf28e
commit c3f53854b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 121 additions and 45 deletions

View File

@ -2785,36 +2785,6 @@ parameters:
count: 1 count: 1
path: src/PhpSpreadsheet/Shared/OLERead.php path: src/PhpSpreadsheet/Shared/OLERead.php
-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:formatNumber\\(\\) should return string but returns array\\|string\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:sanitizeUTF8\\(\\) should return string but returns string\\|false\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
-
message: "#^Parameter \\#1 \\$string of function strlen expects string, float given\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
-
message: "#^Parameter \\#3 \\$subject of function str_replace expects array\\|string, float given\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$decimalSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$thousandsSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
count: 1
path: src/PhpSpreadsheet/Shared/StringHelper.php
- -
message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#" message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#"
count: 1 count: 1

View File

@ -3,13 +3,14 @@
namespace PhpOffice\PhpSpreadsheet\Shared; namespace PhpOffice\PhpSpreadsheet\Shared;
use PhpOffice\PhpSpreadsheet\Calculation\Calculation; use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
use UConverter;
class StringHelper class StringHelper
{ {
/** Constants */ /** Constants */
/** Regular Expressions */ /** Regular Expressions */
// Fraction // Fraction
const STRING_REGEXP_FRACTION = '(-?)(\d+)\s+(\d+\/\d+)'; const STRING_REGEXP_FRACTION = '~^\s*(-?)((\d*)\s+)?(\d+\/\d+)\s*$~';
/** /**
* Control characters array. * Control characters array.
@ -28,14 +29,14 @@ class StringHelper
/** /**
* Decimal separator. * Decimal separator.
* *
* @var string * @var ?string
*/ */
private static $decimalSeparator; private static $decimalSeparator;
/** /**
* Thousands separator. * Thousands separator.
* *
* @var string * @var ?string
*/ */
private static $thousandsSeparator; private static $thousandsSeparator;
@ -328,19 +329,31 @@ class StringHelper
} }
/** /**
* Try to sanitize UTF8, stripping invalid byte sequences. Not perfect. Does not surrogate characters. * Try to sanitize UTF8, replacing invalid sequences with Unicode substitution characters.
*/ */
public static function sanitizeUTF8(string $textValue): string public static function sanitizeUTF8(string $textValue): string
{ {
$textValue = str_replace(["\xef\xbf\xbe", "\xef\xbf\xbf"], "\xef\xbf\xbd", $textValue);
if (class_exists(UConverter::class)) {
$returnValue = UConverter::transcode($textValue, 'UTF-8', 'UTF-8');
if ($returnValue !== false) {
return $returnValue;
}
}
// @codeCoverageIgnoreStart
// I don't think any of the code below should ever be executed.
if (self::getIsIconvEnabled()) { if (self::getIsIconvEnabled()) {
$textValue = @iconv('UTF-8', 'UTF-8', $textValue); $returnValue = @iconv('UTF-8', 'UTF-8', $textValue);
if ($returnValue !== false) {
return $textValue; return $returnValue;
}
} }
$textValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8'); // Phpstan does not think this can return false.
$returnValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
return $textValue; return $returnValue;
// @codeCoverageIgnoreEnd
} }
/** /**
@ -348,19 +361,19 @@ class StringHelper
*/ */
public static function isUTF8(string $textValue): bool public static function isUTF8(string $textValue): bool
{ {
return $textValue === '' || preg_match('/^./su', $textValue) === 1; return $textValue === self::sanitizeUTF8($textValue);
} }
/** /**
* Formats a numeric value as a string for output in various output writers forcing * Formats a numeric value as a string for output in various output writers forcing
* point as decimal separator in case locale is other than English. * point as decimal separator in case locale is other than English.
* *
* @param mixed $numericValue * @param float|int|string $numericValue
*/ */
public static function formatNumber($numericValue): string public static function formatNumber($numericValue): string
{ {
if (is_float($numericValue)) { if (is_float($numericValue)) {
return str_replace(',', '.', $numericValue); return str_replace(',', '.', (string) $numericValue);
} }
return (string) $numericValue; return (string) $numericValue;
@ -537,9 +550,10 @@ class StringHelper
*/ */
public static function convertToNumberIfFraction(string &$operand): bool public static function convertToNumberIfFraction(string &$operand): bool
{ {
if (preg_match('/^' . self::STRING_REGEXP_FRACTION . '$/i', $operand, $match)) { if (preg_match(self::STRING_REGEXP_FRACTION, $operand, $match)) {
$sign = ($match[1] == '-') ? '-' : '+'; $sign = ($match[1] == '-') ? '-' : '+';
$fractionFormula = '=' . $sign . $match[2] . $sign . $match[3]; $wholePart = ($match[3] === '') ? '' : ($sign . $match[3]);
$fractionFormula = '=' . $wholePart . $sign . $match[4];
$operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula); $operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula);
return true; return true;
@ -686,6 +700,6 @@ class StringHelper
} }
$v = (float) $textValue; $v = (float) $textValue;
return (is_numeric(substr($textValue, 0, strlen($v)))) ? $v : $textValue; return (is_numeric(substr($textValue, 0, strlen((string) $v)))) ? $v : $textValue;
} }
} }

View File

@ -66,6 +66,25 @@ class CsvEncodingTest extends TestCase
self::assertEquals('sixième', $sheet->getCell('C2')->getValue()); self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
} }
public function testSurrogate(): void
{
// Surrogates should occur only in UTF-16, and should
// be properly converted to UTF8 when read.
// FFFE/FFFF are illegal, and should be converted to
// substitution character when read.
// Excel does not handle any of the cells in row 3 well.
// LibreOffice handles A3 fine, and discards B3/C3,
// which is a reasonable action.
$filename = 'tests/data/Reader/CSV/premiere.utf16le.csv';
$reader = new Csv();
$reader->setInputEncoding(Csv::guessEncoding($filename));
$spreadsheet = $reader->load($filename);
$sheet = $spreadsheet->getActiveSheet();
self::assertEquals('𐐀', $sheet->getCell('A3')->getValue());
self::assertEquals('<27>', $sheet->getCell('B3')->getValue());
self::assertEquals('<27>', $sheet->getCell('C3')->getValue());
}
/** /**
* @dataProvider providerGuessEncoding * @dataProvider providerGuessEncoding
*/ */

View File

@ -0,0 +1,44 @@
<?php
namespace PhpOffice\PhpSpreadsheetTests\Shared;
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
use PhpOffice\PhpSpreadsheet\Spreadsheet;
use PHPUnit\Framework\TestCase;
class StringHelperInvalidCharTest extends TestCase
{
public function testInvalidChar(): void
{
$spreadsheet = new Spreadsheet();
$sheet = $spreadsheet->getActiveSheet();
$substitution = '<27>';
$array = [
['Normal string', 'Hello', 'Hello'],
['integer', 2, 2],
['float', 2.1, 2.1],
['boolean true', true, true],
['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"],
['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"],
['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"],
['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'],
['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"],
['Control character', "\u{7}", "\u{7}"],
];
$sheet->fromArray($array);
$row = 0;
foreach ($array as $value) {
self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1]));
++$row;
$expected = $value[2];
self::assertSame(
$expected,
$sheet->getCell("B$row")->getValue(),
$sheet->getCell("A$row")->getValue()
);
}
}
}

View File

@ -119,4 +119,33 @@ class StringHelperTest extends TestCase
self::assertEquals($expectedResult, $result); self::assertEquals($expectedResult, $result);
} }
/**
* @dataProvider providerFractions
*/
public function testFraction(string $expected, string $value): void
{
$originalValue = $value;
$result = StringHelper::convertToNumberIfFraction($value);
if ($result === false) {
self::assertSame($expected, $originalValue);
self::assertSame($expected, $value);
} else {
self::assertSame($expected, (string) $value);
self::assertNotEquals($value, $originalValue);
}
}
public function providerFractions(): array
{
return [
'non-fraction' => ['1', '1'],
'common fraction' => ['1.5', '1 1/2'],
'fraction between -1 and 0' => ['-0.5', '-1/2'],
'fraction between -1 and 0 with space' => ['-0.5', ' - 1/2'],
'fraction between 0 and 1' => ['0.75', '3/4 '],
'fraction between 0 and 1 with space' => ['0.75', ' 3/4'],
'improper fraction' => ['1.75', '7/4'],
];
}
} }