Csv reader refactor infer delimiter (#1948)

* Refactor delimiter inference for CSV file reading into a separate class
This commit is contained in:
Mark Baker 2021-03-24 20:05:32 +01:00 committed by GitHub
parent 07ad800755
commit b7f93754d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 150 additions and 97 deletions

View File

@ -4,6 +4,7 @@ namespace PhpOffice\PhpSpreadsheet\Reader;
use InvalidArgumentException; use InvalidArgumentException;
use PhpOffice\PhpSpreadsheet\Cell\Coordinate; use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter;
use PhpOffice\PhpSpreadsheet\Shared\StringHelper; use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
use PhpOffice\PhpSpreadsheet\Spreadsheet; use PhpOffice\PhpSpreadsheet\Spreadsheet;
@ -138,118 +139,26 @@ class Csv extends BaseReader
return; return;
} }
$potentialDelimiters = [',', ';', "\t", '|', ':', ' ', '~']; $inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure);
$counts = [];
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter] = [];
}
// Count how many times each of the potential delimiters appears in each line
$numberLines = 0;
while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) {
$countLine = [];
for ($i = strlen($line) - 1; $i >= 0; --$i) {
$char = $line[$i];
if (isset($counts[$char])) {
if (!isset($countLine[$char])) {
$countLine[$char] = 0;
}
++$countLine[$char];
}
}
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter][] = $countLine[$delimiter]
?? 0;
}
}
// If number of lines is 0, nothing to infer : fall back to the default // If number of lines is 0, nothing to infer : fall back to the default
if ($numberLines === 0) { if ($inferenceEngine->linesCounted() === 0) {
$this->delimiter = reset($potentialDelimiters); $this->delimiter = $inferenceEngine->getDefaultDelimiter();
$this->skipBOM(); $this->skipBOM();
return; return;
} }
// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) $this->delimiter = $inferenceEngine->infer();
$meanSquareDeviations = [];
$middleIdx = floor(($numberLines - 1) / 2);
foreach ($potentialDelimiters as $delimiter) {
$series = $counts[$delimiter];
sort($series);
$median = ($numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
if ($median === 0) {
continue;
}
$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + ($value - $median) ** 2;
}
) / count($series);
}
// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach ($potentialDelimiters as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}
if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}
// If no delimiter could be detected, fall back to the default // If no delimiter could be detected, fall back to the default
if ($this->delimiter === null) { if ($this->delimiter === null) {
$this->delimiter = reset($potentialDelimiters); $this->delimiter = $inferenceEngine->getDefaultDelimiter();
} }
$this->skipBOM(); $this->skipBOM();
} }
/**
* Get the next full line from the file.
*
* @return false|string
*/
private function getNextLine()
{
$line = '';
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');
do {
// Get the next line in the file
$newLine = fgets($this->fileHandle);
// Return false if there is no next line
if ($newLine === false) {
return false;
}
// Add the new line to the line passed in
$line = $line . $newLine;
// Drop everything that is enclosed to avoid counting false positives in enclosures
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
// See if we have any enclosures left in the line
// if we still have an enclosure then we need to read the next line as well
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);
return $line;
}
/** /**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
* *

View File

@ -0,0 +1,144 @@
<?php
namespace PhpOffice\PhpSpreadsheet\Reader\Csv;
class Delimiter
{
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
protected $fileHandle;
protected $escapeCharacter;
protected $enclosure;
protected $counts = [];
protected $numberLines = 0;
protected $delimiter;
public function __construct($fileHandle, $escapeCharacter, $enclosure)
{
$this->fileHandle = $fileHandle;
$this->escapeCharacter = $escapeCharacter;
$this->enclosure = $enclosure;
$this->countPotentialDelimiters();
}
public function getDefaultDelimiter(): string
{
return self::POTENTIAL_DELIMETERS[0];
}
public function linesCounted(): int
{
return $this->numberLines;
}
protected function countPotentialDelimiters(): void
{
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
// Count how many times each of the potential delimiters appears in each line
$this->numberLines = 0;
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
$this->countDelimiterValues($line, $delimiterKeys);
}
}
protected function countDelimiterValues(string $line, array $delimiterKeys): void
{
$splitString = str_split($line, 1);
if (!is_array($splitString)) {
return;
}
$distribution = array_count_values($splitString);
$countLine = array_intersect_key($distribution, $delimiterKeys);
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
}
}
public function infer(): ?string
{
// Calculate the mean square deviations for each delimiter
// (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($this->numberLines - 1) / 2);
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$series = $this->counts[$delimiter];
sort($series);
$median = ($this->numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
if ($median === 0) {
continue;
}
$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + ($value - $median) ** 2;
}
) / count($series);
}
// ... and pick the delimiter with the smallest mean square deviation
// (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}
if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}
return $this->delimiter;
}
/**
* Get the next full line from the file.
*
* @return false|string
*/
public function getNextLine()
{
$line = '';
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');
do {
// Get the next line in the file
$newLine = fgets($this->fileHandle);
// Return false if there is no next line
if ($newLine === false) {
return false;
}
// Add the new line to the line passed in
$line = $line . $newLine;
// Drop everything that is enclosed to avoid counting false positives in enclosures
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
// See if we have any enclosures left in the line
// if we still have an enclosure then we need to read the next line as well
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);
return $line;
}
}