Csv reader refactor infer delimiter (#1948)
* Refactor delimiter inference for CSV file reading into a separate class
This commit is contained in:
parent
07ad800755
commit
b7f93754d6
|
|
@ -4,6 +4,7 @@ namespace PhpOffice\PhpSpreadsheet\Reader;
|
||||||
|
|
||||||
use InvalidArgumentException;
|
use InvalidArgumentException;
|
||||||
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
|
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
|
||||||
|
use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter;
|
||||||
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
|
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
|
||||||
use PhpOffice\PhpSpreadsheet\Spreadsheet;
|
use PhpOffice\PhpSpreadsheet\Spreadsheet;
|
||||||
|
|
||||||
|
|
@ -138,118 +139,26 @@ class Csv extends BaseReader
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$potentialDelimiters = [',', ';', "\t", '|', ':', ' ', '~'];
|
$inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure);
|
||||||
$counts = [];
|
|
||||||
foreach ($potentialDelimiters as $delimiter) {
|
|
||||||
$counts[$delimiter] = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count how many times each of the potential delimiters appears in each line
|
|
||||||
$numberLines = 0;
|
|
||||||
while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) {
|
|
||||||
$countLine = [];
|
|
||||||
for ($i = strlen($line) - 1; $i >= 0; --$i) {
|
|
||||||
$char = $line[$i];
|
|
||||||
if (isset($counts[$char])) {
|
|
||||||
if (!isset($countLine[$char])) {
|
|
||||||
$countLine[$char] = 0;
|
|
||||||
}
|
|
||||||
++$countLine[$char];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
foreach ($potentialDelimiters as $delimiter) {
|
|
||||||
$counts[$delimiter][] = $countLine[$delimiter]
|
|
||||||
?? 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If number of lines is 0, nothing to infer : fall back to the default
|
// If number of lines is 0, nothing to infer : fall back to the default
|
||||||
if ($numberLines === 0) {
|
if ($inferenceEngine->linesCounted() === 0) {
|
||||||
$this->delimiter = reset($potentialDelimiters);
|
$this->delimiter = $inferenceEngine->getDefaultDelimiter();
|
||||||
$this->skipBOM();
|
$this->skipBOM();
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
|
$this->delimiter = $inferenceEngine->infer();
|
||||||
$meanSquareDeviations = [];
|
|
||||||
$middleIdx = floor(($numberLines - 1) / 2);
|
|
||||||
|
|
||||||
foreach ($potentialDelimiters as $delimiter) {
|
|
||||||
$series = $counts[$delimiter];
|
|
||||||
sort($series);
|
|
||||||
|
|
||||||
$median = ($numberLines % 2)
|
|
||||||
? $series[$middleIdx]
|
|
||||||
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
|
|
||||||
|
|
||||||
if ($median === 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$meanSquareDeviations[$delimiter] = array_reduce(
|
|
||||||
$series,
|
|
||||||
function ($sum, $value) use ($median) {
|
|
||||||
return $sum + ($value - $median) ** 2;
|
|
||||||
}
|
|
||||||
) / count($series);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
|
|
||||||
$min = INF;
|
|
||||||
foreach ($potentialDelimiters as $delimiter) {
|
|
||||||
if (!isset($meanSquareDeviations[$delimiter])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($meanSquareDeviations[$delimiter] < $min) {
|
|
||||||
$min = $meanSquareDeviations[$delimiter];
|
|
||||||
$this->delimiter = $delimiter;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no delimiter could be detected, fall back to the default
|
// If no delimiter could be detected, fall back to the default
|
||||||
if ($this->delimiter === null) {
|
if ($this->delimiter === null) {
|
||||||
$this->delimiter = reset($potentialDelimiters);
|
$this->delimiter = $inferenceEngine->getDefaultDelimiter();
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->skipBOM();
|
$this->skipBOM();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the next full line from the file.
|
|
||||||
*
|
|
||||||
* @return false|string
|
|
||||||
*/
|
|
||||||
private function getNextLine()
|
|
||||||
{
|
|
||||||
$line = '';
|
|
||||||
$enclosure = ($this->escapeCharacter === '' ? ''
|
|
||||||
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
|
|
||||||
. preg_quote($this->enclosure, '/');
|
|
||||||
|
|
||||||
do {
|
|
||||||
// Get the next line in the file
|
|
||||||
$newLine = fgets($this->fileHandle);
|
|
||||||
|
|
||||||
// Return false if there is no next line
|
|
||||||
if ($newLine === false) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the new line to the line passed in
|
|
||||||
$line = $line . $newLine;
|
|
||||||
|
|
||||||
// Drop everything that is enclosed to avoid counting false positives in enclosures
|
|
||||||
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
|
|
||||||
|
|
||||||
// See if we have any enclosures left in the line
|
|
||||||
// if we still have an enclosure then we need to read the next line as well
|
|
||||||
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);
|
|
||||||
|
|
||||||
return $line;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
|
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,144 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace PhpOffice\PhpSpreadsheet\Reader\Csv;
|
||||||
|
|
||||||
|
class Delimiter
|
||||||
|
{
|
||||||
|
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
|
||||||
|
|
||||||
|
protected $fileHandle;
|
||||||
|
|
||||||
|
protected $escapeCharacter;
|
||||||
|
|
||||||
|
protected $enclosure;
|
||||||
|
|
||||||
|
protected $counts = [];
|
||||||
|
|
||||||
|
protected $numberLines = 0;
|
||||||
|
|
||||||
|
protected $delimiter;
|
||||||
|
|
||||||
|
public function __construct($fileHandle, $escapeCharacter, $enclosure)
|
||||||
|
{
|
||||||
|
$this->fileHandle = $fileHandle;
|
||||||
|
$this->escapeCharacter = $escapeCharacter;
|
||||||
|
$this->enclosure = $enclosure;
|
||||||
|
|
||||||
|
$this->countPotentialDelimiters();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getDefaultDelimiter(): string
|
||||||
|
{
|
||||||
|
return self::POTENTIAL_DELIMETERS[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
public function linesCounted(): int
|
||||||
|
{
|
||||||
|
return $this->numberLines;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function countPotentialDelimiters(): void
|
||||||
|
{
|
||||||
|
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
|
||||||
|
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
|
||||||
|
|
||||||
|
// Count how many times each of the potential delimiters appears in each line
|
||||||
|
$this->numberLines = 0;
|
||||||
|
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
|
||||||
|
$this->countDelimiterValues($line, $delimiterKeys);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function countDelimiterValues(string $line, array $delimiterKeys): void
|
||||||
|
{
|
||||||
|
$splitString = str_split($line, 1);
|
||||||
|
if (!is_array($splitString)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$distribution = array_count_values($splitString);
|
||||||
|
$countLine = array_intersect_key($distribution, $delimiterKeys);
|
||||||
|
|
||||||
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
||||||
|
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function infer(): ?string
|
||||||
|
{
|
||||||
|
// Calculate the mean square deviations for each delimiter
|
||||||
|
// (ignoring delimiters that haven't been found consistently)
|
||||||
|
$meanSquareDeviations = [];
|
||||||
|
$middleIdx = floor(($this->numberLines - 1) / 2);
|
||||||
|
|
||||||
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
||||||
|
$series = $this->counts[$delimiter];
|
||||||
|
sort($series);
|
||||||
|
|
||||||
|
$median = ($this->numberLines % 2)
|
||||||
|
? $series[$middleIdx]
|
||||||
|
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
|
||||||
|
|
||||||
|
if ($median === 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$meanSquareDeviations[$delimiter] = array_reduce(
|
||||||
|
$series,
|
||||||
|
function ($sum, $value) use ($median) {
|
||||||
|
return $sum + ($value - $median) ** 2;
|
||||||
|
}
|
||||||
|
) / count($series);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... and pick the delimiter with the smallest mean square deviation
|
||||||
|
// (in case of ties, the order in potentialDelimiters is respected)
|
||||||
|
$min = INF;
|
||||||
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
||||||
|
if (!isset($meanSquareDeviations[$delimiter])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($meanSquareDeviations[$delimiter] < $min) {
|
||||||
|
$min = $meanSquareDeviations[$delimiter];
|
||||||
|
$this->delimiter = $delimiter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->delimiter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the next full line from the file.
|
||||||
|
*
|
||||||
|
* @return false|string
|
||||||
|
*/
|
||||||
|
public function getNextLine()
|
||||||
|
{
|
||||||
|
$line = '';
|
||||||
|
$enclosure = ($this->escapeCharacter === '' ? ''
|
||||||
|
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
|
||||||
|
. preg_quote($this->enclosure, '/');
|
||||||
|
|
||||||
|
do {
|
||||||
|
// Get the next line in the file
|
||||||
|
$newLine = fgets($this->fileHandle);
|
||||||
|
|
||||||
|
// Return false if there is no next line
|
||||||
|
if ($newLine === false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the new line to the line passed in
|
||||||
|
$line = $line . $newLine;
|
||||||
|
|
||||||
|
// Drop everything that is enclosed to avoid counting false positives in enclosures
|
||||||
|
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
|
||||||
|
|
||||||
|
// See if we have any enclosures left in the line
|
||||||
|
// if we still have an enclosure then we need to read the next line as well
|
||||||
|
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);
|
||||||
|
|
||||||
|
return $line;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue