diff --git a/docs/topics/file-formats.md b/docs/topics/file-formats.md index 6f8783e6..7318b136 100644 --- a/docs/topics/file-formats.md +++ b/docs/topics/file-formats.md @@ -80,7 +80,8 @@ semi-colon (`;`) are used as separators instead of a comma, although other symbols can be used. Because CSV is a text-only format, it doesn't support any data formatting options. -"CSV" is not a single, well-defined format (although see RFC 4180 for +"CSV" is not a single, well-defined format (although see +[RFC 4180](https://www.rfc-editor.org/rfc/rfc4180.html) for one definition that is commonly used). Rather, in practice the term "CSV" refers to any file that: @@ -117,5 +118,5 @@ Wide Web Consortium (W3C). However, in 2000, HTML also became an international standard (ISO/IEC 15445:2000). HTML 4.01 was published in late 1999, with further errata published through 2001. In 2004 development began on HTML5 in the Web Hypertext Application Technology -Working Group (WHATWG), which became a joint deliverable with the W3C in -2008. +Working Group (WHATWG), which became a joint deliverable with the W3C in 2008. + diff --git a/docs/topics/reading-files.md b/docs/topics/reading-files.md index 38428166..76705281 100644 --- a/docs/topics/reading-files.md +++ b/docs/topics/reading-files.md @@ -44,6 +44,22 @@ practise), it will reject the Xls loader that it would normally use for a .xls file; and test the file using the other loaders until it finds the appropriate loader, and then use that to read the file. +If you know that this is an `xls` file, but don't know whether it is a +genuine BIFF-format Excel or Html markup with an xls extension, you can +limit the loader to check only those two possibilities by passing in an +array of Readers to test against. + +```php +$inputFileName = './sampleData/example1.xls'; +$testAgainstFormats = [ + \PhpOffice\PhpSpreadsheet\IOFactory::READER_XLS, + \PhpOffice\PhpSpreadsheet\IOFactory::READER_HTML, +]; + +/** Load $inputFileName to a Spreadsheet Object **/ +$spreadsheet = \PhpOffice\PhpSpreadsheet\IOFactory::load($inputFileName, 0, $testAgainstFormats); +``` + While easy to implement in your code, and you don't need to worry about the file type; this isn't the most efficient method to load a file; and it lacks the flexibility to configure the loader in any way before @@ -118,6 +134,34 @@ $spreadsheet = $reader->load($inputFileName); See `samples/Reader/04_Simple_file_reader_using_the_IOFactory_to_identify_a_reader_to_use.php` for a working example of this code. +As with the IOFactory `load()` method, you can also pass an array of formats +for the `identify()` method to check against if you know that it will only +be in a subset of the possible formats that PhpSpreadsheet supports. + +```php +$inputFileName = './sampleData/example1.xls'; +$testAgainstFormats = [ + \PhpOffice\PhpSpreadsheet\IOFactory::READER_XLS, + \PhpOffice\PhpSpreadsheet\IOFactory::READER_HTML, +]; + +/** Identify the type of $inputFileName **/ +$inputFileType = \PhpOffice\PhpSpreadsheet\IOFactory::identify($inputFileName, $testAgainstFormats); +``` + +You can also use this to confirm that a file is what it claims to be: + +```php +$inputFileName = './sampleData/example1.xls'; + +try { + /** Verify that $inputFileName really is an Xls file **/ + $inputFileType = \PhpOffice\PhpSpreadsheet\IOFactory::identify($inputFileName, [\PhpOffice\PhpSpreadsheet\IOFactory::READER_XLS]); +} catch (\PhpOffice\PhpSpreadsheet\Reader\Exception $e) { + // File isn't actually an Xls file, even though it has an xls extension +} +``` + ## Spreadsheet Reader Options Once you have created a reader object for the workbook that you want to @@ -146,7 +190,7 @@ $spreadsheet = $reader->load($inputFileName); See `samples/Reader/05_Simple_file_reader_using_the_read_data_only_option.php` for a working example of this code. -It is important to note that Workbooks (and PhpSpreadsheet) store dates +It is important to note that most Workbooks (and PhpSpreadsheet) store dates and times as simple numeric values: they can only be distinguished from other numeric values by the format mask that is applied to that cell. When setting read data only to true, PhpSpreadsheet doesn't read the @@ -162,8 +206,8 @@ Reading Only Data from a Spreadsheet File applies to Readers: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | YES | Xls | YES | Xml | YES | -Ods | YES | SYLK | NO | Gnumeric | YES | +Xlsx | YES | Xls | YES | Xml | YES | +Ods | YES | SYLK | NO | Gnumeric | YES | CSV | NO | HTML | NO ### Reading Only Named WorkSheets from a File @@ -233,8 +277,8 @@ Reading Only Named WorkSheets from a File applies to Readers: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | YES | Xls | YES | Xml | YES | -Ods | YES | SYLK | NO | Gnumeric | YES | +Xlsx | YES | Xls | YES | Xml | YES | +Ods | YES | SYLK | NO | Gnumeric | YES | CSV | NO | HTML | NO ### Reading Only Specific Columns and Rows from a File (Read Filters) @@ -381,7 +425,7 @@ Using Read Filters applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | YES | Xls | YES | Xml | YES | +Xlsx | YES | Xls | YES | Xml | YES | Ods | YES | SYLK | NO | Gnumeric | YES | CSV | YES | HTML | NO | | | @@ -439,7 +483,7 @@ Combining Multiple Files into a Single Spreadsheet Object applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | NO | Xls | NO | Xml | NO | +Xlsx | NO | Xls | NO | Xml | NO | Ods | NO | SYLK | YES | Gnumeric | NO | CSV | YES | HTML | NO @@ -516,7 +560,7 @@ Splitting a single loaded file across multiple worksheets applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | NO | Xls | NO | Xml | NO | +Xlsx | NO | Xls | NO | Xml | NO | Ods | NO | SYLK | NO | Gnumeric | NO | CSV | YES | HTML | NO @@ -556,7 +600,7 @@ Setting CSV delimiter applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | NO | Xls | NO | Xml | NO | +Xlsx | NO | Xls | NO | Xml | NO | Ods | NO | SYLK | NO | Gnumeric | NO | CSV | YES | HTML | NO @@ -594,7 +638,7 @@ Applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N | ----------|:---:|--------|:---:|--------------|:---:| -Xlsx | NO | Xls | NO | Xml | NO | +Xlsx | NO | Xls | NO | Xml | NO | Ods | NO | SYLK | NO | Gnumeric | NO | CSV | YES | HTML | NO @@ -646,7 +690,7 @@ Loading using a Value Binder applies to: Reader | Y/N |Reader | Y/N |Reader | Y/N ----------|:---:|--------|:---:|--------------|:---: -Xlsx | NO | Xls | NO | Xml | NO +Xlsx | NO | Xls | NO | Xml | NO Ods | NO | SYLK | NO | Gnumeric | NO CSV | YES | HTML | YES diff --git a/src/PhpSpreadsheet/IOFactory.php b/src/PhpSpreadsheet/IOFactory.php index 91613cb4..e437a220 100644 --- a/src/PhpSpreadsheet/IOFactory.php +++ b/src/PhpSpreadsheet/IOFactory.php @@ -14,23 +14,39 @@ use PhpOffice\PhpSpreadsheet\Writer\IWriter; */ abstract class IOFactory { + public const READER_XLSX = 'Xlsx'; + public const READER_XLS = 'Xls'; + public const READER_XML = 'Xml'; + public const READER_ODS = 'Ods'; + public const READER_SYLK = 'Slk'; + public const READER_SLK = 'Slk'; + public const READER_GNUMERIC = 'Gnumeric'; + public const READER_HTML = 'Html'; + public const READER_CSV = 'Csv'; + + public const WRITER_XLSX = 'Xlsx'; + public const WRITER_XLS = 'Xls'; + public const WRITER_ODS = 'Ods'; + public const WRITER_CSV = 'Csv'; + public const WRITER_HTML = 'Html'; + private static $readers = [ - 'Xlsx' => Reader\Xlsx::class, - 'Xls' => Reader\Xls::class, - 'Xml' => Reader\Xml::class, - 'Ods' => Reader\Ods::class, - 'Slk' => Reader\Slk::class, - 'Gnumeric' => Reader\Gnumeric::class, - 'Html' => Reader\Html::class, - 'Csv' => Reader\Csv::class, + self::READER_XLSX => Reader\Xlsx::class, + self::READER_XLS => Reader\Xls::class, + self::READER_XML => Reader\Xml::class, + self::READER_ODS => Reader\Ods::class, + self::READER_SLK => Reader\Slk::class, + self::READER_GNUMERIC => Reader\Gnumeric::class, + self::READER_HTML => Reader\Html::class, + self::READER_CSV => Reader\Csv::class, ]; private static $writers = [ - 'Xls' => Writer\Xls::class, - 'Xlsx' => Writer\Xlsx::class, - 'Ods' => Writer\Ods::class, - 'Csv' => Writer\Csv::class, - 'Html' => Writer\Html::class, + self::WRITER_XLS => Writer\Xls::class, + self::WRITER_XLSX => Writer\Xlsx::class, + self::WRITER_ODS => Writer\Ods::class, + self::WRITER_CSV => Writer\Csv::class, + self::WRITER_HTML => Writer\Html::class, 'Tcpdf' => Writer\Pdf\Tcpdf::class, 'Dompdf' => Writer\Pdf\Dompdf::class, 'Mpdf' => Writer\Pdf\Mpdf::class, @@ -70,10 +86,18 @@ abstract class IOFactory * Loads Spreadsheet from file using automatic Reader\IReader resolution. * * @param string $filename The name of the spreadsheet file + * @param int $flags the optional second parameter flags may be used to identify specific elements + * that should be loaded, but which won't be loaded by default, using these values: + * IReader::LOAD_WITH_CHARTS - Include any charts that are defined in the loaded file + * @param string[] $readers An array of Readers to use to identify the file type. By default, load() will try + * all possible Readers until it finds a match; but this allows you to pass in a + * list of Readers so it will only try the subset that you specify here. + * Values in this list can be any of the constant values defined in the set + * IOFactory::READER_*. */ - public static function load(string $filename, int $flags = 0): Spreadsheet + public static function load(string $filename, int $flags = 0, ?array $readers = null): Spreadsheet { - $reader = self::createReaderForFile($filename); + $reader = self::createReaderForFile($filename, $readers); return $reader->load($filename, $flags); } @@ -81,9 +105,9 @@ abstract class IOFactory /** * Identify file type using automatic IReader resolution. */ - public static function identify(string $filename): string + public static function identify(string $filename, ?array $readers = null): string { - $reader = self::createReaderForFile($filename); + $reader = self::createReaderForFile($filename, $readers); $className = get_class($reader); $classType = explode('\\', $className); unset($reader); @@ -93,14 +117,32 @@ abstract class IOFactory /** * Create Reader\IReader for file using automatic IReader resolution. + * + * @param string[] $readers An array of Readers to use to identify the file type. By default, load() will try + * all possible Readers until it finds a match; but this allows you to pass in a + * list of Readers so it will only try the subset that you specify here. + * Values in this list can be any of the constant values defined in the set + * IOFactory::READER_*. */ - public static function createReaderForFile(string $filename): IReader + public static function createReaderForFile(string $filename, ?array $readers = null): IReader { File::assertFile($filename); + $testReaders = self::$readers; + if ($readers !== null) { + $readers = array_map('strtoupper', $readers); + $testReaders = array_filter( + self::$readers, + function (string $readerType) use ($readers) { + return in_array(strtoupper($readerType), $readers, true); + }, + ARRAY_FILTER_USE_KEY + ); + } + // First, lucky guess by inspecting file extension $guessedReader = self::getReaderTypeFromExtension($filename); - if ($guessedReader !== null) { + if (($guessedReader !== null) && array_key_exists($guessedReader, $testReaders)) { $reader = self::createReader($guessedReader); // Let's see if we are lucky @@ -110,11 +152,11 @@ abstract class IOFactory } // If we reach here then "lucky guess" didn't give any result - // Try walking through all the options in self::$autoResolveClasses - foreach (self::$readers as $type => $class) { + // Try walking through all the options in self::$readers (or the selected subset) + foreach ($testReaders as $readerType => $class) { // Ignore our original guess, we know that won't work - if ($type !== $guessedReader) { - $reader = self::createReader($type); + if ($readerType !== $guessedReader) { + $reader = self::createReader($readerType); if ($reader->canRead($filename)) { return $reader; } diff --git a/src/PhpSpreadsheet/Reader/BaseReader.php b/src/PhpSpreadsheet/Reader/BaseReader.php index c215e65b..a137e78c 100644 --- a/src/PhpSpreadsheet/Reader/BaseReader.php +++ b/src/PhpSpreadsheet/Reader/BaseReader.php @@ -153,6 +153,10 @@ abstract class BaseReader implements IReader /** * Loads Spreadsheet from file. + * + * @param int $flags the optional second parameter flags may be used to identify specific elements + * that should be loaded, but which won't be loaded by default, using these values: + * IReader::LOAD_WITH_CHARTS - Include any charts that are defined in the loaded file */ public function load(string $filename, int $flags = 0): Spreadsheet { diff --git a/tests/PhpSpreadsheetTests/IOFactoryTest.php b/tests/PhpSpreadsheetTests/IOFactoryTest.php index b516a9f4..19722dc7 100644 --- a/tests/PhpSpreadsheetTests/IOFactoryTest.php +++ b/tests/PhpSpreadsheetTests/IOFactoryTest.php @@ -108,6 +108,22 @@ class IOFactoryTest extends TestCase ]; } + public function testFormatAsExpected(): void + { + $fileName = 'samples/templates/30template.xls'; + + $actual = IOFactory::identify($fileName, [IOFactory::READER_XLS]); + self::assertSame('Xls', $actual); + } + + public function testFormatNotAsExpectedThrowsException(): void + { + $fileName = 'samples/templates/30template.xls'; + + $this->expectException(ReaderException::class); + IOFactory::identify($fileName, [IOFactory::READER_ODS]); + } + public function testIdentifyNonExistingFileThrowException(): void { $this->expectException(ReaderException::class);