diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php index 1bd7e76..51bf497 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -28,9 +28,13 @@ class CachingStrategyFactory * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @return CachingStrategyInterface The best caching strategy */ - public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) { - // TODO add in-memory strategy - return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + // TODO: take available memory into account + if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) { + return new InMemoryStrategy($sharedStringsUniqueCount); + } else { + return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } } } diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php new file mode 100644 index 0000000..41b41be --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php @@ -0,0 +1,82 @@ +inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount); + $this->isCacheClosed = false; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + if (!$this->isCacheClosed) { + $this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString); + } + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + $this->isCacheClosed = true; + } + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + try { + return $this->inMemoryCache->offsetGet($sharedStringIndex); + } catch (\RuntimeException $e) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + unset($this->inMemoryCache); + $this->isCacheClosed = false; + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 0653e1b..0ced513 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -3,8 +3,6 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Common\Exception\IOException; -use Box\Spout\Common\Helper\FileSystemHelper; -use Box\Spout\Reader\Exception\SharedStringNotFoundException; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface; @@ -81,7 +79,7 @@ class SharedStringsHelper $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); - if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } @@ -93,7 +91,7 @@ class SharedStringsHelper } while ($xmlReader->name === 'si') { - $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters @@ -138,10 +136,30 @@ class SharedStringsHelper * * @param \XMLReader $xmlReader XMLReader instance * @return int Number of unique shared strings in the sharedStrings.xml file + * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read */ protected function getSharedStringsUniqueCount($xmlReader) { + // Use internal errors to avoid displaying lots of warning messages in case of invalid file + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + $xmlReader->next('sst'); + + // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) + while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) { + $xmlReader->read(); + } + + $readError = libxml_get_last_error(); + if ($readError !== false) { + throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]"); + } + + // reset the setting to display XML warnings/errors + libxml_use_internal_errors(false); + return intval($xmlReader->getAttribute('uniqueCount')); } @@ -153,8 +171,37 @@ class SharedStringsHelper */ protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) { - $factory = new CachingStrategyFactory(); - return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + + /** + * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. + * This is to simplify the parsing of the subtree. + * + * @param \XMLReader $xmlReader + * @return \SimpleXMLElement + * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read + */ + protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node. + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $node = null; + try { + $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + } catch (\Exception $exception) { + $error = libxml_get_last_error(); + libxml_use_internal_errors(false); + + throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].'); + } + + libxml_use_internal_errors(false); + + return $node; } /** diff --git a/src/Spout/Reader/XLSX.php b/src/Spout/Reader/XLSX.php index 5471f85..83bee9d 100644 --- a/src/Spout/Reader/XLSX.php +++ b/src/Spout/Reader/XLSX.php @@ -158,7 +158,7 @@ class XLSX extends AbstractReader $worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath(); $worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath; - if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".'); } } diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index a89017d..386e93a 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -2,6 +2,9 @@ namespace Box\Spout\Reader\Helper\XLSX; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy; use Box\Spout\TestUsingResource; /** @@ -33,47 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->sharedStringsHelper->cleanup(); } - /** - * @return void - */ - public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings() - { - $this->sharedStringsHelper->extractSharedStrings(); - - $cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); - $tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder'); - - $filesInTempFolder = $this->getFilesInFolder($tempFolder); - $this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.'); - - $tempFileContents = file_get_contents($filesInTempFolder[0]); - $tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents); - - $this->assertEquals('s1--A1', $tempFileContentsPerLine[0]); - $this->assertEquals('s1--E5', $tempFileContentsPerLine[24]); - } - - /** - * Returns all files that are in the given folder. - * It does not include "." and ".." and is not recursive. - * - * @param string $folderPath - * @return array - */ - private function getFilesInFolder($folderPath) - { - $files = []; - $directoryIterator = new \DirectoryIterator($folderPath); - - foreach ($directoryIterator as $fileInfo) { - if ($fileInfo->isFile()) { - $files[] = $fileInfo->getPathname(); - } - } - - return $files; - } - /** * @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException * @return void @@ -96,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedString = $this->sharedStringsHelper->getStringAtIndex(24); $this->assertEquals('s1--E5', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy); } /** @@ -116,4 +81,26 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedStringsHelper->cleanup(); } + + /** + * @return void + */ + public function testGetStringAtIndexWithFileBasedStrategy() + { + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('str', $sharedString); + + $sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1); + $this->assertEquals('str', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); + + $sharedStringsHelper->cleanup(); + } } diff --git a/tests/Spout/Reader/XLSXTest.php b/tests/Spout/Reader/XLSXTest.php index 5856887..cd24d5a 100644 --- a/tests/Spout/Reader/XLSXTest.php +++ b/tests/Spout/Reader/XLSXTest.php @@ -2,6 +2,7 @@ namespace Box\Spout\Reader; +use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Type; use Box\Spout\TestUsingResource; @@ -243,18 +244,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); } + /** + * @return array + */ + public function dataProviderForTestReadShouldBeProtectedAgainstAttacks() + { + return [ + ['attack_billion_laughs.xlsx'], + ['attack_quadratic_blowup.xlsx'], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks + * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) + * + * @param string $fileName * @return void */ - public function testReadShouldBeProtectedAgainstBillionLaughsAttack() + public function testReadShouldBeProtectedAgainstAttacks($fileName) { - $allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx'); + $startTime = microtime(true); - $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB - $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + try { + $this->getAllRowsForFile($fileName); + $this->fail('An exception should have been thrown'); + } catch (IOException $exception) { + $duration = microtime(true) - $startTime; + $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); - $expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1']; - $this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.'); + $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB + $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + } } /** diff --git a/tests/resources/xlsx/billion_laughs_test_file.xlsx b/tests/resources/xlsx/attack_billion_laughs.xlsx similarity index 53% rename from tests/resources/xlsx/billion_laughs_test_file.xlsx rename to tests/resources/xlsx/attack_billion_laughs.xlsx index ac7aae3..d6cdc75 100644 Binary files a/tests/resources/xlsx/billion_laughs_test_file.xlsx and b/tests/resources/xlsx/attack_billion_laughs.xlsx differ diff --git a/tests/resources/xlsx/attack_quadratic_blowup.xlsx b/tests/resources/xlsx/attack_quadratic_blowup.xlsx new file mode 100644 index 0000000..a317c18 Binary files /dev/null and b/tests/resources/xlsx/attack_quadratic_blowup.xlsx differ diff --git a/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx b/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx new file mode 100644 index 0000000..f4d93f3 Binary files /dev/null and b/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx differ