diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index eb02c3f..feeb782 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -106,6 +106,19 @@ class GlobalFunctionsHelper return fputcsv($handle, $fields, $delimiter, $enclosure); } + /** + * Wrapper around global function fwrite() + * @see fwrite() + * + * @param resource $handle + * @param string $string + * @return int + */ + public function fwrite($handle, $string) + { + return fwrite($handle, $string); + } + /** * Wrapper around global function fclose() * @see fclose() diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php new file mode 100644 index 0000000..642647a --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -0,0 +1,154 @@ + 20 * 600 ≈ 12KB + */ + const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12; + + /** + * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files + * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory + * and the string will be quickly retrieved. + * The performance bottleneck is not when creating these temporary files, but rather when loading their content. + * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works + * best when the indexes of the shared strings are sorted in the sheet data. + * 10,000 was chosen because it creates small files that are fast to be loaded in memory. + */ + const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; + + /** @var CachingStrategyFactory|null Singleton instance */ + protected static $instance = null; + + /** + * Private constructor for singleton + */ + private function __construct() + { + } + + /** + * Returns the singleton instance of the factory + * + * @return CachingStrategyFactory + */ + public static function getInstance() + { + if (self::$instance === null) { + self::$instance = new CachingStrategyFactory(); + } + + return self::$instance; + } + + /** + * Returns the best caching strategy, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored + * @return CachingStrategyInterface The best caching strategy + */ + public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + { + if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) { + return new InMemoryStrategy($sharedStringsUniqueCount); + } else { + return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } + } + + /** + * Returns whether it is safe to use in-memory caching, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @return bool + */ + protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount) + { + $memoryAvailable = $this->getMemoryLimitInKB(); + + if ($memoryAvailable === -1) { + // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe + return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } else { + $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB; + return ($memoryAvailable > $memoryNeeded); + } + } + + /** + * Returns the PHP "memory_limit" in Kilobytes + * + * @return float + */ + protected function getMemoryLimitInKB() + { + $memoryLimitFormatted = $this->getMemoryLimitFromIni(); + $memoryLimitFormatted = strtolower(trim($memoryLimitFormatted)); + + // No memory limit + if ($memoryLimitFormatted === '-1') { + return -1; + } + + if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) { + $amount = intval($matches[1]); + $unit = $matches[2]; + + switch ($unit) { + case 'b': return ($amount / 1024); + case 'k': return $amount; + case 'm': return ($amount * 1024); + case 'g': return ($amount * 1024 * 1024); + case 't': return ($amount * 1024 * 1024 * 1024); + } + } + + return -1; + } + + /** + * Returns the formatted "memory_limit" value + * + * @return string + */ + protected function getMemoryLimitFromIni() + { + return ini_get('memory_limit'); + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyInterface.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyInterface.php new file mode 100644 index 0000000..4334d86 --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyInterface.php @@ -0,0 +1,44 @@ +fileSystemHelper = new FileSystemHelper($rootTempFolder); + $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + + $this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile; + + $this->globalFunctionsHelper = new GlobalFunctionsHelper(); + $this->tempFilePointer = null; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + $this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w'); + } + + // The shared string retrieval logic expects each cell data to be on one line only + // Encoding the line feed character allows to preserve this assumption + $lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString); + + $this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL); + } + + /** + * Returns the path for the temp file that should contain the string for the given index + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The temp file path for the given index + */ + protected function getSharedStringTempFilePath($sharedStringIndex) + { + $numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile); + return $this->tempFolder . '/sharedstrings' . $numTempFile; + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + // close pointer to the last temp file that was written + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + } + + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + $indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile; + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); + } + + if ($this->inMemoryTempFilePath !== $tempFilePath) { + // free memory + unset($this->inMemoryTempFileContents); + + $this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath)); + $this->inMemoryTempFilePath = $tempFilePath; + } + + $sharedString = null; + if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { + $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; + $sharedString = $this->unescapeLineFeed($escapedSharedString); + } + + if ($sharedString === null) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + + return rtrim($sharedString, PHP_EOL); + } + + /** + * Escapes the line feed characters (\n) + * + * @param string $unescapedString + * @return string + */ + private function escapeLineFeed($unescapedString) + { + return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); + } + + /** + * Unescapes the line feed characters (\n) + * + * @param string $escapedString + * @return string + */ + private function unescapeLineFeed($escapedString) + { + return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + if ($this->tempFolder) { + $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + } + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php new file mode 100644 index 0000000..41b41be --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php @@ -0,0 +1,82 @@ +inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount); + $this->isCacheClosed = false; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + if (!$this->isCacheClosed) { + $this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString); + } + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + $this->isCacheClosed = true; + } + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + try { + return $this->inMemoryCache->offsetGet($sharedStringIndex); + } catch (\RuntimeException $e) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + unset($this->inMemoryCache); + $this->isCacheClosed = false; + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 1a73fbb..0f6d21d 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -3,8 +3,8 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Common\Exception\IOException; -use Box\Spout\Common\Helper\FileSystemHelper; -use Box\Spout\Reader\Exception\SharedStringNotFoundException; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface; /** * Class SharedStringsHelper @@ -20,43 +20,14 @@ class SharedStringsHelper /** Main namespace for the sharedStrings.xml file */ const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; - /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files - * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory - * and the string will be quickly retrieved. - * The performance bottleneck is not when creating these temporary files, but rather when loading their content. - * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works - * best when the indexes of the shared strings are sorted in the sheet data. - * 10,000 was chosen because it creates small files that are fast to be loaded in memory. - */ - const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; - - /** Value to use to escape the line feed character ("\n") */ - const ESCAPED_LINE_FEED_CHARACTER = '_x000A_'; - /** @var string Path of the XLSX file being read */ protected $filePath; /** @var string Temporary folder where the temporary files to store shared strings will be stored */ protected $tempFolder; - /** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */ - protected $fileSystemHelper; - - /** @var resource Pointer to the last temp file a shared string was written to */ - protected $tempFilePointer; - - /** - * @var string Path of the temporary file whose contents is currently stored in memory - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFilePath; - - /** - * @var string Contents of the temporary file that was last read - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFileContents; + /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ + protected $cachingStrategy; /** * @param string $filePath Path of the XLSX file being read @@ -65,10 +36,7 @@ class SharedStringsHelper public function __construct($filePath, $tempFolder = null) { $this->filePath = $filePath; - - $rootTempFolder = ($tempFolder) ?: sys_get_temp_dir(); - $this->fileSystemHelper = new FileSystemHelper($rootTempFolder); - $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + $this->tempFolder = $tempFolder; } /** @@ -108,20 +76,22 @@ class SharedStringsHelper { $xmlReader = new \XMLReader(); $sharedStringIndex = 0; - $this->tempFilePointer = null; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); - if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } + $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); + $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); + while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } while ($xmlReader->name === 'si') { - $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters @@ -140,12 +110,7 @@ class SharedStringsHelper } $unescapedTextValue = $escaper->unescape($textValue); - - // The shared string retrieval logic expects each cell data to be on one line only - // Encoding the line feed character allows to preserve this assumption - $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue); - - $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex); + $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); $sharedStringIndex++; @@ -153,10 +118,7 @@ class SharedStringsHelper $xmlReader->next('si'); } - // close pointer to the last temp file that was written - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } + $this->cachingStrategy->closeCache(); $xmlReader->close(); } @@ -169,6 +131,80 @@ class SharedStringsHelper return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; } + /** + * Returns the shared strings unique count, as specified in tag. + * + * @param \XMLReader $xmlReader XMLReader instance + * @return int Number of unique shared strings in the sharedStrings.xml file + * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read + */ + protected function getSharedStringsUniqueCount($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of invalid file + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $xmlReader->next('sst'); + + // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) + while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) { + $xmlReader->read(); + } + + $readError = libxml_get_last_error(); + if ($readError !== false) { + throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]"); + } + + // reset the setting to display XML warnings/errors + libxml_use_internal_errors(false); + + return intval($xmlReader->getAttribute('uniqueCount')); + } + + /** + * Returns the best shared strings caching strategy. + * + * @param int $sharedStringsUniqueCount + * @return CachingStrategyInterface + */ + protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) + { + return CachingStrategyFactory::getInstance() + ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + + /** + * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. + * This is to simplify the parsing of the subtree. + * + * @param \XMLReader $xmlReader + * @return \SimpleXMLElement + * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read + */ + protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node. + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $node = null; + try { + $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + } catch (\Exception $exception) { + $error = libxml_get_last_error(); + libxml_use_internal_errors(false); + + throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].'); + } + + libxml_use_internal_errors(false); + + return $node; + } + /** * Removes nodes that should not be read, like the pronunciation of the Kanji characters. * By keeping them, their text content would be added to the read string. @@ -219,42 +255,7 @@ class SharedStringsHelper } /** - * Writes the given string to its associated temp file. - * A new temporary file is created when the previous one has reached its max capacity. - * - * @param string $sharedString Shared string to write to the temp file - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return void - */ - protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex) - { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - - if (!file_exists($tempFilePath)) { - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } - $this->tempFilePointer = fopen($tempFilePath, 'w'); - } - - fwrite($this->tempFilePointer, $sharedString . PHP_EOL); - } - - /** - * Returns the path for the temp file that should contain the string for the given index - * - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return string The temp file path for the given index - */ - protected function getSharedStringTempFilePath($sharedStringIndex) - { - $numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE); - return $this->tempFolder . '/sharedstrings' . $numTempFile; - } - - /** - * Returns the shared string at the given index. - * Because the strings have been split into different files, it looks for the value in the correct file. + * Returns the shared string at the given index, using the previously chosen caching strategy. * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return string The shared string at the given index @@ -262,63 +263,18 @@ class SharedStringsHelper */ public function getStringAtIndex($sharedStringIndex) { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - $indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE; - - if (!file_exists($tempFilePath)) { - throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); - } - - if ($this->inMemoryTempFilePath !== $tempFilePath) { - // free memory - unset($this->inMemoryTempFileContents); - - $this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath)); - $this->inMemoryTempFilePath = $tempFilePath; - } - - $sharedString = null; - if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { - $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; - $sharedString = $this->unescapeLineFeed($escapedSharedString); - } - - if ($sharedString === null) { - throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); - } - - return rtrim($sharedString, PHP_EOL); + return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); } /** - * Escapes the line feed character (\n) - * - * @param string $unescapedString - * @return string - */ - private function escapeLineFeed($unescapedString) - { - return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); - } - - /** - * Unescapes the line feed character (\n) - * - * @param string $escapedString - * @return string - */ - private function unescapeLineFeed($escapedString) - { - return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); - } - - /** - * Deletes the created temporary folder and all its contents + * Destroys the cache, freeing memory and removing any created artifacts * * @return void */ public function cleanup() { - $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + if ($this->cachingStrategy) { + $this->cachingStrategy->clearCache(); + } } } diff --git a/src/Spout/Reader/XLSX.php b/src/Spout/Reader/XLSX.php index 5865d20..7f176fe 100644 --- a/src/Spout/Reader/XLSX.php +++ b/src/Spout/Reader/XLSX.php @@ -166,7 +166,7 @@ class XLSX extends AbstractReader $worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath(); $worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath; - if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".'); } } diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php new file mode 100644 index 0000000..18b1c74 --- /dev/null +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php @@ -0,0 +1,99 @@ +getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitInKB']) + ->getMock(); + + $factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB); + + \ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub); + + $strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null); + + $fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName; + $this->assertEquals($fullExpectedStrategyClassName, get_class($strategy)); + + $strategy->clearCache(); + \ReflectionHelper::reset(); + } + + /** + * @return array + */ + public function dataProviderForTestGetMemoryLimitInKB() + { + return [ + ['-1', -1], + ['invalid', -1], + ['1024B', 1], + ['128K', 128], + ['256KB', 256], + ['512M', 512 * 1024], + ['2MB', 2 * 1024], + ['1G', 1 * 1024 * 1024], + ['10GB', 10 * 1024 * 1024], + ['2T', 2 * 1024 * 1024 * 1024], + ['5TB', 5 * 1024 * 1024 * 1024], + ]; + } + + /** + * @dataProvider dataProviderForTestGetMemoryLimitInKB + * + * @param string $memoryLimitFormatted + * @param float $expectedMemoryLimitInKB + * @return void + */ + public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB) + { + /** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */ + $factoryStub = $this + ->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitFromIni']) + ->getMock(); + + $factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted); + + $memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB'); + + $this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB); + } +} diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index c868863..82631bc 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -2,6 +2,9 @@ namespace Box\Spout\Reader\Helper\XLSX; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy; use Box\Spout\TestUsingResource; /** @@ -33,46 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->sharedStringsHelper->cleanup(); } - /** - * @return void - */ - public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings() - { - $this->sharedStringsHelper->extractSharedStrings(); - - $tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder'); - - $filesInTempFolder = $this->getFilesInFolder($tempFolder); - $this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.'); - - $tempFileContents = file_get_contents($filesInTempFolder[0]); - $tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents); - - $this->assertEquals('s1--A1', $tempFileContentsPerLine[0]); - $this->assertEquals('s1--E5', $tempFileContentsPerLine[24]); - } - - /** - * Returns all files that are in the given folder. - * It does not include "." and ".." and is not recursive. - * - * @param string $folderPath - * @return array - */ - private function getFilesInFolder($folderPath) - { - $files = []; - $directoryIterator = new \DirectoryIterator($folderPath); - - foreach ($directoryIterator as $fileInfo) { - if ($fileInfo->isFile()) { - $files[] = $fileInfo->getPathname(); - } - } - - return $files; - } - /** * @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException * @return void @@ -95,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedString = $this->sharedStringsHelper->getStringAtIndex(24); $this->assertEquals('s1--E5', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy); } /** @@ -115,4 +81,32 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedStringsHelper->cleanup(); } + + /** + * @return void + */ + public function testGetStringAtIndexWithFileBasedStrategy() + { + // force the file-based strategy by setting no memory limit + $originalMemoryLimit = ini_get('memory_limit'); + ini_set('memory_limit', '-1'); + + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('str', $sharedString); + + $sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1); + $this->assertEquals('str', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); + + $sharedStringsHelper->cleanup(); + + ini_set('memory_limit', $originalMemoryLimit); + } } diff --git a/tests/Spout/Reader/XLSXTest.php b/tests/Spout/Reader/XLSXTest.php index f1e3d99..531bdb8 100644 --- a/tests/Spout/Reader/XLSXTest.php +++ b/tests/Spout/Reader/XLSXTest.php @@ -2,6 +2,7 @@ namespace Box\Spout\Reader; +use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Type; use Box\Spout\TestUsingResource; @@ -245,18 +246,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); } + /** + * @return array + */ + public function dataProviderForTestReadShouldBeProtectedAgainstAttacks() + { + return [ + ['attack_billion_laughs.xlsx'], + ['attack_quadratic_blowup.xlsx'], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks + * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) + * + * @param string $fileName * @return void */ - public function testReadShouldBeProtectedAgainstBillionLaughsAttack() + public function testReadShouldBeProtectedAgainstAttacks($fileName) { - $allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx'); + $startTime = microtime(true); - $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB - $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + try { + $this->getAllRowsForFile($fileName); + $this->fail('An exception should have been thrown'); + } catch (IOException $exception) { + $duration = microtime(true) - $startTime; + $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); - $expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1']; - $this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.'); + $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB + $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + } } /** diff --git a/tests/Spout/ReflectionHelper.php b/tests/Spout/ReflectionHelper.php index 3fb78e4..df02de8 100644 --- a/tests/Spout/ReflectionHelper.php +++ b/tests/Spout/ReflectionHelper.php @@ -89,4 +89,27 @@ class ReflectionHelper return $value; } + + /** + * Invoke a the given public or protected method on the given object. + * + * @param object $object + * @param string $methodName + * @param *mixed|null $params + * + * @return mixed|null + */ + public static function callMethodOnObject($object, $methodName) + { + $params = func_get_args(); + array_shift($params); // object + array_shift($params); // methodName + + $className = get_class($object); + $class = new ReflectionClass($className); + $method = $class->getMethod($methodName); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } } diff --git a/tests/resources/xlsx/billion_laughs_test_file.xlsx b/tests/resources/xlsx/attack_billion_laughs.xlsx similarity index 53% rename from tests/resources/xlsx/billion_laughs_test_file.xlsx rename to tests/resources/xlsx/attack_billion_laughs.xlsx index ac7aae3..d6cdc75 100644 Binary files a/tests/resources/xlsx/billion_laughs_test_file.xlsx and b/tests/resources/xlsx/attack_billion_laughs.xlsx differ diff --git a/tests/resources/xlsx/attack_quadratic_blowup.xlsx b/tests/resources/xlsx/attack_quadratic_blowup.xlsx new file mode 100644 index 0000000..a317c18 Binary files /dev/null and b/tests/resources/xlsx/attack_quadratic_blowup.xlsx differ diff --git a/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx b/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx new file mode 100644 index 0000000..f4d93f3 Binary files /dev/null and b/tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx differ