From 2dcb86aae93f4ee12be390dfdcc06fac3824f038 Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Sat, 11 Jul 2015 14:12:09 -0700 Subject: [PATCH 1/3] Move shared strings caching strategy into its own component This will help implementing different caching strategies: - file based - in-memory --- .../Common/Helper/GlobalFunctionsHelper.php | 13 ++ .../CachingStrategyFactory.php | 36 ++++ .../CachingStrategyInterface.php | 44 ++++ .../FileBasedStrategy.php | 188 ++++++++++++++++++ .../Helper/XLSX/SharedStringsHelper.php | 172 ++++------------ .../Helper/XLSX/SharedStringsHelperTest.php | 3 +- 6 files changed, 323 insertions(+), 133 deletions(-) create mode 100644 src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php create mode 100644 src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyInterface.php create mode 100644 src/Spout/Reader/Helper/XLSX/SharedStringsCaching/FileBasedStrategy.php diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index eb02c3f..feeb782 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -106,6 +106,19 @@ class GlobalFunctionsHelper return fputcsv($handle, $fields, $delimiter, $enclosure); } + /** + * Wrapper around global function fwrite() + * @see fwrite() + * + * @param resource $handle + * @param string $string + * @return int + */ + public function fwrite($handle, $string) + { + return fwrite($handle, $string); + } + /** * Wrapper around global function fclose() * @see fclose() diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php new file mode 100644 index 0000000..1bd7e76 --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -0,0 +1,36 @@ +fileSystemHelper = new FileSystemHelper($rootTempFolder); + $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + + $this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile; + + $this->globalFunctionsHelper = new GlobalFunctionsHelper(); + $this->tempFilePointer = null; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + $this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w'); + } + + // The shared string retrieval logic expects each cell data to be on one line only + // Encoding the line feed character allows to preserve this assumption + $lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString); + + $this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL); + } + + /** + * Returns the path for the temp file that should contain the string for the given index + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The temp file path for the given index + */ + protected function getSharedStringTempFilePath($sharedStringIndex) + { + $numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile); + return $this->tempFolder . '/sharedstrings' . $numTempFile; + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + // close pointer to the last temp file that was written + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + } + + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + $indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile; + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); + } + + if ($this->inMemoryTempFilePath !== $tempFilePath) { + // free memory + unset($this->inMemoryTempFileContents); + + $this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath)); + $this->inMemoryTempFilePath = $tempFilePath; + } + + $sharedString = null; + if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { + $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; + $sharedString = $this->unescapeLineFeed($escapedSharedString); + } + + if ($sharedString === null) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + + return rtrim($sharedString, PHP_EOL); + } + + /** + * Escapes the line feed characters (\n) + * + * @param string $unescapedString + * @return string + */ + private function escapeLineFeed($unescapedString) + { + return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); + } + + /** + * Unescapes the line feed characters (\n) + * + * @param string $escapedString + * @return string + */ + private function unescapeLineFeed($escapedString) + { + return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + if ($this->tempFolder) { + $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + } + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 1a73fbb..0653e1b 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -5,6 +5,8 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Helper\FileSystemHelper; use Box\Spout\Reader\Exception\SharedStringNotFoundException; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface; /** * Class SharedStringsHelper @@ -20,43 +22,14 @@ class SharedStringsHelper /** Main namespace for the sharedStrings.xml file */ const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; - /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files - * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory - * and the string will be quickly retrieved. - * The performance bottleneck is not when creating these temporary files, but rather when loading their content. - * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works - * best when the indexes of the shared strings are sorted in the sheet data. - * 10,000 was chosen because it creates small files that are fast to be loaded in memory. - */ - const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; - - /** Value to use to escape the line feed character ("\n") */ - const ESCAPED_LINE_FEED_CHARACTER = '_x000A_'; - /** @var string Path of the XLSX file being read */ protected $filePath; /** @var string Temporary folder where the temporary files to store shared strings will be stored */ protected $tempFolder; - /** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */ - protected $fileSystemHelper; - - /** @var resource Pointer to the last temp file a shared string was written to */ - protected $tempFilePointer; - - /** - * @var string Path of the temporary file whose contents is currently stored in memory - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFilePath; - - /** - * @var string Contents of the temporary file that was last read - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFileContents; + /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ + protected $cachingStrategy; /** * @param string $filePath Path of the XLSX file being read @@ -65,10 +38,7 @@ class SharedStringsHelper public function __construct($filePath, $tempFolder = null) { $this->filePath = $filePath; - - $rootTempFolder = ($tempFolder) ?: sys_get_temp_dir(); - $this->fileSystemHelper = new FileSystemHelper($rootTempFolder); - $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + $this->tempFolder = $tempFolder; } /** @@ -108,7 +78,6 @@ class SharedStringsHelper { $xmlReader = new \XMLReader(); $sharedStringIndex = 0; - $this->tempFilePointer = null; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); @@ -116,6 +85,9 @@ class SharedStringsHelper throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } + $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); + $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); + while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } @@ -140,12 +112,7 @@ class SharedStringsHelper } $unescapedTextValue = $escaper->unescape($textValue); - - // The shared string retrieval logic expects each cell data to be on one line only - // Encoding the line feed character allows to preserve this assumption - $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue); - - $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex); + $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); $sharedStringIndex++; @@ -153,10 +120,7 @@ class SharedStringsHelper $xmlReader->next('si'); } - // close pointer to the last temp file that was written - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } + $this->cachingStrategy->closeCache(); $xmlReader->close(); } @@ -169,6 +133,30 @@ class SharedStringsHelper return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; } + /** + * Returns the shared strings unique count, as specified in tag. + * + * @param \XMLReader $xmlReader XMLReader instance + * @return int Number of unique shared strings in the sharedStrings.xml file + */ + protected function getSharedStringsUniqueCount($xmlReader) + { + $xmlReader->next('sst'); + return intval($xmlReader->getAttribute('uniqueCount')); + } + + /** + * Returns the best shared strings caching strategy. + * + * @param int $sharedStringsUniqueCount + * @return CachingStrategyInterface + */ + protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) + { + $factory = new CachingStrategyFactory(); + return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + /** * Removes nodes that should not be read, like the pronunciation of the Kanji characters. * By keeping them, their text content would be added to the read string. @@ -219,42 +207,7 @@ class SharedStringsHelper } /** - * Writes the given string to its associated temp file. - * A new temporary file is created when the previous one has reached its max capacity. - * - * @param string $sharedString Shared string to write to the temp file - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return void - */ - protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex) - { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - - if (!file_exists($tempFilePath)) { - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } - $this->tempFilePointer = fopen($tempFilePath, 'w'); - } - - fwrite($this->tempFilePointer, $sharedString . PHP_EOL); - } - - /** - * Returns the path for the temp file that should contain the string for the given index - * - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return string The temp file path for the given index - */ - protected function getSharedStringTempFilePath($sharedStringIndex) - { - $numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE); - return $this->tempFolder . '/sharedstrings' . $numTempFile; - } - - /** - * Returns the shared string at the given index. - * Because the strings have been split into different files, it looks for the value in the correct file. + * Returns the shared string at the given index, using the previously chosen caching strategy. * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return string The shared string at the given index @@ -262,63 +215,18 @@ class SharedStringsHelper */ public function getStringAtIndex($sharedStringIndex) { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - $indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE; - - if (!file_exists($tempFilePath)) { - throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); - } - - if ($this->inMemoryTempFilePath !== $tempFilePath) { - // free memory - unset($this->inMemoryTempFileContents); - - $this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath)); - $this->inMemoryTempFilePath = $tempFilePath; - } - - $sharedString = null; - if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { - $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; - $sharedString = $this->unescapeLineFeed($escapedSharedString); - } - - if ($sharedString === null) { - throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); - } - - return rtrim($sharedString, PHP_EOL); + return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); } /** - * Escapes the line feed character (\n) - * - * @param string $unescapedString - * @return string - */ - private function escapeLineFeed($unescapedString) - { - return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); - } - - /** - * Unescapes the line feed character (\n) - * - * @param string $escapedString - * @return string - */ - private function unescapeLineFeed($escapedString) - { - return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); - } - - /** - * Deletes the created temporary folder and all its contents + * Destroys the cache, freeing memory and removing any created artifacts * * @return void */ public function cleanup() { - $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + if ($this->cachingStrategy) { + $this->cachingStrategy->clearCache(); + } } } diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index c868863..a89017d 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -40,7 +40,8 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase { $this->sharedStringsHelper->extractSharedStrings(); - $tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder'); + $cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder'); $filesInTempFolder = $this->getFilesInFolder($tempFolder); $this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.'); From 334f7087daa466143586cfd8e97dba937c43a3cf Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Mon, 13 Jul 2015 00:29:50 -0700 Subject: [PATCH 2/3] Add in-memory caching strategy for shared strings In-memory implementation using SplFixedArray Updated code and tests to support errors when reading XML nodes (useful when reading XML files used for attacks) Removed LIBXML_NOENT option (which DOES substitute entities...) Added test for Quadratic Blowup attack --- .../CachingStrategyFactory.php | 10 ++- .../SharedStringsCaching/InMemoryStrategy.php | 82 ++++++++++++++++++ .../Helper/XLSX/SharedStringsHelper.php | 59 +++++++++++-- src/Spout/Reader/XLSX.php | 2 +- .../Helper/XLSX/SharedStringsHelperTest.php | 69 ++++++--------- tests/Spout/Reader/XLSXTest.php | 34 ++++++-- ...t_file.xlsx => attack_billion_laughs.xlsx} | Bin 4065 -> 4051 bytes .../xlsx/attack_quadratic_blowup.xlsx | Bin 0 -> 4017 bytes .../sheet_with_lots_of_shared_strings.xlsx | Bin 0 -> 45183 bytes 9 files changed, 199 insertions(+), 57 deletions(-) create mode 100644 src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php rename tests/resources/xlsx/{billion_laughs_test_file.xlsx => attack_billion_laughs.xlsx} (53%) create mode 100644 tests/resources/xlsx/attack_quadratic_blowup.xlsx create mode 100644 tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php index 1bd7e76..51bf497 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -28,9 +28,13 @@ class CachingStrategyFactory * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @return CachingStrategyInterface The best caching strategy */ - public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) { - // TODO add in-memory strategy - return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + // TODO: take available memory into account + if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) { + return new InMemoryStrategy($sharedStringsUniqueCount); + } else { + return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } } } diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php new file mode 100644 index 0000000..41b41be --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/InMemoryStrategy.php @@ -0,0 +1,82 @@ +inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount); + $this->isCacheClosed = false; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + if (!$this->isCacheClosed) { + $this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString); + } + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + $this->isCacheClosed = true; + } + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + try { + return $this->inMemoryCache->offsetGet($sharedStringIndex); + } catch (\RuntimeException $e) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + unset($this->inMemoryCache); + $this->isCacheClosed = false; + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 0653e1b..0ced513 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -3,8 +3,6 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Common\Exception\IOException; -use Box\Spout\Common\Helper\FileSystemHelper; -use Box\Spout\Reader\Exception\SharedStringNotFoundException; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface; @@ -81,7 +79,7 @@ class SharedStringsHelper $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); - if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } @@ -93,7 +91,7 @@ class SharedStringsHelper } while ($xmlReader->name === 'si') { - $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters @@ -138,10 +136,30 @@ class SharedStringsHelper * * @param \XMLReader $xmlReader XMLReader instance * @return int Number of unique shared strings in the sharedStrings.xml file + * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read */ protected function getSharedStringsUniqueCount($xmlReader) { + // Use internal errors to avoid displaying lots of warning messages in case of invalid file + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + $xmlReader->next('sst'); + + // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) + while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) { + $xmlReader->read(); + } + + $readError = libxml_get_last_error(); + if ($readError !== false) { + throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]"); + } + + // reset the setting to display XML warnings/errors + libxml_use_internal_errors(false); + return intval($xmlReader->getAttribute('uniqueCount')); } @@ -153,8 +171,37 @@ class SharedStringsHelper */ protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) { - $factory = new CachingStrategyFactory(); - return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + + /** + * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. + * This is to simplify the parsing of the subtree. + * + * @param \XMLReader $xmlReader + * @return \SimpleXMLElement + * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read + */ + protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node. + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $node = null; + try { + $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + } catch (\Exception $exception) { + $error = libxml_get_last_error(); + libxml_use_internal_errors(false); + + throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].'); + } + + libxml_use_internal_errors(false); + + return $node; } /** diff --git a/src/Spout/Reader/XLSX.php b/src/Spout/Reader/XLSX.php index 5471f85..83bee9d 100644 --- a/src/Spout/Reader/XLSX.php +++ b/src/Spout/Reader/XLSX.php @@ -158,7 +158,7 @@ class XLSX extends AbstractReader $worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath(); $worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath; - if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { + if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".'); } } diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index a89017d..386e93a 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -2,6 +2,9 @@ namespace Box\Spout\Reader\Helper\XLSX; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy; use Box\Spout\TestUsingResource; /** @@ -33,47 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->sharedStringsHelper->cleanup(); } - /** - * @return void - */ - public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings() - { - $this->sharedStringsHelper->extractSharedStrings(); - - $cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); - $tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder'); - - $filesInTempFolder = $this->getFilesInFolder($tempFolder); - $this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.'); - - $tempFileContents = file_get_contents($filesInTempFolder[0]); - $tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents); - - $this->assertEquals('s1--A1', $tempFileContentsPerLine[0]); - $this->assertEquals('s1--E5', $tempFileContentsPerLine[24]); - } - - /** - * Returns all files that are in the given folder. - * It does not include "." and ".." and is not recursive. - * - * @param string $folderPath - * @return array - */ - private function getFilesInFolder($folderPath) - { - $files = []; - $directoryIterator = new \DirectoryIterator($folderPath); - - foreach ($directoryIterator as $fileInfo) { - if ($fileInfo->isFile()) { - $files[] = $fileInfo->getPathname(); - } - } - - return $files; - } - /** * @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException * @return void @@ -96,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedString = $this->sharedStringsHelper->getStringAtIndex(24); $this->assertEquals('s1--E5', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy); } /** @@ -116,4 +81,26 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedStringsHelper->cleanup(); } + + /** + * @return void + */ + public function testGetStringAtIndexWithFileBasedStrategy() + { + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('str', $sharedString); + + $sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1); + $this->assertEquals('str', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); + + $sharedStringsHelper->cleanup(); + } } diff --git a/tests/Spout/Reader/XLSXTest.php b/tests/Spout/Reader/XLSXTest.php index 5856887..cd24d5a 100644 --- a/tests/Spout/Reader/XLSXTest.php +++ b/tests/Spout/Reader/XLSXTest.php @@ -2,6 +2,7 @@ namespace Box\Spout\Reader; +use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Type; use Box\Spout\TestUsingResource; @@ -243,18 +244,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); } + /** + * @return array + */ + public function dataProviderForTestReadShouldBeProtectedAgainstAttacks() + { + return [ + ['attack_billion_laughs.xlsx'], + ['attack_quadratic_blowup.xlsx'], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks + * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) + * + * @param string $fileName * @return void */ - public function testReadShouldBeProtectedAgainstBillionLaughsAttack() + public function testReadShouldBeProtectedAgainstAttacks($fileName) { - $allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx'); + $startTime = microtime(true); - $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB - $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + try { + $this->getAllRowsForFile($fileName); + $this->fail('An exception should have been thrown'); + } catch (IOException $exception) { + $duration = microtime(true) - $startTime; + $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); - $expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1']; - $this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.'); + $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB + $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + } } /** diff --git a/tests/resources/xlsx/billion_laughs_test_file.xlsx b/tests/resources/xlsx/attack_billion_laughs.xlsx similarity index 53% rename from tests/resources/xlsx/billion_laughs_test_file.xlsx rename to tests/resources/xlsx/attack_billion_laughs.xlsx index ac7aae32485346f35675fdbecfaf4035633724dc..d6cdc751e479e322236eb87ac26bb9b7e0b5980f 100644 GIT binary patch delta 603 zcmaDTe_4J*J|oMv%GYk2ix~}|tmRC>OklCe|5#)plJcyEjJ#Wb%AQTw#uCBEz>qgN zmQA+a*ZZ)6M8o^$zwJ8g`O(+%?zVecXGJH9e+}Swwwa=v*J?J?SMAq(-;<7pznV@y z>a&byNjaV1t?3m@D_Y}Y~t{v{#4GF_yVnkYgw-J`{k!qD}G&k zdG_slJ-;r#blqmYyso5*`*!yV`MLKleG>zzNUipM<-UCTw!NZXlkZo*)32#NfA+jm zNRNNi{PmoHzlHO)O(&-7^;Axbjn4dhL$=eZe2>)%;pe+@JuK$xc|>1jTD*4;hwNj6 zLobwGUFSZ(FM_Ww_V4oZs%w3USWCWrD)gK*yS>cT{;(c)Ud^UFnpBAWpT@FB4o z7&N#T7%Fn~%kzt}i!)MFON#XaycwAo7({@oI2bn1D|S^`Eo$==m;$D9GB8M>=na{? MfnS>~p9dre09V`a761SM delta 576 zcmcaC|4@EIJ|m0m%3{~e#f%0})^es`Ca~D#e=M>PNqJU7M&9kqid{AJ1E;1jGB7kv zj%Aar_xC<*AkqH*&^~7|$1fX7-(Jg|wQcb-L6-k-0{ES6rs(Fin$7i9`}NlMtYhR8 z*2#zUpVUrjH=p3r-M`D}{hq0*%1Lkd-6aY)=9w+MaqNfYe#tY2lU51LUX;bs|51H< z#l$aui|W34e7-tKQB$<)-P{E$&dYteUDLX1nNtM2pU>C&zPs6<_P+m9;USF-ovTuovXo8z78_yt*(EsmVHgiDbA0t~MNy?UzikdP+3zctB z_MNhfrDV^?STe}}%EyTQ|3B9?>wE6!yyt$-bI$#m8q)3J0D(XZAfx-1I`ob+LogZ; zC}tN3v=8|Ayq24*H`>+P+C0D=?PVqH@8VpRtYJu$;rK8W(U<*I!bO4QP>U?XBu zmXUQ1!wy59++lJ8zUDEaRqnzfqx2_qqFDNWoSG~+bG_tclE&x@WO!*h9G^2U9T67* z=a1-?ds_p}y>~28ilZv1T|?xnyrv;R-|u>FXa)N#c8SOi3@74@Yx-hU-9l;dgu|nP zaA|+_kA42#zPMoW%h2giEBCL~(~R)F>W4o=TQA8#zK?fdWKi#X`NYKzinZRFeiJ;; zQmhn=-Dfa+xmkRi9%0(^Lj9rdh+*{ffvAfD!ZSjOuc|t~zjJws4p&XQ^sQO+`A`nc z4KRLJJXaNuBTfc2;4WuIIpQk5q*gn@Z~8CEAW#8fLKon5)98V3)}Cl*FX)EP`bPL^ z2^I_k*KBD)Af`>iLd8`|vqJPyluuW^)}baG}Nk ztLbVZCX(TlQLirG#kp_ytIjl1D!Y?D7+s4J;%A~aag{%*zfbz2^R*srVUIJY+s z<#z+RbVf{TfWd;1_Hw>ST>u(BcSIlGP@(HVFHv3a(Gvz^ZYHKIE71@ZUGGGNOVZIZ zv@X+*9r5rPSk}og180fad59lkUMHFR}&^IMk5^f7=Avx(>{%KuLA6P zcq)*N^F)MR8ikU_krG2HFC3S8dH4O1=1VI3iZ$P`9wmN?<9GaCsGs_2c_ibTPzt^> zCC-FI+rWsh*&A*spePXa*eZS`C)xIN7qrueR$RvWZuu^plkPM$-$D`j(%;TCvzaD+ zo_pYNVIue$7lsGxEG8=5U^2bb`lWWZ-TbzPC9P?XDZ5EPU>GAe?>HIek*o+SNcg%kJDD#_Ij-BS<5CkI_4}mkv#aF=NV%{A*J`6_*jw-h#r|(-4 zD2Y9mr8>IDa)iZy$_M!tZ`pfhRI&wo(pGhh&p_>OPjm1CWY*J)VDJX344{WVgL^)j z2~hybQdI#2TAM8ZX?ew#&_?Oi7QBF8`yx80BjuMY1xq_$MaaDA!E!9b2e_Z@76y<0 z>sG7e(x^w-WsFHo&jAQ*;(an z#!4@9t7k4E7wu)wmAgN82wn)UZMcm{wNFivua}d0bJ5=Bjt#N_R$Js;UiUYQSd{KG zA2iM#J9>o5^V+mNXOK6n1n@irRNY=6sWqOx1Dw&n2=*~Eaw8sOE;H%6qTXXk5Xf~h zK+dr{4g0jj73|T&uPwnyCb7*M%ro6L09EFzn& z8Jy@d0YwO>Am&A%w0%6ImST+OGrM%NTUZ1-JR9}8{NaUVG}y@uT~(Q^uxD`2_cITe_><(Os!}kbTEEr;`|HQ(|oGN+*MPg>!9;b)GGNSuG#>4{K)p z9KO;Vo?gRi3JY>?GE#0#A{X{Fuqi1Ub|MuZ4_=jo4EHnZjdSvl?+ct$lfVX@M^c_J zWp?1R$V%{Mg2e1ThHDt+oP}ITtG-#v!YiIR${a`M%7vB94zWInS?rUE9{_HT0X(y; zbG-FTC*U-UiR#Qc3;I1aUr%P_Fh$IfxaJ{K5DB4xIbZl!>@DRj20wYJpUl zeX9zZ)T`6rRuTS;zF3`|n*3KMLPndP%`kJBke3+(iRssM{Hn$*fNlGNUBbniyRw#x za}++ayTe(RR)~4GI)bBgEAIJhJAwCdX~u^fkWx}~O4>_EIQB!ia@7`j#iW!%yHQs= zyCQ^|vzH?p?Y(v47G`?s9M0!d83MSu2~1_49lpOHvcD$pO>4I<8IH<6$vNehnXnfu z`PFK7U)+dg6Se5-P|aa!FNzjRWtWOMo!&gz-B{6z4Xk%zWZ|~SBF8E69q=IM<3FA) zer;|9FRNt}A&9pToHK1Hi6Io32ikrCg>{ncA5(v(+#wx)cx3QsS(*cGfobMRmtde2 zTJLQ}CaXvBk2}5gxEwRiZ2P7Pd?+edRyk;l?6*MnPN|np;#R;lEqqH5JT_t&>n1KB zs6YdKu9C%)7RHT4y^l6}A9ob`l_=VuAO#%+K8%jK)d7E|AxV*diyu1RjH>f@|31Ir z!7JtU)fjJhr&Y--dpP%H#4yFo<&vH6*}SLZhG6HcR8o?6J%;g z1GJ+5{x|{D#}@m;!v%GLot3yIm8pgXYHni^{~mO^bf#irHZi|O@&5|8rZK5@8^~&l z;`}|{cFjqJz26G^SAkN4uJ@PiGLDMQ2L#Dx&<#;Xh3_P2O~6p?AK(%0mz>$S+ns>l zlVscd3-D)Bw+kF<9L24$t*vnz$A6Xv)OB=${r!_rpk};I*!H1ET>%f?HUhUpxA$@? zlpUY~2)_09soi@OHmRI3 zq_i4_v=4PUqg9=0IW6b>pXYhcdwREH&Sk>P|N7<{*QKZTeU|%oZ@>F~UP~-{^z5&s zr1Y7Rarc^eFZ6?Y&*-M4l-@&0Ndt_ows2>AI8;Uxx3U@W zdw-bLa?twW>B(Hy8L#pfRh@P z1)C?#;%dcxcKGuX_XiG7vidSFYq@9j+2|mc>Ea*jwM?Q##Vse*>zV^>MeVQ z1)sce?EW_oC#dhOYwNGuyroim&V@H4Sih9aSz!|VIJVE0W4Z@yOG{?;w+p$tmc39c z5(gjMXxz5Y#4&@_%fR}o?H6B;w$8s=i&lQTpVQVA6vI|E7ah{LHED-WrF&gx{iZKeHS+lAK`Lv_&Mb&l^;B> z?b=}Y)#_h)hQlv=g!w#}u+#g|aP6aq^s4zgpD*n8$EF7FD*kqpIODNjtPXb@{)7F! zAJgwoX$>DcYoprS-;~FtBu|b?`8Ks!x8U~GLHcLL&d+>a5q93ENj0ea)Mx3NpKV{Q zY~3q-?dE4*OU}+){>I{)`^K4#XA-}%U3pFN$Fd;@whZ#``OKwp@{tw1eHDH1vtz2T z$9sU52C&Q_Ko$BD54GN%;{;E2F=UqYo(Y41LJYWI)5GAw0hjNGO)++H95Ii(xb4W) zr%z+IyGcrRe}C6ICc&`oxSj)#`|{lKp~LRa$bZxC;^(vaZYufwN?5;?-;!De&dr%@ zu(Hr)%&&vB3RYa(Jv76(w{`sFwJAL{l|y!%2C*J?UL?dgm$!F_h2K{=W|C)t?({Aq1-nV7`k>9;p^@cU~zi!qunbzBf z!}+Q8o^`8R`7evc2Kf4UYo`TY)V`P7?YnDZk{uhCv3L2XFTMGM`=+8|YElx{>$2@} zPmSXyRxxMGr)0mWXg#R@&4OLyztY@1-~}&KvOV|ILT%NJug2^%w%OG>*k*mZe@>+a zE^BDOLLxi>bm^VO*H4>kim!Su6)0P|I@EO@O78`uvYy)mxk==t{rszbpMO{ z(j2C*`7W-PVa}Qvv9EdrX`R{A{>{EY8{Mym{a2$orsP7X_(;*$)^VIREc4{Ju$?{N*=urU!3d%J#NWEFkG00pnU}wL z(EWhM&)v?yhz%JM^^mo@P~+wO0p9k9r%ZPLv)IhZ&u!mFBoI)T~ZWOk!np^0aW zwRm)7TXDmUQNzFeG2Qxq0+ia40AjK%kV)?50m^mjWCHQ=QO)V{|<>%e5 zVafgrBQCqH`dy{|erYw!hvP8j>}lJV*Y&q9Fio2iH87}-Yd!w=*q#U1IsP&*`tr@H z71@TFqCKK&x5(K4+&YoiWZ%};B+rB7fb^>Q$cHLmRhYk;xrdPbGS>c|QXP|RFzn9+j z+p|^c&HnT{8hCVwPX&ZmbSM_MEqQym>bw;!|P}^7r)_2L^^>Y z{&-%i$?s}`;xGqEnS;228`u$6+AeCAv}E$yYjY+1U`cDHj+rDPu%#?g45k#dM+LUJ z^F%_P_)TbOM@L}0T1Te2q@uAbO%xU=DmL$Ul_wGXz!NVM9yS+0FmHbbww`1z7E4+~ z9mH=#C4T9V0U|+Z2OrE3DXEVXH-c>y@>-t@%mdnVxS}NRD=&#CQj!)}V;$Ju?BHK= z+dp8gq%}!W&l5J~rEw)Zi8ycyPok45XtU^SvHWW4=#NnJ4z}z5)N* z4FB2^%WEnQtZB>TMP!z?j0xolm$Cw@{<}t8GJln%g4fX_IWXXDgO_;*-#zd|W~pRM zsM7iC%39)XL6MT}cl)dJznyu)vfEOY@|;z@r+4%SH#helJ3Qm!_hUl4pI`IE{@Vk$ zid60dh|Pb-9!kXCi2J_K{O1ng9_%3w_J(V>;K1(%0!c1q+LrAn!c%*TiGA4<`$bQkbN7Y%8(WJD zu6;t9h8%pgwauq;M&6JScCPz9Z|FXa+q=NGPwM_V9VWgJ;EB%1Q}#M8DGE6Eb2NCO z^KmdMi*25}E)Sa!d)$Tl)1V;*I;K~8jo@+h zD%twr@fyvAsr^N*K9hW3BwbjLR|x*Jfj>hpE^zI`bv)>odhBi5u^t5%RbnmGH-(OP z()86?{%ienDbqq~hRlt39grWrYC~btn2N_Xwd#4S+jdFmJ8DK0#vO9=)l6L?I{2TG zox=Un5%B98yZ*=dRr`nC#?=$MJRv<=<2Zg^PRD!`-#6WBo+=ky>>X>Vxv4Gi)T*oq->hv_jiU-0 z&7WlL=m?$?V<)l5h8r$QOw&pzS$Cdzifr@pAR~54`W&#+8f<|k*1<%N+`z*rvJr3) zD1;o-iaU+pXhRA;zUhS*W8t-Xz9vYrZ6p$K2#ivMAQf8fX1A!nr+F5gFQIIp2Ja{ls7!hd3p#l_9 z*RuXdmA_sAz`GN+h&p}olxh+db>~kTm(pR&im1~QR|`px*1QICETh6^p6HMjF(1GN z7HiZg&E3(V(gI-XD@L&SJOskYm*z&)F`SCj&P8w3FiH-oi?sJUG}2Q zDz+-}(8-!sz4cB>;{28QFIcr;mNGr-XmE&7))SW9j{rNQ5N!h(M;WbfW|!1g%S;k_UntoUk%vL2tii8cHLWLlnbkiq z$OC5sz7ab^P2ktAU|5Qc`Pr$3jcU%>J37i%#Q^W^A<)3!l&RmR9-Ly{1idcND2-dX zy?t)|eXh3wE=1m$n zaXX68CEHS8sZpN1 z-5yMz^NghaVX7wK$=V~!Z4SNe|3ug8kUICsaQmUNlBezV-pac?V{Lz7WWE=UG4TSpClnCdWLw@uP+6P+f2q+2*r`IPXzS-1Yg$LvZ!+O&*7a?R{y3UBe?JnCY(l%AEP{qpCe+)jnz)ZT;m{f7Q~GsIIQ_;; zFr)V$M@{t|6S=2vty!71l09x?(NLG1QCqeJ{&B)o|8-p7hpr>{=gm`BJG{Kvu^EoB&6&M;;}8mm1rW&Wx7 zJGOAJzTh+IzEwi+hn<uW3LxlIzO@(5tSW2$b(HUlhmK!?Jp-u&djeySQ!TH8*S2U7(#`>B*R?9FJtxtcBJ&rU6U|DzBfMoy&Nw8>dKICheK!>IBsR0d^CWf)@&*;yo zGK>vj7#qznHi==(j5G#eIa%=f8#G9SCF)}Qo#&pJbXcYp(_vv4i#DV`$DU3fV;RxM zY8l3&XV9Nx8D!o2hh?rQ9TtYM=y~+#*z@UQtOfM3T86Rch4kmx z-;l;!M{+AvU+9C;DpUQki+vwijyxPXZ;*M1X8zgn4Z`+>j+NtWd6io_7BAtw&{1x@ zC;s}f$`bo*XCHpq+%Go&wA=Rjj|0bDRr#eS!} z%#I& z@ni4UX9qlYHMpKFh#Qnnmdr3&-_FTu|Dyd&{Cz7*Qw^!1HC$w=-DR6+jA)Y#43pFIEPk$5AkS@^n$+wt`BbH$`4Y+EzqL_7LzZc6Cwp zNO_$OwkeZ8*gqJK_DE?02Mr}0w1+|Vi6lYnn4~o?aIH{_owBU<+_UrG?xSRL1NVqd~%k`R3V8^oKVe&r8An%jd zlf<*J!!ueJ-|}ouA@T zHiTvp?WMhHz`j_zy>zSIUXBnTpJ!y$+a3`Xe5ri2=9%CS8`*>`ig%TlT` zcGYCSTm-+^J0n#f8U5ty0nG{Vc3@r1;(^9tD*HnfU1crT6tXB9#||%JKxK4Zd?r=c z`w|(1ofB6>RnA8TqUmN;>$xR~pxDq>4yU|cnMb#k!zs}Ww&KgD+saA0t=Q0Q#hz{} zT{ao8m9O%vauDn?@q@F~9!q)sD6i?19jgjBHB7yrd9$@z8H8Pco+O)qRRDttG`srJ zP)Xx4prRC?0=x}LUyn>)8%Yi=U^_+K3rh&_ZRcJza%?)d2RtdWeBhhGaN>rhok-!y z=czqThrpT+ff*fwHq7#f{-Mhz%L9SBI$u08Dz%Z-e9eBd=xV6#y~>NHxUFx@Ghg_f zIz2EL{-WeXq;jKc$EqcDRk<*dKleLjLyqK5?q-&Y4yBDIx}pZQS*EcL$vbhTM2@bz>IpjOX^Qgsv_d zEKNEbh=@KG)ptlI0in}<|F8r~h^^>ztBf4<6LOot?$TMIFloVz!`C3YCdSl#e*{p) zPTwY7If~#lkYfWYQyni@8NN#jkznOzRA&qpriM)*7AvZ^2VxmZ^^!m=M2o^9fxDw4 z0^cUbl&)L3YsCV>3I+&}3mp(UCO|?N0HH;2_x< zfUhKF`T!7Z20%vA0r6$B5V$)!AnLD zerxz{e4O8NYssqkikNm2gL?t!SMlO;mmOeicL;a-t7v+cmDn~ z-W9PTL^IyW>@}AK|>wsNtb_9 z3CDI#9fwy*$|7q|J$tQdOJ=C6tqw&|VlUgs;2iEUagxZ7^(NIN0OJIcQG+{}jIcq( zIKgBk^vPQGcV-XGwr2q>SOVS|#bKbV9%lH^o`KNseMu+u(kDp-c$-BvoyrV#}!W{jrG%xEVpP&Cs?VzCE} zBzoo2Nunv8ady>YC`aa-m*Tf%?cXnh-9i;++pDhKlq9|0__#hs>W;>E^qcWu76I8xzmw_IvjF776~M zZ0AHusoz?-%F)(@7(bChg#b#9DinAfA322`<$Z3gPs!}KDk~`Aw0Alw1HlM4HRwaa z-Jens1zcwhC3b@xGKh3I5WiqGfOHb98oWT)ds1~Uwh@}tCJ7xKpQG1A;w z-G?R!R(>DCEzKc6U^M`T1ny22KxBx(S^^D_PhW>%5Q_{Dusce`2Y^7*OatV72)A_Q zE~33CZc#Wn4hY;G9S~ScpaDY5Q9gPdfB?P1d!>MoI5}=1aCdYIfwcr0AfLVt z!5|hpS0g~_olDfgCG;Zh&MSOb(LPEtZnbb#z`23VI*P}4Q*9oJd2odmc#f%c&8FyQKAJ3}wZ3jkLSr4xU9hRG`0(-Zh?vtdh{fzt|(82JYp3QTCW_+Q|Let z_B-`qJkT7xzmAE&4Az2(j2AN@jDV8~*GF!=BftCCU3^5dOql3AG!-v}O@-Jbf2XM^ zO6KnoBREXJ<-pt|`h(yzP-Gbljf#57sI5Ag;t-H$`t-ec27ZrRm4M+Yil zaHUJ$l|0bL_r?Pnh@nW&YyjIu>5HsdjEv|cmE^ZVSgASRE<8QWov_f7^QZNrlK;?m^q1JZN_YX~w(Nc0Lvgv9Z2KtAAg zf%g{+mb3ssNCgNPAS6mD1%%WWm#+K)X}T1Uzp5eNfRGCiI3OgBj|1`nuM51tSY&{Z z3J@|tNR$!>gxD9Cu1xd(z`8yzS(X99Xdxtyj{`!CL`($&sVcyC@$?RbKu}rytLp;q zFBTafqymHt5I}foE(=n8(&adC!|4Okbg5YWs)m3ALM}ky7DD3qI3OSJx_}gr_YlkW z8+LSUADE1IXAr^%ZuI*R zCPNMFHY`z>C5ecBC4n5(7xcaDTR%BdBAzZqMADM^EDQLliRQ+)-&(l+XJr+MG`z5`Pp$z%?8uK2I(1{locuGZUYO z9;}UisZgGi4*+3c^8k>48JmZI z(Ahi$hk*Rc*gOD)fz1O8`8Tn7r?y|U%-eD+1)Aeh`~;R0 zN=4shdENU;7-7CAIOaPYf#6Dg%dL_(0fI0;t@81r>FHkJj~>VVd72Nu_aar!@jXT5 z!wrqVP*Y`Db^Y#a^wVPa3%?NpJO6O6dq0zgE}P}_9q)Zct9-Od;^SFt&Qa^~@aJhp zLS_8tFs*V8Bb&x;(cbsLZ^2LK7iJ+IHadbF&C|peO~5W*E4$-+SghKxEBwaUqh;Om z1~daACwk&`xIOvC8;56S!!KBW1UYc!l%R^dWib=l(d)mCSs z&`!}_M02drWcFxV*bec@NT+i^(Nx0Nmm^v_%agUKg^!Z+j4gfB{`3x6a+k|ca=G7^$qHrcN5n6pQ7 z$-6~kWg81`6#{L>%^q5%R%juE^h$SB6NLgTI&DA#gEl7)O9BQb(`ICFzXIXm`Y6@X zpd@x0n`Urc44Uu=+am4(dKlS&r7wb|=RnKA5K-YoZ=QV*lRZ8L6C!OjhqT7y47Be zd^FL^HXL7eiK;NxR%fWnCY_A9y)K(9CH%(SD+wLxW0&nncM#d;*xWlMJjvzz@pFGK zb9U0ZIp=4DY)2LD2C$#nsDW3t+tnTTs zP#h$wvI^|(mQABXN3fG3TnnU8G~DY=P#Pic86E{d;oIGPKrjfM^egPjEnr<3;c(M| zwh7japsEa5mlpNFx=<(a9YQS!>so=ZN^-_PC@`16cZk9i5DHQ6p|=M@iK6PRbaS}) zR5(||cCNi+>4Yz8oBfv4nR5`$$VFn~b}MKUnJyEHK1x>PW_Xk957T_qG83}W1tS;j3o z7^Vzhq|w1>eo1#z+6-WeFUki>&>pcw@4+?@_joV5n(f7Zt)8{Auu0SYHe65^>2OSs|a{$e!zl$FrkEFpUknsDU{G_5S8_e z5N08ZObDMeo^({eA~_k!43kmgIGC)EKA8?-vR&aI)>WNENuG>`A6-`bn{TS!U5^;5 zfz&=_$W23Qi>Nvsl;p`k-7Xp%Qf;J+RV*MEN&v<2dXtk)wK*hbB1VwnNVkZe=Us>h zNaY8ZGBli{r?$TI)V2=GlkufdWdxgy81lMoGPQD$3fk9E*&skkzxHXwc&e?-lOHYC zZVRntV0oSbTG*E(Xdx}w7F}c@G^}C~Eu=Hs$aaiujF48f5F#OTMCQ{G(V-*a$G}{= zYBCHFXS5Hj{DzzP0HCPFgeZ?9wH*0*BoO>L@~iS}i5jfs*^-x{wgk6``z;jvQeH!= z!zGXEiO(0OKNLnx@a zCxJg2M%>D>zwTnDV091~+&r#q7gzu+n&{CJ5Muu6F)cThs`#x)f@ zju2@A88?ZIqf8QzRtr#tAXVf*9v<|75@jK6KT9m^gUY2G+D=MOfXT_B?bI5(9NKPz z8zDFkG6_>a`H2<*^Aq=6j2X)UJ~VX-GnS@)5e)h8N2cx2+6$$WANN@@h^Qqj0EQLS zuDh;W@Hd2lg8>C9b5A%JFj`Z=Acj%tx^@g;(Bk)prR_Kv#5REp3|KC}Q(rGOV=%C0D~3)J}hm=!GPg}W*OAR!G|pu z{0*VtU_gP=z<|t{3I>rGOV>4J0OKRmb{q^yy{TZJaskCMK47`vZwMtN+zas62(LHV zus-c-BrCY%-Se@+R$a*6j|cB*AEl!rVu3$%a;~*b=lYS_AUXE_;imRCNe$%WYr#TF z!&?h<%3muh5Xy4~j8~-EZk_x)u|W+%@dY%I(vAW}-#UY+MZ^-kHDrXKSH6x|_y=Ke zHDj;*93jjiMhF`3q@!Y!QDZxpj2g#57r%r)nTSyHNA9%*Yl6ut3MfA`u6=4p@1HXh zy*yg~Tn>MWgI=7-b^#q{=rdV*ZK2v6k~5_$VIyS z#_yy?L3#2^p~gHpL@$Zd?|uiPv4hnCQm~a%9f&4X2i}1Q)(9sR7lY|!!;FD!eB`b= zOwOCls6AU~=65W;YKVd5UGk&1#^u-&tu-#smZ&v*dAb0%h!Yj8gg6XZox1db2{3z% zxKq3UtSf%VRyYhK@7JV9{;4g8#9y%^0G({4bwT9RLUt-DzZ3t;mg%bz z?^d+*uCc7*7Ae><$92f6Nslq#iUWjy2VTHMh5V$}>C`KBR}hmYma-utUgZ=vjxx!S zfbha91gUBQ@^CFmlm+>45Vg`Shqkjw`$LW8G-_yvv>mUl03R$(!G=-8J*4duJ4^n^ zB!+=^ZxSl)DzGU%pVE|8u%RtoH-f6>VBHT$+mW@0VANX{EpUj4O=(nh0$_aHF2908 zk%2+HzJP-P8LBM%l7c}BqtY76r>E^W7?66?z#ulIaWJR}AFzy1Pupc+K!M_gQ5hJ- zrnD3cNZVyurWA}%Pup=Y*i2v$o6=6=X2GraVl-%vWp9A6G0=hk=Kc3P$-8Xsd| z?QrMDZ8;Ra4{qG)+uy`35G1qKYVe*(jxqV06n z4)yrXuo3(#b50rH5;%S{4)oF62m6KJK=;UDyf*}2t2u!CadqI2HTZ}sSeQY+nO=?E zOeaJqJx&SutV}@uv?-{v?BNL?@+GpI%Cc}uiU11xpw+IxO*B`eGVq;xu8L;#~ z`X&>&sZU>eXy(*{>$>mkp^N@yJ@h2hNyM);06Rg3?wg6e$&)D@-U%nfPYBdM;uCaG zGHIh+a7Bs*4i-c7VDJoba?@t|7vs!VOHE-@90@4t{Vh%eP?0@DKRloZ(Y=lnHh)^aqOpt@2S65c((M%-!Z>0E?))CN;bEMiou1WMu)ooZKoK#r+$e))(CHU(WJ>s znF|}7T31F(pNN@@R`VPADAKqTAPaFvRcdOWWndn&SI)xS?1- zWbYl7;hGsrSkM;>iG}!gm9Tsi1%&j~I6NC_cT+q$Cfmsg%!0J$XgS?x!aO7ZdNF3&2yzs4;0x|y`|tn+Kam6X901!bqkg^y$XPh)Wata-piJWafLDc$ zcGBGd?0!6(p#(rIDoE2vZ$fbcnTa(hAAnR)7WYteIZo+ELvIe&{pWtc-3;zC5S!&P zh=~2@56`ltx%GR0P7lE#CLuT&q<*v%j8A_RoQdtiGAIOt*esWUL2Dk=J%yzQSt(L# z8K1uNfLjJEAJ8m=)Q^^e@o{^~k?@~k0`7tupu#B)ZyXF_vs?xSEezH@g{2277@xlM zAOnNRGD!VsDH#9AFSz}1Fo?}^85oe=6Z0w%Ghpd~3dW}|J-8Zib+Q+yNUjG2-2XPm z>)r`r#CSgVx8jUSrI+CX&P__lZ3{@xTgQ7WSSLAF_aGot?P0)UK>|-CG#58yI*4n% z0!788;^*#x!n(lrXMvJn$uFg1-V)1h-IbL3D=8^`rexf`X5I_^px!gODJiA*P*TzW zqvZcqTe!15o!FiZ%YC*wdDyYMx43mqpoAtMzR-*KgOV$l)`8>X<}tqW9chD8yWQhGKuZJcYY2F@FX`>GKxnc>*Sm9^$gk7)44xmMgemyp=<2Y?t=E-qkJA05%jLpEW8XW|8G8ODZt@Cy8w<%wu8Hjs$7sRThmq+)J zz3j2c$;p$Bb7{<;r#X>IO0K6qQyN5i_gmV#Xo`v0_LY(1_EGAiqyZ3GaRa{ke=&VR AmH+?% literal 0 HcmV?d00001 From 494c506d5672ee4e5b84450dcde385368fdfc1a2 Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Tue, 14 Jul 2015 01:12:52 -0700 Subject: [PATCH 3/3] Add logic to automatically select the best caching strategy Based on the number of unique shared strings as well as the available memory amount, one strategy will be chosen over the other. The algorithm is based on empirical data and super safe so it may need to be tuned. --- .../CachingStrategyFactory.php | 122 +++++++++++++++++- .../Helper/XLSX/SharedStringsHelper.php | 3 +- .../CachingStrategyFactoryTest.php | 99 ++++++++++++++ .../Helper/XLSX/SharedStringsHelperTest.php | 6 + tests/Spout/ReflectionHelper.php | 23 ++++ 5 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php index 51bf497..642647a 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -10,7 +10,37 @@ namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching; class CachingStrategyFactory { /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files + * The memory amount needed to store a string was obtained empirically from this data: + * + * ------------------------------------ + * | Number of chars⁺ | Memory needed | + * ------------------------------------ + * | 3,000 | 1 MB | + * | 15,000 | 2 MB | + * | 30,000 | 5 MB | + * | 75,000 | 11 MB | + * | 150,000 | 21 MB | + * | 300,000 | 43 MB | + * | 750,000 | 105 MB | + * | 1,500,000 | 210 MB | + * | 2,250,000 | 315 MB | + * | 3,000,000 | 420 MB | + * | 4,500,000 | 630 MB | + * ------------------------------------ + * + * ⁺ All characters were 1 byte long + * + * This gives a linear graph where each 1-byte character requires about 150 bytes to be stored. + * Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe. + * Also, there is on average about 20 characters per cell (this is entirely empirical data...). + * + * This means that in order to store one shared string in memory, the memory amount needed is: + * => 20 * 600 ≈ 12KB + */ + const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12; + + /** + * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory * and the string will be quickly retrieved. * The performance bottleneck is not when creating these temporary files, but rather when loading their content. @@ -20,6 +50,30 @@ class CachingStrategyFactory */ const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; + /** @var CachingStrategyFactory|null Singleton instance */ + protected static $instance = null; + + /** + * Private constructor for singleton + */ + private function __construct() + { + } + + /** + * Returns the singleton instance of the factory + * + * @return CachingStrategyFactory + */ + public static function getInstance() + { + if (self::$instance === null) { + self::$instance = new CachingStrategyFactory(); + } + + return self::$instance; + } + /** * Returns the best caching strategy, given the number of unique shared strings * and the amount of memory available. @@ -28,13 +82,73 @@ class CachingStrategyFactory * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @return CachingStrategyInterface The best caching strategy */ - public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) { - // TODO: take available memory into account - if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) { + if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) { return new InMemoryStrategy($sharedStringsUniqueCount); } else { return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); } } + + /** + * Returns whether it is safe to use in-memory caching, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @return bool + */ + protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount) + { + $memoryAvailable = $this->getMemoryLimitInKB(); + + if ($memoryAvailable === -1) { + // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe + return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } else { + $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB; + return ($memoryAvailable > $memoryNeeded); + } + } + + /** + * Returns the PHP "memory_limit" in Kilobytes + * + * @return float + */ + protected function getMemoryLimitInKB() + { + $memoryLimitFormatted = $this->getMemoryLimitFromIni(); + $memoryLimitFormatted = strtolower(trim($memoryLimitFormatted)); + + // No memory limit + if ($memoryLimitFormatted === '-1') { + return -1; + } + + if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) { + $amount = intval($matches[1]); + $unit = $matches[2]; + + switch ($unit) { + case 'b': return ($amount / 1024); + case 'k': return $amount; + case 'm': return ($amount * 1024); + case 'g': return ($amount * 1024 * 1024); + case 't': return ($amount * 1024 * 1024 * 1024); + } + } + + return -1; + } + + /** + * Returns the formatted "memory_limit" value + * + * @return string + */ + protected function getMemoryLimitFromIni() + { + return ini_get('memory_limit'); + } } diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 0ced513..0f6d21d 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -171,7 +171,8 @@ class SharedStringsHelper */ protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) { - return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + return CachingStrategyFactory::getInstance() + ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); } /** diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php new file mode 100644 index 0000000..18b1c74 --- /dev/null +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php @@ -0,0 +1,99 @@ +getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitInKB']) + ->getMock(); + + $factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB); + + \ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub); + + $strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null); + + $fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName; + $this->assertEquals($fullExpectedStrategyClassName, get_class($strategy)); + + $strategy->clearCache(); + \ReflectionHelper::reset(); + } + + /** + * @return array + */ + public function dataProviderForTestGetMemoryLimitInKB() + { + return [ + ['-1', -1], + ['invalid', -1], + ['1024B', 1], + ['128K', 128], + ['256KB', 256], + ['512M', 512 * 1024], + ['2MB', 2 * 1024], + ['1G', 1 * 1024 * 1024], + ['10GB', 10 * 1024 * 1024], + ['2T', 2 * 1024 * 1024 * 1024], + ['5TB', 5 * 1024 * 1024 * 1024], + ]; + } + + /** + * @dataProvider dataProviderForTestGetMemoryLimitInKB + * + * @param string $memoryLimitFormatted + * @param float $expectedMemoryLimitInKB + * @return void + */ + public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB) + { + /** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */ + $factoryStub = $this + ->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitFromIni']) + ->getMock(); + + $factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted); + + $memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB'); + + $this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB); + } +} diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index 386e93a..82631bc 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -87,6 +87,10 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase */ public function testGetStringAtIndexWithFileBasedStrategy() { + // force the file-based strategy by setting no memory limit + $originalMemoryLimit = ini_get('memory_limit'); + ini_set('memory_limit', '-1'); + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); $sharedStringsHelper = new SharedStringsHelper($resourcePath); @@ -102,5 +106,7 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); $sharedStringsHelper->cleanup(); + + ini_set('memory_limit', $originalMemoryLimit); } } diff --git a/tests/Spout/ReflectionHelper.php b/tests/Spout/ReflectionHelper.php index 3fb78e4..df02de8 100644 --- a/tests/Spout/ReflectionHelper.php +++ b/tests/Spout/ReflectionHelper.php @@ -89,4 +89,27 @@ class ReflectionHelper return $value; } + + /** + * Invoke a the given public or protected method on the given object. + * + * @param object $object + * @param string $methodName + * @param *mixed|null $params + * + * @return mixed|null + */ + public static function callMethodOnObject($object, $methodName) + { + $params = func_get_args(); + array_shift($params); // object + array_shift($params); // methodName + + $className = get_class($object); + $class = new ReflectionClass($className); + $method = $class->getMethod($methodName); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } }