diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index eb02c3f..feeb782 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -106,6 +106,19 @@ class GlobalFunctionsHelper return fputcsv($handle, $fields, $delimiter, $enclosure); } + /** + * Wrapper around global function fwrite() + * @see fwrite() + * + * @param resource $handle + * @param string $string + * @return int + */ + public function fwrite($handle, $string) + { + return fwrite($handle, $string); + } + /** * Wrapper around global function fclose() * @see fclose() diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php new file mode 100644 index 0000000..1bd7e76 --- /dev/null +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -0,0 +1,36 @@ +fileSystemHelper = new FileSystemHelper($rootTempFolder); + $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + + $this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile; + + $this->globalFunctionsHelper = new GlobalFunctionsHelper(); + $this->tempFilePointer = null; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + $this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w'); + } + + // The shared string retrieval logic expects each cell data to be on one line only + // Encoding the line feed character allows to preserve this assumption + $lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString); + + $this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL); + } + + /** + * Returns the path for the temp file that should contain the string for the given index + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The temp file path for the given index + */ + protected function getSharedStringTempFilePath($sharedStringIndex) + { + $numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile); + return $this->tempFolder . '/sharedstrings' . $numTempFile; + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + // close pointer to the last temp file that was written + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + } + + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + $indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile; + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); + } + + if ($this->inMemoryTempFilePath !== $tempFilePath) { + // free memory + unset($this->inMemoryTempFileContents); + + $this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath)); + $this->inMemoryTempFilePath = $tempFilePath; + } + + $sharedString = null; + if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { + $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; + $sharedString = $this->unescapeLineFeed($escapedSharedString); + } + + if ($sharedString === null) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + + return rtrim($sharedString, PHP_EOL); + } + + /** + * Escapes the line feed characters (\n) + * + * @param string $unescapedString + * @return string + */ + private function escapeLineFeed($unescapedString) + { + return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); + } + + /** + * Unescapes the line feed characters (\n) + * + * @param string $escapedString + * @return string + */ + private function unescapeLineFeed($escapedString) + { + return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + if ($this->tempFolder) { + $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + } + } +} diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 1a73fbb..0653e1b 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -5,6 +5,8 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Helper\FileSystemHelper; use Box\Spout\Reader\Exception\SharedStringNotFoundException; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory; +use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface; /** * Class SharedStringsHelper @@ -20,43 +22,14 @@ class SharedStringsHelper /** Main namespace for the sharedStrings.xml file */ const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; - /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files - * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory - * and the string will be quickly retrieved. - * The performance bottleneck is not when creating these temporary files, but rather when loading their content. - * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works - * best when the indexes of the shared strings are sorted in the sheet data. - * 10,000 was chosen because it creates small files that are fast to be loaded in memory. - */ - const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; - - /** Value to use to escape the line feed character ("\n") */ - const ESCAPED_LINE_FEED_CHARACTER = '_x000A_'; - /** @var string Path of the XLSX file being read */ protected $filePath; /** @var string Temporary folder where the temporary files to store shared strings will be stored */ protected $tempFolder; - /** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */ - protected $fileSystemHelper; - - /** @var resource Pointer to the last temp file a shared string was written to */ - protected $tempFilePointer; - - /** - * @var string Path of the temporary file whose contents is currently stored in memory - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFilePath; - - /** - * @var string Contents of the temporary file that was last read - * @see MAX_NUM_STRINGS_PER_TEMP_FILE - */ - protected $inMemoryTempFileContents; + /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ + protected $cachingStrategy; /** * @param string $filePath Path of the XLSX file being read @@ -65,10 +38,7 @@ class SharedStringsHelper public function __construct($filePath, $tempFolder = null) { $this->filePath = $filePath; - - $rootTempFolder = ($tempFolder) ?: sys_get_temp_dir(); - $this->fileSystemHelper = new FileSystemHelper($rootTempFolder); - $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + $this->tempFolder = $tempFolder; } /** @@ -108,7 +78,6 @@ class SharedStringsHelper { $xmlReader = new \XMLReader(); $sharedStringIndex = 0; - $this->tempFilePointer = null; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); @@ -116,6 +85,9 @@ class SharedStringsHelper throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } + $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); + $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); + while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } @@ -140,12 +112,7 @@ class SharedStringsHelper } $unescapedTextValue = $escaper->unescape($textValue); - - // The shared string retrieval logic expects each cell data to be on one line only - // Encoding the line feed character allows to preserve this assumption - $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue); - - $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex); + $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); $sharedStringIndex++; @@ -153,10 +120,7 @@ class SharedStringsHelper $xmlReader->next('si'); } - // close pointer to the last temp file that was written - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } + $this->cachingStrategy->closeCache(); $xmlReader->close(); } @@ -169,6 +133,30 @@ class SharedStringsHelper return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; } + /** + * Returns the shared strings unique count, as specified in tag. + * + * @param \XMLReader $xmlReader XMLReader instance + * @return int Number of unique shared strings in the sharedStrings.xml file + */ + protected function getSharedStringsUniqueCount($xmlReader) + { + $xmlReader->next('sst'); + return intval($xmlReader->getAttribute('uniqueCount')); + } + + /** + * Returns the best shared strings caching strategy. + * + * @param int $sharedStringsUniqueCount + * @return CachingStrategyInterface + */ + protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) + { + $factory = new CachingStrategyFactory(); + return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + /** * Removes nodes that should not be read, like the pronunciation of the Kanji characters. * By keeping them, their text content would be added to the read string. @@ -219,42 +207,7 @@ class SharedStringsHelper } /** - * Writes the given string to its associated temp file. - * A new temporary file is created when the previous one has reached its max capacity. - * - * @param string $sharedString Shared string to write to the temp file - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return void - */ - protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex) - { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - - if (!file_exists($tempFilePath)) { - if ($this->tempFilePointer) { - fclose($this->tempFilePointer); - } - $this->tempFilePointer = fopen($tempFilePath, 'w'); - } - - fwrite($this->tempFilePointer, $sharedString . PHP_EOL); - } - - /** - * Returns the path for the temp file that should contain the string for the given index - * - * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file - * @return string The temp file path for the given index - */ - protected function getSharedStringTempFilePath($sharedStringIndex) - { - $numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE); - return $this->tempFolder . '/sharedstrings' . $numTempFile; - } - - /** - * Returns the shared string at the given index. - * Because the strings have been split into different files, it looks for the value in the correct file. + * Returns the shared string at the given index, using the previously chosen caching strategy. * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return string The shared string at the given index @@ -262,63 +215,18 @@ class SharedStringsHelper */ public function getStringAtIndex($sharedStringIndex) { - $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); - $indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE; - - if (!file_exists($tempFilePath)) { - throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); - } - - if ($this->inMemoryTempFilePath !== $tempFilePath) { - // free memory - unset($this->inMemoryTempFileContents); - - $this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath)); - $this->inMemoryTempFilePath = $tempFilePath; - } - - $sharedString = null; - if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { - $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; - $sharedString = $this->unescapeLineFeed($escapedSharedString); - } - - if ($sharedString === null) { - throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); - } - - return rtrim($sharedString, PHP_EOL); + return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); } /** - * Escapes the line feed character (\n) - * - * @param string $unescapedString - * @return string - */ - private function escapeLineFeed($unescapedString) - { - return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); - } - - /** - * Unescapes the line feed character (\n) - * - * @param string $escapedString - * @return string - */ - private function unescapeLineFeed($escapedString) - { - return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); - } - - /** - * Deletes the created temporary folder and all its contents + * Destroys the cache, freeing memory and removing any created artifacts * * @return void */ public function cleanup() { - $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + if ($this->cachingStrategy) { + $this->cachingStrategy->clearCache(); + } } } diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index c868863..a89017d 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -40,7 +40,8 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase { $this->sharedStringsHelper->extractSharedStrings(); - $tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder'); + $cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder'); $filesInTempFolder = $this->getFilesInFolder($tempFolder); $this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.');