Move shared strings caching strategy into its own component

This will help implementing different caching strategies:
- file based
- in-memory
This commit is contained in:
Adrien Loison 2015-07-11 14:12:09 -07:00
parent cf239905d7
commit 2dcb86aae9
6 changed files with 323 additions and 133 deletions

View File

@ -106,6 +106,19 @@ class GlobalFunctionsHelper
return fputcsv($handle, $fields, $delimiter, $enclosure);
}
/**
* Wrapper around global function fwrite()
* @see fwrite()
*
* @param resource $handle
* @param string $string
* @return int
*/
public function fwrite($handle, $string)
{
return fwrite($handle, $string);
}
/**
* Wrapper around global function fclose()
* @see fclose()

View File

@ -0,0 +1,36 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Class CachingStrategyFactory
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class CachingStrategyFactory
{
/**
* To avoid running out of memory when extracting the shared strings, they will be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
* best when the indexes of the shared strings are sorted in the sheet data.
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
*/
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/**
* Returns the best caching strategy, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @return CachingStrategyInterface The best caching strategy
*/
public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
{
// TODO add in-memory strategy
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
}
}

View File

@ -0,0 +1,44 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Interface CachingStrategyInterface
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
interface CachingStrategyInterface
{
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex);
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache();
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex);
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache();
}

View File

@ -0,0 +1,188 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
use Box\Spout\Common\Helper\FileSystemHelper;
use Box\Spout\Common\Helper\GlobalFunctionsHelper;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
/**
* Class FileBasedStrategy
*
* This class implements the file-based caching strategy for shared strings.
* Shared strings are stored in small files (with a max number of strings per file).
* This strategy is slower than an in-memory strategy but is used to avoid out of memory crashes.
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class FileBasedStrategy implements CachingStrategyInterface
{
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \Box\Spout\Common\Helper\FileSystemHelper Helper to perform file system operations */
protected $fileSystemHelper;
/**
* @var int Maximum number of strings that can be stored in one temp file
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $maxNumStringsPerTempFile;
/** @var resource Pointer to the last temp file a shared string was written to */
protected $tempFilePointer;
/**
* @var string Path of the temporary file whose contents is currently stored in memory
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFilePath;
/**
* @var string Contents of the temporary file that was last read
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFileContents;
/**
* @param string|null $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @param int $maxNumStringsPerTempFile Maximum number of strings that can be stored in one temp file
*/
public function __construct($tempFolder, $maxNumStringsPerTempFile)
{
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
$this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile;
$this->globalFunctionsHelper = new GlobalFunctionsHelper();
$this->tempFilePointer = null;
}
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
$this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w');
}
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString);
$this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL);
}
/**
* Returns the path for the temp file that should contain the string for the given index
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The temp file path for the given index
*/
protected function getSharedStringTempFilePath($sharedStringIndex)
{
$numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile);
return $this->tempFolder . '/sharedstrings' . $numTempFile;
}
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache()
{
// close pointer to the last temp file that was written
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
}
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
$indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile;
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
}
if ($this->inMemoryTempFilePath !== $tempFilePath) {
// free memory
unset($this->inMemoryTempFileContents);
$this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath));
$this->inMemoryTempFilePath = $tempFilePath;
}
$sharedString = null;
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
$sharedString = $this->unescapeLineFeed($escapedSharedString);
}
if ($sharedString === null) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
return rtrim($sharedString, PHP_EOL);
}
/**
* Escapes the line feed characters (\n)
*
* @param string $unescapedString
* @return string
*/
private function escapeLineFeed($unescapedString)
{
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
}
/**
* Unescapes the line feed characters (\n)
*
* @param string $escapedString
* @return string
*/
private function unescapeLineFeed($escapedString)
{
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache()
{
if ($this->tempFolder) {
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
}
}
}

View File

@ -5,6 +5,8 @@ namespace Box\Spout\Reader\Helper\XLSX;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\FileSystemHelper;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface;
/**
* Class SharedStringsHelper
@ -20,43 +22,14 @@ class SharedStringsHelper
/** Main namespace for the sharedStrings.xml file */
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/**
* To avoid running out of memory when extracting the shared strings, they will be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
* best when the indexes of the shared strings are sorted in the sheet data.
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
*/
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
/** @var string Path of the XLSX file being read */
protected $filePath;
/** @var string Temporary folder where the temporary files to store shared strings will be stored */
protected $tempFolder;
/** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */
protected $fileSystemHelper;
/** @var resource Pointer to the last temp file a shared string was written to */
protected $tempFilePointer;
/**
* @var string Path of the temporary file whose contents is currently stored in memory
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFilePath;
/**
* @var string Contents of the temporary file that was last read
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFileContents;
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */
protected $cachingStrategy;
/**
* @param string $filePath Path of the XLSX file being read
@ -65,10 +38,7 @@ class SharedStringsHelper
public function __construct($filePath, $tempFolder = null)
{
$this->filePath = $filePath;
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
$this->tempFolder = $tempFolder;
}
/**
@ -108,7 +78,6 @@ class SharedStringsHelper
{
$xmlReader = new \XMLReader();
$sharedStringIndex = 0;
$this->tempFilePointer = null;
$escaper = new \Box\Spout\Common\Escaper\XLSX();
$sharedStringsFilePath = $this->getSharedStringsFilePath();
@ -116,6 +85,9 @@ class SharedStringsHelper
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
}
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
while ($xmlReader->read() && $xmlReader->name !== 'si') {
// do nothing until a 'si' tag is reached
}
@ -140,12 +112,7 @@ class SharedStringsHelper
}
$unescapedTextValue = $escaper->unescape($textValue);
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);
$this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
$sharedStringIndex++;
@ -153,10 +120,7 @@ class SharedStringsHelper
$xmlReader->next('si');
}
// close pointer to the last temp file that was written
if ($this->tempFilePointer) {
fclose($this->tempFilePointer);
}
$this->cachingStrategy->closeCache();
$xmlReader->close();
}
@ -169,6 +133,30 @@ class SharedStringsHelper
return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
}
/**
* Returns the shared strings unique count, as specified in <sst> tag.
*
* @param \XMLReader $xmlReader XMLReader instance
* @return int Number of unique shared strings in the sharedStrings.xml file
*/
protected function getSharedStringsUniqueCount($xmlReader)
{
$xmlReader->next('sst');
return intval($xmlReader->getAttribute('uniqueCount'));
}
/**
* Returns the best shared strings caching strategy.
*
* @param int $sharedStringsUniqueCount
* @return CachingStrategyInterface
*/
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
{
$factory = new CachingStrategyFactory();
return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
}
/**
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
* By keeping them, their text content would be added to the read string.
@ -219,42 +207,7 @@ class SharedStringsHelper
}
/**
* Writes the given string to its associated temp file.
* A new temporary file is created when the previous one has reached its max capacity.
*
* @param string $sharedString Shared string to write to the temp file
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
if (!file_exists($tempFilePath)) {
if ($this->tempFilePointer) {
fclose($this->tempFilePointer);
}
$this->tempFilePointer = fopen($tempFilePath, 'w');
}
fwrite($this->tempFilePointer, $sharedString . PHP_EOL);
}
/**
* Returns the path for the temp file that should contain the string for the given index
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The temp file path for the given index
*/
protected function getSharedStringTempFilePath($sharedStringIndex)
{
$numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE);
return $this->tempFolder . '/sharedstrings' . $numTempFile;
}
/**
* Returns the shared string at the given index.
* Because the strings have been split into different files, it looks for the value in the correct file.
* Returns the shared string at the given index, using the previously chosen caching strategy.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
@ -262,63 +215,18 @@ class SharedStringsHelper
*/
public function getStringAtIndex($sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
$indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE;
if (!file_exists($tempFilePath)) {
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
}
if ($this->inMemoryTempFilePath !== $tempFilePath) {
// free memory
unset($this->inMemoryTempFileContents);
$this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath));
$this->inMemoryTempFilePath = $tempFilePath;
}
$sharedString = null;
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
$sharedString = $this->unescapeLineFeed($escapedSharedString);
}
if ($sharedString === null) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
return rtrim($sharedString, PHP_EOL);
return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
}
/**
* Escapes the line feed character (\n)
*
* @param string $unescapedString
* @return string
*/
private function escapeLineFeed($unescapedString)
{
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
}
/**
* Unescapes the line feed character (\n)
*
* @param string $escapedString
* @return string
*/
private function unescapeLineFeed($escapedString)
{
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
}
/**
* Deletes the created temporary folder and all its contents
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function cleanup()
{
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
if ($this->cachingStrategy) {
$this->cachingStrategy->clearCache();
}
}
}

View File

@ -40,7 +40,8 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
{
$this->sharedStringsHelper->extractSharedStrings();
$tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder');
$cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
$tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder');
$filesInTempFolder = $this->getFilesInFolder($tempFolder);
$this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.');