Merge pull request #67 from box/caching_strategies

Caching strategies
This commit is contained in:
Adrien Loison 2015-07-14 10:58:37 -07:00
commit 6ae79b63b3
14 changed files with 758 additions and 183 deletions

View File

@ -106,6 +106,19 @@ class GlobalFunctionsHelper
return fputcsv($handle, $fields, $delimiter, $enclosure); return fputcsv($handle, $fields, $delimiter, $enclosure);
} }
/**
* Wrapper around global function fwrite()
* @see fwrite()
*
* @param resource $handle
* @param string $string
* @return int
*/
public function fwrite($handle, $string)
{
return fwrite($handle, $string);
}
/** /**
* Wrapper around global function fclose() * Wrapper around global function fclose()
* @see fclose() * @see fclose()

View File

@ -0,0 +1,154 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Class CachingStrategyFactory
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class CachingStrategyFactory
{
/**
* The memory amount needed to store a string was obtained empirically from this data:
*
* ------------------------------------
* | Number of chars⁺ | Memory needed |
* ------------------------------------
* | 3,000 | 1 MB |
* | 15,000 | 2 MB |
* | 30,000 | 5 MB |
* | 75,000 | 11 MB |
* | 150,000 | 21 MB |
* | 300,000 | 43 MB |
* | 750,000 | 105 MB |
* | 1,500,000 | 210 MB |
* | 2,250,000 | 315 MB |
* | 3,000,000 | 420 MB |
* | 4,500,000 | 630 MB |
* ------------------------------------
*
* All characters were 1 byte long
*
* This gives a linear graph where each 1-byte character requires about 150 bytes to be stored.
* Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe.
* Also, there is on average about 20 characters per cell (this is entirely empirical data...).
*
* This means that in order to store one shared string in memory, the memory amount needed is:
* => 20 * 600 12KB
*/
const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12;
/**
* To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
* best when the indexes of the shared strings are sorted in the sheet data.
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
*/
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/** @var CachingStrategyFactory|null Singleton instance */
protected static $instance = null;
/**
* Private constructor for singleton
*/
private function __construct()
{
}
/**
* Returns the singleton instance of the factory
*
* @return CachingStrategyFactory
*/
public static function getInstance()
{
if (self::$instance === null) {
self::$instance = new CachingStrategyFactory();
}
return self::$instance;
}
/**
* Returns the best caching strategy, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @return CachingStrategyInterface The best caching strategy
*/
public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
{
if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) {
return new InMemoryStrategy($sharedStringsUniqueCount);
} else {
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
}
}
/**
* Returns whether it is safe to use in-memory caching, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @return bool
*/
protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)
{
$memoryAvailable = $this->getMemoryLimitInKB();
if ($memoryAvailable === -1) {
// if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe
return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE);
} else {
$memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB;
return ($memoryAvailable > $memoryNeeded);
}
}
/**
* Returns the PHP "memory_limit" in Kilobytes
*
* @return float
*/
protected function getMemoryLimitInKB()
{
$memoryLimitFormatted = $this->getMemoryLimitFromIni();
$memoryLimitFormatted = strtolower(trim($memoryLimitFormatted));
// No memory limit
if ($memoryLimitFormatted === '-1') {
return -1;
}
if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) {
$amount = intval($matches[1]);
$unit = $matches[2];
switch ($unit) {
case 'b': return ($amount / 1024);
case 'k': return $amount;
case 'm': return ($amount * 1024);
case 'g': return ($amount * 1024 * 1024);
case 't': return ($amount * 1024 * 1024 * 1024);
}
}
return -1;
}
/**
* Returns the formatted "memory_limit" value
*
* @return string
*/
protected function getMemoryLimitFromIni()
{
return ini_get('memory_limit');
}
}

View File

@ -0,0 +1,44 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Interface CachingStrategyInterface
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
interface CachingStrategyInterface
{
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex);
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache();
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex);
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache();
}

View File

@ -0,0 +1,188 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
use Box\Spout\Common\Helper\FileSystemHelper;
use Box\Spout\Common\Helper\GlobalFunctionsHelper;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
/**
* Class FileBasedStrategy
*
* This class implements the file-based caching strategy for shared strings.
* Shared strings are stored in small files (with a max number of strings per file).
* This strategy is slower than an in-memory strategy but is used to avoid out of memory crashes.
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class FileBasedStrategy implements CachingStrategyInterface
{
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \Box\Spout\Common\Helper\FileSystemHelper Helper to perform file system operations */
protected $fileSystemHelper;
/**
* @var int Maximum number of strings that can be stored in one temp file
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $maxNumStringsPerTempFile;
/** @var resource Pointer to the last temp file a shared string was written to */
protected $tempFilePointer;
/**
* @var string Path of the temporary file whose contents is currently stored in memory
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFilePath;
/**
* @var string Contents of the temporary file that was last read
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFileContents;
/**
* @param string|null $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @param int $maxNumStringsPerTempFile Maximum number of strings that can be stored in one temp file
*/
public function __construct($tempFolder, $maxNumStringsPerTempFile)
{
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
$this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile;
$this->globalFunctionsHelper = new GlobalFunctionsHelper();
$this->tempFilePointer = null;
}
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
$this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w');
}
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString);
$this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL);
}
/**
* Returns the path for the temp file that should contain the string for the given index
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The temp file path for the given index
*/
protected function getSharedStringTempFilePath($sharedStringIndex)
{
$numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile);
return $this->tempFolder . '/sharedstrings' . $numTempFile;
}
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache()
{
// close pointer to the last temp file that was written
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
}
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
$indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile;
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
}
if ($this->inMemoryTempFilePath !== $tempFilePath) {
// free memory
unset($this->inMemoryTempFileContents);
$this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath));
$this->inMemoryTempFilePath = $tempFilePath;
}
$sharedString = null;
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
$sharedString = $this->unescapeLineFeed($escapedSharedString);
}
if ($sharedString === null) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
return rtrim($sharedString, PHP_EOL);
}
/**
* Escapes the line feed characters (\n)
*
* @param string $unescapedString
* @return string
*/
private function escapeLineFeed($unescapedString)
{
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
}
/**
* Unescapes the line feed characters (\n)
*
* @param string $escapedString
* @return string
*/
private function unescapeLineFeed($escapedString)
{
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache()
{
if ($this->tempFolder) {
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
}
}
}

View File

@ -0,0 +1,82 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
/**
* Class InMemoryStrategy
*
* This class implements the in-memory caching strategy for shared strings.
* This strategy is used when the number of unique strings is low, compared to the memory available.
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class InMemoryStrategy implements CachingStrategyInterface
{
/** @var \SplFixedArray Array used to cache the shared strings */
protected $inMemoryCache;
/** @var bool Whether the cache has been closed */
protected $isCacheClosed;
/**
* @param int $sharedStringsUniqueCount Number of unique shared strings
*/
public function __construct($sharedStringsUniqueCount)
{
$this->inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount);
$this->isCacheClosed = false;
}
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex)
{
if (!$this->isCacheClosed) {
$this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString);
}
}
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache()
{
$this->isCacheClosed = true;
}
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
try {
return $this->inMemoryCache->offsetGet($sharedStringIndex);
} catch (\RuntimeException $e) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache()
{
unset($this->inMemoryCache);
$this->isCacheClosed = false;
}
}

View File

@ -3,8 +3,8 @@
namespace Box\Spout\Reader\Helper\XLSX; namespace Box\Spout\Reader\Helper\XLSX;
use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\FileSystemHelper; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\Exception\SharedStringNotFoundException; use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface;
/** /**
* Class SharedStringsHelper * Class SharedStringsHelper
@ -20,43 +20,14 @@ class SharedStringsHelper
/** Main namespace for the sharedStrings.xml file */ /** Main namespace for the sharedStrings.xml file */
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/**
* To avoid running out of memory when extracting the shared strings, they will be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
* best when the indexes of the shared strings are sorted in the sheet data.
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
*/
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
/** @var string Path of the XLSX file being read */ /** @var string Path of the XLSX file being read */
protected $filePath; protected $filePath;
/** @var string Temporary folder where the temporary files to store shared strings will be stored */ /** @var string Temporary folder where the temporary files to store shared strings will be stored */
protected $tempFolder; protected $tempFolder;
/** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */ /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
protected $fileSystemHelper; protected $cachingStrategy;
/** @var resource Pointer to the last temp file a shared string was written to */
protected $tempFilePointer;
/**
* @var string Path of the temporary file whose contents is currently stored in memory
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFilePath;
/**
* @var string Contents of the temporary file that was last read
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFileContents;
/** /**
* @param string $filePath Path of the XLSX file being read * @param string $filePath Path of the XLSX file being read
@ -65,10 +36,7 @@ class SharedStringsHelper
public function __construct($filePath, $tempFolder = null) public function __construct($filePath, $tempFolder = null)
{ {
$this->filePath = $filePath; $this->filePath = $filePath;
$this->tempFolder = $tempFolder;
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
} }
/** /**
@ -108,20 +76,22 @@ class SharedStringsHelper
{ {
$xmlReader = new \XMLReader(); $xmlReader = new \XMLReader();
$sharedStringIndex = 0; $sharedStringIndex = 0;
$this->tempFilePointer = null;
$escaper = new \Box\Spout\Common\Escaper\XLSX(); $escaper = new \Box\Spout\Common\Escaper\XLSX();
$sharedStringsFilePath = $this->getSharedStringsFilePath(); $sharedStringsFilePath = $this->getSharedStringsFilePath();
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) {
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
} }
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
while ($xmlReader->read() && $xmlReader->name !== 'si') { while ($xmlReader->read() && $xmlReader->name !== 'si') {
// do nothing until a 'si' tag is reached // do nothing until a 'si' tag is reached
} }
while ($xmlReader->name === 'si') { while ($xmlReader->name === 'si') {
$node = new \SimpleXMLElement($xmlReader->readOuterXml()); $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
// removes nodes that should not be read, like the pronunciation of the Kanji characters // removes nodes that should not be read, like the pronunciation of the Kanji characters
@ -140,12 +110,7 @@ class SharedStringsHelper
} }
$unescapedTextValue = $escaper->unescape($textValue); $unescapedTextValue = $escaper->unescape($textValue);
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);
$this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
$sharedStringIndex++; $sharedStringIndex++;
@ -153,10 +118,7 @@ class SharedStringsHelper
$xmlReader->next('si'); $xmlReader->next('si');
} }
// close pointer to the last temp file that was written $this->cachingStrategy->closeCache();
if ($this->tempFilePointer) {
fclose($this->tempFilePointer);
}
$xmlReader->close(); $xmlReader->close();
} }
@ -169,6 +131,80 @@ class SharedStringsHelper
return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
} }
/**
* Returns the shared strings unique count, as specified in <sst> tag.
*
* @param \XMLReader $xmlReader XMLReader instance
* @return int Number of unique shared strings in the sharedStrings.xml file
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
*/
protected function getSharedStringsUniqueCount($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$xmlReader->next('sst');
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) {
$xmlReader->read();
}
$readError = libxml_get_last_error();
if ($readError !== false) {
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]");
}
// reset the setting to display XML warnings/errors
libxml_use_internal_errors(false);
return intval($xmlReader->getAttribute('uniqueCount'));
}
/**
* Returns the best shared strings caching strategy.
*
* @param int $sharedStringsUniqueCount
* @return CachingStrategyInterface
*/
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
{
return CachingStrategyFactory::getInstance()
->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
}
/**
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
* This is to simplify the parsing of the subtree.
*
* @param \XMLReader $xmlReader
* @return \SimpleXMLElement
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
*/
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node.
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$node = null;
try {
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
} catch (\Exception $exception) {
$error = libxml_get_last_error();
libxml_use_internal_errors(false);
throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].');
}
libxml_use_internal_errors(false);
return $node;
}
/** /**
* Removes nodes that should not be read, like the pronunciation of the Kanji characters. * Removes nodes that should not be read, like the pronunciation of the Kanji characters.
* By keeping them, their text content would be added to the read string. * By keeping them, their text content would be added to the read string.
@ -219,42 +255,7 @@ class SharedStringsHelper
} }
/** /**
* Writes the given string to its associated temp file. * Returns the shared string at the given index, using the previously chosen caching strategy.
* A new temporary file is created when the previous one has reached its max capacity.
*
* @param string $sharedString Shared string to write to the temp file
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
if (!file_exists($tempFilePath)) {
if ($this->tempFilePointer) {
fclose($this->tempFilePointer);
}
$this->tempFilePointer = fopen($tempFilePath, 'w');
}
fwrite($this->tempFilePointer, $sharedString . PHP_EOL);
}
/**
* Returns the path for the temp file that should contain the string for the given index
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The temp file path for the given index
*/
protected function getSharedStringTempFilePath($sharedStringIndex)
{
$numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE);
return $this->tempFolder . '/sharedstrings' . $numTempFile;
}
/**
* Returns the shared string at the given index.
* Because the strings have been split into different files, it looks for the value in the correct file.
* *
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index * @return string The shared string at the given index
@ -262,63 +263,18 @@ class SharedStringsHelper
*/ */
public function getStringAtIndex($sharedStringIndex) public function getStringAtIndex($sharedStringIndex)
{ {
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
$indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE;
if (!file_exists($tempFilePath)) {
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
}
if ($this->inMemoryTempFilePath !== $tempFilePath) {
// free memory
unset($this->inMemoryTempFileContents);
$this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath));
$this->inMemoryTempFilePath = $tempFilePath;
}
$sharedString = null;
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
$sharedString = $this->unescapeLineFeed($escapedSharedString);
}
if ($sharedString === null) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
return rtrim($sharedString, PHP_EOL);
} }
/** /**
* Escapes the line feed character (\n) * Destroys the cache, freeing memory and removing any created artifacts
*
* @param string $unescapedString
* @return string
*/
private function escapeLineFeed($unescapedString)
{
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
}
/**
* Unescapes the line feed character (\n)
*
* @param string $escapedString
* @return string
*/
private function unescapeLineFeed($escapedString)
{
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
}
/**
* Deletes the created temporary folder and all its contents
* *
* @return void * @return void
*/ */
public function cleanup() public function cleanup()
{ {
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); if ($this->cachingStrategy) {
$this->cachingStrategy->clearCache();
}
} }
} }

View File

@ -166,7 +166,7 @@ class XLSX extends AbstractReader
$worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath(); $worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath();
$worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath; $worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath;
if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) {
throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".'); throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".');
} }
} }

View File

@ -0,0 +1,99 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Class CachingStrategyFactoryTest
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class CachingStrategyFactoryTest extends \PHPUnit_Framework_TestCase
{
/**
* @return array
*/
public function dataProviderForTestGetBestCachingStrategy()
{
return [
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 10, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE - 10, -1, 'InMemoryStrategy'],
[10 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[15, CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[5 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'InMemoryStrategy'],
];
}
/**
* @dataProvider dataProviderForTestGetBestCachingStrategy
*
* @param int $sharedStringsUniqueCount
* @param int $memoryLimitInKB
* @param string $expectedStrategyClassName
* @return void
*/
public function testGetBestCachingStrategy($sharedStringsUniqueCount, $memoryLimitInKB, $expectedStrategyClassName)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitInKB'])
->getMock();
$factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB);
\ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub);
$strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null);
$fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName;
$this->assertEquals($fullExpectedStrategyClassName, get_class($strategy));
$strategy->clearCache();
\ReflectionHelper::reset();
}
/**
* @return array
*/
public function dataProviderForTestGetMemoryLimitInKB()
{
return [
['-1', -1],
['invalid', -1],
['1024B', 1],
['128K', 128],
['256KB', 256],
['512M', 512 * 1024],
['2MB', 2 * 1024],
['1G', 1 * 1024 * 1024],
['10GB', 10 * 1024 * 1024],
['2T', 2 * 1024 * 1024 * 1024],
['5TB', 5 * 1024 * 1024 * 1024],
];
}
/**
* @dataProvider dataProviderForTestGetMemoryLimitInKB
*
* @param string $memoryLimitFormatted
* @param float $expectedMemoryLimitInKB
* @return void
*/
public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitFromIni'])
->getMock();
$factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted);
$memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB');
$this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB);
}
}

View File

@ -2,6 +2,9 @@
namespace Box\Spout\Reader\Helper\XLSX; namespace Box\Spout\Reader\Helper\XLSX;
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy;
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy;
use Box\Spout\TestUsingResource; use Box\Spout\TestUsingResource;
/** /**
@ -33,46 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
$this->sharedStringsHelper->cleanup(); $this->sharedStringsHelper->cleanup();
} }
/**
* @return void
*/
public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings()
{
$this->sharedStringsHelper->extractSharedStrings();
$tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder');
$filesInTempFolder = $this->getFilesInFolder($tempFolder);
$this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.');
$tempFileContents = file_get_contents($filesInTempFolder[0]);
$tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents);
$this->assertEquals('s1--A1', $tempFileContentsPerLine[0]);
$this->assertEquals('s1--E5', $tempFileContentsPerLine[24]);
}
/**
* Returns all files that are in the given folder.
* It does not include "." and ".." and is not recursive.
*
* @param string $folderPath
* @return array
*/
private function getFilesInFolder($folderPath)
{
$files = [];
$directoryIterator = new \DirectoryIterator($folderPath);
foreach ($directoryIterator as $fileInfo) {
if ($fileInfo->isFile()) {
$files[] = $fileInfo->getPathname();
}
}
return $files;
}
/** /**
* @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException * @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException
* @return void * @return void
@ -95,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
$sharedString = $this->sharedStringsHelper->getStringAtIndex(24); $sharedString = $this->sharedStringsHelper->getStringAtIndex(24);
$this->assertEquals('s1--E5', $sharedString); $this->assertEquals('s1--E5', $sharedString);
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
$this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy);
} }
/** /**
@ -115,4 +81,32 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
$sharedStringsHelper->cleanup(); $sharedStringsHelper->cleanup();
} }
/**
* @return void
*/
public function testGetStringAtIndexWithFileBasedStrategy()
{
// force the file-based strategy by setting no memory limit
$originalMemoryLimit = ini_get('memory_limit');
ini_set('memory_limit', '-1');
$resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx');
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
$sharedStringsHelper->extractSharedStrings();
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
$this->assertEquals('str', $sharedString);
$sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1);
$this->assertEquals('str', $sharedString);
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy');
$this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy);
$sharedStringsHelper->cleanup();
ini_set('memory_limit', $originalMemoryLimit);
}
} }

View File

@ -2,6 +2,7 @@
namespace Box\Spout\Reader; namespace Box\Spout\Reader;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Type; use Box\Spout\Common\Type;
use Box\Spout\TestUsingResource; use Box\Spout\TestUsingResource;
@ -245,18 +246,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
} }
/** /**
* @return array
*/
public function dataProviderForTestReadShouldBeProtectedAgainstAttacks()
{
return [
['attack_billion_laughs.xlsx'],
['attack_quadratic_blowup.xlsx'],
];
}
/**
* @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
*
* @param string $fileName
* @return void * @return void
*/ */
public function testReadShouldBeProtectedAgainstBillionLaughsAttack() public function testReadShouldBeProtectedAgainstAttacks($fileName)
{ {
$allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx'); $startTime = microtime(true);
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB try {
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); $this->getAllRowsForFile($fileName);
$this->fail('An exception should have been thrown');
} catch (IOException $exception) {
$duration = microtime(true) - $startTime;
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
$expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1']; $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
$this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.'); $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
}
} }
/** /**

View File

@ -89,4 +89,27 @@ class ReflectionHelper
return $value; return $value;
} }
/**
* Invoke a the given public or protected method on the given object.
*
* @param object $object
* @param string $methodName
* @param *mixed|null $params
*
* @return mixed|null
*/
public static function callMethodOnObject($object, $methodName)
{
$params = func_get_args();
array_shift($params); // object
array_shift($params); // methodName
$className = get_class($object);
$class = new ReflectionClass($className);
$method = $class->getMethod($methodName);
$method->setAccessible(true);
return $method->invokeArgs($object, $params);
}
} }

Binary file not shown.