commit
6ae79b63b3
@ -106,6 +106,19 @@ class GlobalFunctionsHelper
|
|||||||
return fputcsv($handle, $fields, $delimiter, $enclosure);
|
return fputcsv($handle, $fields, $delimiter, $enclosure);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper around global function fwrite()
|
||||||
|
* @see fwrite()
|
||||||
|
*
|
||||||
|
* @param resource $handle
|
||||||
|
* @param string $string
|
||||||
|
* @return int
|
||||||
|
*/
|
||||||
|
public function fwrite($handle, $string)
|
||||||
|
{
|
||||||
|
return fwrite($handle, $string);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper around global function fclose()
|
* Wrapper around global function fclose()
|
||||||
* @see fclose()
|
* @see fclose()
|
||||||
|
@ -0,0 +1,154 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class CachingStrategyFactory
|
||||||
|
*
|
||||||
|
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||||
|
*/
|
||||||
|
class CachingStrategyFactory
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The memory amount needed to store a string was obtained empirically from this data:
|
||||||
|
*
|
||||||
|
* ------------------------------------
|
||||||
|
* | Number of chars⁺ | Memory needed |
|
||||||
|
* ------------------------------------
|
||||||
|
* | 3,000 | 1 MB |
|
||||||
|
* | 15,000 | 2 MB |
|
||||||
|
* | 30,000 | 5 MB |
|
||||||
|
* | 75,000 | 11 MB |
|
||||||
|
* | 150,000 | 21 MB |
|
||||||
|
* | 300,000 | 43 MB |
|
||||||
|
* | 750,000 | 105 MB |
|
||||||
|
* | 1,500,000 | 210 MB |
|
||||||
|
* | 2,250,000 | 315 MB |
|
||||||
|
* | 3,000,000 | 420 MB |
|
||||||
|
* | 4,500,000 | 630 MB |
|
||||||
|
* ------------------------------------
|
||||||
|
*
|
||||||
|
* ⁺ All characters were 1 byte long
|
||||||
|
*
|
||||||
|
* This gives a linear graph where each 1-byte character requires about 150 bytes to be stored.
|
||||||
|
* Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe.
|
||||||
|
* Also, there is on average about 20 characters per cell (this is entirely empirical data...).
|
||||||
|
*
|
||||||
|
* This means that in order to store one shared string in memory, the memory amount needed is:
|
||||||
|
* => 20 * 600 ≈ 12KB
|
||||||
|
*/
|
||||||
|
const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files
|
||||||
|
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
|
||||||
|
* and the string will be quickly retrieved.
|
||||||
|
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
|
||||||
|
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
|
||||||
|
* best when the indexes of the shared strings are sorted in the sheet data.
|
||||||
|
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
|
||||||
|
*/
|
||||||
|
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
|
||||||
|
|
||||||
|
/** @var CachingStrategyFactory|null Singleton instance */
|
||||||
|
protected static $instance = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Private constructor for singleton
|
||||||
|
*/
|
||||||
|
private function __construct()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the singleton instance of the factory
|
||||||
|
*
|
||||||
|
* @return CachingStrategyFactory
|
||||||
|
*/
|
||||||
|
public static function getInstance()
|
||||||
|
{
|
||||||
|
if (self::$instance === null) {
|
||||||
|
self::$instance = new CachingStrategyFactory();
|
||||||
|
}
|
||||||
|
|
||||||
|
return self::$instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the best caching strategy, given the number of unique shared strings
|
||||||
|
* and the amount of memory available.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringsUniqueCount Number of unique shared strings
|
||||||
|
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
|
||||||
|
* @return CachingStrategyInterface The best caching strategy
|
||||||
|
*/
|
||||||
|
public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
|
||||||
|
{
|
||||||
|
if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) {
|
||||||
|
return new InMemoryStrategy($sharedStringsUniqueCount);
|
||||||
|
} else {
|
||||||
|
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether it is safe to use in-memory caching, given the number of unique shared strings
|
||||||
|
* and the amount of memory available.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringsUniqueCount Number of unique shared strings
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)
|
||||||
|
{
|
||||||
|
$memoryAvailable = $this->getMemoryLimitInKB();
|
||||||
|
|
||||||
|
if ($memoryAvailable === -1) {
|
||||||
|
// if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe
|
||||||
|
return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE);
|
||||||
|
} else {
|
||||||
|
$memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB;
|
||||||
|
return ($memoryAvailable > $memoryNeeded);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the PHP "memory_limit" in Kilobytes
|
||||||
|
*
|
||||||
|
* @return float
|
||||||
|
*/
|
||||||
|
protected function getMemoryLimitInKB()
|
||||||
|
{
|
||||||
|
$memoryLimitFormatted = $this->getMemoryLimitFromIni();
|
||||||
|
$memoryLimitFormatted = strtolower(trim($memoryLimitFormatted));
|
||||||
|
|
||||||
|
// No memory limit
|
||||||
|
if ($memoryLimitFormatted === '-1') {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) {
|
||||||
|
$amount = intval($matches[1]);
|
||||||
|
$unit = $matches[2];
|
||||||
|
|
||||||
|
switch ($unit) {
|
||||||
|
case 'b': return ($amount / 1024);
|
||||||
|
case 'k': return $amount;
|
||||||
|
case 'm': return ($amount * 1024);
|
||||||
|
case 'g': return ($amount * 1024 * 1024);
|
||||||
|
case 't': return ($amount * 1024 * 1024 * 1024);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the formatted "memory_limit" value
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getMemoryLimitFromIni()
|
||||||
|
{
|
||||||
|
return ini_get('memory_limit');
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface CachingStrategyInterface
|
||||||
|
*
|
||||||
|
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||||
|
*/
|
||||||
|
interface CachingStrategyInterface
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Adds the given string to the cache.
|
||||||
|
*
|
||||||
|
* @param string $sharedString The string to be added to the cache
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function addStringForIndex($sharedString, $sharedStringIndex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the cache after the last shared string was added.
|
||||||
|
* This prevents any additional string from being added to the cache.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function closeCache();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the string located at the given index from the cache.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return string The shared string at the given index
|
||||||
|
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
|
||||||
|
*/
|
||||||
|
public function getStringAtIndex($sharedStringIndex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destroys the cache, freeing memory and removing any created artifacts
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function clearCache();
|
||||||
|
}
|
@ -0,0 +1,188 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||||
|
|
||||||
|
use Box\Spout\Common\Helper\FileSystemHelper;
|
||||||
|
use Box\Spout\Common\Helper\GlobalFunctionsHelper;
|
||||||
|
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class FileBasedStrategy
|
||||||
|
*
|
||||||
|
* This class implements the file-based caching strategy for shared strings.
|
||||||
|
* Shared strings are stored in small files (with a max number of strings per file).
|
||||||
|
* This strategy is slower than an in-memory strategy but is used to avoid out of memory crashes.
|
||||||
|
*
|
||||||
|
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||||
|
*/
|
||||||
|
class FileBasedStrategy implements CachingStrategyInterface
|
||||||
|
{
|
||||||
|
/** Value to use to escape the line feed character ("\n") */
|
||||||
|
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
|
||||||
|
|
||||||
|
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
|
||||||
|
protected $globalFunctionsHelper;
|
||||||
|
|
||||||
|
/** @var \Box\Spout\Common\Helper\FileSystemHelper Helper to perform file system operations */
|
||||||
|
protected $fileSystemHelper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var int Maximum number of strings that can be stored in one temp file
|
||||||
|
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
|
||||||
|
*/
|
||||||
|
protected $maxNumStringsPerTempFile;
|
||||||
|
|
||||||
|
/** @var resource Pointer to the last temp file a shared string was written to */
|
||||||
|
protected $tempFilePointer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var string Path of the temporary file whose contents is currently stored in memory
|
||||||
|
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
|
||||||
|
*/
|
||||||
|
protected $inMemoryTempFilePath;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var string Contents of the temporary file that was last read
|
||||||
|
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
|
||||||
|
*/
|
||||||
|
protected $inMemoryTempFileContents;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string|null $tempFolder Temporary folder where the temporary files to store shared strings will be stored
|
||||||
|
* @param int $maxNumStringsPerTempFile Maximum number of strings that can be stored in one temp file
|
||||||
|
*/
|
||||||
|
public function __construct($tempFolder, $maxNumStringsPerTempFile)
|
||||||
|
{
|
||||||
|
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
|
||||||
|
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
|
||||||
|
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
|
||||||
|
|
||||||
|
$this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile;
|
||||||
|
|
||||||
|
$this->globalFunctionsHelper = new GlobalFunctionsHelper();
|
||||||
|
$this->tempFilePointer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the given string to the cache.
|
||||||
|
*
|
||||||
|
* @param string $sharedString The string to be added to the cache
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function addStringForIndex($sharedString, $sharedStringIndex)
|
||||||
|
{
|
||||||
|
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
|
||||||
|
|
||||||
|
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
|
||||||
|
if ($this->tempFilePointer) {
|
||||||
|
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
|
||||||
|
}
|
||||||
|
$this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w');
|
||||||
|
}
|
||||||
|
|
||||||
|
// The shared string retrieval logic expects each cell data to be on one line only
|
||||||
|
// Encoding the line feed character allows to preserve this assumption
|
||||||
|
$lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString);
|
||||||
|
|
||||||
|
$this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the path for the temp file that should contain the string for the given index
|
||||||
|
*
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return string The temp file path for the given index
|
||||||
|
*/
|
||||||
|
protected function getSharedStringTempFilePath($sharedStringIndex)
|
||||||
|
{
|
||||||
|
$numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile);
|
||||||
|
return $this->tempFolder . '/sharedstrings' . $numTempFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the cache after the last shared string was added.
|
||||||
|
* This prevents any additional string from being added to the cache.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function closeCache()
|
||||||
|
{
|
||||||
|
// close pointer to the last temp file that was written
|
||||||
|
if ($this->tempFilePointer) {
|
||||||
|
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the string located at the given index from the cache.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return string The shared string at the given index
|
||||||
|
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
|
||||||
|
*/
|
||||||
|
public function getStringAtIndex($sharedStringIndex)
|
||||||
|
{
|
||||||
|
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
|
||||||
|
$indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile;
|
||||||
|
|
||||||
|
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
|
||||||
|
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->inMemoryTempFilePath !== $tempFilePath) {
|
||||||
|
// free memory
|
||||||
|
unset($this->inMemoryTempFileContents);
|
||||||
|
|
||||||
|
$this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath));
|
||||||
|
$this->inMemoryTempFilePath = $tempFilePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
$sharedString = null;
|
||||||
|
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
|
||||||
|
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
|
||||||
|
$sharedString = $this->unescapeLineFeed($escapedSharedString);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($sharedString === null) {
|
||||||
|
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
|
||||||
|
}
|
||||||
|
|
||||||
|
return rtrim($sharedString, PHP_EOL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes the line feed characters (\n)
|
||||||
|
*
|
||||||
|
* @param string $unescapedString
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private function escapeLineFeed($unescapedString)
|
||||||
|
{
|
||||||
|
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unescapes the line feed characters (\n)
|
||||||
|
*
|
||||||
|
* @param string $escapedString
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private function unescapeLineFeed($escapedString)
|
||||||
|
{
|
||||||
|
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destroys the cache, freeing memory and removing any created artifacts
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function clearCache()
|
||||||
|
{
|
||||||
|
if ($this->tempFolder) {
|
||||||
|
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,82 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||||
|
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class InMemoryStrategy
|
||||||
|
*
|
||||||
|
* This class implements the in-memory caching strategy for shared strings.
|
||||||
|
* This strategy is used when the number of unique strings is low, compared to the memory available.
|
||||||
|
*
|
||||||
|
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||||
|
*/
|
||||||
|
class InMemoryStrategy implements CachingStrategyInterface
|
||||||
|
{
|
||||||
|
/** @var \SplFixedArray Array used to cache the shared strings */
|
||||||
|
protected $inMemoryCache;
|
||||||
|
|
||||||
|
/** @var bool Whether the cache has been closed */
|
||||||
|
protected $isCacheClosed;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param int $sharedStringsUniqueCount Number of unique shared strings
|
||||||
|
*/
|
||||||
|
public function __construct($sharedStringsUniqueCount)
|
||||||
|
{
|
||||||
|
$this->inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount);
|
||||||
|
$this->isCacheClosed = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the given string to the cache.
|
||||||
|
*
|
||||||
|
* @param string $sharedString The string to be added to the cache
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function addStringForIndex($sharedString, $sharedStringIndex)
|
||||||
|
{
|
||||||
|
if (!$this->isCacheClosed) {
|
||||||
|
$this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the cache after the last shared string was added.
|
||||||
|
* This prevents any additional string from being added to the cache.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function closeCache()
|
||||||
|
{
|
||||||
|
$this->isCacheClosed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the string located at the given index from the cache.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
|
* @return string The shared string at the given index
|
||||||
|
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
|
||||||
|
*/
|
||||||
|
public function getStringAtIndex($sharedStringIndex)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
return $this->inMemoryCache->offsetGet($sharedStringIndex);
|
||||||
|
} catch (\RuntimeException $e) {
|
||||||
|
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destroys the cache, freeing memory and removing any created artifacts
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function clearCache()
|
||||||
|
{
|
||||||
|
unset($this->inMemoryCache);
|
||||||
|
$this->isCacheClosed = false;
|
||||||
|
}
|
||||||
|
}
|
@ -3,8 +3,8 @@
|
|||||||
namespace Box\Spout\Reader\Helper\XLSX;
|
namespace Box\Spout\Reader\Helper\XLSX;
|
||||||
|
|
||||||
use Box\Spout\Common\Exception\IOException;
|
use Box\Spout\Common\Exception\IOException;
|
||||||
use Box\Spout\Common\Helper\FileSystemHelper;
|
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
|
||||||
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
|
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class SharedStringsHelper
|
* Class SharedStringsHelper
|
||||||
@ -20,43 +20,14 @@ class SharedStringsHelper
|
|||||||
/** Main namespace for the sharedStrings.xml file */
|
/** Main namespace for the sharedStrings.xml file */
|
||||||
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
|
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
|
||||||
|
|
||||||
/**
|
|
||||||
* To avoid running out of memory when extracting the shared strings, they will be saved to temporary files
|
|
||||||
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
|
|
||||||
* and the string will be quickly retrieved.
|
|
||||||
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
|
|
||||||
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
|
|
||||||
* best when the indexes of the shared strings are sorted in the sheet data.
|
|
||||||
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
|
|
||||||
*/
|
|
||||||
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
|
|
||||||
|
|
||||||
/** Value to use to escape the line feed character ("\n") */
|
|
||||||
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
|
|
||||||
|
|
||||||
/** @var string Path of the XLSX file being read */
|
/** @var string Path of the XLSX file being read */
|
||||||
protected $filePath;
|
protected $filePath;
|
||||||
|
|
||||||
/** @var string Temporary folder where the temporary files to store shared strings will be stored */
|
/** @var string Temporary folder where the temporary files to store shared strings will be stored */
|
||||||
protected $tempFolder;
|
protected $tempFolder;
|
||||||
|
|
||||||
/** @var \Box\Spout\Writer\Helper\XLSX\FileSystemHelper Helper to perform file system operations */
|
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */
|
||||||
protected $fileSystemHelper;
|
protected $cachingStrategy;
|
||||||
|
|
||||||
/** @var resource Pointer to the last temp file a shared string was written to */
|
|
||||||
protected $tempFilePointer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @var string Path of the temporary file whose contents is currently stored in memory
|
|
||||||
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
|
|
||||||
*/
|
|
||||||
protected $inMemoryTempFilePath;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @var string Contents of the temporary file that was last read
|
|
||||||
* @see MAX_NUM_STRINGS_PER_TEMP_FILE
|
|
||||||
*/
|
|
||||||
protected $inMemoryTempFileContents;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $filePath Path of the XLSX file being read
|
* @param string $filePath Path of the XLSX file being read
|
||||||
@ -65,10 +36,7 @@ class SharedStringsHelper
|
|||||||
public function __construct($filePath, $tempFolder = null)
|
public function __construct($filePath, $tempFolder = null)
|
||||||
{
|
{
|
||||||
$this->filePath = $filePath;
|
$this->filePath = $filePath;
|
||||||
|
$this->tempFolder = $tempFolder;
|
||||||
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
|
|
||||||
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
|
|
||||||
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -108,20 +76,22 @@ class SharedStringsHelper
|
|||||||
{
|
{
|
||||||
$xmlReader = new \XMLReader();
|
$xmlReader = new \XMLReader();
|
||||||
$sharedStringIndex = 0;
|
$sharedStringIndex = 0;
|
||||||
$this->tempFilePointer = null;
|
|
||||||
$escaper = new \Box\Spout\Common\Escaper\XLSX();
|
$escaper = new \Box\Spout\Common\Escaper\XLSX();
|
||||||
|
|
||||||
$sharedStringsFilePath = $this->getSharedStringsFilePath();
|
$sharedStringsFilePath = $this->getSharedStringsFilePath();
|
||||||
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) {
|
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) {
|
||||||
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
|
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
|
||||||
|
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
|
||||||
|
|
||||||
while ($xmlReader->read() && $xmlReader->name !== 'si') {
|
while ($xmlReader->read() && $xmlReader->name !== 'si') {
|
||||||
// do nothing until a 'si' tag is reached
|
// do nothing until a 'si' tag is reached
|
||||||
}
|
}
|
||||||
|
|
||||||
while ($xmlReader->name === 'si') {
|
while ($xmlReader->name === 'si') {
|
||||||
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
|
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
|
||||||
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
|
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
|
||||||
|
|
||||||
// removes nodes that should not be read, like the pronunciation of the Kanji characters
|
// removes nodes that should not be read, like the pronunciation of the Kanji characters
|
||||||
@ -140,12 +110,7 @@ class SharedStringsHelper
|
|||||||
}
|
}
|
||||||
|
|
||||||
$unescapedTextValue = $escaper->unescape($textValue);
|
$unescapedTextValue = $escaper->unescape($textValue);
|
||||||
|
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
|
||||||
// The shared string retrieval logic expects each cell data to be on one line only
|
|
||||||
// Encoding the line feed character allows to preserve this assumption
|
|
||||||
$lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);
|
|
||||||
|
|
||||||
$this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
|
|
||||||
|
|
||||||
$sharedStringIndex++;
|
$sharedStringIndex++;
|
||||||
|
|
||||||
@ -153,10 +118,7 @@ class SharedStringsHelper
|
|||||||
$xmlReader->next('si');
|
$xmlReader->next('si');
|
||||||
}
|
}
|
||||||
|
|
||||||
// close pointer to the last temp file that was written
|
$this->cachingStrategy->closeCache();
|
||||||
if ($this->tempFilePointer) {
|
|
||||||
fclose($this->tempFilePointer);
|
|
||||||
}
|
|
||||||
|
|
||||||
$xmlReader->close();
|
$xmlReader->close();
|
||||||
}
|
}
|
||||||
@ -169,6 +131,80 @@ class SharedStringsHelper
|
|||||||
return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
|
return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the shared strings unique count, as specified in <sst> tag.
|
||||||
|
*
|
||||||
|
* @param \XMLReader $xmlReader XMLReader instance
|
||||||
|
* @return int Number of unique shared strings in the sharedStrings.xml file
|
||||||
|
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
|
||||||
|
*/
|
||||||
|
protected function getSharedStringsUniqueCount($xmlReader)
|
||||||
|
{
|
||||||
|
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
|
||||||
|
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
|
||||||
|
libxml_clear_errors();
|
||||||
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
|
$xmlReader->next('sst');
|
||||||
|
|
||||||
|
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
|
||||||
|
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) {
|
||||||
|
$xmlReader->read();
|
||||||
|
}
|
||||||
|
|
||||||
|
$readError = libxml_get_last_error();
|
||||||
|
if ($readError !== false) {
|
||||||
|
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]");
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset the setting to display XML warnings/errors
|
||||||
|
libxml_use_internal_errors(false);
|
||||||
|
|
||||||
|
return intval($xmlReader->getAttribute('uniqueCount'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the best shared strings caching strategy.
|
||||||
|
*
|
||||||
|
* @param int $sharedStringsUniqueCount
|
||||||
|
* @return CachingStrategyInterface
|
||||||
|
*/
|
||||||
|
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
|
||||||
|
{
|
||||||
|
return CachingStrategyFactory::getInstance()
|
||||||
|
->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
|
||||||
|
* This is to simplify the parsing of the subtree.
|
||||||
|
*
|
||||||
|
* @param \XMLReader $xmlReader
|
||||||
|
* @return \SimpleXMLElement
|
||||||
|
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
|
||||||
|
*/
|
||||||
|
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
|
||||||
|
{
|
||||||
|
// Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node.
|
||||||
|
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
|
||||||
|
libxml_clear_errors();
|
||||||
|
libxml_use_internal_errors(true);
|
||||||
|
|
||||||
|
$node = null;
|
||||||
|
try {
|
||||||
|
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
|
||||||
|
} catch (\Exception $exception) {
|
||||||
|
$error = libxml_get_last_error();
|
||||||
|
libxml_use_internal_errors(false);
|
||||||
|
|
||||||
|
throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].');
|
||||||
|
}
|
||||||
|
|
||||||
|
libxml_use_internal_errors(false);
|
||||||
|
|
||||||
|
return $node;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
|
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
|
||||||
* By keeping them, their text content would be added to the read string.
|
* By keeping them, their text content would be added to the read string.
|
||||||
@ -219,42 +255,7 @@ class SharedStringsHelper
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes the given string to its associated temp file.
|
* Returns the shared string at the given index, using the previously chosen caching strategy.
|
||||||
* A new temporary file is created when the previous one has reached its max capacity.
|
|
||||||
*
|
|
||||||
* @param string $sharedString Shared string to write to the temp file
|
|
||||||
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
|
||||||
* @return void
|
|
||||||
*/
|
|
||||||
protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex)
|
|
||||||
{
|
|
||||||
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
|
|
||||||
|
|
||||||
if (!file_exists($tempFilePath)) {
|
|
||||||
if ($this->tempFilePointer) {
|
|
||||||
fclose($this->tempFilePointer);
|
|
||||||
}
|
|
||||||
$this->tempFilePointer = fopen($tempFilePath, 'w');
|
|
||||||
}
|
|
||||||
|
|
||||||
fwrite($this->tempFilePointer, $sharedString . PHP_EOL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the path for the temp file that should contain the string for the given index
|
|
||||||
*
|
|
||||||
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
|
||||||
* @return string The temp file path for the given index
|
|
||||||
*/
|
|
||||||
protected function getSharedStringTempFilePath($sharedStringIndex)
|
|
||||||
{
|
|
||||||
$numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE);
|
|
||||||
return $this->tempFolder . '/sharedstrings' . $numTempFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the shared string at the given index.
|
|
||||||
* Because the strings have been split into different files, it looks for the value in the correct file.
|
|
||||||
*
|
*
|
||||||
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||||
* @return string The shared string at the given index
|
* @return string The shared string at the given index
|
||||||
@ -262,63 +263,18 @@ class SharedStringsHelper
|
|||||||
*/
|
*/
|
||||||
public function getStringAtIndex($sharedStringIndex)
|
public function getStringAtIndex($sharedStringIndex)
|
||||||
{
|
{
|
||||||
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
|
return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
|
||||||
$indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE;
|
|
||||||
|
|
||||||
if (!file_exists($tempFilePath)) {
|
|
||||||
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($this->inMemoryTempFilePath !== $tempFilePath) {
|
|
||||||
// free memory
|
|
||||||
unset($this->inMemoryTempFileContents);
|
|
||||||
|
|
||||||
$this->inMemoryTempFileContents = explode(PHP_EOL, file_get_contents($tempFilePath));
|
|
||||||
$this->inMemoryTempFilePath = $tempFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
$sharedString = null;
|
|
||||||
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
|
|
||||||
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
|
|
||||||
$sharedString = $this->unescapeLineFeed($escapedSharedString);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($sharedString === null) {
|
|
||||||
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
|
|
||||||
}
|
|
||||||
|
|
||||||
return rtrim($sharedString, PHP_EOL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Escapes the line feed character (\n)
|
* Destroys the cache, freeing memory and removing any created artifacts
|
||||||
*
|
|
||||||
* @param string $unescapedString
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
private function escapeLineFeed($unescapedString)
|
|
||||||
{
|
|
||||||
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Unescapes the line feed character (\n)
|
|
||||||
*
|
|
||||||
* @param string $escapedString
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
private function unescapeLineFeed($escapedString)
|
|
||||||
{
|
|
||||||
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Deletes the created temporary folder and all its contents
|
|
||||||
*
|
*
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
public function cleanup()
|
public function cleanup()
|
||||||
{
|
{
|
||||||
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
|
if ($this->cachingStrategy) {
|
||||||
|
$this->cachingStrategy->clearCache();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -166,7 +166,7 @@ class XLSX extends AbstractReader
|
|||||||
$worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath();
|
$worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath();
|
||||||
|
|
||||||
$worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath;
|
$worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath;
|
||||||
if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) {
|
if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) {
|
||||||
throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".');
|
throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,99 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class CachingStrategyFactoryTest
|
||||||
|
*
|
||||||
|
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||||
|
*/
|
||||||
|
class CachingStrategyFactoryTest extends \PHPUnit_Framework_TestCase
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function dataProviderForTestGetBestCachingStrategy()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE, -1, 'FileBasedStrategy'],
|
||||||
|
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 10, -1, 'FileBasedStrategy'],
|
||||||
|
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE - 10, -1, 'InMemoryStrategy'],
|
||||||
|
[10 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
|
||||||
|
[15, CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
|
||||||
|
[5 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'InMemoryStrategy'],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataProviderForTestGetBestCachingStrategy
|
||||||
|
*
|
||||||
|
* @param int $sharedStringsUniqueCount
|
||||||
|
* @param int $memoryLimitInKB
|
||||||
|
* @param string $expectedStrategyClassName
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testGetBestCachingStrategy($sharedStringsUniqueCount, $memoryLimitInKB, $expectedStrategyClassName)
|
||||||
|
{
|
||||||
|
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
|
||||||
|
$factoryStub = $this
|
||||||
|
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->setMethods(['getMemoryLimitInKB'])
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB);
|
||||||
|
|
||||||
|
\ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub);
|
||||||
|
|
||||||
|
$strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null);
|
||||||
|
|
||||||
|
$fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName;
|
||||||
|
$this->assertEquals($fullExpectedStrategyClassName, get_class($strategy));
|
||||||
|
|
||||||
|
$strategy->clearCache();
|
||||||
|
\ReflectionHelper::reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function dataProviderForTestGetMemoryLimitInKB()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
['-1', -1],
|
||||||
|
['invalid', -1],
|
||||||
|
['1024B', 1],
|
||||||
|
['128K', 128],
|
||||||
|
['256KB', 256],
|
||||||
|
['512M', 512 * 1024],
|
||||||
|
['2MB', 2 * 1024],
|
||||||
|
['1G', 1 * 1024 * 1024],
|
||||||
|
['10GB', 10 * 1024 * 1024],
|
||||||
|
['2T', 2 * 1024 * 1024 * 1024],
|
||||||
|
['5TB', 5 * 1024 * 1024 * 1024],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataProviderForTestGetMemoryLimitInKB
|
||||||
|
*
|
||||||
|
* @param string $memoryLimitFormatted
|
||||||
|
* @param float $expectedMemoryLimitInKB
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB)
|
||||||
|
{
|
||||||
|
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
|
||||||
|
$factoryStub = $this
|
||||||
|
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
|
||||||
|
->disableOriginalConstructor()
|
||||||
|
->setMethods(['getMemoryLimitFromIni'])
|
||||||
|
->getMock();
|
||||||
|
|
||||||
|
$factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted);
|
||||||
|
|
||||||
|
$memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB');
|
||||||
|
|
||||||
|
$this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB);
|
||||||
|
}
|
||||||
|
}
|
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
namespace Box\Spout\Reader\Helper\XLSX;
|
namespace Box\Spout\Reader\Helper\XLSX;
|
||||||
|
|
||||||
|
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
|
||||||
|
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy;
|
||||||
|
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy;
|
||||||
use Box\Spout\TestUsingResource;
|
use Box\Spout\TestUsingResource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -33,46 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
|||||||
$this->sharedStringsHelper->cleanup();
|
$this->sharedStringsHelper->cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @return void
|
|
||||||
*/
|
|
||||||
public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings()
|
|
||||||
{
|
|
||||||
$this->sharedStringsHelper->extractSharedStrings();
|
|
||||||
|
|
||||||
$tempFolder = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'tempFolder');
|
|
||||||
|
|
||||||
$filesInTempFolder = $this->getFilesInFolder($tempFolder);
|
|
||||||
$this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.');
|
|
||||||
|
|
||||||
$tempFileContents = file_get_contents($filesInTempFolder[0]);
|
|
||||||
$tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents);
|
|
||||||
|
|
||||||
$this->assertEquals('s1--A1', $tempFileContentsPerLine[0]);
|
|
||||||
$this->assertEquals('s1--E5', $tempFileContentsPerLine[24]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns all files that are in the given folder.
|
|
||||||
* It does not include "." and ".." and is not recursive.
|
|
||||||
*
|
|
||||||
* @param string $folderPath
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
private function getFilesInFolder($folderPath)
|
|
||||||
{
|
|
||||||
$files = [];
|
|
||||||
$directoryIterator = new \DirectoryIterator($folderPath);
|
|
||||||
|
|
||||||
foreach ($directoryIterator as $fileInfo) {
|
|
||||||
if ($fileInfo->isFile()) {
|
|
||||||
$files[] = $fileInfo->getPathname();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $files;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException
|
* @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException
|
||||||
* @return void
|
* @return void
|
||||||
@ -95,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
|||||||
|
|
||||||
$sharedString = $this->sharedStringsHelper->getStringAtIndex(24);
|
$sharedString = $this->sharedStringsHelper->getStringAtIndex(24);
|
||||||
$this->assertEquals('s1--E5', $sharedString);
|
$this->assertEquals('s1--E5', $sharedString);
|
||||||
|
|
||||||
|
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
|
||||||
|
$this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -115,4 +81,32 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
|||||||
|
|
||||||
$sharedStringsHelper->cleanup();
|
$sharedStringsHelper->cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testGetStringAtIndexWithFileBasedStrategy()
|
||||||
|
{
|
||||||
|
// force the file-based strategy by setting no memory limit
|
||||||
|
$originalMemoryLimit = ini_get('memory_limit');
|
||||||
|
ini_set('memory_limit', '-1');
|
||||||
|
|
||||||
|
$resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx');
|
||||||
|
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
|
||||||
|
|
||||||
|
$sharedStringsHelper->extractSharedStrings();
|
||||||
|
|
||||||
|
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
|
||||||
|
$this->assertEquals('str', $sharedString);
|
||||||
|
|
||||||
|
$sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1);
|
||||||
|
$this->assertEquals('str', $sharedString);
|
||||||
|
|
||||||
|
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy');
|
||||||
|
$this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy);
|
||||||
|
|
||||||
|
$sharedStringsHelper->cleanup();
|
||||||
|
|
||||||
|
ini_set('memory_limit', $originalMemoryLimit);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
namespace Box\Spout\Reader;
|
namespace Box\Spout\Reader;
|
||||||
|
|
||||||
|
use Box\Spout\Common\Exception\IOException;
|
||||||
use Box\Spout\Common\Type;
|
use Box\Spout\Common\Type;
|
||||||
use Box\Spout\TestUsingResource;
|
use Box\Spout\TestUsingResource;
|
||||||
|
|
||||||
@ -245,18 +246,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
|
|||||||
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
|
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function dataProviderForTestReadShouldBeProtectedAgainstAttacks()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
['attack_billion_laughs.xlsx'],
|
||||||
|
['attack_quadratic_blowup.xlsx'],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks
|
||||||
|
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
|
||||||
|
*
|
||||||
|
* @param string $fileName
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
public function testReadShouldBeProtectedAgainstBillionLaughsAttack()
|
public function testReadShouldBeProtectedAgainstAttacks($fileName)
|
||||||
{
|
{
|
||||||
$allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx');
|
$startTime = microtime(true);
|
||||||
|
|
||||||
|
try {
|
||||||
|
$this->getAllRowsForFile($fileName);
|
||||||
|
$this->fail('An exception should have been thrown');
|
||||||
|
} catch (IOException $exception) {
|
||||||
|
$duration = microtime(true) - $startTime;
|
||||||
|
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
|
||||||
|
|
||||||
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
|
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
|
||||||
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
|
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
|
||||||
|
}
|
||||||
$expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'];
|
|
||||||
$this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -89,4 +89,27 @@ class ReflectionHelper
|
|||||||
|
|
||||||
return $value;
|
return $value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoke a the given public or protected method on the given object.
|
||||||
|
*
|
||||||
|
* @param object $object
|
||||||
|
* @param string $methodName
|
||||||
|
* @param *mixed|null $params
|
||||||
|
*
|
||||||
|
* @return mixed|null
|
||||||
|
*/
|
||||||
|
public static function callMethodOnObject($object, $methodName)
|
||||||
|
{
|
||||||
|
$params = func_get_args();
|
||||||
|
array_shift($params); // object
|
||||||
|
array_shift($params); // methodName
|
||||||
|
|
||||||
|
$className = get_class($object);
|
||||||
|
$class = new ReflectionClass($className);
|
||||||
|
$method = $class->getMethod($methodName);
|
||||||
|
$method->setAccessible(true);
|
||||||
|
|
||||||
|
return $method->invokeArgs($object, $params);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Binary file not shown.
BIN
tests/resources/xlsx/attack_quadratic_blowup.xlsx
Normal file
BIN
tests/resources/xlsx/attack_quadratic_blowup.xlsx
Normal file
Binary file not shown.
BIN
tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx
Normal file
BIN
tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user