Add in-memory caching strategy for shared strings
In-memory implementation using SplFixedArray Updated code and tests to support errors when reading XML nodes (useful when reading XML files used for attacks) Removed LIBXML_NOENT option (which DOES substitute entities...) Added test for Quadratic Blowup attack
This commit is contained in:
parent
2dcb86aae9
commit
334f7087da
@ -28,9 +28,13 @@ class CachingStrategyFactory
|
||||
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
|
||||
* @return CachingStrategyInterface The best caching strategy
|
||||
*/
|
||||
public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
|
||||
public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
|
||||
{
|
||||
// TODO add in-memory strategy
|
||||
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
|
||||
// TODO: take available memory into account
|
||||
if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) {
|
||||
return new InMemoryStrategy($sharedStringsUniqueCount);
|
||||
} else {
|
||||
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,82 @@
|
||||
<?php
|
||||
|
||||
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
|
||||
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
|
||||
|
||||
/**
|
||||
* Class InMemoryStrategy
|
||||
*
|
||||
* This class implements the in-memory caching strategy for shared strings.
|
||||
* This strategy is used when the number of unique strings is low, compared to the memory available.
|
||||
*
|
||||
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
|
||||
*/
|
||||
class InMemoryStrategy implements CachingStrategyInterface
|
||||
{
|
||||
/** @var \SplFixedArray Array used to cache the shared strings */
|
||||
protected $inMemoryCache;
|
||||
|
||||
/** @var bool Whether the cache has been closed */
|
||||
protected $isCacheClosed;
|
||||
|
||||
/**
|
||||
* @param int $sharedStringsUniqueCount Number of unique shared strings
|
||||
*/
|
||||
public function __construct($sharedStringsUniqueCount)
|
||||
{
|
||||
$this->inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount);
|
||||
$this->isCacheClosed = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given string to the cache.
|
||||
*
|
||||
* @param string $sharedString The string to be added to the cache
|
||||
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||
* @return void
|
||||
*/
|
||||
public function addStringForIndex($sharedString, $sharedStringIndex)
|
||||
{
|
||||
if (!$this->isCacheClosed) {
|
||||
$this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the cache after the last shared string was added.
|
||||
* This prevents any additional string from being added to the cache.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function closeCache()
|
||||
{
|
||||
$this->isCacheClosed = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string located at the given index from the cache.
|
||||
*
|
||||
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
|
||||
* @return string The shared string at the given index
|
||||
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
|
||||
*/
|
||||
public function getStringAtIndex($sharedStringIndex)
|
||||
{
|
||||
try {
|
||||
return $this->inMemoryCache->offsetGet($sharedStringIndex);
|
||||
} catch (\RuntimeException $e) {
|
||||
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Destroys the cache, freeing memory and removing any created artifacts
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function clearCache()
|
||||
{
|
||||
unset($this->inMemoryCache);
|
||||
$this->isCacheClosed = false;
|
||||
}
|
||||
}
|
@ -3,8 +3,6 @@
|
||||
namespace Box\Spout\Reader\Helper\XLSX;
|
||||
|
||||
use Box\Spout\Common\Exception\IOException;
|
||||
use Box\Spout\Common\Helper\FileSystemHelper;
|
||||
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
|
||||
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
|
||||
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyInterface;
|
||||
|
||||
@ -81,7 +79,7 @@ class SharedStringsHelper
|
||||
$escaper = new \Box\Spout\Common\Escaper\XLSX();
|
||||
|
||||
$sharedStringsFilePath = $this->getSharedStringsFilePath();
|
||||
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) {
|
||||
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) {
|
||||
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
|
||||
}
|
||||
|
||||
@ -93,7 +91,7 @@ class SharedStringsHelper
|
||||
}
|
||||
|
||||
while ($xmlReader->name === 'si') {
|
||||
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
|
||||
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
|
||||
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
|
||||
|
||||
// removes nodes that should not be read, like the pronunciation of the Kanji characters
|
||||
@ -138,10 +136,30 @@ class SharedStringsHelper
|
||||
*
|
||||
* @param \XMLReader $xmlReader XMLReader instance
|
||||
* @return int Number of unique shared strings in the sharedStrings.xml file
|
||||
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
|
||||
*/
|
||||
protected function getSharedStringsUniqueCount($xmlReader)
|
||||
{
|
||||
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
|
||||
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
$xmlReader->next('sst');
|
||||
|
||||
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
|
||||
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) {
|
||||
$xmlReader->read();
|
||||
}
|
||||
|
||||
$readError = libxml_get_last_error();
|
||||
if ($readError !== false) {
|
||||
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]");
|
||||
}
|
||||
|
||||
// reset the setting to display XML warnings/errors
|
||||
libxml_use_internal_errors(false);
|
||||
|
||||
return intval($xmlReader->getAttribute('uniqueCount'));
|
||||
}
|
||||
|
||||
@ -153,8 +171,37 @@ class SharedStringsHelper
|
||||
*/
|
||||
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
|
||||
{
|
||||
$factory = new CachingStrategyFactory();
|
||||
return $factory->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
|
||||
return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
|
||||
* This is to simplify the parsing of the subtree.
|
||||
*
|
||||
* @param \XMLReader $xmlReader
|
||||
* @return \SimpleXMLElement
|
||||
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
|
||||
*/
|
||||
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
|
||||
{
|
||||
// Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node.
|
||||
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
$node = null;
|
||||
try {
|
||||
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
|
||||
} catch (\Exception $exception) {
|
||||
$error = libxml_get_last_error();
|
||||
libxml_use_internal_errors(false);
|
||||
|
||||
throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].');
|
||||
}
|
||||
|
||||
libxml_use_internal_errors(false);
|
||||
|
||||
return $node;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -158,7 +158,7 @@ class XLSX extends AbstractReader
|
||||
$worksheetDataXMLFilePath = $worksheet->getDataXmlFilePath();
|
||||
|
||||
$worksheetDataFilePath = 'zip://' . $this->filePath . '#' . $worksheetDataXMLFilePath;
|
||||
if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) {
|
||||
if ($this->xmlReader->open($worksheetDataFilePath, null, LIBXML_NONET) === false) {
|
||||
throw new IOException('Could not open "' . $worksheetDataXMLFilePath . '".');
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,9 @@
|
||||
|
||||
namespace Box\Spout\Reader\Helper\XLSX;
|
||||
|
||||
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory;
|
||||
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\FileBasedStrategy;
|
||||
use Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\InMemoryStrategy;
|
||||
use Box\Spout\TestUsingResource;
|
||||
|
||||
/**
|
||||
@ -33,47 +36,6 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
||||
$this->sharedStringsHelper->cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function testExtractSharedStringsShouldCreateTempFileWithSharedStrings()
|
||||
{
|
||||
$this->sharedStringsHelper->extractSharedStrings();
|
||||
|
||||
$cachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
|
||||
$tempFolder = \ReflectionHelper::getValueOnObject($cachingStrategy, 'tempFolder');
|
||||
|
||||
$filesInTempFolder = $this->getFilesInFolder($tempFolder);
|
||||
$this->assertEquals(1, count($filesInTempFolder), 'One temp file should have been created in the temp folder.');
|
||||
|
||||
$tempFileContents = file_get_contents($filesInTempFolder[0]);
|
||||
$tempFileContentsPerLine = explode(PHP_EOL, $tempFileContents);
|
||||
|
||||
$this->assertEquals('s1--A1', $tempFileContentsPerLine[0]);
|
||||
$this->assertEquals('s1--E5', $tempFileContentsPerLine[24]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all files that are in the given folder.
|
||||
* It does not include "." and ".." and is not recursive.
|
||||
*
|
||||
* @param string $folderPath
|
||||
* @return array
|
||||
*/
|
||||
private function getFilesInFolder($folderPath)
|
||||
{
|
||||
$files = [];
|
||||
$directoryIterator = new \DirectoryIterator($folderPath);
|
||||
|
||||
foreach ($directoryIterator as $fileInfo) {
|
||||
if ($fileInfo->isFile()) {
|
||||
$files[] = $fileInfo->getPathname();
|
||||
}
|
||||
}
|
||||
|
||||
return $files;
|
||||
}
|
||||
|
||||
/**
|
||||
* @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException
|
||||
* @return void
|
||||
@ -96,6 +58,9 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
||||
|
||||
$sharedString = $this->sharedStringsHelper->getStringAtIndex(24);
|
||||
$this->assertEquals('s1--E5', $sharedString);
|
||||
|
||||
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
|
||||
$this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -116,4 +81,26 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
|
||||
|
||||
$sharedStringsHelper->cleanup();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function testGetStringAtIndexWithFileBasedStrategy()
|
||||
{
|
||||
$resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx');
|
||||
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
|
||||
|
||||
$sharedStringsHelper->extractSharedStrings();
|
||||
|
||||
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
|
||||
$this->assertEquals('str', $sharedString);
|
||||
|
||||
$sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1);
|
||||
$this->assertEquals('str', $sharedString);
|
||||
|
||||
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy');
|
||||
$this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy);
|
||||
|
||||
$sharedStringsHelper->cleanup();
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
namespace Box\Spout\Reader;
|
||||
|
||||
use Box\Spout\Common\Exception\IOException;
|
||||
use Box\Spout\Common\Type;
|
||||
use Box\Spout\TestUsingResource;
|
||||
|
||||
@ -243,18 +244,39 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
|
||||
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function dataProviderForTestReadShouldBeProtectedAgainstAttacks()
|
||||
{
|
||||
return [
|
||||
['attack_billion_laughs.xlsx'],
|
||||
['attack_quadratic_blowup.xlsx'],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks
|
||||
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
|
||||
*
|
||||
* @param string $fileName
|
||||
* @return void
|
||||
*/
|
||||
public function testReadShouldBeProtectedAgainstBillionLaughsAttack()
|
||||
public function testReadShouldBeProtectedAgainstAttacks($fileName)
|
||||
{
|
||||
$allRows = $this->getAllRowsForFile('billion_laughs_test_file.xlsx');
|
||||
$startTime = microtime(true);
|
||||
|
||||
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
|
||||
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
|
||||
try {
|
||||
$this->getAllRowsForFile($fileName);
|
||||
$this->fail('An exception should have been thrown');
|
||||
} catch (IOException $exception) {
|
||||
$duration = microtime(true) - $startTime;
|
||||
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
|
||||
|
||||
$expectedFirstRow = ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'];
|
||||
$this->assertEquals($expectedFirstRow, $allRows[0], 'Entities should be ignored when reading XML files.');
|
||||
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
|
||||
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
Binary file not shown.
BIN
tests/resources/xlsx/attack_quadratic_blowup.xlsx
Normal file
BIN
tests/resources/xlsx/attack_quadratic_blowup.xlsx
Normal file
Binary file not shown.
BIN
tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx
Normal file
BIN
tests/resources/xlsx/sheet_with_lots_of_shared_strings.xlsx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user