From 494c506d5672ee4e5b84450dcde385368fdfc1a2 Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Tue, 14 Jul 2015 01:12:52 -0700 Subject: [PATCH] Add logic to automatically select the best caching strategy Based on the number of unique shared strings as well as the available memory amount, one strategy will be chosen over the other. The algorithm is based on empirical data and super safe so it may need to be tuned. --- .../CachingStrategyFactory.php | 122 +++++++++++++++++- .../Helper/XLSX/SharedStringsHelper.php | 3 +- .../CachingStrategyFactoryTest.php | 99 ++++++++++++++ .../Helper/XLSX/SharedStringsHelperTest.php | 6 + tests/Spout/ReflectionHelper.php | 23 ++++ 5 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php index 51bf497..642647a 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -10,7 +10,37 @@ namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching; class CachingStrategyFactory { /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files + * The memory amount needed to store a string was obtained empirically from this data: + * + * ------------------------------------ + * | Number of chars⁺ | Memory needed | + * ------------------------------------ + * | 3,000 | 1 MB | + * | 15,000 | 2 MB | + * | 30,000 | 5 MB | + * | 75,000 | 11 MB | + * | 150,000 | 21 MB | + * | 300,000 | 43 MB | + * | 750,000 | 105 MB | + * | 1,500,000 | 210 MB | + * | 2,250,000 | 315 MB | + * | 3,000,000 | 420 MB | + * | 4,500,000 | 630 MB | + * ------------------------------------ + * + * ⁺ All characters were 1 byte long + * + * This gives a linear graph where each 1-byte character requires about 150 bytes to be stored. + * Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe. + * Also, there is on average about 20 characters per cell (this is entirely empirical data...). + * + * This means that in order to store one shared string in memory, the memory amount needed is: + * => 20 * 600 ≈ 12KB + */ + const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12; + + /** + * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory * and the string will be quickly retrieved. * The performance bottleneck is not when creating these temporary files, but rather when loading their content. @@ -20,6 +50,30 @@ class CachingStrategyFactory */ const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; + /** @var CachingStrategyFactory|null Singleton instance */ + protected static $instance = null; + + /** + * Private constructor for singleton + */ + private function __construct() + { + } + + /** + * Returns the singleton instance of the factory + * + * @return CachingStrategyFactory + */ + public static function getInstance() + { + if (self::$instance === null) { + self::$instance = new CachingStrategyFactory(); + } + + return self::$instance; + } + /** * Returns the best caching strategy, given the number of unique shared strings * and the amount of memory available. @@ -28,13 +82,73 @@ class CachingStrategyFactory * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @return CachingStrategyInterface The best caching strategy */ - public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) { - // TODO: take available memory into account - if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) { + if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) { return new InMemoryStrategy($sharedStringsUniqueCount); } else { return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); } } + + /** + * Returns whether it is safe to use in-memory caching, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @return bool + */ + protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount) + { + $memoryAvailable = $this->getMemoryLimitInKB(); + + if ($memoryAvailable === -1) { + // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe + return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } else { + $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB; + return ($memoryAvailable > $memoryNeeded); + } + } + + /** + * Returns the PHP "memory_limit" in Kilobytes + * + * @return float + */ + protected function getMemoryLimitInKB() + { + $memoryLimitFormatted = $this->getMemoryLimitFromIni(); + $memoryLimitFormatted = strtolower(trim($memoryLimitFormatted)); + + // No memory limit + if ($memoryLimitFormatted === '-1') { + return -1; + } + + if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) { + $amount = intval($matches[1]); + $unit = $matches[2]; + + switch ($unit) { + case 'b': return ($amount / 1024); + case 'k': return $amount; + case 'm': return ($amount * 1024); + case 'g': return ($amount * 1024 * 1024); + case 't': return ($amount * 1024 * 1024 * 1024); + } + } + + return -1; + } + + /** + * Returns the formatted "memory_limit" value + * + * @return string + */ + protected function getMemoryLimitFromIni() + { + return ini_get('memory_limit'); + } } diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 0ced513..0f6d21d 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -171,7 +171,8 @@ class SharedStringsHelper */ protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) { - return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + return CachingStrategyFactory::getInstance() + ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); } /** diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php new file mode 100644 index 0000000..18b1c74 --- /dev/null +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php @@ -0,0 +1,99 @@ +getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitInKB']) + ->getMock(); + + $factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB); + + \ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub); + + $strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null); + + $fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName; + $this->assertEquals($fullExpectedStrategyClassName, get_class($strategy)); + + $strategy->clearCache(); + \ReflectionHelper::reset(); + } + + /** + * @return array + */ + public function dataProviderForTestGetMemoryLimitInKB() + { + return [ + ['-1', -1], + ['invalid', -1], + ['1024B', 1], + ['128K', 128], + ['256KB', 256], + ['512M', 512 * 1024], + ['2MB', 2 * 1024], + ['1G', 1 * 1024 * 1024], + ['10GB', 10 * 1024 * 1024], + ['2T', 2 * 1024 * 1024 * 1024], + ['5TB', 5 * 1024 * 1024 * 1024], + ]; + } + + /** + * @dataProvider dataProviderForTestGetMemoryLimitInKB + * + * @param string $memoryLimitFormatted + * @param float $expectedMemoryLimitInKB + * @return void + */ + public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB) + { + /** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */ + $factoryStub = $this + ->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitFromIni']) + ->getMock(); + + $factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted); + + $memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB'); + + $this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB); + } +} diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index 386e93a..82631bc 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -87,6 +87,10 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase */ public function testGetStringAtIndexWithFileBasedStrategy() { + // force the file-based strategy by setting no memory limit + $originalMemoryLimit = ini_get('memory_limit'); + ini_set('memory_limit', '-1'); + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); $sharedStringsHelper = new SharedStringsHelper($resourcePath); @@ -102,5 +106,7 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); $sharedStringsHelper->cleanup(); + + ini_set('memory_limit', $originalMemoryLimit); } } diff --git a/tests/Spout/ReflectionHelper.php b/tests/Spout/ReflectionHelper.php index 3fb78e4..df02de8 100644 --- a/tests/Spout/ReflectionHelper.php +++ b/tests/Spout/ReflectionHelper.php @@ -89,4 +89,27 @@ class ReflectionHelper return $value; } + + /** + * Invoke a the given public or protected method on the given object. + * + * @param object $object + * @param string $methodName + * @param *mixed|null $params + * + * @return mixed|null + */ + public static function callMethodOnObject($object, $methodName) + { + $params = func_get_args(); + array_shift($params); // object + array_shift($params); // methodName + + $className = get_class($object); + $class = new ReflectionClass($className); + $method = $class->getMethod($methodName); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } }