diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php index 51bf497..642647a 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactory.php @@ -10,7 +10,37 @@ namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching; class CachingStrategyFactory { /** - * To avoid running out of memory when extracting the shared strings, they will be saved to temporary files + * The memory amount needed to store a string was obtained empirically from this data: + * + * ------------------------------------ + * | Number of chars⁺ | Memory needed | + * ------------------------------------ + * | 3,000 | 1 MB | + * | 15,000 | 2 MB | + * | 30,000 | 5 MB | + * | 75,000 | 11 MB | + * | 150,000 | 21 MB | + * | 300,000 | 43 MB | + * | 750,000 | 105 MB | + * | 1,500,000 | 210 MB | + * | 2,250,000 | 315 MB | + * | 3,000,000 | 420 MB | + * | 4,500,000 | 630 MB | + * ------------------------------------ + * + * ⁺ All characters were 1 byte long + * + * This gives a linear graph where each 1-byte character requires about 150 bytes to be stored. + * Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe. + * Also, there is on average about 20 characters per cell (this is entirely empirical data...). + * + * This means that in order to store one shared string in memory, the memory amount needed is: + * => 20 * 600 ≈ 12KB + */ + const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12; + + /** + * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory * and the string will be quickly retrieved. * The performance bottleneck is not when creating these temporary files, but rather when loading their content. @@ -20,6 +50,30 @@ class CachingStrategyFactory */ const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; + /** @var CachingStrategyFactory|null Singleton instance */ + protected static $instance = null; + + /** + * Private constructor for singleton + */ + private function __construct() + { + } + + /** + * Returns the singleton instance of the factory + * + * @return CachingStrategyFactory + */ + public static function getInstance() + { + if (self::$instance === null) { + self::$instance = new CachingStrategyFactory(); + } + + return self::$instance; + } + /** * Returns the best caching strategy, given the number of unique shared strings * and the amount of memory available. @@ -28,13 +82,73 @@ class CachingStrategyFactory * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @return CachingStrategyInterface The best caching strategy */ - public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) { - // TODO: take available memory into account - if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) { + if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) { return new InMemoryStrategy($sharedStringsUniqueCount); } else { return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); } } + + /** + * Returns whether it is safe to use in-memory caching, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @return bool + */ + protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount) + { + $memoryAvailable = $this->getMemoryLimitInKB(); + + if ($memoryAvailable === -1) { + // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe + return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } else { + $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB; + return ($memoryAvailable > $memoryNeeded); + } + } + + /** + * Returns the PHP "memory_limit" in Kilobytes + * + * @return float + */ + protected function getMemoryLimitInKB() + { + $memoryLimitFormatted = $this->getMemoryLimitFromIni(); + $memoryLimitFormatted = strtolower(trim($memoryLimitFormatted)); + + // No memory limit + if ($memoryLimitFormatted === '-1') { + return -1; + } + + if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) { + $amount = intval($matches[1]); + $unit = $matches[2]; + + switch ($unit) { + case 'b': return ($amount / 1024); + case 'k': return $amount; + case 'm': return ($amount * 1024); + case 'g': return ($amount * 1024 * 1024); + case 't': return ($amount * 1024 * 1024 * 1024); + } + } + + return -1; + } + + /** + * Returns the formatted "memory_limit" value + * + * @return string + */ + protected function getMemoryLimitFromIni() + { + return ini_get('memory_limit'); + } } diff --git a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php index 0ced513..0f6d21d 100644 --- a/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php +++ b/src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php @@ -171,7 +171,8 @@ class SharedStringsHelper */ protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) { - return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + return CachingStrategyFactory::getInstance() + ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); } /** diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php new file mode 100644 index 0000000..18b1c74 --- /dev/null +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsCaching/CachingStrategyFactoryTest.php @@ -0,0 +1,99 @@ +getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitInKB']) + ->getMock(); + + $factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB); + + \ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub); + + $strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null); + + $fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName; + $this->assertEquals($fullExpectedStrategyClassName, get_class($strategy)); + + $strategy->clearCache(); + \ReflectionHelper::reset(); + } + + /** + * @return array + */ + public function dataProviderForTestGetMemoryLimitInKB() + { + return [ + ['-1', -1], + ['invalid', -1], + ['1024B', 1], + ['128K', 128], + ['256KB', 256], + ['512M', 512 * 1024], + ['2MB', 2 * 1024], + ['1G', 1 * 1024 * 1024], + ['10GB', 10 * 1024 * 1024], + ['2T', 2 * 1024 * 1024 * 1024], + ['5TB', 5 * 1024 * 1024 * 1024], + ]; + } + + /** + * @dataProvider dataProviderForTestGetMemoryLimitInKB + * + * @param string $memoryLimitFormatted + * @param float $expectedMemoryLimitInKB + * @return void + */ + public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB) + { + /** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */ + $factoryStub = $this + ->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitFromIni']) + ->getMock(); + + $factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted); + + $memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB'); + + $this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB); + } +} diff --git a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php index 386e93a..82631bc 100644 --- a/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/Helper/XLSX/SharedStringsHelperTest.php @@ -87,6 +87,10 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase */ public function testGetStringAtIndexWithFileBasedStrategy() { + // force the file-based strategy by setting no memory limit + $originalMemoryLimit = ini_get('memory_limit'); + ini_set('memory_limit', '-1'); + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); $sharedStringsHelper = new SharedStringsHelper($resourcePath); @@ -102,5 +106,7 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); $sharedStringsHelper->cleanup(); + + ini_set('memory_limit', $originalMemoryLimit); } } diff --git a/tests/Spout/ReflectionHelper.php b/tests/Spout/ReflectionHelper.php index 3fb78e4..df02de8 100644 --- a/tests/Spout/ReflectionHelper.php +++ b/tests/Spout/ReflectionHelper.php @@ -89,4 +89,27 @@ class ReflectionHelper return $value; } + + /** + * Invoke a the given public or protected method on the given object. + * + * @param object $object + * @param string $methodName + * @param *mixed|null $params + * + * @return mixed|null + */ + public static function callMethodOnObject($object, $methodName) + { + $params = func_get_args(); + array_shift($params); // object + array_shift($params); // methodName + + $className = get_class($object); + $class = new ReflectionClass($className); + $method = $class->getMethod($methodName); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } }