Add logic to automatically select the best caching strategy

Based on the number of unique shared strings as well as the available memory amount,
one strategy will be chosen over the other.
The algorithm is based on empirical data and super safe so it may need to be tuned.
This commit is contained in:
Adrien Loison 2015-07-14 01:12:52 -07:00
parent 334f7087da
commit 494c506d56
5 changed files with 248 additions and 5 deletions

View File

@ -10,7 +10,37 @@ namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
class CachingStrategyFactory class CachingStrategyFactory
{ {
/** /**
* To avoid running out of memory when extracting the shared strings, they will be saved to temporary files * The memory amount needed to store a string was obtained empirically from this data:
*
* ------------------------------------
* | Number of chars⁺ | Memory needed |
* ------------------------------------
* | 3,000 | 1 MB |
* | 15,000 | 2 MB |
* | 30,000 | 5 MB |
* | 75,000 | 11 MB |
* | 150,000 | 21 MB |
* | 300,000 | 43 MB |
* | 750,000 | 105 MB |
* | 1,500,000 | 210 MB |
* | 2,250,000 | 315 MB |
* | 3,000,000 | 420 MB |
* | 4,500,000 | 630 MB |
* ------------------------------------
*
* All characters were 1 byte long
*
* This gives a linear graph where each 1-byte character requires about 150 bytes to be stored.
* Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe.
* Also, there is on average about 20 characters per cell (this is entirely empirical data...).
*
* This means that in order to store one shared string in memory, the memory amount needed is:
* => 20 * 600 12KB
*/
const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12;
/**
* To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved. * and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content. * The performance bottleneck is not when creating these temporary files, but rather when loading their content.
@ -20,6 +50,30 @@ class CachingStrategyFactory
*/ */
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/** @var CachingStrategyFactory|null Singleton instance */
protected static $instance = null;
/**
* Private constructor for singleton
*/
private function __construct()
{
}
/**
* Returns the singleton instance of the factory
*
* @return CachingStrategyFactory
*/
public static function getInstance()
{
if (self::$instance === null) {
self::$instance = new CachingStrategyFactory();
}
return self::$instance;
}
/** /**
* Returns the best caching strategy, given the number of unique shared strings * Returns the best caching strategy, given the number of unique shared strings
* and the amount of memory available. * and the amount of memory available.
@ -28,13 +82,73 @@ class CachingStrategyFactory
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @return CachingStrategyInterface The best caching strategy * @return CachingStrategyInterface The best caching strategy
*/ */
public static function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
{ {
// TODO: take available memory into account if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) {
if ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE) {
return new InMemoryStrategy($sharedStringsUniqueCount); return new InMemoryStrategy($sharedStringsUniqueCount);
} else { } else {
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
} }
} }
/**
* Returns whether it is safe to use in-memory caching, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @return bool
*/
protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)
{
$memoryAvailable = $this->getMemoryLimitInKB();
if ($memoryAvailable === -1) {
// if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe
return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE);
} else {
$memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB;
return ($memoryAvailable > $memoryNeeded);
}
}
/**
* Returns the PHP "memory_limit" in Kilobytes
*
* @return float
*/
protected function getMemoryLimitInKB()
{
$memoryLimitFormatted = $this->getMemoryLimitFromIni();
$memoryLimitFormatted = strtolower(trim($memoryLimitFormatted));
// No memory limit
if ($memoryLimitFormatted === '-1') {
return -1;
}
if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) {
$amount = intval($matches[1]);
$unit = $matches[2];
switch ($unit) {
case 'b': return ($amount / 1024);
case 'k': return $amount;
case 'm': return ($amount * 1024);
case 'g': return ($amount * 1024 * 1024);
case 't': return ($amount * 1024 * 1024 * 1024);
}
}
return -1;
}
/**
* Returns the formatted "memory_limit" value
*
* @return string
*/
protected function getMemoryLimitFromIni()
{
return ini_get('memory_limit');
}
} }

View File

@ -171,7 +171,8 @@ class SharedStringsHelper
*/ */
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
{ {
return CachingStrategyFactory::getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); return CachingStrategyFactory::getInstance()
->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
} }
/** /**

View File

@ -0,0 +1,99 @@
<?php
namespace Box\Spout\Reader\Helper\XLSX\SharedStringsCaching;
/**
* Class CachingStrategyFactoryTest
*
* @package Box\Spout\Reader\Helper\XLSX\SharedStringsCaching
*/
class CachingStrategyFactoryTest extends \PHPUnit_Framework_TestCase
{
/**
* @return array
*/
public function dataProviderForTestGetBestCachingStrategy()
{
return [
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 10, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE - 10, -1, 'InMemoryStrategy'],
[10 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[15, CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[5 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'InMemoryStrategy'],
];
}
/**
* @dataProvider dataProviderForTestGetBestCachingStrategy
*
* @param int $sharedStringsUniqueCount
* @param int $memoryLimitInKB
* @param string $expectedStrategyClassName
* @return void
*/
public function testGetBestCachingStrategy($sharedStringsUniqueCount, $memoryLimitInKB, $expectedStrategyClassName)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitInKB'])
->getMock();
$factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB);
\ReflectionHelper::setStaticValue('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub);
$strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null);
$fullExpectedStrategyClassName = 'Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\\' . $expectedStrategyClassName;
$this->assertEquals($fullExpectedStrategyClassName, get_class($strategy));
$strategy->clearCache();
\ReflectionHelper::reset();
}
/**
* @return array
*/
public function dataProviderForTestGetMemoryLimitInKB()
{
return [
['-1', -1],
['invalid', -1],
['1024B', 1],
['128K', 128],
['256KB', 256],
['512M', 512 * 1024],
['2MB', 2 * 1024],
['1G', 1 * 1024 * 1024],
['10GB', 10 * 1024 * 1024],
['2T', 2 * 1024 * 1024 * 1024],
['5TB', 5 * 1024 * 1024 * 1024],
];
}
/**
* @dataProvider dataProviderForTestGetMemoryLimitInKB
*
* @param string $memoryLimitFormatted
* @param float $expectedMemoryLimitInKB
* @return void
*/
public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\Helper\XLSX\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitFromIni'])
->getMock();
$factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted);
$memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB');
$this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB);
}
}

View File

@ -87,6 +87,10 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
*/ */
public function testGetStringAtIndexWithFileBasedStrategy() public function testGetStringAtIndexWithFileBasedStrategy()
{ {
// force the file-based strategy by setting no memory limit
$originalMemoryLimit = ini_get('memory_limit');
ini_set('memory_limit', '-1');
$resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx');
$sharedStringsHelper = new SharedStringsHelper($resourcePath); $sharedStringsHelper = new SharedStringsHelper($resourcePath);
@ -102,5 +106,7 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
$this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy);
$sharedStringsHelper->cleanup(); $sharedStringsHelper->cleanup();
ini_set('memory_limit', $originalMemoryLimit);
} }
} }

View File

@ -89,4 +89,27 @@ class ReflectionHelper
return $value; return $value;
} }
/**
* Invoke a the given public or protected method on the given object.
*
* @param object $object
* @param string $methodName
* @param *mixed|null $params
*
* @return mixed|null
*/
public static function callMethodOnObject($object, $methodName)
{
$params = func_get_args();
array_shift($params); // object
array_shift($params); // methodName
$className = get_class($object);
$class = new ReflectionClass($className);
$method = $class->getMethod($methodName);
$method->setAccessible(true);
return $method->invokeArgs($object, $params);
}
} }