Improve XLSX Escaper performance (#305)

This commit is contained in:
Adrien Loison 2016-09-03 11:04:25 -07:00 committed by GitHub
parent 5e7a1745ac
commit 435a9a016e
2 changed files with 45 additions and 6 deletions

View File

@ -14,15 +14,23 @@ class XLSX implements EscaperInterface
{
use Singleton;
/** @var string[] Control characters to be escaped */
/** @var string Regex pattern to detect control characters that need to be escaped */
protected $escapableControlCharactersPattern;
/** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
protected $controlCharactersEscapingMap;
/** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
protected $controlCharactersEscapingReverseMap;
/**
* Initializes the singleton instance
*/
protected function init()
{
$this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
$this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
$this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap);
}
/**
@ -53,6 +61,20 @@ class XLSX implements EscaperInterface
return $unescapedString;
}
/**
* @return string Regex pattern containing all escapable control characters
*/
protected function getEscapableControlCharactersPattern()
{
// control characters values are from 0 to 1F (hex values) in the ASCII table
// some characters should not be escaped though: "\t", "\r" and "\n".
return '[\x00-\x08' .
// skipping "\t" (0x9) and "\n" (0xA)
'\x0B-\x0C' .
// skipping "\r" (0xD)
'\x0E-\x1F]';
}
/**
* Builds the map containing control characters to be escaped
* mapped to their escaped values.
@ -66,14 +88,14 @@ class XLSX implements EscaperInterface
protected function getControlCharactersEscapingMap()
{
$controlCharactersEscapingMap = [];
$whitelistedControlCharacters = ["\t", "\r", "\n"];
// control characters values are from 0 to 1F (hex values) in the ASCII table
for ($charValue = 0x0; $charValue <= 0x1F; $charValue++) {
if (!in_array(chr($charValue), $whitelistedControlCharacters)) {
for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
$character = chr($charValue);
if (preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
$charHexValue = dechex($charValue);
$escapedChar = '_x' . sprintf('%04s' , strtoupper($charHexValue)) . '_';
$controlCharactersEscapingMap[$escapedChar] = chr($charValue);
$controlCharactersEscapingMap[$escapedChar] = $character;
}
}
@ -96,7 +118,15 @@ class XLSX implements EscaperInterface
protected function escapeControlCharacters($string)
{
$escapedString = $this->escapeEscapeCharacter($string);
return str_replace(array_values($this->controlCharactersEscapingMap), array_keys($this->controlCharactersEscapingMap), $escapedString);
// if no control characters
if (!preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
return $escapedString;
}
return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function($matches) {
return $this->controlCharactersEscapingReverseMap[$matches[0]];
}, $escapedString);
}
/**
@ -126,6 +156,7 @@ class XLSX implements EscaperInterface
protected function unescapeControlCharacters($string)
{
$unescapedString = $string;
foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
// only unescape characters that don't contain the escaped escape character for now
$unescapedString = preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);

View File

@ -17,7 +17,11 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
return [
['test', 'test'],
['adam\'s "car"', 'adam&#039;s &quot;car&quot;'],
["\n", "\n"],
["\r", "\r"],
["\t", "\t"],
[chr(0), '_x0000_'],
[chr(4), '_x0004_'],
['_x0000_', '_x005F_x0000_'],
[chr(21), '_x0015_'],
['control '.chr(21).' character', 'control _x0015_ character'],
@ -49,7 +53,11 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
return [
['test', 'test'],
['adam&#039;s &quot;car&quot;', 'adam\'s "car"'],
["\n", "\n"],
["\r", "\r"],
["\t", "\t"],
['_x0000_', chr(0)],
['_x0004_', chr(4)],
['_x005F_x0000_', '_x0000_'],
['_x0015_', chr(21)],
['control _x0015_ character', 'control '.chr(21).' character'],