Improve XLSX Escaper performance

This commit is contained in:
Adrien Loison 2016-09-02 19:40:51 -07:00
parent 5e7a1745ac
commit a0749c63b9
2 changed files with 45 additions and 6 deletions

View File

@ -14,15 +14,23 @@ class XLSX implements EscaperInterface
{ {
use Singleton; use Singleton;
/** @var string[] Control characters to be escaped */ /** @var string Regex pattern to detect control characters that need to be escaped */
protected $escapableControlCharactersPattern;
/** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
protected $controlCharactersEscapingMap; protected $controlCharactersEscapingMap;
/** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
protected $controlCharactersEscapingReverseMap;
/** /**
* Initializes the singleton instance * Initializes the singleton instance
*/ */
protected function init() protected function init()
{ {
$this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
$this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap(); $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
$this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap);
} }
/** /**
@ -53,6 +61,20 @@ class XLSX implements EscaperInterface
return $unescapedString; return $unescapedString;
} }
/**
* @return string Regex pattern containing all escapable control characters
*/
protected function getEscapableControlCharactersPattern()
{
// control characters values are from 0 to 1F (hex values) in the ASCII table
// some characters should not be escaped though: "\t", "\r" and "\n".
return '[\x00-\x08' .
// skipping "\t" (0x9) and "\n" (0xA)
'\x0B-\x0C' .
// skipping "\r" (0xD)
'\x0E-\x1F]';
}
/** /**
* Builds the map containing control characters to be escaped * Builds the map containing control characters to be escaped
* mapped to their escaped values. * mapped to their escaped values.
@ -66,14 +88,14 @@ class XLSX implements EscaperInterface
protected function getControlCharactersEscapingMap() protected function getControlCharactersEscapingMap()
{ {
$controlCharactersEscapingMap = []; $controlCharactersEscapingMap = [];
$whitelistedControlCharacters = ["\t", "\r", "\n"];
// control characters values are from 0 to 1F (hex values) in the ASCII table // control characters values are from 0 to 1F (hex values) in the ASCII table
for ($charValue = 0x0; $charValue <= 0x1F; $charValue++) { for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
if (!in_array(chr($charValue), $whitelistedControlCharacters)) { $character = chr($charValue);
if (preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
$charHexValue = dechex($charValue); $charHexValue = dechex($charValue);
$escapedChar = '_x' . sprintf('%04s' , strtoupper($charHexValue)) . '_'; $escapedChar = '_x' . sprintf('%04s' , strtoupper($charHexValue)) . '_';
$controlCharactersEscapingMap[$escapedChar] = chr($charValue); $controlCharactersEscapingMap[$escapedChar] = $character;
} }
} }
@ -96,7 +118,15 @@ class XLSX implements EscaperInterface
protected function escapeControlCharacters($string) protected function escapeControlCharacters($string)
{ {
$escapedString = $this->escapeEscapeCharacter($string); $escapedString = $this->escapeEscapeCharacter($string);
return str_replace(array_values($this->controlCharactersEscapingMap), array_keys($this->controlCharactersEscapingMap), $escapedString);
// if no control characters
if (!preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
return $escapedString;
}
return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function($matches) {
return $this->controlCharactersEscapingReverseMap[$matches[0]];
}, $escapedString);
} }
/** /**
@ -126,6 +156,7 @@ class XLSX implements EscaperInterface
protected function unescapeControlCharacters($string) protected function unescapeControlCharacters($string)
{ {
$unescapedString = $string; $unescapedString = $string;
foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) { foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
// only unescape characters that don't contain the escaped escape character for now // only unescape characters that don't contain the escaped escape character for now
$unescapedString = preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString); $unescapedString = preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);

View File

@ -17,7 +17,11 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
return [ return [
['test', 'test'], ['test', 'test'],
['adam\'s "car"', 'adam&#039;s &quot;car&quot;'], ['adam\'s "car"', 'adam&#039;s &quot;car&quot;'],
["\n", "\n"],
["\r", "\r"],
["\t", "\t"],
[chr(0), '_x0000_'], [chr(0), '_x0000_'],
[chr(4), '_x0004_'],
['_x0000_', '_x005F_x0000_'], ['_x0000_', '_x005F_x0000_'],
[chr(21), '_x0015_'], [chr(21), '_x0015_'],
['control '.chr(21).' character', 'control _x0015_ character'], ['control '.chr(21).' character', 'control _x0015_ character'],
@ -49,7 +53,11 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
return [ return [
['test', 'test'], ['test', 'test'],
['adam&#039;s &quot;car&quot;', 'adam\'s "car"'], ['adam&#039;s &quot;car&quot;', 'adam\'s "car"'],
["\n", "\n"],
["\r", "\r"],
["\t", "\t"],
['_x0000_', chr(0)], ['_x0000_', chr(0)],
['_x0004_', chr(4)],
['_x005F_x0000_', '_x0000_'], ['_x005F_x0000_', '_x0000_'],
['_x0015_', chr(21)], ['_x0015_', chr(21)],
['control _x0015_ character', 'control '.chr(21).' character'], ['control _x0015_ character', 'control '.chr(21).' character'],