From 048105461c79502db3af6b2ab5a4c89ec77909c2 Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Fri, 28 Apr 2017 02:27:33 +0200 Subject: [PATCH] Fix shared strings XML Entities auto decode (#411) When converting an XMLReader node to a SimpleXMLElement, the conversion would automatically decode the XML entities. This resulted in a double decode. For example: "&#34;" was converted to """ when imported into a SimpleXMLElement and was again converted into " (quote). This commit changes the way the XLSX Shared Strings file is processed. It also changes the unescaping logic for both XLSX and ODS. Finally, it removes any usage of the SimpleXML library (yay!). --- README.md | 1 - composer.json | 3 +- src/Spout/Common/Escaper/ODS.php | 17 +- src/Spout/Common/Escaper/XLSX.php | 13 +- src/Spout/Reader/ODS/RowIterator.php | 1 + src/Spout/Reader/Wrapper/SimpleXMLElement.php | 159 ------------------ src/Spout/Reader/Wrapper/XMLReader.php | 1 + .../XLSX/Helper/SharedStringsHelper.php | 128 +++++--------- src/Spout/Reader/XLSX/RowIterator.php | 1 + tests/Spout/Common/Escaper/ODSTest.php | 2 +- tests/Spout/Common/Escaper/XLSXTest.php | 8 +- .../Reader/Wrapper/SimpleXMLElementTest.php | 127 -------------- .../XLSX/Helper/SharedStringsHelperTest.php | 16 ++ tests/Spout/Reader/XLSX/ReaderTest.php | 38 +++-- tests/Spout/Writer/ODS/WriterTest.php | 2 +- tests/Spout/Writer/XLSX/WriterTest.php | 8 +- ..._sheet_with_pre_encoded_html_entities.xlsx | Bin 0 -> 3201 bytes 17 files changed, 120 insertions(+), 405 deletions(-) delete mode 100644 src/Spout/Reader/Wrapper/SimpleXMLElement.php delete mode 100644 tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php create mode 100644 tests/resources/xlsx/one_sheet_with_pre_encoded_html_entities.xlsx diff --git a/README.md b/README.md index 951d984..500f7ce 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,6 @@ require_once '[PATH/TO]/src/Spout/Autoloader/autoload.php'; // don't forget to c * PHP version 5.4.0 or higher * PHP extension `php_zip` enabled * PHP extension `php_xmlreader` enabled -* PHP extension `php_simplexml` enabled ## Basic usage diff --git a/composer.json b/composer.json index 5c38dfb..76485c9 100644 --- a/composer.json +++ b/composer.json @@ -14,8 +14,7 @@ "require": { "php": ">=5.4.0", "ext-zip": "*", - "ext-xmlreader" : "*", - "ext-simplexml": "*" + "ext-xmlreader" : "*" }, "require-dev": { "phpunit/phpunit": "^4.8.0" diff --git a/src/Spout/Common/Escaper/ODS.php b/src/Spout/Common/Escaper/ODS.php index 5f8a7c9..4ffd766 100644 --- a/src/Spout/Common/Escaper/ODS.php +++ b/src/Spout/Common/Escaper/ODS.php @@ -26,10 +26,13 @@ class ODS implements EscaperInterface // 'ENT_DISALLOWED' ensures that invalid characters in the given document type are replaced. // Otherwise control characters like a vertical tab "\v" will make the XML document unreadable by the XML processor // @link https://github.com/box/spout/issues/329 - $replacedString = htmlspecialchars($string, ENT_QUOTES | ENT_DISALLOWED); + $replacedString = htmlspecialchars($string, ENT_NOQUOTES | ENT_DISALLOWED); } else { - // We are on hhvm or any other engine that does not support ENT_DISALLOWED - $escapedString = htmlspecialchars($string, ENT_QUOTES); + // We are on hhvm or any other engine that does not support ENT_DISALLOWED. + // + // @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded. + // Single and double quotes can be left as is. + $escapedString = htmlspecialchars($string, ENT_NOQUOTES); // control characters values are from 0 to 1F (hex values) in the ASCII table // some characters should not be escaped though: "\t", "\r" and "\n". @@ -52,6 +55,12 @@ class ODS implements EscaperInterface */ public function unescape($string) { - return htmlspecialchars_decode($string, ENT_QUOTES); + // ============== + // = WARNING = + // ============== + // It is assumed that the given string has already had its XML entities decoded. + // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation). + // Therefore there is no need to call "htmlspecialchars_decode()". + return $string; } } diff --git a/src/Spout/Common/Escaper/XLSX.php b/src/Spout/Common/Escaper/XLSX.php index 8ca317f..7c6c61b 100644 --- a/src/Spout/Common/Escaper/XLSX.php +++ b/src/Spout/Common/Escaper/XLSX.php @@ -42,7 +42,9 @@ class XLSX implements EscaperInterface public function escape($string) { $escapedString = $this->escapeControlCharacters($string); - $escapedString = htmlspecialchars($escapedString, ENT_QUOTES); + // @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded. + // Single and double quotes can be left as is. + $escapedString = htmlspecialchars($escapedString, ENT_NOQUOTES); return $escapedString; } @@ -55,8 +57,13 @@ class XLSX implements EscaperInterface */ public function unescape($string) { - $unescapedString = htmlspecialchars_decode($string, ENT_QUOTES); - $unescapedString = $this->unescapeControlCharacters($unescapedString); + // ============== + // = WARNING = + // ============== + // It is assumed that the given string has already had its XML entities decoded. + // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation). + // Therefore there is no need to call "htmlspecialchars_decode()". + $unescapedString = $this->unescapeControlCharacters($string); return $unescapedString; } diff --git a/src/Spout/Reader/ODS/RowIterator.php b/src/Spout/Reader/ODS/RowIterator.php index ca38e05..46ff33e 100644 --- a/src/Spout/Reader/ODS/RowIterator.php +++ b/src/Spout/Reader/ODS/RowIterator.php @@ -202,6 +202,7 @@ class RowIterator implements IteratorInterface { $currentNumColumnsRepeated = $this->getNumColumnsRepeatedForCurrentNode($xmlReader); + // NOTE: expand() will automatically decode all XML entities of the child nodes $node = $xmlReader->expand(); $currentCellValue = $this->getCellValue($node); diff --git a/src/Spout/Reader/Wrapper/SimpleXMLElement.php b/src/Spout/Reader/Wrapper/SimpleXMLElement.php deleted file mode 100644 index a6f723c..0000000 --- a/src/Spout/Reader/Wrapper/SimpleXMLElement.php +++ /dev/null @@ -1,159 +0,0 @@ -useXMLInternalErrors(); - - try { - $this->simpleXMLElement = new \SimpleXMLElement($xmlData); - } catch (\Exception $exception) { - // if the data is invalid, the constructor will throw an Exception - $this->resetXMLInternalErrorsSetting(); - throw new XMLProcessingException($this->getLastXMLErrorMessage()); - } - - $this->resetXMLInternalErrorsSetting(); - } - - /** - * Returns the attribute for the given name. - * - * @param string $name Attribute name - * @param string|null|void $namespace An optional namespace for the retrieved attributes - * @return string|null The attribute value or NULL if attribute not found - */ - public function getAttribute($name, $namespace = null) - { - $isPrefix = ($namespace !== null); - $attributes = $this->simpleXMLElement->attributes($namespace, $isPrefix); - $attributeValue = $attributes->{$name}; - - return ($attributeValue !== null) ? (string) $attributeValue : null; - } - - /** - * Creates a prefix/ns context for the next XPath query - * @see \SimpleXMLElement::registerXPathNamespace - * - * @param string $prefix The namespace prefix to use in the XPath query for the namespace given in "namespace". - * @param string $namespace The namespace to use for the XPath query. This must match a namespace in - * use by the XML document or the XPath query using "prefix" will not return any results. - * @return bool TRUE on success or FALSE on failure. - */ - public function registerXPathNamespace($prefix, $namespace) - { - return $this->simpleXMLElement->registerXPathNamespace($prefix, $namespace); - } - - /** - * Runs XPath query on XML data - * @see \SimpleXMLElement::xpath - * - * @param string $path An XPath path - * @return SimpleXMLElement[]|bool an array of SimpleXMLElement objects or FALSE in case of an error. - */ - public function xpath($path) - { - $elements = $this->simpleXMLElement->xpath($path); - - if ($elements !== false) { - $wrappedElements = []; - foreach ($elements as $element) { - $wrappedElement = $this->wrapSimpleXMLElement($element); - - if ($wrappedElement !== null) { - $wrappedElements[] = $this->wrapSimpleXMLElement($element); - } - } - - $elements = $wrappedElements; - } - - return $elements; - } - - /** - * Wraps the given element into an instance of the wrapper - * - * @param \SimpleXMLElement $element Element to be wrapped - * @return SimpleXMLElement|null The wrapped element or NULL if the given element is invalid - */ - protected function wrapSimpleXMLElement(\SimpleXMLElement $element) - { - $wrappedElement = null; - $elementAsXML = $element->asXML(); - - if ($elementAsXML !== false) { - $wrappedElement = new SimpleXMLElement($elementAsXML); - } - - return $wrappedElement; - } - - /** - * Remove all nodes matching the given XPath query. - * It does not map to any \SimpleXMLElement function. - * - * @param string $path An XPath path - * @return void - */ - public function removeNodesMatchingXPath($path) - { - $nodesToRemove = $this->simpleXMLElement->xpath($path); - - foreach ($nodesToRemove as $nodeToRemove) { - unset($nodeToRemove[0]); - } - } - - /** - * Returns the first child matching the given tag name - * - * @param string $tagName - * @return SimpleXMLElement|null The first child matching the tag name or NULL if none found - */ - public function getFirstChildByTagName($tagName) - { - $doesElementExist = isset($this->simpleXMLElement->{$tagName}); - - /** @var \SimpleXMLElement $realElement */ - $realElement = $this->simpleXMLElement->{$tagName}; - - return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null; - } - - /** - * @return string - */ - public function __toString() - { - return $this->simpleXMLElement->__toString(); - } -} diff --git a/src/Spout/Reader/Wrapper/XMLReader.php b/src/Spout/Reader/Wrapper/XMLReader.php index ffe6818..2e20327 100644 --- a/src/Spout/Reader/Wrapper/XMLReader.php +++ b/src/Spout/Reader/Wrapper/XMLReader.php @@ -1,6 +1,7 @@ getSharedStringsFilePath(); if ($xmlReader->open($sharedStringsFilePath) === false) { @@ -91,14 +100,14 @@ class SharedStringsHelper $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); - $xmlReader->readUntilNodeFound('si'); + $xmlReader->readUntilNodeFound(self::XML_NODE_SI); - while ($xmlReader->name === 'si') { - $this->processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper); + while ($xmlReader->name === self::XML_NODE_SI) { + $this->processSharedStringsItem($xmlReader, $sharedStringIndex); $sharedStringIndex++; - // jump to the next 'si' tag - $xmlReader->next('si'); + // jump to the next '' tag + $xmlReader->next(self::XML_NODE_SI); } $this->cachingStrategy->closeCache(); @@ -127,19 +136,19 @@ class SharedStringsHelper */ protected function getSharedStringsUniqueCount($xmlReader) { - $xmlReader->next('sst'); + $xmlReader->next(self::XML_NODE_SST); // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) - while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) { + while ($xmlReader->name === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) { $xmlReader->read(); } - $uniqueCount = $xmlReader->getAttribute('uniqueCount'); + $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); // some software do not add the "uniqueCount" attribute but only use the "count" one // @see https://github.com/box/spout/issues/254 if ($uniqueCount === null) { - $uniqueCount = $xmlReader->getAttribute('count'); + $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); } return ($uniqueCount !== null) ? intval($uniqueCount) : null; @@ -160,99 +169,54 @@ class SharedStringsHelper /** * Processes the shared strings item XML node which the given XML reader is positioned on. * - * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "" node * @param int $sharedStringIndex Index of the processed shared strings item - * @param \Box\Spout\Common\Escaper\XLSX $escaper Helper to escape values * @return void */ - protected function processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper) + protected function processSharedStringsItem($xmlReader, $sharedStringIndex) { - $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); - $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); + $sharedStringValue = ''; - // removes nodes that should not be read, like the pronunciation of the Kanji characters - $cleanNode = $this->removeSuperfluousTextNodes($node); + // NOTE: expand() will automatically decode all XML entities of the child nodes + $siNode = $xmlReader->expand(); + $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); - // find all text nodes "t"; there can be multiple if the cell contains formatting - $textNodes = $cleanNode->xpath('//ns:t'); + foreach ($textNodes as $textNode) { + if ($this->shouldExtractTextNodeValue($textNode)) { + $textNodeValue = $textNode->nodeValue; + $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); - $textValue = $this->extractTextValueForNodes($textNodes); - $unescapedTextValue = $escaper->unescape($textValue); + $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue); + } + } - $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); + $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); } /** - * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. - * This is to simplify the parsing of the subtree. + * Not all text nodes' values must be extracted. + * Some text nodes are part of a node describing the pronunciation for instance. + * We'll only consider the nodes whose parents are "" or "". * - * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader - * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement - * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read + * @param \DOMElement $textNode Text node to check + * @return bool Whether the given text node's value must be extracted */ - protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) + protected function shouldExtractTextNodeValue($textNode) { - $node = null; - try { - $node = new SimpleXMLElement($xmlReader->readOuterXml()); - } catch (XMLProcessingException $exception) { - throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}]."); - } - - return $node; - } - - /** - * Removes nodes that should not be read, like the pronunciation of the Kanji characters. - * By keeping them, their text content would be added to the read string. - * - * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove - * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node - */ - protected function removeSuperfluousTextNodes($parentNode) - { - $tagsToRemove = [ - 'rPh', // Pronunciation of the text - 'pPr', // Paragraph Properties / Previous Paragraph Properties - 'rPr', // Run Properties for the Paragraph Mark / Previous Run Properties for the Paragraph Mark - ]; - - foreach ($tagsToRemove as $tagToRemove) { - $xpath = '//ns:' . $tagToRemove; - $parentNode->removeNodesMatchingXPath($xpath); - } - - return $parentNode; - } - - /** - * @param array $textNodes Text XML nodes ("") - * @return string The value associated with the given text node(s) - */ - protected function extractTextValueForNodes($textNodes) - { - $textValue = ''; - - foreach ($textNodes as $nodeIndex => $textNode) { - $textNodeAsString = $textNode->__toString(); - $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); - - $textValue .= ($shouldPreserveWhitespace) ? $textNodeAsString : trim($textNodeAsString); - } - - return $textValue; + $parentTagName = $textNode->parentNode->localName; + return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R); } /** * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. * - * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element () whitespace may be preserved + * @param \DOMElement $textNode The text node element () whose whitespace may be preserved * @return bool Whether whitespace should be preserved */ protected function shouldPreserveWhitespace($textNode) { - $spaceValue = $textNode->getAttribute('space', 'xml'); - return ($spaceValue === 'preserve'); + $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE); + return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE); } /** diff --git a/src/Spout/Reader/XLSX/RowIterator.php b/src/Spout/Reader/XLSX/RowIterator.php index 7716e2c..2440593 100644 --- a/src/Spout/Reader/XLSX/RowIterator.php +++ b/src/Spout/Reader/XLSX/RowIterator.php @@ -260,6 +260,7 @@ class RowIterator implements IteratorInterface { $currentColumnIndex = $this->getColumnIndex($xmlReader); + // NOTE: expand() will automatically decode all XML entities of the child nodes $node = $xmlReader->expand(); $this->currentlyProcessedRowData[$currentColumnIndex] = $this->getCellValue($node); $this->lastColumnIndexProcessed = $currentColumnIndex; diff --git a/tests/Spout/Common/Escaper/ODSTest.php b/tests/Spout/Common/Escaper/ODSTest.php index fb5c960..77b4913 100644 --- a/tests/Spout/Common/Escaper/ODSTest.php +++ b/tests/Spout/Common/Escaper/ODSTest.php @@ -16,7 +16,7 @@ class ODSTest extends \PHPUnit_Framework_TestCase { return [ ['test', 'test'], - ['carl\'s "pokemon"', 'carl's "pokemon"'], + ['carl\'s "pokemon"', 'carl\'s "pokemon"'], ["\n", "\n"], ["\r", "\r"], ["\t", "\t"], diff --git a/tests/Spout/Common/Escaper/XLSXTest.php b/tests/Spout/Common/Escaper/XLSXTest.php index 7cede8b..0f2098f 100644 --- a/tests/Spout/Common/Escaper/XLSXTest.php +++ b/tests/Spout/Common/Escaper/XLSXTest.php @@ -16,7 +16,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase { return [ ['test', 'test'], - ['adam\'s "car"', 'adam's "car"'], + ['adam\'s "car"', 'adam\'s "car"'], ["\n", "\n"], ["\r", "\r"], ["\t", "\t"], @@ -25,7 +25,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase ['_x0000_', '_x005F_x0000_'], [chr(21), '_x0015_'], ['control '.chr(21).' character', 'control _x0015_ character'], - ['control\'s '.chr(21).' "character"', 'control's _x0015_ "character"'], + ['control\'s '.chr(21).' "character"', 'control\'s _x0015_ "character"'], ]; } @@ -52,7 +52,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase { return [ ['test', 'test'], - ['adam's "car"', 'adam\'s "car"'], + ['adam's "car"', 'adam's "car"'], ["\n", "\n"], ["\r", "\r"], ["\t", "\t"], @@ -61,7 +61,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase ['_x005F_x0000_', '_x0000_'], ['_x0015_', chr(21)], ['control _x0015_ character', 'control '.chr(21).' character'], - ['control's _x0015_ "character"', 'control\'s '.chr(21).' "character"'], + ['control's _x0015_ "character"', 'control's '.chr(21).' "character"'], ]; } diff --git a/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php b/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php deleted file mode 100644 index bb074d7..0000000 --- a/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php +++ /dev/null @@ -1,127 +0,0 @@ -'; - new SimpleXMLElement($invalidXML); - } - - /** - * @return array - */ - public function dataProviderForTestGetAttribute() - { - $xmlWithoutNamespace = << - -XML; - - $xmlWithHalfNamespace = << - -XML; - - $xmlWithFullNamespace = << - -XML; - - return [ - [$xmlWithoutNamespace, null, ['foo' => 'bar', 'type' => 'test']], - [$xmlWithHalfNamespace, null, ['foo' => 'bar', 'type' => null]], - [$xmlWithFullNamespace, null, ['foo' => null, 'type' => null]], - [$xmlWithoutNamespace, 'r', ['foo' => null, 'type' => null]], - [$xmlWithHalfNamespace, 'r', ['foo' => null, 'type' => 'test']], - [$xmlWithFullNamespace, 'r', ['foo' => 'bar', 'type' => 'test']], - ]; - } - - /** - * @dataProvider dataProviderForTestGetAttribute - * - * @param string $xml - * @param string|null $namespace - * @param array $expectedAttributes - * @return void - */ - public function testGetAttribute($xml, $namespace, $expectedAttributes) - { - $element = new SimpleXMLElement($xml); - - foreach ($expectedAttributes as $name => $expectedValue) { - $value = $element->getAttribute($name, $namespace); - $this->assertEquals($expectedValue, $value); - } - } - - /** - * @return void - */ - public function testXPath() - { - $xml = << - - - - 0 - 1 - - - -XML; - - $element = new SimpleXMLElement($xml); - $matchedElements = $element->xpath('//c'); - - $this->assertEquals(2, count($matchedElements)); - $this->assertTrue($matchedElements[0] instanceof SimpleXMLElement, 'The SimpleXMLElement should be wrapped'); - $this->assertEquals('A2', $matchedElements[1]->getAttribute('r')); - } - - /** - * @return void - */ - public function testRemoveNodeMatchingXPath() - { - $xml = << - - - - 0 - 1 - - - -XML; - - $element = new SimpleXMLElement($xml); - $this->assertNotNull($element->getFirstChildByTagName('sheetData')); - - $element->removeNodesMatchingXPath('//sheetData'); - $this->assertNull($element->getFirstChildByTagName('sheetData')); - } -} diff --git a/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php b/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php index 31f9c71..17d1b52 100644 --- a/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php +++ b/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php @@ -98,6 +98,22 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase $sharedStringsHelper->cleanup(); } + /** + * @return void + */ + public function testGetStringAtIndexShouldNotDoubleDecodeHTMLEntities() + { + $resourcePath = $this->getResourcePath('one_sheet_with_pre_encoded_html_entities.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('quote: " - ampersand: &', $sharedString); + + $sharedStringsHelper->cleanup(); + } + /** * @return void */ diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index 51b79d4..b9e032e 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -426,42 +426,46 @@ class ReaderTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); } - /** - * @return array - */ - public function dataProviderForTestReadShouldBeProtectedAgainstAttacks() - { - return [ - ['attack_billion_laughs.xlsx'], - ['attack_quadratic_blowup.xlsx'], - ]; - } - - /** - * @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) * - * @param string $fileName * @return void */ - public function testReadShouldBeProtectedAgainstAttacks($fileName) + public function testReadShouldBeProtectedAgainstBillionLaughsAttack() { $startTime = microtime(true); try { // using @ to prevent warnings/errors from being displayed - @$this->getAllRowsForFile($fileName); + @$this->getAllRowsForFile('attack_billion_laughs.xlsx'); $this->fail('An exception should have been thrown'); } catch (IOException $exception) { $duration = microtime(true) - $startTime; $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); - $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB + $expectedMaxMemoryUsage = 40 * 1024 * 1024; // 40MB $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); } } + /** + * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) + * + * @return void + */ + public function testReadShouldBeProtectedAgainstQuadraticBlowupAttack() + { + $startTime = microtime(true); + + $this->getAllRowsForFile('attack_quadratic_blowup.xlsx'); + + $duration = microtime(true) - $startTime; + $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); + + $expectedMaxMemoryUsage = 40 * 1024 * 1024; // 40MB + $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + } + /** * @return void */ diff --git a/tests/Spout/Writer/ODS/WriterTest.php b/tests/Spout/Writer/ODS/WriterTest.php index 172f8bd..836e679 100644 --- a/tests/Spout/Writer/ODS/WriterTest.php +++ b/tests/Spout/Writer/ODS/WriterTest.php @@ -421,7 +421,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase $this->writeToODSFile($dataRows, $fileName); - $this->assertValueWasWritten($fileName, 'I'm in "great" mood', 'Quotes should be escaped'); + $this->assertValueWasWritten($fileName, 'I\'m in "great" mood', 'Quotes should not be escaped'); $this->assertValueWasWritten($fileName, 'This <must> be escaped & tested', '<, > and & should be escaped'); } diff --git a/tests/Spout/Writer/XLSX/WriterTest.php b/tests/Spout/Writer/XLSX/WriterTest.php index 86ae1f6..c7f4f96 100644 --- a/tests/Spout/Writer/XLSX/WriterTest.php +++ b/tests/Spout/Writer/XLSX/WriterTest.php @@ -473,7 +473,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase $this->writeToXLSXFile($dataRows, $fileName); - $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'I'm in "great" mood', 'Quotes should be escaped'); + $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'I\'m in "great" mood', 'Quotes should not be escaped'); $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'This <must> be escaped & tested', '<, > and & should be escaped'); } @@ -482,14 +482,14 @@ class WriterTest extends \PHPUnit_Framework_TestCase */ public function testAddRowShouldEscapeControlCharacters() { - $fileName = 'test_add_row_should_escape_html_special_characters.xlsx'; + $fileName = 'test_add_row_should_escape_control_characters.xlsx'; $dataRows = [ - ['control\'s '.chr(21).' "character"'], + ['control '.chr(21).' character'], ]; $this->writeToXLSXFile($dataRows, $fileName); - $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'control's _x0015_ "character"'); + $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'control _x0015_ character'); } /** diff --git a/tests/resources/xlsx/one_sheet_with_pre_encoded_html_entities.xlsx b/tests/resources/xlsx/one_sheet_with_pre_encoded_html_entities.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..326540376cbdfb5ab59281a8fdc10b5829695870 GIT binary patch literal 3201 zcmaJ@2{@Ds7ake=R`&d*A<=}voshVMvF}@CC!;Zgk$uY&ZuUVG(M|Ta)=;(>WXTeS zWM7hq%Zw#z!vDGb|8>p1|3CB0oM-0w-t#=~d(L~#G1Q?3bAmu1dXOx0mbUZ$LtjlQ z5J(ve0&xKUTB%~)y^-$TcBTOyNG}_hzng1KYL9-Kdb+>#h5_f~jmd5RSl`}T3 zaN_}2dj9ATEt~WNRn?fcskrZ{ifkvGG{!5@PbC|T*IE#`E0WL9-K{%O&1qDeW_|Z1 zKMotos2K4UxrVM-F3K(s3{x+2gRRk3me|2h=Spik-{mk(=E7OA)4g+-2zk^<`UWnU zJWm4G;_&YfjaPjHxSCyUi;TKayJ-d}zT9hcHNL5>Qo>1mR$;4R>*i~Dq=-%Hg70Ey z&A6;Y6;&*o?p4gU-H@1_k=r_$8QT$z4ar!QgN?*BJ`ekJjW+7s%6Hy0CHA?#IEcmh=z3kIpqJqlO3%!%;psG=6vGuU$)r+e<3c?sB{d(89GlYb#xl22; zcL?O>oH?H3eASiIByRy+S5JUIEI;^qc?Y;6y$;z%rM$(oK{z|t$)fFKG~avPHjDR= ziXPbS_U8A~5kwxe&hzGwx9PP|`Q9G#T+owpgZzf)*O!EXlir#7H$WxX3u+!;R*htg>dK4WG78Nb4l+K+@{2bt^% ziy8b>nMe;*J#FdyWq-N&ZuJ$-@%X{d?RBZ*tQD3|hZU#3`8CrjLM*l>bFFs=&3LCJ zm+h=y-hC~tVY6}pLX39m>t);_w%f>-I*7)|FMBm#nUs?rk;(PklHqaw`dbLsuQ-xl z#R@k%(OAs|mfKQdz6;-j`4Ae#0vEN&?RF_tsa%A7XI#rE!6`(O8QNzU?Q zEd|FnFj{egVoOG5VN0GsaV(qp3cSwEypKiS@Jb4u#touwmnCcdK%Maz^OAshvLsQD z=l;FciDA(ba1|WdWVGsFtQXPno*j0^hrz@@ce|v=6#LR0n zGvyov|CGf<8|OR596kM-&u2?-oT5Qp^i5d{0T~V@ISG86us01ou^wI$4jvwdJ*t<8 zXKa-`iCEu@O$yT6h6uXELVl%Lp^Nk`33rkevKnlbg%o*Z2DC{A`gcJ1lj1~HKL(Cy zQ7xf1{8v8uDAdS98LhHhA(6JPvb*q?zhyy{o!QieX~dcu;e}1cucrjYtBQF=GoTth zBa4|i0bf`X7aq`ep$R@#GzPR;_Gp5i;Ujg^WmRpwe9v}9osgwP$-$BdUyK{?9WcRV z8=i2z`Q9SS@fyFlh`9{`T&E7y_|#uX-Nbq#4_W1>C>umb0?rF&94ZOtwvZu!ldL3B7H}!KG($<21aF48Fw*sz^W#0 z%?NeYL>ePcLvuAkZQ`e{x`IYH(eWxv2SdeE9;i+oYydB(Csq3$lrr|7Tj-9B`b<__d!D?xC$jtD>$1-oX?C>#CDtNlmC%iDgF33kY}TfzeQ#Nj z^|7{_HL6JM6g|Xsqxxq<@y@4hO4sS1MlS4Xd*(j~F$*Hu?YL$vgw)X{I_`4h7s7h% z#SY3|eWyJx=VQ{Y#V8;n0+5prh``YP;hMjW?*BOGEwD71JJy3@yLLv-(nSso*$C&^ zI4E38tq-l1iCJZSKk?x%xD*`S7QA_jOb&^ynrcdj2oQLnNXwL&16IX%*0YsHefrS$ zjn#JqP1cO)sTZ_)(!Qj3lF6ojncpZ_Blo_bFymDhK^zRzBQE^OpzJ~;w8v_pj?Ppx zw?f>_M*psOn0U4-)4m&(4 z06Dx+4xUIS6K_u!_gjCAb{~U2ZE9uaSHL8RnjXJV6771ckPa_0K*#HQ)6ZT6{@5w* zRnxO*S?cg$hOZ&tm*El~`NGeLVLN2jaDOqA`OUC(Y^TiOtal5 zDseUD&4%U$J-dBbqeFbSEOM@jJDZ)Ur+i7-2r(0L=4Na3YE zToiW?z)~7u%lm`vVcESM}>1})k!ubzG_7-3!>@<>_i?B@@K-M-?q$ld@BpsoObqrmZAcY+T4neN!SY zy8^xH9@}{X6QXzjzPvG-O*5Y+v+=FhMdtoViLA<}aA~_Vk06qv2f6B8;26}fABKOp9Vrt~UOhkHAW&_1 z0H7+x4MYj1JRy$2E`T2ooA1wqf)Yx(>yDsg>SLRa5=glfj)06bf8P<5Xv*<>gr4~; z`oF`N5>7d8j^Hr5|2KRn^HBQqD9_LI|Lj`I43xcpltGl?&$|8135Gf}w1-bJ0B