Fix shared strings XML Entities auto decode (#411)

When converting an XMLReader node to a SimpleXMLElement, the conversion would automatically decode the XML entities. This resulted in a double decode.
For example: """ was converted to """ when imported into a SimpleXMLElement and was again converted into " (quote).

This commit changes the way the XLSX Shared Strings file is processed. It also changes the unescaping logic for both XLSX and ODS.

Finally, it removes any usage of the SimpleXML library (yay!).
This commit is contained in:
Adrien Loison 2017-04-28 02:27:33 +02:00 committed by GitHub
parent 1eb01a3d2a
commit 048105461c
17 changed files with 120 additions and 405 deletions

View File

@ -42,7 +42,6 @@ require_once '[PATH/TO]/src/Spout/Autoloader/autoload.php'; // don't forget to c
* PHP version 5.4.0 or higher * PHP version 5.4.0 or higher
* PHP extension `php_zip` enabled * PHP extension `php_zip` enabled
* PHP extension `php_xmlreader` enabled * PHP extension `php_xmlreader` enabled
* PHP extension `php_simplexml` enabled
## Basic usage ## Basic usage

View File

@ -14,8 +14,7 @@
"require": { "require": {
"php": ">=5.4.0", "php": ">=5.4.0",
"ext-zip": "*", "ext-zip": "*",
"ext-xmlreader" : "*", "ext-xmlreader" : "*"
"ext-simplexml": "*"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "^4.8.0" "phpunit/phpunit": "^4.8.0"

View File

@ -26,10 +26,13 @@ class ODS implements EscaperInterface
// 'ENT_DISALLOWED' ensures that invalid characters in the given document type are replaced. // 'ENT_DISALLOWED' ensures that invalid characters in the given document type are replaced.
// Otherwise control characters like a vertical tab "\v" will make the XML document unreadable by the XML processor // Otherwise control characters like a vertical tab "\v" will make the XML document unreadable by the XML processor
// @link https://github.com/box/spout/issues/329 // @link https://github.com/box/spout/issues/329
$replacedString = htmlspecialchars($string, ENT_QUOTES | ENT_DISALLOWED); $replacedString = htmlspecialchars($string, ENT_NOQUOTES | ENT_DISALLOWED);
} else { } else {
// We are on hhvm or any other engine that does not support ENT_DISALLOWED // We are on hhvm or any other engine that does not support ENT_DISALLOWED.
$escapedString = htmlspecialchars($string, ENT_QUOTES); //
// @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded.
// Single and double quotes can be left as is.
$escapedString = htmlspecialchars($string, ENT_NOQUOTES);
// control characters values are from 0 to 1F (hex values) in the ASCII table // control characters values are from 0 to 1F (hex values) in the ASCII table
// some characters should not be escaped though: "\t", "\r" and "\n". // some characters should not be escaped though: "\t", "\r" and "\n".
@ -52,6 +55,12 @@ class ODS implements EscaperInterface
*/ */
public function unescape($string) public function unescape($string)
{ {
return htmlspecialchars_decode($string, ENT_QUOTES); // ==============
// = WARNING =
// ==============
// It is assumed that the given string has already had its XML entities decoded.
// This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
// Therefore there is no need to call "htmlspecialchars_decode()".
return $string;
} }
} }

View File

@ -42,7 +42,9 @@ class XLSX implements EscaperInterface
public function escape($string) public function escape($string)
{ {
$escapedString = $this->escapeControlCharacters($string); $escapedString = $this->escapeControlCharacters($string);
$escapedString = htmlspecialchars($escapedString, ENT_QUOTES); // @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded.
// Single and double quotes can be left as is.
$escapedString = htmlspecialchars($escapedString, ENT_NOQUOTES);
return $escapedString; return $escapedString;
} }
@ -55,8 +57,13 @@ class XLSX implements EscaperInterface
*/ */
public function unescape($string) public function unescape($string)
{ {
$unescapedString = htmlspecialchars_decode($string, ENT_QUOTES); // ==============
$unescapedString = $this->unescapeControlCharacters($unescapedString); // = WARNING =
// ==============
// It is assumed that the given string has already had its XML entities decoded.
// This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
// Therefore there is no need to call "htmlspecialchars_decode()".
$unescapedString = $this->unescapeControlCharacters($string);
return $unescapedString; return $unescapedString;
} }

View File

@ -202,6 +202,7 @@ class RowIterator implements IteratorInterface
{ {
$currentNumColumnsRepeated = $this->getNumColumnsRepeatedForCurrentNode($xmlReader); $currentNumColumnsRepeated = $this->getNumColumnsRepeatedForCurrentNode($xmlReader);
// NOTE: expand() will automatically decode all XML entities of the child nodes
$node = $xmlReader->expand(); $node = $xmlReader->expand();
$currentCellValue = $this->getCellValue($node); $currentCellValue = $this->getCellValue($node);

View File

@ -1,159 +0,0 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\Reader\Exception\XMLProcessingException;
/**
* Class SimpleXMLElement
* Wrapper around the built-in SimpleXMLElement. This class does not extend \SimpleXMLElement
* because it its constructor is final... Instead, it is used as a passthrough.
* @see \SimpleXMLElement
*
* @package Box\Spout\Reader\Wrapper
*/
class SimpleXMLElement
{
use XMLInternalErrorsHelper;
/** @var \SimpleXMLElement Instance of the wrapped SimpleXMLElement object */
protected $simpleXMLElement;
/**
* Creates a new SimpleXMLElement object
* @see \SimpleXMLElement::__construct
*
* @param string $xmlData A well-formed XML string
* @throws \Box\Spout\Reader\Exception\XMLProcessingException If the XML string is not well-formed
*/
public function __construct($xmlData)
{
$this->useXMLInternalErrors();
try {
$this->simpleXMLElement = new \SimpleXMLElement($xmlData);
} catch (\Exception $exception) {
// if the data is invalid, the constructor will throw an Exception
$this->resetXMLInternalErrorsSetting();
throw new XMLProcessingException($this->getLastXMLErrorMessage());
}
$this->resetXMLInternalErrorsSetting();
}
/**
* Returns the attribute for the given name.
*
* @param string $name Attribute name
* @param string|null|void $namespace An optional namespace for the retrieved attributes
* @return string|null The attribute value or NULL if attribute not found
*/
public function getAttribute($name, $namespace = null)
{
$isPrefix = ($namespace !== null);
$attributes = $this->simpleXMLElement->attributes($namespace, $isPrefix);
$attributeValue = $attributes->{$name};
return ($attributeValue !== null) ? (string) $attributeValue : null;
}
/**
* Creates a prefix/ns context for the next XPath query
* @see \SimpleXMLElement::registerXPathNamespace
*
* @param string $prefix The namespace prefix to use in the XPath query for the namespace given in "namespace".
* @param string $namespace The namespace to use for the XPath query. This must match a namespace in
* use by the XML document or the XPath query using "prefix" will not return any results.
* @return bool TRUE on success or FALSE on failure.
*/
public function registerXPathNamespace($prefix, $namespace)
{
return $this->simpleXMLElement->registerXPathNamespace($prefix, $namespace);
}
/**
* Runs XPath query on XML data
* @see \SimpleXMLElement::xpath
*
* @param string $path An XPath path
* @return SimpleXMLElement[]|bool an array of SimpleXMLElement objects or FALSE in case of an error.
*/
public function xpath($path)
{
$elements = $this->simpleXMLElement->xpath($path);
if ($elements !== false) {
$wrappedElements = [];
foreach ($elements as $element) {
$wrappedElement = $this->wrapSimpleXMLElement($element);
if ($wrappedElement !== null) {
$wrappedElements[] = $this->wrapSimpleXMLElement($element);
}
}
$elements = $wrappedElements;
}
return $elements;
}
/**
* Wraps the given element into an instance of the wrapper
*
* @param \SimpleXMLElement $element Element to be wrapped
* @return SimpleXMLElement|null The wrapped element or NULL if the given element is invalid
*/
protected function wrapSimpleXMLElement(\SimpleXMLElement $element)
{
$wrappedElement = null;
$elementAsXML = $element->asXML();
if ($elementAsXML !== false) {
$wrappedElement = new SimpleXMLElement($elementAsXML);
}
return $wrappedElement;
}
/**
* Remove all nodes matching the given XPath query.
* It does not map to any \SimpleXMLElement function.
*
* @param string $path An XPath path
* @return void
*/
public function removeNodesMatchingXPath($path)
{
$nodesToRemove = $this->simpleXMLElement->xpath($path);
foreach ($nodesToRemove as $nodeToRemove) {
unset($nodeToRemove[0]);
}
}
/**
* Returns the first child matching the given tag name
*
* @param string $tagName
* @return SimpleXMLElement|null The first child matching the tag name or NULL if none found
*/
public function getFirstChildByTagName($tagName)
{
$doesElementExist = isset($this->simpleXMLElement->{$tagName});
/** @var \SimpleXMLElement $realElement */
$realElement = $this->simpleXMLElement->{$tagName};
return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null;
}
/**
* @return string
*/
public function __toString()
{
return $this->simpleXMLElement->__toString();
}
}

View File

@ -1,6 +1,7 @@
<?php <?php
namespace Box\Spout\Reader\Wrapper; namespace Box\Spout\Reader\Wrapper;
use DOMNode;
/** /**

View File

@ -4,7 +4,6 @@ namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\Exception\XMLProcessingException; use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
use Box\Spout\Reader\Wrapper\XMLReader; use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory; use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface; use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
@ -23,6 +22,18 @@ class SharedStringsHelper
/** Main namespace for the sharedStrings.xml file */ /** Main namespace for the sharedStrings.xml file */
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/** Definition of XML nodes names used to parse data */
const XML_NODE_SST = 'sst';
const XML_NODE_SI = 'si';
const XML_NODE_R = 'r';
const XML_NODE_T = 't';
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_COUNT = 'count';
const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
/** @var string Path of the XLSX file being read */ /** @var string Path of the XLSX file being read */
protected $filePath; protected $filePath;
@ -79,8 +90,6 @@ class SharedStringsHelper
{ {
$xmlReader = new XMLReader(); $xmlReader = new XMLReader();
$sharedStringIndex = 0; $sharedStringIndex = 0;
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$escaper = \Box\Spout\Common\Escaper\XLSX::getInstance();
$sharedStringsFilePath = $this->getSharedStringsFilePath(); $sharedStringsFilePath = $this->getSharedStringsFilePath();
if ($xmlReader->open($sharedStringsFilePath) === false) { if ($xmlReader->open($sharedStringsFilePath) === false) {
@ -91,14 +100,14 @@ class SharedStringsHelper
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
$xmlReader->readUntilNodeFound('si'); $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
while ($xmlReader->name === 'si') { while ($xmlReader->name === self::XML_NODE_SI) {
$this->processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper); $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
$sharedStringIndex++; $sharedStringIndex++;
// jump to the next 'si' tag // jump to the next '<si>' tag
$xmlReader->next('si'); $xmlReader->next(self::XML_NODE_SI);
} }
$this->cachingStrategy->closeCache(); $this->cachingStrategy->closeCache();
@ -127,19 +136,19 @@ class SharedStringsHelper
*/ */
protected function getSharedStringsUniqueCount($xmlReader) protected function getSharedStringsUniqueCount($xmlReader)
{ {
$xmlReader->next('sst'); $xmlReader->next(self::XML_NODE_SST);
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) { while ($xmlReader->name === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
$xmlReader->read(); $xmlReader->read();
} }
$uniqueCount = $xmlReader->getAttribute('uniqueCount'); $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
// some software do not add the "uniqueCount" attribute but only use the "count" one // some software do not add the "uniqueCount" attribute but only use the "count" one
// @see https://github.com/box/spout/issues/254 // @see https://github.com/box/spout/issues/254
if ($uniqueCount === null) { if ($uniqueCount === null) {
$uniqueCount = $xmlReader->getAttribute('count'); $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
} }
return ($uniqueCount !== null) ? intval($uniqueCount) : null; return ($uniqueCount !== null) ? intval($uniqueCount) : null;
@ -160,99 +169,54 @@ class SharedStringsHelper
/** /**
* Processes the shared strings item XML node which the given XML reader is positioned on. * Processes the shared strings item XML node which the given XML reader is positioned on.
* *
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
* @param int $sharedStringIndex Index of the processed shared strings item * @param int $sharedStringIndex Index of the processed shared strings item
* @param \Box\Spout\Common\Escaper\XLSX $escaper Helper to escape values
* @return void * @return void
*/ */
protected function processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper) protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
{ {
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); $sharedStringValue = '';
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
// removes nodes that should not be read, like the pronunciation of the Kanji characters // NOTE: expand() will automatically decode all XML entities of the child nodes
$cleanNode = $this->removeSuperfluousTextNodes($node); $siNode = $xmlReader->expand();
$textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
// find all text nodes "t"; there can be multiple if the cell contains formatting foreach ($textNodes as $textNode) {
$textNodes = $cleanNode->xpath('//ns:t'); if ($this->shouldExtractTextNodeValue($textNode)) {
$textNodeValue = $textNode->nodeValue;
$shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
$textValue = $this->extractTextValueForNodes($textNodes); $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
$unescapedTextValue = $escaper->unescape($textValue); }
}
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
} }
/** /**
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance. * Not all text nodes' values must be extracted.
* This is to simplify the parsing of the subtree. * Some text nodes are part of a node describing the pronunciation for instance.
* We'll only consider the nodes whose parents are "<si>" or "<r>".
* *
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader * @param \DOMElement $textNode Text node to check
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement * @return bool Whether the given text node's value must be extracted
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
*/ */
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) protected function shouldExtractTextNodeValue($textNode)
{ {
$node = null; $parentTagName = $textNode->parentNode->localName;
try { return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
$node = new SimpleXMLElement($xmlReader->readOuterXml());
} catch (XMLProcessingException $exception) {
throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
}
return $node;
}
/**
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
* By keeping them, their text content would be added to the read string.
*
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
*/
protected function removeSuperfluousTextNodes($parentNode)
{
$tagsToRemove = [
'rPh', // Pronunciation of the text
'pPr', // Paragraph Properties / Previous Paragraph Properties
'rPr', // Run Properties for the Paragraph Mark / Previous Run Properties for the Paragraph Mark
];
foreach ($tagsToRemove as $tagToRemove) {
$xpath = '//ns:' . $tagToRemove;
$parentNode->removeNodesMatchingXPath($xpath);
}
return $parentNode;
}
/**
* @param array $textNodes Text XML nodes ("<t>")
* @return string The value associated with the given text node(s)
*/
protected function extractTextValueForNodes($textNodes)
{
$textValue = '';
foreach ($textNodes as $nodeIndex => $textNode) {
$textNodeAsString = $textNode->__toString();
$shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
$textValue .= ($shouldPreserveWhitespace) ? $textNodeAsString : trim($textNodeAsString);
}
return $textValue;
} }
/** /**
* If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
* *
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
* @return bool Whether whitespace should be preserved * @return bool Whether whitespace should be preserved
*/ */
protected function shouldPreserveWhitespace($textNode) protected function shouldPreserveWhitespace($textNode)
{ {
$spaceValue = $textNode->getAttribute('space', 'xml'); $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
return ($spaceValue === 'preserve'); return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
} }
/** /**

View File

@ -260,6 +260,7 @@ class RowIterator implements IteratorInterface
{ {
$currentColumnIndex = $this->getColumnIndex($xmlReader); $currentColumnIndex = $this->getColumnIndex($xmlReader);
// NOTE: expand() will automatically decode all XML entities of the child nodes
$node = $xmlReader->expand(); $node = $xmlReader->expand();
$this->currentlyProcessedRowData[$currentColumnIndex] = $this->getCellValue($node); $this->currentlyProcessedRowData[$currentColumnIndex] = $this->getCellValue($node);
$this->lastColumnIndexProcessed = $currentColumnIndex; $this->lastColumnIndexProcessed = $currentColumnIndex;

View File

@ -16,7 +16,7 @@ class ODSTest extends \PHPUnit_Framework_TestCase
{ {
return [ return [
['test', 'test'], ['test', 'test'],
['carl\'s "pokemon"', 'carl&#039;s &quot;pokemon&quot;'], ['carl\'s "pokemon"', 'carl\'s "pokemon"'],
["\n", "\n"], ["\n", "\n"],
["\r", "\r"], ["\r", "\r"],
["\t", "\t"], ["\t", "\t"],

View File

@ -16,7 +16,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
{ {
return [ return [
['test', 'test'], ['test', 'test'],
['adam\'s "car"', 'adam&#039;s &quot;car&quot;'], ['adam\'s "car"', 'adam\'s "car"'],
["\n", "\n"], ["\n", "\n"],
["\r", "\r"], ["\r", "\r"],
["\t", "\t"], ["\t", "\t"],
@ -25,7 +25,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
['_x0000_', '_x005F_x0000_'], ['_x0000_', '_x005F_x0000_'],
[chr(21), '_x0015_'], [chr(21), '_x0015_'],
['control '.chr(21).' character', 'control _x0015_ character'], ['control '.chr(21).' character', 'control _x0015_ character'],
['control\'s '.chr(21).' "character"', 'control&#039;s _x0015_ &quot;character&quot;'], ['control\'s '.chr(21).' "character"', 'control\'s _x0015_ "character"'],
]; ];
} }
@ -52,7 +52,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
{ {
return [ return [
['test', 'test'], ['test', 'test'],
['adam&#039;s &quot;car&quot;', 'adam\'s "car"'], ['adam&#039;s &quot;car&quot;', 'adam&#039;s &quot;car&quot;'],
["\n", "\n"], ["\n", "\n"],
["\r", "\r"], ["\r", "\r"],
["\t", "\t"], ["\t", "\t"],
@ -61,7 +61,7 @@ class XLSXTest extends \PHPUnit_Framework_TestCase
['_x005F_x0000_', '_x0000_'], ['_x005F_x0000_', '_x0000_'],
['_x0015_', chr(21)], ['_x0015_', chr(21)],
['control _x0015_ character', 'control '.chr(21).' character'], ['control _x0015_ character', 'control '.chr(21).' character'],
['control&#039;s _x0015_ &quot;character&quot;', 'control\'s '.chr(21).' "character"'], ['control&#039;s _x0015_ &quot;character&quot;', 'control&#039;s '.chr(21).' &quot;character&quot;'],
]; ];
} }

View File

@ -1,127 +0,0 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\TestUsingResource;
/**
* Class SimpleXMLElementTest
*
* @package Box\Spout\Reader\Wrapper
*/
class SimpleXMLElementTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @expectedException \Box\Spout\Reader\Exception\XMLProcessingException
*
* @return void
*/
public function testConstructShouldThrowExceptionIfInvalidData()
{
$invalidXML = '<invalid><xml></invalid>';
new SimpleXMLElement($invalidXML);
}
/**
* @return array
*/
public function dataProviderForTestGetAttribute()
{
$xmlWithoutNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet foo="bar" type="test" />
XML;
$xmlWithHalfNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
foo="bar" r:type="test" />
XML;
$xmlWithFullNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
r:foo="bar" r:type="test" />
XML;
return [
[$xmlWithoutNamespace, null, ['foo' => 'bar', 'type' => 'test']],
[$xmlWithHalfNamespace, null, ['foo' => 'bar', 'type' => null]],
[$xmlWithFullNamespace, null, ['foo' => null, 'type' => null]],
[$xmlWithoutNamespace, 'r', ['foo' => null, 'type' => null]],
[$xmlWithHalfNamespace, 'r', ['foo' => null, 'type' => 'test']],
[$xmlWithFullNamespace, 'r', ['foo' => 'bar', 'type' => 'test']],
];
}
/**
* @dataProvider dataProviderForTestGetAttribute
*
* @param string $xml
* @param string|null $namespace
* @param array $expectedAttributes
* @return void
*/
public function testGetAttribute($xml, $namespace, $expectedAttributes)
{
$element = new SimpleXMLElement($xml);
foreach ($expectedAttributes as $name => $expectedValue) {
$value = $element->getAttribute($name, $namespace);
$this->assertEquals($expectedValue, $value);
}
}
/**
* @return void
*/
public function testXPath()
{
$xml = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet>
<sheetData>
<row r="1">
<c r="A1"><v>0</v></c>
<c r="A2"><v>1</v></c>
</row>
</sheetData>
</worksheet>
XML;
$element = new SimpleXMLElement($xml);
$matchedElements = $element->xpath('//c');
$this->assertEquals(2, count($matchedElements));
$this->assertTrue($matchedElements[0] instanceof SimpleXMLElement, 'The SimpleXMLElement should be wrapped');
$this->assertEquals('A2', $matchedElements[1]->getAttribute('r'));
}
/**
* @return void
*/
public function testRemoveNodeMatchingXPath()
{
$xml = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet>
<sheetData>
<row r="1">
<c r="A1"><v>0</v></c>
<c r="A2"><v>1</v></c>
</row>
</sheetData>
</worksheet>
XML;
$element = new SimpleXMLElement($xml);
$this->assertNotNull($element->getFirstChildByTagName('sheetData'));
$element->removeNodesMatchingXPath('//sheetData');
$this->assertNull($element->getFirstChildByTagName('sheetData'));
}
}

View File

@ -98,6 +98,22 @@ class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
$sharedStringsHelper->cleanup(); $sharedStringsHelper->cleanup();
} }
/**
* @return void
*/
public function testGetStringAtIndexShouldNotDoubleDecodeHTMLEntities()
{
$resourcePath = $this->getResourcePath('one_sheet_with_pre_encoded_html_entities.xlsx');
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
$sharedStringsHelper->extractSharedStrings();
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
$this->assertEquals('quote: &#34; - ampersand: &amp;', $sharedString);
$sharedStringsHelper->cleanup();
}
/** /**
* @return void * @return void
*/ */

View File

@ -426,42 +426,46 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
} }
/** /**
* @return array
*/
public function dataProviderForTestReadShouldBeProtectedAgainstAttacks()
{
return [
['attack_billion_laughs.xlsx'],
['attack_quadratic_blowup.xlsx'],
];
}
/**
* @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
* *
* @param string $fileName
* @return void * @return void
*/ */
public function testReadShouldBeProtectedAgainstAttacks($fileName) public function testReadShouldBeProtectedAgainstBillionLaughsAttack()
{ {
$startTime = microtime(true); $startTime = microtime(true);
try { try {
// using @ to prevent warnings/errors from being displayed // using @ to prevent warnings/errors from being displayed
@$this->getAllRowsForFile($fileName); @$this->getAllRowsForFile('attack_billion_laughs.xlsx');
$this->fail('An exception should have been thrown'); $this->fail('An exception should have been thrown');
} catch (IOException $exception) { } catch (IOException $exception) {
$duration = microtime(true) - $startTime; $duration = microtime(true) - $startTime;
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB $expectedMaxMemoryUsage = 40 * 1024 * 1024; // 40MB
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
} }
} }
/**
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
*
* @return void
*/
public function testReadShouldBeProtectedAgainstQuadraticBlowupAttack()
{
$startTime = microtime(true);
$this->getAllRowsForFile('attack_quadratic_blowup.xlsx');
$duration = microtime(true) - $startTime;
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
$expectedMaxMemoryUsage = 40 * 1024 * 1024; // 40MB
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
}
/** /**
* @return void * @return void
*/ */

View File

@ -421,7 +421,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
$this->writeToODSFile($dataRows, $fileName); $this->writeToODSFile($dataRows, $fileName);
$this->assertValueWasWritten($fileName, 'I&#039;m in &quot;great&quot; mood', 'Quotes should be escaped'); $this->assertValueWasWritten($fileName, 'I\'m in "great" mood', 'Quotes should not be escaped');
$this->assertValueWasWritten($fileName, 'This &lt;must&gt; be escaped &amp; tested', '<, > and & should be escaped'); $this->assertValueWasWritten($fileName, 'This &lt;must&gt; be escaped &amp; tested', '<, > and & should be escaped');
} }

View File

@ -473,7 +473,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
$this->writeToXLSXFile($dataRows, $fileName); $this->writeToXLSXFile($dataRows, $fileName);
$this->assertInlineDataWasWrittenToSheet($fileName, 1, 'I&#039;m in &quot;great&quot; mood', 'Quotes should be escaped'); $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'I\'m in "great" mood', 'Quotes should not be escaped');
$this->assertInlineDataWasWrittenToSheet($fileName, 1, 'This &lt;must&gt; be escaped &amp; tested', '<, > and & should be escaped'); $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'This &lt;must&gt; be escaped &amp; tested', '<, > and & should be escaped');
} }
@ -482,14 +482,14 @@ class WriterTest extends \PHPUnit_Framework_TestCase
*/ */
public function testAddRowShouldEscapeControlCharacters() public function testAddRowShouldEscapeControlCharacters()
{ {
$fileName = 'test_add_row_should_escape_html_special_characters.xlsx'; $fileName = 'test_add_row_should_escape_control_characters.xlsx';
$dataRows = [ $dataRows = [
['control\'s '.chr(21).' "character"'], ['control '.chr(21).' character'],
]; ];
$this->writeToXLSXFile($dataRows, $fileName); $this->writeToXLSXFile($dataRows, $fileName);
$this->assertInlineDataWasWrittenToSheet($fileName, 1, 'control&#039;s _x0015_ &quot;character&quot;'); $this->assertInlineDataWasWrittenToSheet($fileName, 1, 'control _x0015_ character');
} }
/** /**