Add wrappers around XMLReader and SimpleXMLElement to improve error handling

This commit is contained in:
Adrien Loison 2015-07-23 15:32:43 -07:00
parent be3932af18
commit 1ba10ed2b0
16 changed files with 789 additions and 155 deletions

View File

@ -40,7 +40,7 @@ class FileSystemHelper
$wasCreationSuccessful = mkdir($folderPath, 0777, true);
if (!$wasCreationSuccessful) {
throw new IOException('Unable to create folder: ' . $folderPath);
throw new IOException("Unable to create folder: $folderPath");
}
return $folderPath;
@ -64,7 +64,7 @@ class FileSystemHelper
$wasCreationSuccessful = file_put_contents($filePath, $fileContents);
if ($wasCreationSuccessful === false) {
throw new IOException('Unable to create file: ' . $filePath);
throw new IOException("Unable to create file: $filePath");
}
return $filePath;
@ -126,7 +126,7 @@ class FileSystemHelper
{
$isInBaseFolder = (strpos($operationFolderPath, $this->baseFolderPath) === 0);
if (!$isInBaseFolder) {
throw new IOException('Cannot perform I/O operation outside of the base folder: ' . $this->baseFolderPath);
throw new IOException("Cannot perform I/O operation outside of the base folder: {$this->baseFolderPath}");
}
}
}

View File

@ -64,9 +64,9 @@ abstract class AbstractReader implements ReaderInterface
if (!$this->isPhpStream($filePath)) {
// we skip the checks if the provided file path points to a PHP stream
if (!$this->globalFunctionsHelper->file_exists($filePath)) {
throw new IOException('Could not open ' . $filePath . ' for reading! File does not exist.');
throw new IOException("Could not open $filePath for reading! File does not exist.");
} else if (!$this->globalFunctionsHelper->is_readable($filePath)) {
throw new IOException('Could not open ' . $filePath . ' for reading! File is not readable.');
throw new IOException("Could not open $filePath for reading! File is not readable.");
}
}
@ -74,7 +74,7 @@ abstract class AbstractReader implements ReaderInterface
$this->openReader($filePath);
$this->isStreamOpened = true;
} catch (\Exception $exception) {
throw new IOException('Could not open ' . $filePath . ' for reading! (' . $exception->getMessage() . ')');
throw new IOException("Could not open $filePath for reading! ({$exception->getMessage()})");
}
}

View File

@ -64,7 +64,7 @@ class Reader extends AbstractReader
{
$this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r');
if (!$this->filePointer) {
throw new IOException('Could not open file ' . $filePath . ' for reading.');
throw new IOException("Could not open file $filePath for reading.");
}
$this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper);

View File

@ -0,0 +1,12 @@
<?php
namespace Box\Spout\Reader\Exception;
/**
* Class XMLProcessingException
*
* @package Box\Spout\Reader\Exception
*/
class XMLProcessingException extends ReaderException
{
}

View File

@ -0,0 +1,161 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\Reader\Exception\XMLProcessingException;
/**
* Class SimpleXMLElement
* Wrapper around the built-in SimpleXMLElement. This class does not extend \SimpleXMLElement
* because it its constructor is final... Instead, it is used as a passthrough.
* @see \SimpleXMLElement
*
* @package Box\Spout\Reader\Wrapper
*/
class SimpleXMLElement
{
use XMLInternalErrorsHelper;
/** @var \SimpleXMLElement Instance of the wrapped SimpleXMLElement object */
protected $simpleXMLElement;
/**
* Creates a new SimpleXMLElement object
* @see \SimpleXMLElement::__construct
*
* @param string $xmlData A well-formed XML string
* @throws \Box\Spout\Reader\Exception\XMLProcessingException If the XML string is not well-formed
*/
public function __construct($xmlData)
{
$this->useXMLInternalErrors();
try {
$this->simpleXMLElement = new \SimpleXMLElement($xmlData);
} catch (\Exception $exception) {
// if the data is invalid, the constructor will throw an Exception
$this->resetXMLInternalErrorsSetting();
throw new XMLProcessingException($this->getLastXMLErrorMessage());
}
$this->resetXMLInternalErrorsSetting();
return $this->simpleXMLElement;
}
/**
* Returns the attribute for the given name.
*
* @param string $name Attribute name
* @param string|null|void $namespace An optional namespace for the retrieved attributes
* @return string|null The attribute value or NULL if attribute not found
*/
public function getAttribute($name, $namespace = null)
{
$isPrefix = ($namespace !== null);
$attributes = $this->simpleXMLElement->attributes($namespace, $isPrefix);
$attributeValue = $attributes->{$name};
return ($attributeValue !== null) ? (string) $attributeValue : null;
}
/**
* Creates a prefix/ns context for the next XPath query
* @see \SimpleXMLElement::registerXPathNamespace
*
* @param string $prefix The namespace prefix to use in the XPath query for the namespace given in "namespace".
* @param string $namespace The namespace to use for the XPath query. This must match a namespace in
* use by the XML document or the XPath query using "prefix" will not return any results.
* @return bool TRUE on success or FALSE on failure.
*/
public function registerXPathNamespace($prefix, $namespace)
{
return $this->simpleXMLElement->registerXPathNamespace($prefix, $namespace);
}
/**
* Runs XPath query on XML data
* @see \SimpleXMLElement::xpath
*
* @param string $path An XPath path
* @return SimpleXMLElement[]|bool an array of SimpleXMLElement objects or FALSE in case of an error.
*/
public function xpath($path)
{
$elements = $this->simpleXMLElement->xpath($path);
if ($elements !== false) {
$wrappedElements = [];
foreach ($elements as $element) {
$wrappedElement = $this->wrapSimpleXMLElement($element);
if ($wrappedElement !== null) {
$wrappedElements[] = $this->wrapSimpleXMLElement($element);
}
}
$elements = $wrappedElements;
}
return $elements;
}
/**
* Wraps the given element into an instance of the wrapper
*
* @param \SimpleXMLElement $element Element to be wrapped
* @return SimpleXMLElement|null The wrapped element or NULL if the given element is invalid
*/
protected function wrapSimpleXMLElement(\SimpleXMLElement $element)
{
$wrappedElement = null;
$elementAsXML = $element->asXML();
if ($elementAsXML !== false) {
$wrappedElement = new SimpleXMLElement($elementAsXML);
}
return $wrappedElement;
}
/**
* Remove all nodes matching the given XPath query.
* It does not map to any \SimpleXMLElement function.
*
* @param string $path An XPath path
* @return void
*/
public function removeNodesMatchingXPath($path)
{
$nodesToRemove = $this->simpleXMLElement->xpath($path);
foreach ($nodesToRemove as $nodeToRemove) {
unset($nodeToRemove[0]);
}
}
/**
* Returns the first child matching the given tag name
*
* @param string $tagName
* @return SimpleXMLElement|null The first child matching the tag name or NULL if none found
*/
public function getFirstChildByTagName($tagName)
{
$doesElementExist = isset($this->simpleXMLElement->{$tagName});
/** @var \SimpleXMLElement $realElement */
$realElement = $this->simpleXMLElement->{$tagName};
return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null;
}
/**
* @return string
*/
public function __toString()
{
return $this->simpleXMLElement->__toString();
}
}

View File

@ -0,0 +1,82 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\Reader\Exception\XMLProcessingException;
/**
* Trait XMLInternalErrorsHelper
*
* @package Box\Spout\Reader\Wrapper
*/
trait XMLInternalErrorsHelper
{
/** @var bool Stores whether XML errors were initially stored internally - used to reset */
protected $initialUseInternalErrorsValue;
/**
* To avoid displaying lots of warning/error messages on screen,
* stores errors internally instead.
*
* @return void
*/
protected function useXMLInternalErrors()
{
libxml_clear_errors();
$this->initialUseInternalErrorsValue = libxml_use_internal_errors(true);
}
/**
* Throws an XMLProcessingException if an error occured.
* It also always resets the "libxml_use_internal_errors" setting back to its initial value.
*
* @return void
* @throws \Box\Spout\Reader\Exception\XMLProcessingException
*/
protected function resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured()
{
if ($this->hasXMLErrorOccured()) {
$this->resetXMLInternalErrorsSetting();
throw new XMLProcessingException($this->getLastXMLErrorMessage());
}
$this->resetXMLInternalErrorsSetting();
}
/**
* Returns whether the a XML error has occured since the last time errors were cleared.
*
* @return bool TRUE if an error occured, FALSE otherwise
*/
private function hasXMLErrorOccured()
{
return (libxml_get_last_error() !== false);
}
/**
* Returns the error message for the last XML error that occured.
* @see libxml_get_last_error
*
* @return String|null Last XML error message or null if no error
*/
private function getLastXMLErrorMessage()
{
$errorMessage = null;
$error = libxml_get_last_error();
if ($error !== false) {
$errorMessage = trim($error->message);
}
return $errorMessage;
}
/**
* @return void
*/
protected function resetXMLInternalErrorsSetting()
{
libxml_use_internal_errors($this->initialUseInternalErrorsValue);
}
}

View File

@ -0,0 +1,123 @@
<?php
namespace Box\Spout\Reader\Wrapper;
/**
* Class XMLReader
* Wrapper around the built-in XMLReader
* @see \XMLReader
*
* @package Box\Spout\Reader\Wrapper
*/
class XMLReader extends \XMLReader
{
use XMLInternalErrorsHelper;
/**
* Set the URI containing the XML to parse
* @see \XMLReader::open
*
* @param string $URI URI pointing to the document
* @return bool TRUE on success or FALSE on failure
*/
public function open($URI)
{
$wasOpenSuccessful = false;
// HHVM does not check if file exists within zip file
// @link https://github.com/facebook/hhvm/issues/5779
if ($this->isRunningHHVM() && $this->isZipStream($URI)) {
if ($this->fileExistsWithinZip($URI)) {
$wasOpenSuccessful = parent::open($URI, null, LIBXML_NONET);
}
} else {
$wasOpenSuccessful = parent::open($URI, null, LIBXML_NONET);
}
return $wasOpenSuccessful;
}
/**
* Returns whether the given URI is a zip stream.
*
* @param string $URI URI pointing to a document
* @return bool TRUE if URI is a zip stream, FALSE otherwise
*/
protected function isZipStream($URI)
{
return (strpos($URI, 'zip://') === 0);
}
/**
* Returns whether the current environment is HHVM
*
* @return bool TRUE if running on HHVM, FALSE otherwise
*/
protected function isRunningHHVM()
{
return defined('HHVM_VERSION');
}
/**
* Returns whether the file at the given location exists
*
* @param string $zipStreamURI URI of a zip stream, e.g. "zip://file.zip#path/inside.xml"
* @return bool TRUE if the file exists, FALSE otherwise
*/
protected function fileExistsWithinZip($zipStreamURI)
{
$doesFileExists = false;
$pattern = '/zip:\/\/([^#]+)#(.*)/';
if (preg_match($pattern, $zipStreamURI, $matches)) {
$zipFilePath = $matches[1];
$innerFilePath = $matches[2];
$zip = new \ZipArchive();
if ($zip->open($zipFilePath) === true) {
$doesFileExists = ($zip->locateName($innerFilePath) !== false);
$zip->close();
}
}
return $doesFileExists;
}
/**
* Move to next node in document
* @see \XMLReader::read
*
* @return bool TRUE on success or FALSE on failure
* @throws \Box\Spout\Reader\Exception\XMLProcessingException If an error/warning occurred
*/
public function read()
{
$this->useXMLInternalErrors();
$wasReadSuccessful = parent::read();
$this->resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured();
return $wasReadSuccessful;
}
/**
* Move cursor to next node skipping all subtrees
* @see \XMLReader::next
*
* @param string|void $localName The name of the next node to move to
* @return bool TRUE on success or FALSE on failure
* @throws \Box\Spout\Reader\Exception\XMLProcessingException If an error/warning occurred
*/
public function next($localName = null)
{
$this->useXMLInternalErrors();
$wasNextSuccessful = parent::next($localName);
$this->resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured();
return $wasNextSuccessful;
}
}

View File

@ -3,6 +3,9 @@
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
@ -74,7 +77,7 @@ class SharedStringsHelper
*/
public function extractSharedStrings()
{
$xmlReader = new \XMLReader();
$xmlReader = new XMLReader();
$sharedStringIndex = 0;
$escaper = new \Box\Spout\Common\Escaper\XLSX();
@ -83,39 +86,44 @@ class SharedStringsHelper
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
}
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
try {
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
while ($xmlReader->read() && $xmlReader->name !== 'si') {
// do nothing until a 'si' tag is reached
}
while ($xmlReader->name === 'si') {
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
// removes nodes that should not be read, like the pronunciation of the Kanji characters
$cleanNode = $this->removeSuperfluousTextNodes($node);
// find all text nodes 't'; there can be multiple if the cell contains formatting
$textNodes = $cleanNode->xpath('//ns:t');
$textValue = '';
foreach ($textNodes as $textNode) {
if ($this->shouldPreserveWhitespace($textNode)) {
$textValue .= $textNode->__toString();
} else {
$textValue .= trim($textNode->__toString());
}
while ($xmlReader->read() && $xmlReader->name !== 'si') {
// do nothing until a 'si' tag is reached
}
$unescapedTextValue = $escaper->unescape($textValue);
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
while ($xmlReader->name === 'si') {
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
$sharedStringIndex++;
// removes nodes that should not be read, like the pronunciation of the Kanji characters
$cleanNode = $this->removeSuperfluousTextNodes($node);
// jump to the next 'si' tag
$xmlReader->next('si');
// find all text nodes 't'; there can be multiple if the cell contains formatting
$textNodes = $cleanNode->xpath('//ns:t');
$textValue = '';
foreach ($textNodes as $textNode) {
if ($this->shouldPreserveWhitespace($textNode)) {
$textValue .= $textNode->__toString();
} else {
$textValue .= trim($textNode->__toString());
}
}
$unescapedTextValue = $escaper->unescape($textValue);
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
$sharedStringIndex++;
// jump to the next 'si' tag
$xmlReader->next('si');
}
} catch (XMLProcessingException $exception) {
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
}
$this->cachingStrategy->closeCache();
@ -134,33 +142,19 @@ class SharedStringsHelper
/**
* Returns the shared strings unique count, as specified in <sst> tag.
*
* @param \XMLReader $xmlReader XMLReader instance
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
* @return int Number of unique shared strings in the sharedStrings.xml file
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
*/
protected function getSharedStringsUniqueCount($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$xmlReader->next('sst');
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) {
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) {
$xmlReader->read();
}
$readError = libxml_get_last_error();
if ($readError !== false) {
$readErrorMessage = trim($readError->message);
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readErrorMessage}]");
}
// reset the setting to display XML warnings/errors
libxml_use_internal_errors(false);
return intval($xmlReader->getAttribute('uniqueCount'));
}
@ -180,29 +174,19 @@ class SharedStringsHelper
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
* This is to simplify the parsing of the subtree.
*
* @param \XMLReader $xmlReader
* @return \SimpleXMLElement
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
*/
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node.
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$node = null;
try {
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
} catch (\Exception $exception) {
$error = libxml_get_last_error();
libxml_use_internal_errors(false);
throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].');
$node = new SimpleXMLElement($xmlReader->readOuterXml());
} catch (XMLProcessingException $exception) {
throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
}
libxml_use_internal_errors(false);
return $node;
}
@ -210,8 +194,8 @@ class SharedStringsHelper
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
* By keeping them, their text content would be added to the read string.
*
* @param \SimpleXMLElement $parentNode Parent node that may contain nodes to remove
* @return \SimpleXMLElement Cleaned parent node
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
*/
protected function removeSuperfluousTextNodes($parentNode)
{
@ -221,12 +205,7 @@ class SharedStringsHelper
foreach ($tagsToRemove as $tagToRemove) {
$xpath = '//ns:' . $tagToRemove;
$nodesToRemove = $parentNode->xpath($xpath);
foreach ($nodesToRemove as $nodeToRemove) {
// This is how to remove a node from the XML
unset($nodeToRemove[0]);
}
$parentNode->removeNodesMatchingXPath($xpath);
}
return $parentNode;
@ -235,24 +214,13 @@ class SharedStringsHelper
/**
* If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
*
* @param \SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
* @return bool Whether whitespace should be preserved
*/
protected function shouldPreserveWhitespace($textNode)
{
$shouldPreserveWhitespace = false;
$attributes = $textNode->attributes('xml', true);
if ($attributes) {
foreach ($attributes as $attributeName => $attributeValue) {
if ($attributeName === 'space' && $attributeValue->__toString() === 'preserve') {
$shouldPreserveWhitespace = true;
break;
}
}
}
return $shouldPreserveWhitespace;
$spaceValue = $textNode->getAttribute('space', 'xml');
return ($spaceValue === 'preserve');
}
/**

View File

@ -2,6 +2,7 @@
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
use Box\Spout\Reader\XLSX\Sheet;
/**
@ -37,10 +38,10 @@ class SheetHelper
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \SimpleXMLElement XML element representing the workbook.xml.rels file */
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml.rels file */
protected $workbookXMLRelsAsXMLElement;
/** @var \SimpleXMLElement XML element representing the workbook.xml file */
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml file */
protected $workbookXMLAsXMLElement;
/**
@ -76,7 +77,7 @@ class SheetHelper
for ($i = 0; $i < $numSheetNodes; $i++) {
$sheetNode = $sheetNodes[$i];
$sheetDataXMLFilePath = (string) $sheetNode->attributes()->PartName;
$sheetDataXMLFilePath = $sheetNode->getAttribute('PartName');
$sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath, $i);
}
@ -115,15 +116,15 @@ class SheetHelper
if (count($relationshipNodes) === 1) {
$relationshipNode = $relationshipNodes[0];
$sheetId = (string) $relationshipNode->attributes()->Id;
$sheetId = $relationshipNode->getAttribute('Id');
$workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
$sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]');
if (count($sheetNodes) === 1) {
$sheetNode = $sheetNodes[0];
$sheetId = (int) $sheetNode->attributes()->sheetId;
$escapedSheetName = (string) $sheetNode->attributes()->name;
$sheetId = (int) $sheetNode->getAttribute('sheetId');
$escapedSheetName = $sheetNode->getAttribute('name');
$escaper = new \Box\Spout\Common\Escaper\XLSX();
$sheetName = $escaper->unescape($escapedSheetName);
@ -149,7 +150,7 @@ class SheetHelper
* Returns a representation of the workbook.xml.rels file, ready to be parsed.
* The returned value is cached.
*
* @return \SimpleXMLElement XML element representating the workbook.xml.rels file
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
*/
protected function getWorkbookXMLRelsAsXMLElement()
{
@ -167,7 +168,7 @@ class SheetHelper
* Returns a representation of the workbook.xml file, ready to be parsed.
* The returned value is cached.
*
* @return \SimpleXMLElement XML element representating the workbook.xml.rels file
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
*/
protected function getWorkbookXMLAsXMLElement()
{
@ -186,13 +187,13 @@ class SheetHelper
*
* @param string $xmlFilePath The path of the XML file inside the XLSX file
* @param string $mainNamespace The main XPath namespace to register
* @return \SimpleXMLElement The XML element representing the file
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement The XML element representing the file
*/
protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace)
{
$xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath);
$xmlElement = new \SimpleXMLElement($xmlContents);
$xmlElement = new SimpleXMLElement($xmlContents);
$xmlElement->registerXPathNamespace('ns', $mainNamespace);
return $xmlElement;

View File

@ -61,7 +61,7 @@ class Reader extends AbstractReader
$this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper);
} else {
throw new IOException('Could not open ' . $filePath . ' for reading.');
throw new IOException("Could not open $filePath for reading.");
}
}

View File

@ -3,7 +3,9 @@
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\CellHelper;
/**
@ -45,7 +47,7 @@ class RowIterator implements IteratorInterface
/** @var Helper\SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var \XMLReader The XMLReader object that will help read sheet's XML data */
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */
@ -74,7 +76,7 @@ class RowIterator implements IteratorInterface
$this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
$this->sharedStringsHelper = $sharedStringsHelper;
$this->xmlReader = new \XMLReader();
$this->xmlReader = new XMLReader();
$this->escaper = new \Box\Spout\Common\Escaper\XLSX();
}
@ -102,8 +104,8 @@ class RowIterator implements IteratorInterface
$this->xmlReader->close();
$sheetDataFilePath = 'zip://' . $this->filePath . '#' . $this->sheetDataXMLFilePath;
if ($this->xmlReader->open($sheetDataFilePath, null, LIBXML_NONET) === false) {
throw new IOException('Could not open "' . $this->sheetDataXMLFilePath . '".');
if ($this->xmlReader->open($sheetDataFilePath) === false) {
throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\".");
}
$this->numReadRows = 0;
@ -138,59 +140,52 @@ class RowIterator implements IteratorInterface
$isInsideRowTag = false;
$rowData = [];
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
// For instance on HHVM, XMLReader->open() won't fail when trying to read a unexisting file within a zip...
// But the XMLReader->read() will fail!
libxml_clear_errors();
libxml_use_internal_errors(true);
try {
while ($this->xmlReader->read()) {
if ($this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) {
// Read dimensions of the sheet
$dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) {
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
}
while ($this->xmlReader->read()) {
if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) {
// Read dimensions of the sheet
$dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) {
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
} else if ($this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// Start of the row description
$isInsideRowTag = true;
// Read spans info if present
$numberOfColumnsForRow = $this->numColumns;
$spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
if ($spans) {
list(, $numberOfColumnsForRow) = explode(':', $spans);
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
$rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
} else if ($isInsideRowTag && $this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) {
// Start of a cell description
$currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
$currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex);
$node = $this->xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
} else if ($this->xmlReader->nodeType == XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// End of the row description
// If needed, we fill the empty cells
$rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
$this->numReadRows++;
break;
} else if ($this->xmlReader->nodeType == XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) {
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
}
} else if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// Start of the row description
$isInsideRowTag = true;
// Read spans info if present
$numberOfColumnsForRow = $this->numColumns;
$spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
if ($spans) {
list(, $numberOfColumnsForRow) = explode(':', $spans);
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
$rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
} else if ($isInsideRowTag && $this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) {
// Start of a cell description
$currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
$currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex);
$node = $this->xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
} else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// End of the row description
// If needed, we fill the empty cells
$rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
$this->numReadRows++;
break;
} else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) {
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
}
}
$readError = libxml_get_last_error();
if ($readError !== false) {
$readErrorMessage = trim($readError->message);
throw new IOException("The {$this->sheetDataXMLFilePath} file cannot be read. [{$readErrorMessage}]");
} catch (XMLProcessingException $exception) {
throw new IOException("The {$this->sheetDataXMLFilePath} file cannot be read. [{$exception->getMessage()}]");
}
$this->rowDataBuffer = $rowData;

View File

@ -156,7 +156,7 @@ EOD;
$wasWriteSuccessful = fwrite($this->sheetFilePointer, $data);
if ($wasWriteSuccessful === false) {
throw new IOException('Unable to write data in ' . $this->worksheetFilePath);
throw new IOException("Unable to write data in {$this->worksheetFilePath}");
}
// only update the count if the write worked

View File

@ -0,0 +1,127 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\TestUsingResource;
/**
* Class SimpleXMLElementTest
*
* @package Box\Spout\Reader\Wrapper
*/
class SimpleXMLElementTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @expectedException \Box\Spout\Reader\Exception\XMLProcessingException
*
* @return void
*/
public function testConstructShouldThrowExceptionIfInvalidData()
{
$invalidXML = '<invalid><xml></invalid>';
new SimpleXMLElement($invalidXML);
}
/**
* @return array
*/
public function dataProviderForTestGetAttribute()
{
$xmlWithoutNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet foo="bar" type="test" />
XML;
$xmlWithHalfNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
foo="bar" r:type="test" />
XML;
$xmlWithFullNamespace = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
r:foo="bar" r:type="test" />
XML;
return [
[$xmlWithoutNamespace, null, ['foo' => 'bar', 'type' => 'test']],
[$xmlWithHalfNamespace, null, ['foo' => 'bar', 'type' => null]],
[$xmlWithFullNamespace, null, ['foo' => null, 'type' => null]],
[$xmlWithoutNamespace, 'r', ['foo' => null, 'type' => null]],
[$xmlWithHalfNamespace, 'r', ['foo' => null, 'type' => 'test']],
[$xmlWithFullNamespace, 'r', ['foo' => 'bar', 'type' => 'test']],
];
}
/**
* @dataProvider dataProviderForTestGetAttribute
*
* @param string $xml
* @param string|null $namespace
* @param array $expectedAttributes
* @return void
*/
public function testGetAttribute($xml, $namespace, $expectedAttributes)
{
$element = new SimpleXMLElement($xml);
foreach ($expectedAttributes as $name => $expectedValue) {
$value = $element->getAttribute($name, $namespace);
$this->assertEquals($expectedValue, $value);
}
}
/**
* @return void
*/
public function testXPath()
{
$xml = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet>
<sheetData>
<row r="1">
<c r="A1"><v>0</v></c>
<c r="A2"><v>1</v></c>
</row>
</sheetData>
</worksheet>
XML;
$element = new SimpleXMLElement($xml);
$matchedElements = $element->xpath('//c');
$this->assertEquals(2, count($matchedElements));
$this->assertTrue($matchedElements[0] instanceof SimpleXMLElement, 'The SimpleXMLElement should be wrapped');
$this->assertEquals('A2', $matchedElements[1]->getAttribute('r'));
}
/**
* @return void
*/
public function testRemoveNodeMatchingXPath()
{
$xml = <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<worksheet>
<sheetData>
<row r="1">
<c r="A1"><v>0</v></c>
<c r="A2"><v>1</v></c>
</row>
</sheetData>
</worksheet>
XML;
$element = new SimpleXMLElement($xml);
$this->assertNotNull($element->getFirstChildByTagName('sheetData'));
$element->removeNodesMatchingXPath('//sheetData');
$this->assertNull($element->getFirstChildByTagName('sheetData'));
}
}

View File

@ -0,0 +1,166 @@
<?php
namespace Box\Spout\Reader\Wrapper;
use Box\Spout\TestUsingResource;
use Box\Spout\Reader\Exception\XMLProcessingException;
/**
* Class XMLReaderTest
*
* @package Box\Spout\Reader\Wrapper
*/
class XMLReaderTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @return void
*/
public function testOpenShouldFailIfFileInsideZipDoesNotExist()
{
$resourcePath = $this->getResourcePath('one_sheet_with_inline_strings.xlsx');
$nonExistingXMLFilePath = 'zip://' . $resourcePath . '#path/to/fake/file.xml';
$xmlReader = new XMLReader();
// using "@" to prevent errors/warning to be displayed
$wasOpenSuccessful = @$xmlReader->open($nonExistingXMLFilePath);
$this->assertTrue($wasOpenSuccessful === false);
}
/**
* Testing a HHVM bug: https://github.com/facebook/hhvm/issues/5779
* The associated code in XMLReader::open() can be removed when the issue is fixed (and this test starts failing).
* @see XMLReader::open()
*
* @return void
*/
public function testHHVMStillDoesNotComplainWhenCallingOpenWithFileInsideZipNotExisting()
{
// Test should only be run on HHVM
if ($this->isRunningHHVM()) {
$resourcePath = $this->getResourcePath('one_sheet_with_inline_strings.xlsx');
$nonExistingXMLFilePath = 'zip://' . $resourcePath . '#path/to/fake/file.xml';
libxml_clear_errors();
$initialUseInternalErrorsSetting = libxml_use_internal_errors(true);
// using the built-in XMLReader
$xmlReader = new \XMLReader();
$this->assertTrue($xmlReader->open($nonExistingXMLFilePath) !== false);
$this->assertTrue(libxml_get_last_error() === false);
libxml_use_internal_errors($initialUseInternalErrorsSetting);
}
}
/**
* @return bool TRUE if running on HHVM, FALSE otherwise
*/
private function isRunningHHVM()
{
return defined('HHVM_VERSION');
}
/**
* @expectedException \Box\Spout\Reader\Exception\XMLProcessingException
*
* @return void
*/
public function testReadShouldThrowExceptionOnError()
{
$resourcePath = $this->getResourcePath('one_sheet_with_invalid_xml_characters.xlsx');
$sheetDataXMLFilePath = 'zip://' . $resourcePath . '#xl/worksheets/sheet1.xml';
$xmlReader = new XMLReader();
if ($xmlReader->open($sheetDataXMLFilePath) === false) {
$this->fail();
}
// using "@" to prevent errors/warning to be displayed
while (@$xmlReader->read()) {
// do nothing
}
}
/**
* @expectedException \Box\Spout\Reader\Exception\XMLProcessingException
*
* @return void
*/
public function testNextShouldThrowExceptionOnError()
{
// The sharedStrings.xml file in "attack_billion_laughs.xlsx" contains
// a doctype element that causes read errors
$resourcePath = $this->getResourcePath('attack_billion_laughs.xlsx');
$sheetDataXMLFilePath = 'zip://' . $resourcePath . '#xl/sharedStrings.xml';
$xmlReader = new XMLReader();
if ($xmlReader->open($sheetDataXMLFilePath) !== false) {
@$xmlReader->next('sst');
}
}
/**
* @return array
*/
public function dataProviderForTestIsZipStream()
{
return [
['/absolute/path/to/file.xlsx', false],
['relative/path/to/file.xlsx', false],
['php://temp', false],
['zip:///absolute/path/to/file.xlsx', true],
['zip://relative/path/to/file.xlsx', true],
];
}
/**
* @dataProvider dataProviderForTestIsZipStream
*
* @param string $URI
* @param bool $expectedResult
* @return void
*/
public function testIsZipStream($URI, $expectedResult)
{
$xmlReader = new XMLReader();
$isZipStream = \ReflectionHelper::callMethodOnObject($xmlReader, 'isZipStream', $URI);
$this->assertEquals($expectedResult, $isZipStream);
}
/**
* @return array
*/
public function dataProviderForTestFileExistsWithinZip()
{
return [
['[Content_Types].xml', true],
['xl/sharedStrings.xml', true],
['xl/worksheets/sheet1.xml', true],
['/invalid/file.xml', false],
['another/invalid/file.xml', false],
];
}
/**
* @dataProvider dataProviderForTestFileExistsWithinZip
*
* @param string $innerFilePath
* @param bool $expectedResult
* @return void
*/
public function testFileExistsWithinZip($innerFilePath, $expectedResult)
{
$resourcePath = $this->getResourcePath('one_sheet_with_inline_strings.xlsx');
$zipStreamURI = 'zip://' . $resourcePath . '#' . $innerFilePath;
$xmlReader = new XMLReader();
$isZipStream = \ReflectionHelper::callMethodOnObject($xmlReader, 'fileExistsWithinZip', $zipStreamURI);
$this->assertEquals($expectedResult, $isZipStream);
}
}

View File

@ -6,7 +6,6 @@ use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory;
use Box\Spout\TestUsingResource;
use Symfony\Component\Config\Definition\Exception\Exception;
/**
* Class ReaderTest