diff --git a/src/Spout/Common/Helper/FileSystemHelper.php b/src/Spout/Common/Helper/FileSystemHelper.php index 6186822..4d4f0be 100644 --- a/src/Spout/Common/Helper/FileSystemHelper.php +++ b/src/Spout/Common/Helper/FileSystemHelper.php @@ -40,7 +40,7 @@ class FileSystemHelper $wasCreationSuccessful = mkdir($folderPath, 0777, true); if (!$wasCreationSuccessful) { - throw new IOException('Unable to create folder: ' . $folderPath); + throw new IOException("Unable to create folder: $folderPath"); } return $folderPath; @@ -64,7 +64,7 @@ class FileSystemHelper $wasCreationSuccessful = file_put_contents($filePath, $fileContents); if ($wasCreationSuccessful === false) { - throw new IOException('Unable to create file: ' . $filePath); + throw new IOException("Unable to create file: $filePath"); } return $filePath; @@ -126,7 +126,7 @@ class FileSystemHelper { $isInBaseFolder = (strpos($operationFolderPath, $this->baseFolderPath) === 0); if (!$isInBaseFolder) { - throw new IOException('Cannot perform I/O operation outside of the base folder: ' . $this->baseFolderPath); + throw new IOException("Cannot perform I/O operation outside of the base folder: {$this->baseFolderPath}"); } } } diff --git a/src/Spout/Reader/AbstractReader.php b/src/Spout/Reader/AbstractReader.php index bfbedf8..e55bdec 100644 --- a/src/Spout/Reader/AbstractReader.php +++ b/src/Spout/Reader/AbstractReader.php @@ -64,9 +64,9 @@ abstract class AbstractReader implements ReaderInterface if (!$this->isPhpStream($filePath)) { // we skip the checks if the provided file path points to a PHP stream if (!$this->globalFunctionsHelper->file_exists($filePath)) { - throw new IOException('Could not open ' . $filePath . ' for reading! File does not exist.'); + throw new IOException("Could not open $filePath for reading! File does not exist."); } else if (!$this->globalFunctionsHelper->is_readable($filePath)) { - throw new IOException('Could not open ' . $filePath . ' for reading! File is not readable.'); + throw new IOException("Could not open $filePath for reading! File is not readable."); } } @@ -74,7 +74,7 @@ abstract class AbstractReader implements ReaderInterface $this->openReader($filePath); $this->isStreamOpened = true; } catch (\Exception $exception) { - throw new IOException('Could not open ' . $filePath . ' for reading! (' . $exception->getMessage() . ')'); + throw new IOException("Could not open $filePath for reading! ({$exception->getMessage()})"); } } diff --git a/src/Spout/Reader/CSV/Reader.php b/src/Spout/Reader/CSV/Reader.php index 9f9e56f..523eaca 100644 --- a/src/Spout/Reader/CSV/Reader.php +++ b/src/Spout/Reader/CSV/Reader.php @@ -64,7 +64,7 @@ class Reader extends AbstractReader { $this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r'); if (!$this->filePointer) { - throw new IOException('Could not open file ' . $filePath . ' for reading.'); + throw new IOException("Could not open file $filePath for reading."); } $this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper); diff --git a/src/Spout/Reader/Exception/XMLProcessingException.php b/src/Spout/Reader/Exception/XMLProcessingException.php new file mode 100644 index 0000000..70f630b --- /dev/null +++ b/src/Spout/Reader/Exception/XMLProcessingException.php @@ -0,0 +1,12 @@ +useXMLInternalErrors(); + + try { + $this->simpleXMLElement = new \SimpleXMLElement($xmlData); + } catch (\Exception $exception) { + // if the data is invalid, the constructor will throw an Exception + $this->resetXMLInternalErrorsSetting(); + throw new XMLProcessingException($this->getLastXMLErrorMessage()); + } + + $this->resetXMLInternalErrorsSetting(); + + return $this->simpleXMLElement; + } + + /** + * Returns the attribute for the given name. + * + * @param string $name Attribute name + * @param string|null|void $namespace An optional namespace for the retrieved attributes + * @return string|null The attribute value or NULL if attribute not found + */ + public function getAttribute($name, $namespace = null) + { + $isPrefix = ($namespace !== null); + $attributes = $this->simpleXMLElement->attributes($namespace, $isPrefix); + $attributeValue = $attributes->{$name}; + + return ($attributeValue !== null) ? (string) $attributeValue : null; + } + + /** + * Creates a prefix/ns context for the next XPath query + * @see \SimpleXMLElement::registerXPathNamespace + * + * @param string $prefix The namespace prefix to use in the XPath query for the namespace given in "namespace". + * @param string $namespace The namespace to use for the XPath query. This must match a namespace in + * use by the XML document or the XPath query using "prefix" will not return any results. + * @return bool TRUE on success or FALSE on failure. + */ + public function registerXPathNamespace($prefix, $namespace) + { + return $this->simpleXMLElement->registerXPathNamespace($prefix, $namespace); + } + + /** + * Runs XPath query on XML data + * @see \SimpleXMLElement::xpath + * + * @param string $path An XPath path + * @return SimpleXMLElement[]|bool an array of SimpleXMLElement objects or FALSE in case of an error. + */ + public function xpath($path) + { + $elements = $this->simpleXMLElement->xpath($path); + + if ($elements !== false) { + $wrappedElements = []; + foreach ($elements as $element) { + $wrappedElement = $this->wrapSimpleXMLElement($element); + + if ($wrappedElement !== null) { + $wrappedElements[] = $this->wrapSimpleXMLElement($element); + } + } + + $elements = $wrappedElements; + } + + return $elements; + } + + /** + * Wraps the given element into an instance of the wrapper + * + * @param \SimpleXMLElement $element Element to be wrapped + * @return SimpleXMLElement|null The wrapped element or NULL if the given element is invalid + */ + protected function wrapSimpleXMLElement(\SimpleXMLElement $element) + { + $wrappedElement = null; + $elementAsXML = $element->asXML(); + + if ($elementAsXML !== false) { + $wrappedElement = new SimpleXMLElement($elementAsXML); + } + + return $wrappedElement; + } + + /** + * Remove all nodes matching the given XPath query. + * It does not map to any \SimpleXMLElement function. + * + * @param string $path An XPath path + * @return void + */ + public function removeNodesMatchingXPath($path) + { + $nodesToRemove = $this->simpleXMLElement->xpath($path); + + foreach ($nodesToRemove as $nodeToRemove) { + unset($nodeToRemove[0]); + } + } + + /** + * Returns the first child matching the given tag name + * + * @param string $tagName + * @return SimpleXMLElement|null The first child matching the tag name or NULL if none found + */ + public function getFirstChildByTagName($tagName) + { + $doesElementExist = isset($this->simpleXMLElement->{$tagName}); + + /** @var \SimpleXMLElement $realElement */ + $realElement = $this->simpleXMLElement->{$tagName}; + + return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null; + } + + /** + * @return string + */ + public function __toString() + { + return $this->simpleXMLElement->__toString(); + } +} diff --git a/src/Spout/Reader/Wrapper/XMLInternalErrorsHelper.php b/src/Spout/Reader/Wrapper/XMLInternalErrorsHelper.php new file mode 100644 index 0000000..8b4464a --- /dev/null +++ b/src/Spout/Reader/Wrapper/XMLInternalErrorsHelper.php @@ -0,0 +1,82 @@ +initialUseInternalErrorsValue = libxml_use_internal_errors(true); + } + + /** + * Throws an XMLProcessingException if an error occured. + * It also always resets the "libxml_use_internal_errors" setting back to its initial value. + * + * @return void + * @throws \Box\Spout\Reader\Exception\XMLProcessingException + */ + protected function resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured() + { + if ($this->hasXMLErrorOccured()) { + $this->resetXMLInternalErrorsSetting(); + throw new XMLProcessingException($this->getLastXMLErrorMessage()); + } + + $this->resetXMLInternalErrorsSetting(); + } + + /** + * Returns whether the a XML error has occured since the last time errors were cleared. + * + * @return bool TRUE if an error occured, FALSE otherwise + */ + private function hasXMLErrorOccured() + { + return (libxml_get_last_error() !== false); + } + + /** + * Returns the error message for the last XML error that occured. + * @see libxml_get_last_error + * + * @return String|null Last XML error message or null if no error + */ + private function getLastXMLErrorMessage() + { + $errorMessage = null; + $error = libxml_get_last_error(); + + if ($error !== false) { + $errorMessage = trim($error->message); + } + + return $errorMessage; + } + + /** + * @return void + */ + protected function resetXMLInternalErrorsSetting() + { + libxml_use_internal_errors($this->initialUseInternalErrorsValue); + } + +} diff --git a/src/Spout/Reader/Wrapper/XMLReader.php b/src/Spout/Reader/Wrapper/XMLReader.php new file mode 100644 index 0000000..5cc3838 --- /dev/null +++ b/src/Spout/Reader/Wrapper/XMLReader.php @@ -0,0 +1,123 @@ +isRunningHHVM() && $this->isZipStream($URI)) { + if ($this->fileExistsWithinZip($URI)) { + $wasOpenSuccessful = parent::open($URI, null, LIBXML_NONET); + } + } else { + $wasOpenSuccessful = parent::open($URI, null, LIBXML_NONET); + } + + return $wasOpenSuccessful; + } + + /** + * Returns whether the given URI is a zip stream. + * + * @param string $URI URI pointing to a document + * @return bool TRUE if URI is a zip stream, FALSE otherwise + */ + protected function isZipStream($URI) + { + return (strpos($URI, 'zip://') === 0); + } + + /** + * Returns whether the current environment is HHVM + * + * @return bool TRUE if running on HHVM, FALSE otherwise + */ + protected function isRunningHHVM() + { + return defined('HHVM_VERSION'); + } + + /** + * Returns whether the file at the given location exists + * + * @param string $zipStreamURI URI of a zip stream, e.g. "zip://file.zip#path/inside.xml" + * @return bool TRUE if the file exists, FALSE otherwise + */ + protected function fileExistsWithinZip($zipStreamURI) + { + $doesFileExists = false; + + $pattern = '/zip:\/\/([^#]+)#(.*)/'; + if (preg_match($pattern, $zipStreamURI, $matches)) { + $zipFilePath = $matches[1]; + $innerFilePath = $matches[2]; + + $zip = new \ZipArchive(); + if ($zip->open($zipFilePath) === true) { + $doesFileExists = ($zip->locateName($innerFilePath) !== false); + $zip->close(); + } + } + + return $doesFileExists; + } + + /** + * Move to next node in document + * @see \XMLReader::read + * + * @return bool TRUE on success or FALSE on failure + * @throws \Box\Spout\Reader\Exception\XMLProcessingException If an error/warning occurred + */ + public function read() + { + $this->useXMLInternalErrors(); + + $wasReadSuccessful = parent::read(); + + $this->resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured(); + + return $wasReadSuccessful; + } + + /** + * Move cursor to next node skipping all subtrees + * @see \XMLReader::next + * + * @param string|void $localName The name of the next node to move to + * @return bool TRUE on success or FALSE on failure + * @throws \Box\Spout\Reader\Exception\XMLProcessingException If an error/warning occurred + */ + public function next($localName = null) + { + $this->useXMLInternalErrors(); + + $wasNextSuccessful = parent::next($localName); + + $this->resetXMLInternalErrorsSettingAndThrowIfXMLErrorOccured(); + + return $wasNextSuccessful; + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php b/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php index 75f8989..f01150c 100644 --- a/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php @@ -3,6 +3,9 @@ namespace Box\Spout\Reader\XLSX\Helper; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Reader\Exception\XMLProcessingException; +use Box\Spout\Reader\Wrapper\SimpleXMLElement; +use Box\Spout\Reader\Wrapper\XMLReader; use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory; use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface; @@ -74,7 +77,7 @@ class SharedStringsHelper */ public function extractSharedStrings() { - $xmlReader = new \XMLReader(); + $xmlReader = new XMLReader(); $sharedStringIndex = 0; $escaper = new \Box\Spout\Common\Escaper\XLSX(); @@ -83,39 +86,44 @@ class SharedStringsHelper throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } - $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); - $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); + try { + $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); + $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); - while ($xmlReader->read() && $xmlReader->name !== 'si') { - // do nothing until a 'si' tag is reached - } - - while ($xmlReader->name === 'si') { - $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); - $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); - - // removes nodes that should not be read, like the pronunciation of the Kanji characters - $cleanNode = $this->removeSuperfluousTextNodes($node); - - // find all text nodes 't'; there can be multiple if the cell contains formatting - $textNodes = $cleanNode->xpath('//ns:t'); - - $textValue = ''; - foreach ($textNodes as $textNode) { - if ($this->shouldPreserveWhitespace($textNode)) { - $textValue .= $textNode->__toString(); - } else { - $textValue .= trim($textNode->__toString()); - } + while ($xmlReader->read() && $xmlReader->name !== 'si') { + // do nothing until a 'si' tag is reached } - $unescapedTextValue = $escaper->unescape($textValue); - $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); + while ($xmlReader->name === 'si') { + $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); + $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); - $sharedStringIndex++; + // removes nodes that should not be read, like the pronunciation of the Kanji characters + $cleanNode = $this->removeSuperfluousTextNodes($node); - // jump to the next 'si' tag - $xmlReader->next('si'); + // find all text nodes 't'; there can be multiple if the cell contains formatting + $textNodes = $cleanNode->xpath('//ns:t'); + + $textValue = ''; + foreach ($textNodes as $textNode) { + if ($this->shouldPreserveWhitespace($textNode)) { + $textValue .= $textNode->__toString(); + } else { + $textValue .= trim($textNode->__toString()); + } + } + + $unescapedTextValue = $escaper->unescape($textValue); + $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); + + $sharedStringIndex++; + + // jump to the next 'si' tag + $xmlReader->next('si'); + } + + } catch (XMLProcessingException $exception) { + throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); } $this->cachingStrategy->closeCache(); @@ -134,33 +142,19 @@ class SharedStringsHelper /** * Returns the shared strings unique count, as specified in tag. * - * @param \XMLReader $xmlReader XMLReader instance + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance * @return int Number of unique shared strings in the sharedStrings.xml file * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read */ protected function getSharedStringsUniqueCount($xmlReader) { - // Use internal errors to avoid displaying lots of warning messages in case of invalid file - // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks - libxml_clear_errors(); - libxml_use_internal_errors(true); - $xmlReader->next('sst'); // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) - while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) { + while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) { $xmlReader->read(); } - $readError = libxml_get_last_error(); - if ($readError !== false) { - $readErrorMessage = trim($readError->message); - throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readErrorMessage}]"); - } - - // reset the setting to display XML warnings/errors - libxml_use_internal_errors(false); - return intval($xmlReader->getAttribute('uniqueCount')); } @@ -180,29 +174,19 @@ class SharedStringsHelper * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. * This is to simplify the parsing of the subtree. * - * @param \XMLReader $xmlReader - * @return \SimpleXMLElement + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader + * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read */ protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) { - // Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node. - // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks - libxml_clear_errors(); - libxml_use_internal_errors(true); - $node = null; try { - $node = new \SimpleXMLElement($xmlReader->readOuterXml()); - } catch (\Exception $exception) { - $error = libxml_get_last_error(); - libxml_use_internal_errors(false); - - throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].'); + $node = new SimpleXMLElement($xmlReader->readOuterXml()); + } catch (XMLProcessingException $exception) { + throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}]."); } - libxml_use_internal_errors(false); - return $node; } @@ -210,8 +194,8 @@ class SharedStringsHelper * Removes nodes that should not be read, like the pronunciation of the Kanji characters. * By keeping them, their text content would be added to the read string. * - * @param \SimpleXMLElement $parentNode Parent node that may contain nodes to remove - * @return \SimpleXMLElement Cleaned parent node + * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove + * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node */ protected function removeSuperfluousTextNodes($parentNode) { @@ -221,12 +205,7 @@ class SharedStringsHelper foreach ($tagsToRemove as $tagToRemove) { $xpath = '//ns:' . $tagToRemove; - $nodesToRemove = $parentNode->xpath($xpath); - - foreach ($nodesToRemove as $nodeToRemove) { - // This is how to remove a node from the XML - unset($nodeToRemove[0]); - } + $parentNode->removeNodesMatchingXPath($xpath); } return $parentNode; @@ -235,24 +214,13 @@ class SharedStringsHelper /** * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. * - * @param \SimpleXMLElement $textNode The text node element () whitespace may be preserved + * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element () whitespace may be preserved * @return bool Whether whitespace should be preserved */ protected function shouldPreserveWhitespace($textNode) { - $shouldPreserveWhitespace = false; - - $attributes = $textNode->attributes('xml', true); - if ($attributes) { - foreach ($attributes as $attributeName => $attributeValue) { - if ($attributeName === 'space' && $attributeValue->__toString() === 'preserve') { - $shouldPreserveWhitespace = true; - break; - } - } - } - - return $shouldPreserveWhitespace; + $spaceValue = $textNode->getAttribute('space', 'xml'); + return ($spaceValue === 'preserve'); } /** diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php index a3431ae..308e94e 100644 --- a/src/Spout/Reader/XLSX/Helper/SheetHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -2,6 +2,7 @@ namespace Box\Spout\Reader\XLSX\Helper; +use Box\Spout\Reader\Wrapper\SimpleXMLElement; use Box\Spout\Reader\XLSX\Sheet; /** @@ -37,10 +38,10 @@ class SheetHelper /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ protected $globalFunctionsHelper; - /** @var \SimpleXMLElement XML element representing the workbook.xml.rels file */ + /** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml.rels file */ protected $workbookXMLRelsAsXMLElement; - /** @var \SimpleXMLElement XML element representing the workbook.xml file */ + /** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml file */ protected $workbookXMLAsXMLElement; /** @@ -76,7 +77,7 @@ class SheetHelper for ($i = 0; $i < $numSheetNodes; $i++) { $sheetNode = $sheetNodes[$i]; - $sheetDataXMLFilePath = (string) $sheetNode->attributes()->PartName; + $sheetDataXMLFilePath = $sheetNode->getAttribute('PartName'); $sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath, $i); } @@ -115,15 +116,15 @@ class SheetHelper if (count($relationshipNodes) === 1) { $relationshipNode = $relationshipNodes[0]; - $sheetId = (string) $relationshipNode->attributes()->Id; + $sheetId = $relationshipNode->getAttribute('Id'); $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]'); if (count($sheetNodes) === 1) { $sheetNode = $sheetNodes[0]; - $sheetId = (int) $sheetNode->attributes()->sheetId; - $escapedSheetName = (string) $sheetNode->attributes()->name; + $sheetId = (int) $sheetNode->getAttribute('sheetId'); + $escapedSheetName = $sheetNode->getAttribute('name'); $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sheetName = $escaper->unescape($escapedSheetName); @@ -149,7 +150,7 @@ class SheetHelper * Returns a representation of the workbook.xml.rels file, ready to be parsed. * The returned value is cached. * - * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file */ protected function getWorkbookXMLRelsAsXMLElement() { @@ -167,7 +168,7 @@ class SheetHelper * Returns a representation of the workbook.xml file, ready to be parsed. * The returned value is cached. * - * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file */ protected function getWorkbookXMLAsXMLElement() { @@ -186,13 +187,13 @@ class SheetHelper * * @param string $xmlFilePath The path of the XML file inside the XLSX file * @param string $mainNamespace The main XPath namespace to register - * @return \SimpleXMLElement The XML element representing the file + * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement The XML element representing the file */ protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace) { $xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath); - $xmlElement = new \SimpleXMLElement($xmlContents); + $xmlElement = new SimpleXMLElement($xmlContents); $xmlElement->registerXPathNamespace('ns', $mainNamespace); return $xmlElement; diff --git a/src/Spout/Reader/XLSX/Reader.php b/src/Spout/Reader/XLSX/Reader.php index f24d185..cf13517 100644 --- a/src/Spout/Reader/XLSX/Reader.php +++ b/src/Spout/Reader/XLSX/Reader.php @@ -61,7 +61,7 @@ class Reader extends AbstractReader $this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper); } else { - throw new IOException('Could not open ' . $filePath . ' for reading.'); + throw new IOException("Could not open $filePath for reading."); } } diff --git a/src/Spout/Reader/XLSX/RowIterator.php b/src/Spout/Reader/XLSX/RowIterator.php index 6fc1dde..0225171 100644 --- a/src/Spout/Reader/XLSX/RowIterator.php +++ b/src/Spout/Reader/XLSX/RowIterator.php @@ -3,7 +3,9 @@ namespace Box\Spout\Reader\XLSX; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Reader\Exception\XMLProcessingException; use Box\Spout\Reader\IteratorInterface; +use Box\Spout\Reader\Wrapper\XMLReader; use Box\Spout\Reader\XLSX\Helper\CellHelper; /** @@ -45,7 +47,7 @@ class RowIterator implements IteratorInterface /** @var Helper\SharedStringsHelper Helper to work with shared strings */ protected $sharedStringsHelper; - /** @var \XMLReader The XMLReader object that will help read sheet's XML data */ + /** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */ protected $xmlReader; /** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */ @@ -74,7 +76,7 @@ class RowIterator implements IteratorInterface $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); $this->sharedStringsHelper = $sharedStringsHelper; - $this->xmlReader = new \XMLReader(); + $this->xmlReader = new XMLReader(); $this->escaper = new \Box\Spout\Common\Escaper\XLSX(); } @@ -102,8 +104,8 @@ class RowIterator implements IteratorInterface $this->xmlReader->close(); $sheetDataFilePath = 'zip://' . $this->filePath . '#' . $this->sheetDataXMLFilePath; - if ($this->xmlReader->open($sheetDataFilePath, null, LIBXML_NONET) === false) { - throw new IOException('Could not open "' . $this->sheetDataXMLFilePath . '".'); + if ($this->xmlReader->open($sheetDataFilePath) === false) { + throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\"."); } $this->numReadRows = 0; @@ -138,59 +140,52 @@ class RowIterator implements IteratorInterface $isInsideRowTag = false; $rowData = []; - // Use internal errors to avoid displaying lots of warning messages in case of invalid file - // For instance on HHVM, XMLReader->open() won't fail when trying to read a unexisting file within a zip... - // But the XMLReader->read() will fail! - libxml_clear_errors(); - libxml_use_internal_errors(true); + try { + while ($this->xmlReader->read()) { + if ($this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) { + // Read dimensions of the sheet + $dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) + if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) { + $lastCellIndex = $matches[1]; + $this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1; + } - while ($this->xmlReader->read()) { - if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) { - // Read dimensions of the sheet - $dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) - if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) { - $lastCellIndex = $matches[1]; - $this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1; + } else if ($this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { + // Start of the row description + $isInsideRowTag = true; + + // Read spans info if present + $numberOfColumnsForRow = $this->numColumns; + $spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance + if ($spans) { + list(, $numberOfColumnsForRow) = explode(':', $spans); + $numberOfColumnsForRow = intval($numberOfColumnsForRow); + } + $rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : []; + + } else if ($isInsideRowTag && $this->xmlReader->nodeType == XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) { + // Start of a cell description + $currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); + $currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex); + + $node = $this->xmlReader->expand(); + $rowData[$currentColumnIndex] = $this->getCellValue($node); + + } else if ($this->xmlReader->nodeType == XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { + // End of the row description + // If needed, we fill the empty cells + $rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData); + $this->numReadRows++; + break; + + } else if ($this->xmlReader->nodeType == XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) { + // The closing "" marks the end of the file + $this->hasReachedEndOfFile = true; } - - } else if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { - // Start of the row description - $isInsideRowTag = true; - - // Read spans info if present - $numberOfColumnsForRow = $this->numColumns; - $spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance - if ($spans) { - list(, $numberOfColumnsForRow) = explode(':', $spans); - $numberOfColumnsForRow = intval($numberOfColumnsForRow); - } - $rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : []; - - } else if ($isInsideRowTag && $this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) { - // Start of a cell description - $currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); - $currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex); - - $node = $this->xmlReader->expand(); - $rowData[$currentColumnIndex] = $this->getCellValue($node); - - } else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { - // End of the row description - // If needed, we fill the empty cells - $rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData); - $this->numReadRows++; - break; - - } else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) { - // The closing "" marks the end of the file - $this->hasReachedEndOfFile = true; } - } - $readError = libxml_get_last_error(); - if ($readError !== false) { - $readErrorMessage = trim($readError->message); - throw new IOException("The {$this->sheetDataXMLFilePath} file cannot be read. [{$readErrorMessage}]"); + } catch (XMLProcessingException $exception) { + throw new IOException("The {$this->sheetDataXMLFilePath} file cannot be read. [{$exception->getMessage()}]"); } $this->rowDataBuffer = $rowData; diff --git a/src/Spout/Writer/XLSX/Internal/Worksheet.php b/src/Spout/Writer/XLSX/Internal/Worksheet.php index f091004..ef41ec2 100644 --- a/src/Spout/Writer/XLSX/Internal/Worksheet.php +++ b/src/Spout/Writer/XLSX/Internal/Worksheet.php @@ -156,7 +156,7 @@ EOD; $wasWriteSuccessful = fwrite($this->sheetFilePointer, $data); if ($wasWriteSuccessful === false) { - throw new IOException('Unable to write data in ' . $this->worksheetFilePath); + throw new IOException("Unable to write data in {$this->worksheetFilePath}"); } // only update the count if the write worked diff --git a/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php b/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php new file mode 100644 index 0000000..bb074d7 --- /dev/null +++ b/tests/Spout/Reader/Wrapper/SimpleXMLElementTest.php @@ -0,0 +1,127 @@ +'; + new SimpleXMLElement($invalidXML); + } + + /** + * @return array + */ + public function dataProviderForTestGetAttribute() + { + $xmlWithoutNamespace = << + +XML; + + $xmlWithHalfNamespace = << + +XML; + + $xmlWithFullNamespace = << + +XML; + + return [ + [$xmlWithoutNamespace, null, ['foo' => 'bar', 'type' => 'test']], + [$xmlWithHalfNamespace, null, ['foo' => 'bar', 'type' => null]], + [$xmlWithFullNamespace, null, ['foo' => null, 'type' => null]], + [$xmlWithoutNamespace, 'r', ['foo' => null, 'type' => null]], + [$xmlWithHalfNamespace, 'r', ['foo' => null, 'type' => 'test']], + [$xmlWithFullNamespace, 'r', ['foo' => 'bar', 'type' => 'test']], + ]; + } + + /** + * @dataProvider dataProviderForTestGetAttribute + * + * @param string $xml + * @param string|null $namespace + * @param array $expectedAttributes + * @return void + */ + public function testGetAttribute($xml, $namespace, $expectedAttributes) + { + $element = new SimpleXMLElement($xml); + + foreach ($expectedAttributes as $name => $expectedValue) { + $value = $element->getAttribute($name, $namespace); + $this->assertEquals($expectedValue, $value); + } + } + + /** + * @return void + */ + public function testXPath() + { + $xml = << + + + + 0 + 1 + + + +XML; + + $element = new SimpleXMLElement($xml); + $matchedElements = $element->xpath('//c'); + + $this->assertEquals(2, count($matchedElements)); + $this->assertTrue($matchedElements[0] instanceof SimpleXMLElement, 'The SimpleXMLElement should be wrapped'); + $this->assertEquals('A2', $matchedElements[1]->getAttribute('r')); + } + + /** + * @return void + */ + public function testRemoveNodeMatchingXPath() + { + $xml = << + + + + 0 + 1 + + + +XML; + + $element = new SimpleXMLElement($xml); + $this->assertNotNull($element->getFirstChildByTagName('sheetData')); + + $element->removeNodesMatchingXPath('//sheetData'); + $this->assertNull($element->getFirstChildByTagName('sheetData')); + } +} diff --git a/tests/Spout/Reader/Wrapper/XMLReaderTest.php b/tests/Spout/Reader/Wrapper/XMLReaderTest.php new file mode 100644 index 0000000..097a5fa --- /dev/null +++ b/tests/Spout/Reader/Wrapper/XMLReaderTest.php @@ -0,0 +1,166 @@ +getResourcePath('one_sheet_with_inline_strings.xlsx'); + $nonExistingXMLFilePath = 'zip://' . $resourcePath . '#path/to/fake/file.xml'; + + $xmlReader = new XMLReader(); + + // using "@" to prevent errors/warning to be displayed + $wasOpenSuccessful = @$xmlReader->open($nonExistingXMLFilePath); + + $this->assertTrue($wasOpenSuccessful === false); + } + + /** + * Testing a HHVM bug: https://github.com/facebook/hhvm/issues/5779 + * The associated code in XMLReader::open() can be removed when the issue is fixed (and this test starts failing). + * @see XMLReader::open() + * + * @return void + */ + public function testHHVMStillDoesNotComplainWhenCallingOpenWithFileInsideZipNotExisting() + { + // Test should only be run on HHVM + if ($this->isRunningHHVM()) { + $resourcePath = $this->getResourcePath('one_sheet_with_inline_strings.xlsx'); + $nonExistingXMLFilePath = 'zip://' . $resourcePath . '#path/to/fake/file.xml'; + + libxml_clear_errors(); + $initialUseInternalErrorsSetting = libxml_use_internal_errors(true); + + // using the built-in XMLReader + $xmlReader = new \XMLReader(); + $this->assertTrue($xmlReader->open($nonExistingXMLFilePath) !== false); + $this->assertTrue(libxml_get_last_error() === false); + + libxml_use_internal_errors($initialUseInternalErrorsSetting); + } + } + + /** + * @return bool TRUE if running on HHVM, FALSE otherwise + */ + private function isRunningHHVM() + { + return defined('HHVM_VERSION'); + } + + /** + * @expectedException \Box\Spout\Reader\Exception\XMLProcessingException + * + * @return void + */ + public function testReadShouldThrowExceptionOnError() + { + $resourcePath = $this->getResourcePath('one_sheet_with_invalid_xml_characters.xlsx'); + $sheetDataXMLFilePath = 'zip://' . $resourcePath . '#xl/worksheets/sheet1.xml'; + + $xmlReader = new XMLReader(); + if ($xmlReader->open($sheetDataXMLFilePath) === false) { + $this->fail(); + } + + // using "@" to prevent errors/warning to be displayed + while (@$xmlReader->read()) { + // do nothing + } + } + + /** + * @expectedException \Box\Spout\Reader\Exception\XMLProcessingException + * + * @return void + */ + public function testNextShouldThrowExceptionOnError() + { + // The sharedStrings.xml file in "attack_billion_laughs.xlsx" contains + // a doctype element that causes read errors + $resourcePath = $this->getResourcePath('attack_billion_laughs.xlsx'); + $sheetDataXMLFilePath = 'zip://' . $resourcePath . '#xl/sharedStrings.xml'; + + $xmlReader = new XMLReader(); + if ($xmlReader->open($sheetDataXMLFilePath) !== false) { + @$xmlReader->next('sst'); + } + } + + /** + * @return array + */ + public function dataProviderForTestIsZipStream() + { + return [ + ['/absolute/path/to/file.xlsx', false], + ['relative/path/to/file.xlsx', false], + ['php://temp', false], + ['zip:///absolute/path/to/file.xlsx', true], + ['zip://relative/path/to/file.xlsx', true], + ]; + } + + /** + * @dataProvider dataProviderForTestIsZipStream + * + * @param string $URI + * @param bool $expectedResult + * @return void + */ + public function testIsZipStream($URI, $expectedResult) + { + $xmlReader = new XMLReader(); + $isZipStream = \ReflectionHelper::callMethodOnObject($xmlReader, 'isZipStream', $URI); + + $this->assertEquals($expectedResult, $isZipStream); + } + + /** + * @return array + */ + public function dataProviderForTestFileExistsWithinZip() + { + return [ + ['[Content_Types].xml', true], + ['xl/sharedStrings.xml', true], + ['xl/worksheets/sheet1.xml', true], + ['/invalid/file.xml', false], + ['another/invalid/file.xml', false], + ]; + } + + /** + * @dataProvider dataProviderForTestFileExistsWithinZip + * + * @param string $innerFilePath + * @param bool $expectedResult + * @return void + */ + public function testFileExistsWithinZip($innerFilePath, $expectedResult) + { + $resourcePath = $this->getResourcePath('one_sheet_with_inline_strings.xlsx'); + $zipStreamURI = 'zip://' . $resourcePath . '#' . $innerFilePath; + + $xmlReader = new XMLReader(); + $isZipStream = \ReflectionHelper::callMethodOnObject($xmlReader, 'fileExistsWithinZip', $zipStreamURI); + + $this->assertEquals($expectedResult, $isZipStream); + } +} diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index 0037ca9..1ec4290 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -6,7 +6,6 @@ use Box\Spout\Common\Exception\IOException; use Box\Spout\Common\Type; use Box\Spout\Reader\ReaderFactory; use Box\Spout\TestUsingResource; -use Symfony\Component\Config\Definition\Exception\Exception; /** * Class ReaderTest diff --git a/tests/resources/xlsx/one_sheet_with_invalid_xml_characters.xlsx b/tests/resources/xlsx/one_sheet_with_invalid_xml_characters.xlsx new file mode 100644 index 0000000..49293f5 Binary files /dev/null and b/tests/resources/xlsx/one_sheet_with_invalid_xml_characters.xlsx differ