Support XLSX with prefixed XML files (#237)

While the standard is not to have prefixes, some XLSX files have XML files containing a prefix.
Microsoft has a tool that generates such files: https://msdn.microsoft.com/en-us/library/office/gg278316.aspx
This commit is contained in:
Adrien Loison 2016-05-29 22:16:59 -07:00
parent 2ed30321b4
commit 03866a6604
5 changed files with 119 additions and 86 deletions

View File

@ -138,9 +138,10 @@ class XMLReader extends \XMLReader
*/
public function readUntilNodeFound($nodeName)
{
while (($wasReadSuccessful = $this->read()) && ($this->nodeType !== \XMLReader::ELEMENT || $this->name !== $nodeName)) {
// do nothing
}
do {
$wasReadSuccessful = $this->read();
$isNotPositionedOnStartingNode = !$this->isPositionedOnStartingNode($nodeName);
} while ($wasReadSuccessful && $isNotPositionedOnStartingNode);
return $wasReadSuccessful;
}
@ -170,7 +171,7 @@ class XMLReader extends \XMLReader
*/
public function isPositionedOnStartingNode($nodeName)
{
return ($this->nodeType === XMLReader::ELEMENT && $this->name === $nodeName);
return $this->isPositionedOnNode($nodeName, XMLReader::ELEMENT);
}
/**
@ -179,6 +180,22 @@ class XMLReader extends \XMLReader
*/
public function isPositionedOnEndingNode($nodeName)
{
return ($this->nodeType === XMLReader::END_ELEMENT && $this->name === $nodeName);
return $this->isPositionedOnNode($nodeName, XMLReader::END_ELEMENT);
}
/**
* @param string $nodeName
* @param int $nodeType
* @return bool Whether the XML Reader is currently positioned on the node with given name and type
*/
private function isPositionedOnNode($nodeName, $nodeType)
{
// In some cases, the node has a prefix (for instance, "<sheet>" can also be "<x:sheet>").
// So if the given node name does not have a prefix, we need to look at the unprefixed name ("localName").
// @see https://github.com/box/spout/issues/233
$hasPrefix = (strpos($nodeName, ':') !== false);
$currentNodeName = ($hasPrefix) ? $this->name : $this->localName;
return ($this->nodeType === $nodeType && $currentNodeName === $nodeName);
}
}

View File

@ -2,7 +2,7 @@
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Sheet;
/**
@ -17,10 +17,6 @@ class SheetHelper
const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels';
const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml';
/** Namespaces for the XML files */
const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships';
const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/** @var string Path of the XLSX file being read */
protected $filePath;
@ -33,12 +29,6 @@ class SheetHelper
/** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */
protected $shouldFormatDates;
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml.rels file */
protected $workbookXMLRelsAsXMLElement;
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml file */
protected $workbookXMLAsXMLElement;
/**
* @param string $filePath Path of the XLSX file being read
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings
@ -62,13 +52,21 @@ class SheetHelper
public function getSheets()
{
$sheets = [];
$sheetIndex = 0;
// Starting from "workbook.xml" as this file is the source of truth for the sheets order
$workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
$sheetNodes = $workbookXMLElement->xpath('//ns:sheet');
$xmlReader = new XMLReader();
if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_FILE_PATH)) {
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode('sheet')) {
$sheets[] = $this->getSheetFromSheetXMLNode($xmlReader, $sheetIndex);
$sheetIndex++;
} else if ($xmlReader->isPositionedOnEndingNode('sheets')) {
// stop reading once all sheets have been read
break;
}
}
foreach ($sheetNodes as $sheetIndex => $sheetNode) {
$sheets[] = $this->getSheetFromSheetXMLNode($sheetNode, $sheetIndex);
$xmlReader->close();
}
return $sheets;
@ -79,88 +77,56 @@ class SheetHelper
* We can find the XML file path describing the sheet inside "workbook.xml.res", by mapping with the sheet ID
* ("r:id" in "workbook.xml", "Id" in "workbook.xml.res").
*
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $sheetNode XML Node describing the sheet, as defined in "workbook.xml"
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReaderOnSheetNode XML Reader instance, pointing on the node describing the sheet, as defined in "workbook.xml"
* @param int $sheetIndexZeroBased Index of the sheet, based on order of appearance in the workbook (zero-based)
* @return \Box\Spout\Reader\XLSX\Sheet Sheet instance
*/
protected function getSheetFromSheetXMLNode($sheetNode, $sheetIndexZeroBased)
protected function getSheetFromSheetXMLNode($xmlReaderOnSheetNode, $sheetIndexZeroBased)
{
// To retrieve namespaced attributes, some versions of LibXML will accept prefixing the attribute
// with the namespace directly (tested on LibXML 2.9.3). For older versions (tested on LibXML 2.7.8),
// attributes need to be retrieved without the namespace hint.
$sheetId = $sheetNode->getAttribute('r:id');
if ($sheetId === null) {
$sheetId = $sheetNode->getAttribute('id');
}
$escapedSheetName = $sheetNode->getAttribute('name');
$sheetId = $xmlReaderOnSheetNode->getAttribute('r:id');
$escapedSheetName = $xmlReaderOnSheetNode->getAttribute('name');
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$escaper = new \Box\Spout\Common\Escaper\XLSX();
$sheetName = $escaper->unescape($escapedSheetName);
// find the file path of the sheet, by looking at the "workbook.xml.res" file
$workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement();
$relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Id="' . $sheetId . '"]');
$relationshipNode = $relationshipNodes[0];
// In workbook.xml.rels, it is only "worksheets/sheet1.xml"
// In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
$sheetDataXMLFilePath = '/xl/' . $relationshipNode->getAttribute('Target');
$sheetDataXMLFilePath = $this->getSheetDataXMLFilePathForSheetId($sheetId);
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName);
}
/**
* Returns a representation of the workbook.xml.rels file, ready to be parsed.
* The returned value is cached.
*
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
* @param string $sheetId The sheet ID, as defined in "workbook.xml"
* @return string The XML file path describing the sheet inside "workbook.xml.res", for the given sheet ID
*/
protected function getWorkbookXMLRelsAsXMLElement()
protected function getSheetDataXMLFilePathForSheetId($sheetId)
{
if (!$this->workbookXMLRelsAsXMLElement) {
$this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace(
self::WORKBOOK_XML_RELS_FILE_PATH,
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS
);
}
$sheetDataXMLFilePath = '';
return $this->workbookXMLRelsAsXMLElement;
}
// find the file path of the sheet, by looking at the "workbook.xml.res" file
$xmlReader = new XMLReader();
if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_RELS_FILE_PATH)) {
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode('Relationship')) {
$relationshipSheetId = $xmlReader->getAttribute('Id');
/**
* Returns a representation of the workbook.xml file, ready to be parsed.
* The returned value is cached.
*
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
*/
protected function getWorkbookXMLAsXMLElement()
{
if (!$this->workbookXMLAsXMLElement) {
$this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace(
self::WORKBOOK_XML_FILE_PATH,
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML
);
}
if ($relationshipSheetId === $sheetId) {
// In workbook.xml.rels, it is only "worksheets/sheet1.xml"
// In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
$sheetDataXMLFilePath = $xmlReader->getAttribute('Target');
return $this->workbookXMLAsXMLElement;
}
/**
* Loads the contents of the given file in an XML parser and register the given XPath namespace.
*
* @param string $xmlFilePath The path of the XML file inside the XLSX file
* @param string $mainNamespace The main XPath namespace to register
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement The XML element representing the file
*/
protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace)
{
$xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath);
$xmlElement = new SimpleXMLElement($xmlContents);
$xmlElement->registerXPathNamespace('ns', $mainNamespace);
return $xmlElement;
// sometimes, the sheet data file path already contains "/xl/"...
if (strpos($sheetDataXMLFilePath, '/xl/') !== 0) {
$sheetDataXMLFilePath = '/xl/' . $sheetDataXMLFilePath;
break;
}
}
}
}
$xmlReader->close();
}
return $sheetDataXMLFilePath;
}
}

View File

@ -198,4 +198,37 @@ class XMLReaderTest extends \PHPUnit_Framework_TestCase
unlink($tempFolder . '/test.xlsx');
}
/**
* @return array
*/
public function dataProviderForTestIsPositionedOnStartingAndEndingNode()
{
return [
['<test></test>'], // not prefixed
['<x:test xmlns:x="foo"></x:test>'], // prefixed
];
}
/**
* @dataProvider dataProviderForTestIsPositionedOnStartingAndEndingNode
*
* @param string $testXML
* @return void
*/
public function testIsPositionedOnStartingAndEndingNode($testXML)
{
$xmlReader = new XMLReader();
$xmlReader->XML($testXML);
// the first read moves the pointer to "<test>"
$xmlReader->read();
$this->assertTrue($xmlReader->isPositionedOnStartingNode('test'));
$this->assertFalse($xmlReader->isPositionedOnEndingNode('test'));
// the seconds read moves the pointer to "</test>"
$xmlReader->read();
$this->assertFalse($xmlReader->isPositionedOnStartingNode('test'));
$this->assertTrue($xmlReader->isPositionedOnEndingNode('test'));
}
}

View File

@ -95,6 +95,23 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportPrefixedXMLFiles()
{
// The XML files of this spreadsheet are prefixed.
// For instance, they use "<x:sheet>" instead of "<sheet>", etc.
$allRows = $this->getAllRowsForFile('sheet_with_prefixed_xml_files.xlsx');
$expectedRows = [
['s1 - A1', 's1 - B1', 's1 - C1'],
['s1 - A2', 's1 - B2', 's1 - C2'],
['s1 - A3', 's1 - B3', 's1 - C3'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/