Support XLSX with prefixed XML files (#237)
While the standard is not to have prefixes, some XLSX files have XML files containing a prefix. Microsoft has a tool that generates such files: https://msdn.microsoft.com/en-us/library/office/gg278316.aspx
This commit is contained in:
parent
2ed30321b4
commit
03866a6604
@ -138,9 +138,10 @@ class XMLReader extends \XMLReader
|
|||||||
*/
|
*/
|
||||||
public function readUntilNodeFound($nodeName)
|
public function readUntilNodeFound($nodeName)
|
||||||
{
|
{
|
||||||
while (($wasReadSuccessful = $this->read()) && ($this->nodeType !== \XMLReader::ELEMENT || $this->name !== $nodeName)) {
|
do {
|
||||||
// do nothing
|
$wasReadSuccessful = $this->read();
|
||||||
}
|
$isNotPositionedOnStartingNode = !$this->isPositionedOnStartingNode($nodeName);
|
||||||
|
} while ($wasReadSuccessful && $isNotPositionedOnStartingNode);
|
||||||
|
|
||||||
return $wasReadSuccessful;
|
return $wasReadSuccessful;
|
||||||
}
|
}
|
||||||
@ -170,7 +171,7 @@ class XMLReader extends \XMLReader
|
|||||||
*/
|
*/
|
||||||
public function isPositionedOnStartingNode($nodeName)
|
public function isPositionedOnStartingNode($nodeName)
|
||||||
{
|
{
|
||||||
return ($this->nodeType === XMLReader::ELEMENT && $this->name === $nodeName);
|
return $this->isPositionedOnNode($nodeName, XMLReader::ELEMENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -179,6 +180,22 @@ class XMLReader extends \XMLReader
|
|||||||
*/
|
*/
|
||||||
public function isPositionedOnEndingNode($nodeName)
|
public function isPositionedOnEndingNode($nodeName)
|
||||||
{
|
{
|
||||||
return ($this->nodeType === XMLReader::END_ELEMENT && $this->name === $nodeName);
|
return $this->isPositionedOnNode($nodeName, XMLReader::END_ELEMENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $nodeName
|
||||||
|
* @param int $nodeType
|
||||||
|
* @return bool Whether the XML Reader is currently positioned on the node with given name and type
|
||||||
|
*/
|
||||||
|
private function isPositionedOnNode($nodeName, $nodeType)
|
||||||
|
{
|
||||||
|
// In some cases, the node has a prefix (for instance, "<sheet>" can also be "<x:sheet>").
|
||||||
|
// So if the given node name does not have a prefix, we need to look at the unprefixed name ("localName").
|
||||||
|
// @see https://github.com/box/spout/issues/233
|
||||||
|
$hasPrefix = (strpos($nodeName, ':') !== false);
|
||||||
|
$currentNodeName = ($hasPrefix) ? $this->name : $this->localName;
|
||||||
|
|
||||||
|
return ($this->nodeType === $nodeType && $currentNodeName === $nodeName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
namespace Box\Spout\Reader\XLSX\Helper;
|
namespace Box\Spout\Reader\XLSX\Helper;
|
||||||
|
|
||||||
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
|
use Box\Spout\Reader\Wrapper\XMLReader;
|
||||||
use Box\Spout\Reader\XLSX\Sheet;
|
use Box\Spout\Reader\XLSX\Sheet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -17,10 +17,6 @@ class SheetHelper
|
|||||||
const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels';
|
const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels';
|
||||||
const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml';
|
const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml';
|
||||||
|
|
||||||
/** Namespaces for the XML files */
|
|
||||||
const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships';
|
|
||||||
const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
|
|
||||||
|
|
||||||
/** @var string Path of the XLSX file being read */
|
/** @var string Path of the XLSX file being read */
|
||||||
protected $filePath;
|
protected $filePath;
|
||||||
|
|
||||||
@ -33,12 +29,6 @@ class SheetHelper
|
|||||||
/** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */
|
/** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */
|
||||||
protected $shouldFormatDates;
|
protected $shouldFormatDates;
|
||||||
|
|
||||||
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml.rels file */
|
|
||||||
protected $workbookXMLRelsAsXMLElement;
|
|
||||||
|
|
||||||
/** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml file */
|
|
||||||
protected $workbookXMLAsXMLElement;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $filePath Path of the XLSX file being read
|
* @param string $filePath Path of the XLSX file being read
|
||||||
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings
|
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings
|
||||||
@ -62,13 +52,21 @@ class SheetHelper
|
|||||||
public function getSheets()
|
public function getSheets()
|
||||||
{
|
{
|
||||||
$sheets = [];
|
$sheets = [];
|
||||||
|
$sheetIndex = 0;
|
||||||
|
|
||||||
// Starting from "workbook.xml" as this file is the source of truth for the sheets order
|
$xmlReader = new XMLReader();
|
||||||
$workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
|
if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_FILE_PATH)) {
|
||||||
$sheetNodes = $workbookXMLElement->xpath('//ns:sheet');
|
while ($xmlReader->read()) {
|
||||||
|
if ($xmlReader->isPositionedOnStartingNode('sheet')) {
|
||||||
|
$sheets[] = $this->getSheetFromSheetXMLNode($xmlReader, $sheetIndex);
|
||||||
|
$sheetIndex++;
|
||||||
|
} else if ($xmlReader->isPositionedOnEndingNode('sheets')) {
|
||||||
|
// stop reading once all sheets have been read
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
foreach ($sheetNodes as $sheetIndex => $sheetNode) {
|
$xmlReader->close();
|
||||||
$sheets[] = $this->getSheetFromSheetXMLNode($sheetNode, $sheetIndex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $sheets;
|
return $sheets;
|
||||||
@ -79,88 +77,56 @@ class SheetHelper
|
|||||||
* We can find the XML file path describing the sheet inside "workbook.xml.res", by mapping with the sheet ID
|
* We can find the XML file path describing the sheet inside "workbook.xml.res", by mapping with the sheet ID
|
||||||
* ("r:id" in "workbook.xml", "Id" in "workbook.xml.res").
|
* ("r:id" in "workbook.xml", "Id" in "workbook.xml.res").
|
||||||
*
|
*
|
||||||
* @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $sheetNode XML Node describing the sheet, as defined in "workbook.xml"
|
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReaderOnSheetNode XML Reader instance, pointing on the node describing the sheet, as defined in "workbook.xml"
|
||||||
* @param int $sheetIndexZeroBased Index of the sheet, based on order of appearance in the workbook (zero-based)
|
* @param int $sheetIndexZeroBased Index of the sheet, based on order of appearance in the workbook (zero-based)
|
||||||
* @return \Box\Spout\Reader\XLSX\Sheet Sheet instance
|
* @return \Box\Spout\Reader\XLSX\Sheet Sheet instance
|
||||||
*/
|
*/
|
||||||
protected function getSheetFromSheetXMLNode($sheetNode, $sheetIndexZeroBased)
|
protected function getSheetFromSheetXMLNode($xmlReaderOnSheetNode, $sheetIndexZeroBased)
|
||||||
{
|
{
|
||||||
// To retrieve namespaced attributes, some versions of LibXML will accept prefixing the attribute
|
$sheetId = $xmlReaderOnSheetNode->getAttribute('r:id');
|
||||||
// with the namespace directly (tested on LibXML 2.9.3). For older versions (tested on LibXML 2.7.8),
|
$escapedSheetName = $xmlReaderOnSheetNode->getAttribute('name');
|
||||||
// attributes need to be retrieved without the namespace hint.
|
|
||||||
$sheetId = $sheetNode->getAttribute('r:id');
|
|
||||||
if ($sheetId === null) {
|
|
||||||
$sheetId = $sheetNode->getAttribute('id');
|
|
||||||
}
|
|
||||||
|
|
||||||
$escapedSheetName = $sheetNode->getAttribute('name');
|
|
||||||
|
|
||||||
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
|
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
|
||||||
$escaper = new \Box\Spout\Common\Escaper\XLSX();
|
$escaper = new \Box\Spout\Common\Escaper\XLSX();
|
||||||
$sheetName = $escaper->unescape($escapedSheetName);
|
$sheetName = $escaper->unescape($escapedSheetName);
|
||||||
|
|
||||||
// find the file path of the sheet, by looking at the "workbook.xml.res" file
|
$sheetDataXMLFilePath = $this->getSheetDataXMLFilePathForSheetId($sheetId);
|
||||||
$workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement();
|
|
||||||
$relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Id="' . $sheetId . '"]');
|
|
||||||
$relationshipNode = $relationshipNodes[0];
|
|
||||||
|
|
||||||
// In workbook.xml.rels, it is only "worksheets/sheet1.xml"
|
|
||||||
// In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
|
|
||||||
$sheetDataXMLFilePath = '/xl/' . $relationshipNode->getAttribute('Target');
|
|
||||||
|
|
||||||
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName);
|
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a representation of the workbook.xml.rels file, ready to be parsed.
|
* @param string $sheetId The sheet ID, as defined in "workbook.xml"
|
||||||
* The returned value is cached.
|
* @return string The XML file path describing the sheet inside "workbook.xml.res", for the given sheet ID
|
||||||
*
|
|
||||||
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
|
|
||||||
*/
|
*/
|
||||||
protected function getWorkbookXMLRelsAsXMLElement()
|
protected function getSheetDataXMLFilePathForSheetId($sheetId)
|
||||||
{
|
{
|
||||||
if (!$this->workbookXMLRelsAsXMLElement) {
|
$sheetDataXMLFilePath = '';
|
||||||
$this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace(
|
|
||||||
self::WORKBOOK_XML_RELS_FILE_PATH,
|
// find the file path of the sheet, by looking at the "workbook.xml.res" file
|
||||||
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS
|
$xmlReader = new XMLReader();
|
||||||
);
|
if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_RELS_FILE_PATH)) {
|
||||||
|
while ($xmlReader->read()) {
|
||||||
|
if ($xmlReader->isPositionedOnStartingNode('Relationship')) {
|
||||||
|
$relationshipSheetId = $xmlReader->getAttribute('Id');
|
||||||
|
|
||||||
|
if ($relationshipSheetId === $sheetId) {
|
||||||
|
// In workbook.xml.rels, it is only "worksheets/sheet1.xml"
|
||||||
|
// In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
|
||||||
|
$sheetDataXMLFilePath = $xmlReader->getAttribute('Target');
|
||||||
|
|
||||||
|
// sometimes, the sheet data file path already contains "/xl/"...
|
||||||
|
if (strpos($sheetDataXMLFilePath, '/xl/') !== 0) {
|
||||||
|
$sheetDataXMLFilePath = '/xl/' . $sheetDataXMLFilePath;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$xmlReader->close();
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->workbookXMLRelsAsXMLElement;
|
return $sheetDataXMLFilePath;
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a representation of the workbook.xml file, ready to be parsed.
|
|
||||||
* The returned value is cached.
|
|
||||||
*
|
|
||||||
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file
|
|
||||||
*/
|
|
||||||
protected function getWorkbookXMLAsXMLElement()
|
|
||||||
{
|
|
||||||
if (!$this->workbookXMLAsXMLElement) {
|
|
||||||
$this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace(
|
|
||||||
self::WORKBOOK_XML_FILE_PATH,
|
|
||||||
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $this->workbookXMLAsXMLElement;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Loads the contents of the given file in an XML parser and register the given XPath namespace.
|
|
||||||
*
|
|
||||||
* @param string $xmlFilePath The path of the XML file inside the XLSX file
|
|
||||||
* @param string $mainNamespace The main XPath namespace to register
|
|
||||||
* @return \Box\Spout\Reader\Wrapper\SimpleXMLElement The XML element representing the file
|
|
||||||
*/
|
|
||||||
protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace)
|
|
||||||
{
|
|
||||||
$xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath);
|
|
||||||
|
|
||||||
$xmlElement = new SimpleXMLElement($xmlContents);
|
|
||||||
$xmlElement->registerXPathNamespace('ns', $mainNamespace);
|
|
||||||
|
|
||||||
return $xmlElement;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -198,4 +198,37 @@ class XMLReaderTest extends \PHPUnit_Framework_TestCase
|
|||||||
|
|
||||||
unlink($tempFolder . '/test.xlsx');
|
unlink($tempFolder . '/test.xlsx');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function dataProviderForTestIsPositionedOnStartingAndEndingNode()
|
||||||
|
{
|
||||||
|
return [
|
||||||
|
['<test></test>'], // not prefixed
|
||||||
|
['<x:test xmlns:x="foo"></x:test>'], // prefixed
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @dataProvider dataProviderForTestIsPositionedOnStartingAndEndingNode
|
||||||
|
*
|
||||||
|
* @param string $testXML
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testIsPositionedOnStartingAndEndingNode($testXML)
|
||||||
|
{
|
||||||
|
$xmlReader = new XMLReader();
|
||||||
|
$xmlReader->XML($testXML);
|
||||||
|
|
||||||
|
// the first read moves the pointer to "<test>"
|
||||||
|
$xmlReader->read();
|
||||||
|
$this->assertTrue($xmlReader->isPositionedOnStartingNode('test'));
|
||||||
|
$this->assertFalse($xmlReader->isPositionedOnEndingNode('test'));
|
||||||
|
|
||||||
|
// the seconds read moves the pointer to "</test>"
|
||||||
|
$xmlReader->read();
|
||||||
|
$this->assertFalse($xmlReader->isPositionedOnStartingNode('test'));
|
||||||
|
$this->assertTrue($xmlReader->isPositionedOnEndingNode('test'));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -95,6 +95,23 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
|
|||||||
$this->assertEquals($expectedRows, $allRows);
|
$this->assertEquals($expectedRows, $allRows);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testReadShouldSupportPrefixedXMLFiles()
|
||||||
|
{
|
||||||
|
// The XML files of this spreadsheet are prefixed.
|
||||||
|
// For instance, they use "<x:sheet>" instead of "<sheet>", etc.
|
||||||
|
$allRows = $this->getAllRowsForFile('sheet_with_prefixed_xml_files.xlsx');
|
||||||
|
|
||||||
|
$expectedRows = [
|
||||||
|
['s1 - A1', 's1 - B1', 's1 - C1'],
|
||||||
|
['s1 - A2', 's1 - B2', 's1 - C2'],
|
||||||
|
['s1 - A3', 's1 - B3', 's1 - C3'],
|
||||||
|
];
|
||||||
|
$this->assertEquals($expectedRows, $allRows);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
|
BIN
tests/resources/xlsx/sheet_with_prefixed_xml_files.xlsx
Normal file
BIN
tests/resources/xlsx/sheet_with_prefixed_xml_files.xlsx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user