From 2a187a8eb2a63c7d6ea3542a8c28ff622f5f176e Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Sun, 29 May 2016 17:33:17 -0700 Subject: [PATCH] Support XLSX with prefixed XML files While the standard is not to have prefixes, some XLSX files have XML files containing a prefix. Microsoft has a tool that generates such files: https://msdn.microsoft.com/en-us/library/office/gg278316.aspx --- src/Spout/Reader/Wrapper/XMLReader.php | 27 +++- src/Spout/Reader/XLSX/Helper/SheetHelper.php | 128 +++++++----------- tests/Spout/Reader/Wrapper/XMLReaderTest.php | 33 +++++ tests/Spout/Reader/XLSX/ReaderTest.php | 17 +++ .../xlsx/sheet_with_prefixed_xml_files.xlsx | Bin 0 -> 7084 bytes 5 files changed, 119 insertions(+), 86 deletions(-) create mode 100644 tests/resources/xlsx/sheet_with_prefixed_xml_files.xlsx diff --git a/src/Spout/Reader/Wrapper/XMLReader.php b/src/Spout/Reader/Wrapper/XMLReader.php index 42bd92c..94b28eb 100644 --- a/src/Spout/Reader/Wrapper/XMLReader.php +++ b/src/Spout/Reader/Wrapper/XMLReader.php @@ -138,9 +138,10 @@ class XMLReader extends \XMLReader */ public function readUntilNodeFound($nodeName) { - while (($wasReadSuccessful = $this->read()) && ($this->nodeType !== \XMLReader::ELEMENT || $this->name !== $nodeName)) { - // do nothing - } + do { + $wasReadSuccessful = $this->read(); + $isNotPositionedOnStartingNode = !$this->isPositionedOnStartingNode($nodeName); + } while ($wasReadSuccessful && $isNotPositionedOnStartingNode); return $wasReadSuccessful; } @@ -170,7 +171,7 @@ class XMLReader extends \XMLReader */ public function isPositionedOnStartingNode($nodeName) { - return ($this->nodeType === XMLReader::ELEMENT && $this->name === $nodeName); + return $this->isPositionedOnNode($nodeName, XMLReader::ELEMENT); } /** @@ -179,6 +180,22 @@ class XMLReader extends \XMLReader */ public function isPositionedOnEndingNode($nodeName) { - return ($this->nodeType === XMLReader::END_ELEMENT && $this->name === $nodeName); + return $this->isPositionedOnNode($nodeName, XMLReader::END_ELEMENT); + } + + /** + * @param string $nodeName + * @param int $nodeType + * @return bool Whether the XML Reader is currently positioned on the node with given name and type + */ + private function isPositionedOnNode($nodeName, $nodeType) + { + // In some cases, the node has a prefix (for instance, "" can also be ""). + // So if the given node name does not have a prefix, we need to look at the unprefixed name ("localName"). + // @see https://github.com/box/spout/issues/233 + $hasPrefix = (strpos($nodeName, ':') !== false); + $currentNodeName = ($hasPrefix) ? $this->name : $this->localName; + + return ($this->nodeType === $nodeType && $currentNodeName === $nodeName); } } diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php index 5f74f44..ae7b8e0 100644 --- a/src/Spout/Reader/XLSX/Helper/SheetHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -2,7 +2,7 @@ namespace Box\Spout\Reader\XLSX\Helper; -use Box\Spout\Reader\Wrapper\SimpleXMLElement; +use Box\Spout\Reader\Wrapper\XMLReader; use Box\Spout\Reader\XLSX\Sheet; /** @@ -17,10 +17,6 @@ class SheetHelper const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels'; const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml'; - /** Namespaces for the XML files */ - const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships'; - const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; - /** @var string Path of the XLSX file being read */ protected $filePath; @@ -33,12 +29,6 @@ class SheetHelper /** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */ protected $shouldFormatDates; - /** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml.rels file */ - protected $workbookXMLRelsAsXMLElement; - - /** @var \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representing the workbook.xml file */ - protected $workbookXMLAsXMLElement; - /** * @param string $filePath Path of the XLSX file being read * @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings @@ -62,13 +52,21 @@ class SheetHelper public function getSheets() { $sheets = []; + $sheetIndex = 0; - // Starting from "workbook.xml" as this file is the source of truth for the sheets order - $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); - $sheetNodes = $workbookXMLElement->xpath('//ns:sheet'); + $xmlReader = new XMLReader(); + if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_FILE_PATH)) { + while ($xmlReader->read()) { + if ($xmlReader->isPositionedOnStartingNode('sheet')) { + $sheets[] = $this->getSheetFromSheetXMLNode($xmlReader, $sheetIndex); + $sheetIndex++; + } else if ($xmlReader->isPositionedOnEndingNode('sheets')) { + // stop reading once all sheets have been read + break; + } + } - foreach ($sheetNodes as $sheetIndex => $sheetNode) { - $sheets[] = $this->getSheetFromSheetXMLNode($sheetNode, $sheetIndex); + $xmlReader->close(); } return $sheets; @@ -79,88 +77,56 @@ class SheetHelper * We can find the XML file path describing the sheet inside "workbook.xml.res", by mapping with the sheet ID * ("r:id" in "workbook.xml", "Id" in "workbook.xml.res"). * - * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $sheetNode XML Node describing the sheet, as defined in "workbook.xml" + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReaderOnSheetNode XML Reader instance, pointing on the node describing the sheet, as defined in "workbook.xml" * @param int $sheetIndexZeroBased Index of the sheet, based on order of appearance in the workbook (zero-based) * @return \Box\Spout\Reader\XLSX\Sheet Sheet instance */ - protected function getSheetFromSheetXMLNode($sheetNode, $sheetIndexZeroBased) + protected function getSheetFromSheetXMLNode($xmlReaderOnSheetNode, $sheetIndexZeroBased) { - // To retrieve namespaced attributes, some versions of LibXML will accept prefixing the attribute - // with the namespace directly (tested on LibXML 2.9.3). For older versions (tested on LibXML 2.7.8), - // attributes need to be retrieved without the namespace hint. - $sheetId = $sheetNode->getAttribute('r:id'); - if ($sheetId === null) { - $sheetId = $sheetNode->getAttribute('id'); - } - - $escapedSheetName = $sheetNode->getAttribute('name'); + $sheetId = $xmlReaderOnSheetNode->getAttribute('r:id'); + $escapedSheetName = $xmlReaderOnSheetNode->getAttribute('name'); /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sheetName = $escaper->unescape($escapedSheetName); - // find the file path of the sheet, by looking at the "workbook.xml.res" file - $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); - $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Id="' . $sheetId . '"]'); - $relationshipNode = $relationshipNodes[0]; - - // In workbook.xml.rels, it is only "worksheets/sheet1.xml" - // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" - $sheetDataXMLFilePath = '/xl/' . $relationshipNode->getAttribute('Target'); + $sheetDataXMLFilePath = $this->getSheetDataXMLFilePathForSheetId($sheetId); return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName); } /** - * Returns a representation of the workbook.xml.rels file, ready to be parsed. - * The returned value is cached. - * - * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file + * @param string $sheetId The sheet ID, as defined in "workbook.xml" + * @return string The XML file path describing the sheet inside "workbook.xml.res", for the given sheet ID */ - protected function getWorkbookXMLRelsAsXMLElement() + protected function getSheetDataXMLFilePathForSheetId($sheetId) { - if (!$this->workbookXMLRelsAsXMLElement) { - $this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace( - self::WORKBOOK_XML_RELS_FILE_PATH, - self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS - ); + $sheetDataXMLFilePath = ''; + + // find the file path of the sheet, by looking at the "workbook.xml.res" file + $xmlReader = new XMLReader(); + if ($xmlReader->open('zip://' . $this->filePath . '#' . self::WORKBOOK_XML_RELS_FILE_PATH)) { + while ($xmlReader->read()) { + if ($xmlReader->isPositionedOnStartingNode('Relationship')) { + $relationshipSheetId = $xmlReader->getAttribute('Id'); + + if ($relationshipSheetId === $sheetId) { + // In workbook.xml.rels, it is only "worksheets/sheet1.xml" + // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" + $sheetDataXMLFilePath = $xmlReader->getAttribute('Target'); + + // sometimes, the sheet data file path already contains "/xl/"... + if (strpos($sheetDataXMLFilePath, '/xl/') !== 0) { + $sheetDataXMLFilePath = '/xl/' . $sheetDataXMLFilePath; + break; + } + } + } + } + + $xmlReader->close(); } - return $this->workbookXMLRelsAsXMLElement; - } - - /** - * Returns a representation of the workbook.xml file, ready to be parsed. - * The returned value is cached. - * - * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement XML element representating the workbook.xml.rels file - */ - protected function getWorkbookXMLAsXMLElement() - { - if (!$this->workbookXMLAsXMLElement) { - $this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace( - self::WORKBOOK_XML_FILE_PATH, - self::MAIN_NAMESPACE_FOR_WORKBOOK_XML - ); - } - - return $this->workbookXMLAsXMLElement; - } - - /** - * Loads the contents of the given file in an XML parser and register the given XPath namespace. - * - * @param string $xmlFilePath The path of the XML file inside the XLSX file - * @param string $mainNamespace The main XPath namespace to register - * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement The XML element representing the file - */ - protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace) - { - $xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath); - - $xmlElement = new SimpleXMLElement($xmlContents); - $xmlElement->registerXPathNamespace('ns', $mainNamespace); - - return $xmlElement; + return $sheetDataXMLFilePath; } } diff --git a/tests/Spout/Reader/Wrapper/XMLReaderTest.php b/tests/Spout/Reader/Wrapper/XMLReaderTest.php index a4deacd..1f7ffc4 100644 --- a/tests/Spout/Reader/Wrapper/XMLReaderTest.php +++ b/tests/Spout/Reader/Wrapper/XMLReaderTest.php @@ -198,4 +198,37 @@ class XMLReaderTest extends \PHPUnit_Framework_TestCase unlink($tempFolder . '/test.xlsx'); } + + /** + * @return array + */ + public function dataProviderForTestIsPositionedOnStartingAndEndingNode() + { + return [ + [''], // not prefixed + [''], // prefixed + ]; + } + + /** + * @dataProvider dataProviderForTestIsPositionedOnStartingAndEndingNode + * + * @param string $testXML + * @return void + */ + public function testIsPositionedOnStartingAndEndingNode($testXML) + { + $xmlReader = new XMLReader(); + $xmlReader->XML($testXML); + + // the first read moves the pointer to "" + $xmlReader->read(); + $this->assertTrue($xmlReader->isPositionedOnStartingNode('test')); + $this->assertFalse($xmlReader->isPositionedOnEndingNode('test')); + + // the seconds read moves the pointer to "" + $xmlReader->read(); + $this->assertFalse($xmlReader->isPositionedOnStartingNode('test')); + $this->assertTrue($xmlReader->isPositionedOnEndingNode('test')); + } } diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index 8620ed5..2703799 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -95,6 +95,23 @@ class ReaderTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRows, $allRows); } + /** + * @return void + */ + public function testReadShouldSupportPrefixedXMLFiles() + { + // The XML files of this spreadsheet are prefixed. + // For instance, they use "" instead of "", etc. + $allRows = $this->getAllRowsForFile('sheet_with_prefixed_xml_files.xlsx'); + + $expectedRows = [ + ['s1 - A1', 's1 - B1', 's1 - C1'], + ['s1 - A2', 's1 - B2', 's1 - C2'], + ['s1 - A3', 's1 - B3', 's1 - C3'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + /** * @return void */ diff --git a/tests/resources/xlsx/sheet_with_prefixed_xml_files.xlsx b/tests/resources/xlsx/sheet_with_prefixed_xml_files.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d8bbc39f5fda8ba0679a47e3df108e0b5f529216 GIT binary patch literal 7084 zcmd^EWmFVev>r-IX@N_3GlDb%0un-VQUU`=qjX3~DJdX0NJw{w z3JmZDYw+rIz4!a=AG2l+JI>kXJNxYYp{9g}P67Y`umDVXi#}}}TO3xi^&#AO#icMi zQO{fQ%^XVNf<>(1@1-+VS`_B!1N%AcL8=-wxvDbx3uS1$G?BpjIO)xrDwOzwY2(tI zWC;cJ+UYdUYg4s_f*gWuk|G8unAxv9%y|!{bF(slN@7+z)dshACqfS`g!)A!-K`Htd_|(M?%6hoo${6IN0v%bM#=%s}u$DPnL}9*4T7(UIz2~;X(SM z#bqO~vpZ<~Q|EIQRVhwW0J4!LY@+rt#m9{z9jDB%+rb!8w+xy!uSb|15-0C00&aNo zH`Uk$WckPq;j4YmE&#CpumOZf=uf$T_y%@1w{yAjL+9HGMA1Z>(IHMC6%_!${iy>y z)2w`?;?PP*AXgd)>kOiX>$bniO<5~oBWvan)-jQk9Ne=r+5$BDXfRBLZWD(=>o2mhIU(YCIwja0~n7OSX^Dnpxyv1 z=g>H;CvH}Q5378Mc&U#8s;?W!32-_%wh}LT^)ii-g*MUODwYq6)}dkL-@ZV;MK!>t z#kZ(A*-41TY%q-(L5*QPuUUsrm)cE~@~Cnp4_qk1yyd9+{xX^9BAS>%(`NC+Da=l> z&-IPo$3;|6=xK(#9|{kIL{SE1(Cejm(Wa}UpW?}9KDxpxE-!A%c4uQw;7V1jaR22i zHEjBiYkAn08xIQ=jrh53S3?$G7LWCxOszz7UkCh1b`STX_iJk6+(Su8_sj&L`vmQF&QGiN4KSaCJ2Uc_>JUCrPZm{&pp`8C(?zCaPf7RmgKR;!2F}YtlhKSU*BEn4 z_7W`ZfcCJ_hBqeBN~SjK{&4V@G!mU_MjFlyM0$(MrCw7S@I)llUu>-QGrYsp9 zVUDdzdDL=!_@v}QSW;^w*=}Nv*6*_D?BmFsgu#?l<)lUEvRrp|dY_?vvX@ly6UcbW z09#I%4$2U(m~aidV^5`jQnX%JNbqKU87~5oI2Jk2PAeCP|0!%)l{}het?*N`p+@9J zW@=y#qeL6Noe}v-G%jUqc}Cqw?FKLf)s;*5FIVzTmP0f*@_+6diPwMvOt`IYbE95U> zIR0v9>ga6F?S#OixwEVFx%l6T05nv7o&f<$4nz@9|3CRL6%-M;YGPq3Y{tXOFJ#6K z5fTwLHRt0s&Kb3Gx;sr3&MW3E?MbdoW(LQ;3jQ70ps9tmv>RfM8%+>Hb;S)?J?-hpIqGQS{ko+ zcCEChtSFx(yH5|jxjKfa?@#2LPmRM30YzfJ?*=ZZ4yTm~=wQ2wXOiy7u1$fGH8I$n z0|IGM4)*mT&BPMJD(K$A*0Ne&>20dAP3?82cFXdd!mJ+WhXZ+evV zzfp!L6aWvqznuJ8wLvKSsk+n7W8+L9{;Y~{at5rQYIhx-ZA}~tEa+LOktwB0{^Qgugs1|iZ16_&4T5S-vQ@*%t3qL-n zG<@12bx&<>^4g{HEOgnv!@9f0Awm2%OP?EaSM!rAT&Y<_E99TiyTM8{NigcpPXY9y zOLU;1m1kTgH@Mibp`xVscpkT74Da=|;SH)Kp~yySFQv^%8aZd@XXMcm8?}r?YS)Mj zh%R|h_?mQBb87JBsT9*Ytcb9|ZPgx!75LF5TV{_cdv@c-*0GGZc#l_Xr&fMi|{nhGr7^fpK5&{Rn zglN1!NIGhQM#DryIu*;>G$JdJ-=y}u1*Slzo;}q&Z@eKQkLh!I z;yF3F7Z^l55W=qi3hs=75?lF7S&(^+>ll7%at&B@+yUcb{VXrd<-|Q`cSuB2Y%ip5 zvBF6B+OQx2qzhHVHJi{8dSWdg1^Xh{Qf^XN*M)9Ev;mY5Q7*_9$k0ivU^YB5>p6Ve zxqzKYaQBsSQoCVs&0w>r>I@G1l~}sjW$~G!L9WMaL=jIrO=1Uww%#%(X?bzkmov@W zPjA~c;%l>1lUj09)V@Npn#(Z_(W7nVLwlHs4dm-%O>9yI*4xQg#a7C62Pp**Tu(Jv zEYW(@QxTF-?Jh)-wmY+Wgc31x5kIgq-QsD)cqu#_XoedfJ#}T=uw9QQ4qJ4azhV*G_Ep0sS9=Y53%6Tl8LKLVDo# z)=ficA+(op^14J4>CvzttQ0P%9tiP3gz2#_-rFG=QaTge*t(t5;rIE*Tx~#Cye{w< z?_dncE{v{I{B!ctCO!ia=03B>+J5 z4MpG7zJ8uTXgfn=fkjub1EyIEEntOevbx_;tTSo37B3nfgkZ8W+4R0!* z+j>H4S8rEShuCA>x*h5Ui+(c#S@nnCSFd1DCmj~Hufyk(fHCZgZ3=;7cebOigKIz0 zF-!q?qpZg(Q2i69P7_yc z>hg1}nJr{r_h~r_4nPCdCB%zlhBEF+>iJ6o=ORmn)o5{Hx}`9YhYu*qR@aYv}K)n)W*=_URT8#?96PPl=(`Y-_{KKVfAm%kH%41l$p(YR)3QwTJgyr7rgWs($^GcQj5cYkTX2m`KxS}q3>slVOf^{yd4RZ9)U*-HSRkS2 zJ>Mp?P2blGs3|c*t$D%YnelsUd`GUD`9nU*ol=p$g6{*o324ZNKkJr>C9#&3y->Hh zF2k&OH%wXVqFMwxXmj8nX_iUx2&L_b771voDn?DH6uybeZ8;CFo+3)&N7%QTlZz4= zDqP6+*myZ$a+v0MhV(Jy_Zc+87u~BUdI^ zlpCb3Vte94+(l|$;!j=npi7Nqs|TwAgMB!2%yk4G5}&l*aVIb6<2;}K3xYKbNL5h6 zdu7oC?0`G>7B)?o`f6YwJEg3E34491X|DSkeo;>F0t=PrYa?X6UOD=XZ&=CopUJ*Q zJ%vVkQufUn#gm@Of*Y5*$?hFo=JZ*2Fyy~&@KKjDKz*IXI~;qI3^&l>&Tw6Ah4NgNONslz?1Eyi|8S zzPO{93I7C5FEF5^sy-koZz%6`Z8j6c z(C!8e(0WDZM1uurXyxU=vEk#xv;RzB-O4T1?;W1EK&zFgY-8RN_K;C=QL+c?p>a2O zSX`%CBXY$8V=|`E@P(E7z-cqyV{911)>fWIpa4E0+xa=!Gr*!38vuRSb}0yaAAdBEdSc*HTB)%TvSzWoP{IC&>y{kTC7B6U=ja*ZeV z6jx)OaxGaj*d#`}g<{&`qaQ_rtJt}v^AaKsiu^UcgcUbWRgc$cpmR;6A9Mx8Gy<@4 zEX2YWib%TNZoD+KhEcze(HgE~qt7YMmBFP2=HSq2inb04ZnZnPxfyd#Tvp(J798n# zclJTB;XBds>Ea83g&DrS(1nsa&oZ`W)6%3qc$;V2YlLfhQ0#GKW-sqIJj!ft?VFiv ztw*iAf||S?C@=1Rk8$td`eojG@5WR+>hceui?!U5UUYSjraO|+#H8gjEl-HpX zN4dCX$TlxEsy+7WKSt}9_mFtCd)jhS{>w7f^b=f$=9cRR8k^QFo)#U62wvYKfc9`_03v(eKUY2;bkX9qn}>*d)(lF1@WiO!t;H zud-3o$;9OjZvDE20}D(43>h^^T zwI!>z%N6@Em{lX0{RXm<&?yHD>4nc(zU9L0*O{FC#Rqs>H48I`@OMcZT zkVnQRgn9a4v+CooSU#U2@;_(j>i^>uEg|SJD&{Y1{s;i^A-cH#ND<#ziqhiy5%>OL zNBF5J1KDStV8r3N~+;_eh<6gHdPD?i^bDf=2K=gPM4Mka>qJ;UMF}u6JFS#p><~)38(`0Mrxww$KHn)Mha+EXti} z;YCkLkei^S&Bve5K{dg4TmPDlZq#O7wQ6ro#=u$!9iMT~^M_%9V%Y}KqDGn29*a(c zEVN<-W=GUoF$`vMKxUm0x^YfDc z`qgSqPlhgs^4fbKdz(3;2^*ePLoLH*Jni?UDK#6?l&0C@S6L1EaatSOYxiJ0)XlTY zTORykPw;M@1dcU-;dkw6a_({be5gsaD}POpnkG7gb!a;5l(sXFvomlx3Ij@=f)_2{ z8n?W~M7|E2_xs%RPLe{W8CMg7EP0JNtEiV(Z=9R`O^%G3AYZbgOEgdcL!`q<|-#M-ye*esJA^nE_pZxwW zIj_H+$r-1E6mrb(49}1C{(c<4B1hz#!Jm=(Z_Wo9=k|HC^PSDVfcs~f1nI{O0VxEN zgd|P=w$)!(9*^_Y^v6K`W5a(QsgV)u&zt=mu#sW^+`l31qYzX(V#Vk8j5Pi|+h^?u zQZ6FK`|rj-n~`4^e*UpNYe$eG{G091<^;Kb-+P*~=Ljjc5dRwcbpcC& ihV*g|Q9=LU