From b8fd789ac016636c871df5560b4e7514e00e3cfe Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Thu, 19 May 2016 10:37:48 -0700 Subject: [PATCH] Retrieve XLSX sheets in order of appearance (#220) Instead of relying on the ID, sheets should be retrieved in the order they appear in the file. Workbook.xml describes the correct order. This allows the reader to read data in the correct order when sheets have been manually moved after creation. --- src/Spout/Reader/XLSX/Helper/SheetHelper.php | 73 +++++++----------- tests/Spout/Reader/XLSX/ReaderTest.php | 2 +- .../file_with_no_sheets_in_content_types.xlsx | Bin 3720 -> 0 bytes .../file_with_no_sheets_in_workbook_xml.xlsx | Bin 0 -> 3704 bytes 4 files changed, 28 insertions(+), 47 deletions(-) delete mode 100644 tests/resources/xlsx/file_with_no_sheets_in_content_types.xlsx create mode 100644 tests/resources/xlsx/file_with_no_sheets_in_workbook_xml.xlsx diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php index 3400509..23a2b08 100644 --- a/src/Spout/Reader/XLSX/Helper/SheetHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -14,18 +14,13 @@ use Box\Spout\Reader\XLSX\Sheet; class SheetHelper { /** Paths of XML files relative to the XLSX file root */ - const CONTENT_TYPES_XML_FILE_PATH = '[Content_Types].xml'; const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels'; const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml'; /** Namespaces for the XML files */ - const MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML = 'http://schemas.openxmlformats.org/package/2006/content-types'; const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships'; const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; - /** Value of the Override attribute used in [Content_Types].xml to define sheets */ - const OVERRIDE_CONTENT_TYPES_ATTRIBUTE = 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml'; - /** @var string Path of the XLSX file being read */ protected $filePath; @@ -63,65 +58,51 @@ class SheetHelper { $sheets = []; - $contentTypesAsXMLElement = $this->getFileAsXMLElementWithNamespace( - self::CONTENT_TYPES_XML_FILE_PATH, - self::MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML - ); + // Starting from "workbook.xml" as this file is the source of truth for the sheets order + $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); + $sheetNodes = $workbookXMLElement->xpath('//ns:sheet'); - // find all nodes defining a sheet - $sheetNodes = $contentTypesAsXMLElement->xpath('//ns:Override[@ContentType="' . self::OVERRIDE_CONTENT_TYPES_ATTRIBUTE . '"]'); - $numSheetNodes = count($sheetNodes); - - for ($i = 0; $i < $numSheetNodes; $i++) { - $sheetNode = $sheetNodes[$i]; - $sheetDataXMLFilePath = $sheetNode->getAttribute('PartName'); - - $sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath); + foreach ($sheetNodes as $sheetIndex => $sheetNode) { + $sheets[] = $this->getSheetFromSheetXMLNode($sheetNode, $sheetIndex); } - // make sure the sheets are sorted by index - // (as the sheets are not necessarily in this order in the XML file) - usort($sheets, function ($sheet1, $sheet2) { - return ($sheet1->getIndex() - $sheet2->getIndex()); - }); - return $sheets; } /** - * Returns an instance of a sheet, given the path of its data XML file. - * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet. - * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. - * The entry contains the ID and name of the sheet. + * Returns an instance of a sheet, given the XML node describing the sheet - from "workbook.xml". + * We can find the XML file path describing the sheet inside "workbook.xml.res", by mapping with the sheet ID + * ("r:id" in "workbook.xml", "Id" in "workbook.xml.res"). * - * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml + * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $sheetNode XML Node describing the sheet, as defined in "workbook.xml" + * @param int $sheetIndexZeroBased Index of the sheet, based on order of appearance in the workbook (zero-based) * @return \Box\Spout\Reader\XLSX\Sheet Sheet instance */ - protected function getSheetFromXML($sheetDataXMLFilePath) + protected function getSheetFromSheetXMLNode($sheetNode, $sheetIndexZeroBased) { - // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" - // In workbook.xml.rels, it is only "worksheets/sheet1.xml" - $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); - - // find the node associated to the given file path - $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); - $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); - $relationshipNode = $relationshipNodes[0]; - - $relationshipSheetId = $relationshipNode->getAttribute('Id'); - - $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); - $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $relationshipSheetId . '"]'); - $sheetNode = $sheetNodes[0]; + // To retrieve namespaced attributes, some versions of LibXML will accept prefixing the attribute + // with the namespace directly (tested on LibXML 2.9.3). For older versions (tested on LibXML 2.7.8), + // attributes need to be retrieved without the namespace hint. + $sheetId = $sheetNode->getAttribute('r:id'); + if ($sheetId === null) { + $sheetId = $sheetNode->getAttribute('id'); + } $escapedSheetName = $sheetNode->getAttribute('name'); - $sheetIdOneBased = $sheetNode->getAttribute('sheetId'); - $sheetIndexZeroBased = $sheetIdOneBased - 1; /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sheetName = $escaper->unescape($escapedSheetName); + // find the file path of the sheet, by looking at the "workbook.xml.res" file + $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); + $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Id="' . $sheetId . '"]'); + $relationshipNode = $relationshipNodes[0]; + + // In workbook.xml.rels, it is only "worksheets/sheet1.xml" + // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" + $sheetDataXMLFilePath = '/xl/' . $relationshipNode->getAttribute('Target'); + return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetIndexZeroBased, $sheetName); } diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index b1e6fdd..176f6d0 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -23,7 +23,7 @@ class ReaderTest extends \PHPUnit_Framework_TestCase { return [ ['/path/to/fake/file.xlsx'], - ['file_with_no_sheets_in_content_types.xlsx'], + ['file_with_no_sheets_in_workbook_xml.xlsx'], ['file_with_sheet_xml_not_matching_content_types.xlsx'], ['file_corrupted.xlsx'], ]; diff --git a/tests/resources/xlsx/file_with_no_sheets_in_content_types.xlsx b/tests/resources/xlsx/file_with_no_sheets_in_content_types.xlsx deleted file mode 100644 index 597b23078fd27a6e807c3eff3dbdfa9f4c2d34d9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3720 zcmai12{@E(_aE6N%UFx-JCQLmb|uD=T__PUG-V&g$k>vJDcQHknm4J}+MpGcrdJeR zdnt`AJNb%92KgQ-Bl`cpb6wAM%{9;Yo#l6*`<(Mwo6$4yfno=aD1YMC2uLLcT! z%XF#LSUfj(szuM$RNTEXam{%~TpIg*ZJEV{C`5(!fl?_IyqOMAZvS_J(^ED&m2+K! zM5o+>J=4A*Q{3BQ`?L8;<>^I&6MImHFODc>ymT=nyD47kEHBY>8C|;9O*ZsbPH?|G zct5&eIOT5-b}5CJ&w3PVmL+_9N=xH$so(-)iWAEo{d4!OmhD$>3wJNpCTHcIy+e$G z;*K;8;1CZGPr`go*P~SuR}?OY>Lg8;zVd>(Jf#RV%ZbS$*pGZXoke!2EYohn58tQP zx<|4L<$rn%Jhop#v^LFqCV0)4q^PhEC8d(278RAoWG?WK8|Ikhe&f*d@R-VKjpE4R zy_m8YIvkl|F5>jcl%Vg2l#|Z$%xgU&61azl@zJD2 z*nz517g@OE*P^6ps=@p@Ch+m*cao*$zEKapLcicdN5Ec-xfAM!#Od6^*B&A|F<&eO zGNY4I!sHcO(k_ISmNKw}n#@QXTs{5@dNf4dgLp%2my7gXFrV=f!m7A(RO6}4g}pjf zle(e_WBLuL&L>Fesv-UT z*K0K|x#8+AWln=R*6)ZWXO&NS1_veaI2Zrku%n@H*d)Y6fx3AA;hpi>Q$Yj# zitN#fwiiYLO!CddCJY?T)2r9jUF?poP3B^wYlw5CtLWc(y_0_}iH zwQugrf140t-mcnP_fJ7>b4Wayc%WleT;k;XQfk|n^W$3j-z!A<02s;#q&W*bTw7Ch zM`KXyx%#F!TamPO~VI`zEr4N7t-TuN-8T&k=G}L>mHKv ztSsT&7L6)Mg&OvTgYorm@neVMqGMZ=$eto@la4-*zI{|2%uUuV6ZdbXe|ZyjubN9Z z!=}Y_dGxV~7$X4!&l*&2mHUpm$HL6YhtADDhXvhUHN;J|d?c4%7a)iWUX6{Lw_W+Z zvY-{Adi-pE28dTZ{e`WFsOxh&4zaX5NZpY&ASx&-#&mSDSw8m%`xyjWB!aLhO_FQ$n>*_a3?!kgfqICVX67 zm@QSNsvEsW1SzM^Scw`@hQoxqWw{rX`R`oi=4Il`IgDi@zfns1_imQ1`INno&o%9t z30!iV!+2jfbh#2Aw^n%=FX=#GH2y+u?V zn?y_bv19Y89U=Lg(rr}NmSe_z;O=CW*ceAe-d=pB=vtq8xp(a{==bnXnwE@f0@rF3 zI864R;bTL5P#baESfEK_Z1q;}B9HXjSBVsRnmSUrJ;#GOE8xnzx%nCVdS>1v!6)sj zFkj|XE^v&5N7N96#SCWe4pzp+t2}$vBanlSM8D1qZj&%6yA=RtoD=&(nOfSP)V=SU z7u}Uxd-++(+)EtdZaw&nfFT`Mw{+GMHuyrW4(8;b86^kFYYty!%DS6ub+1`g2-%#; zcrPifG`Hm_s;+0IqGa9a)x|FTT3d@x-HrYV-{p5%#tt_J-^Qh> zZT7}d4RY(zgNX4#Hsk3%{PQehQVs`)1W?mF{1SQ?9;Bge7|2L)8_4P!m97&(%Sim<}~d>54WisE_Q>-p^32&;kwz= z1uoA#ck_~7L<@)i^*ogFtv%l%ajwPq9H40f`+n^7yFBCPL^QAuK1w+xc-e?B}hnbkE+?l2h&VbZsJu`d)*j&uyk6 z(6^%^LQqjt!!wn=Dna#|_CITGj*(V%)Lp-F-q~y*w44JyP$=y7Np^HuEjS3^rW_0c z@$b<60a0GRMj5>yEHA-$pM4JF8HifAF#Rn{*$lbQH-e5rC(z3-QBzcIIY3U_R~?rU z*eHDYKMD(xbUw?67Me59KdCO3n#34#`wG;~wGgG|&`NRVQor43aiR21q^@{a-R~Kp z*LGMOI#rI~W*jl=B*6USC>Ev6qnYQsWn45*s^8(X`jA5H@Vr;xClH!le!>cCohQzR z9DxxI2|YGqeIE{a7tK11t<(>{FmV2+?B}6W++9!MLOfN_9C6Od#7S1#o;0Z+C^Fai ztx7sJjMKF9`V$qZs)h}*&HjMhVT--ZP!!xQKHs?JQFAxAn6NL=Rig#WAn9r(!LRue zCZPV^=(0~Sf-sTXAk1mbt|HXnpTz|>dKPa0J6KzdINT*}jBOf%Y(rij=qt>cv=-Ci z^=D?VC9xU9gw8VHxnPo9@drN5+|8f7YlJ;X+HMe&F!5}Hl9|)7CYmtCXKh9Yw1)qC zO9swsi~M0{rWx2-8S5({P3(Xg_%(27PXDwb(lEG9%#Vrl|2bRVrD$>m$ZE@G^}E0A zyA=&qxfS+v)1o=uSZKGeD>QUAFn2ed{#ssW@SOy$&uN+rGW?v{zoOj<_&rHF#vgz` zle&G5)BMP7g>7wB+c@4xcl!jS*--<|_fPYXmhpy#?L&xWf%4PB=1AHO-QHViPzRu! z0M56DPXq5HZoNs-!~)>)CwQ}E{UUuM=Iu?3h7H+)rL{0?GsYh^4*@|ymcZdkt$*-< F{tH}$XDt8# diff --git a/tests/resources/xlsx/file_with_no_sheets_in_workbook_xml.xlsx b/tests/resources/xlsx/file_with_no_sheets_in_workbook_xml.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..74de5277f2c8b64188bc2dbc8d2b67a5365cfbfc GIT binary patch literal 3704 zcmai12{_bi7a#i+#x@}_mI-AUgUQxr8QWOKwIqs*Om?HOM`|XfE4yrkx^AdTma%55 zG+m`|6Q*0)%QBW+hLk1WU$R8s_r1^ae`cPU^E>A~zxVf^bN&`4EUZuv2m}U&Un(HT-gsG|xSDR=!zZnp)=T zQzqQ*QizBU?Pq-J3cB;@1xTTi6p!ywUv2P6)3 zq)*p{f8wdq3?~a0D_)>y$NVfO^2^jils-zS-}!BEqD zA9qIMe$VB3h}76sYc_wAnKs6;NL1z<>{yaUov^(|fu{x3v;r}!*!{8i#GO(*aTrP+ z<(y;+JN+C(*7bEhAf4@z-Kq`q-y^_QLzfCuC>wH=MZrJiCX6qj8E))go94Ih!Xh7H z^(Vwki1>T3=X}`2s%Szmf&Vp5o#V#~FN^M!@1yU!a>NU1E_JVzLX z%DGd1S7UaR76c0!e$m6`mken3Bc`C5+cEQBL4TC_UCEU4iq})JW8!I_T>3X>ATiaPsM(sDy zb)Vxda16etv}koeZvbqwi`lCX+Y*X1os%^xZhK;Ij$N*q@{E8&akW#^mNc2lzWCmV zBFeL{gFBX($7Ih14y@?1d@+1*eiUyPl5E@j=H@?f7ffFsqSgO(tFAdTwwI#NJ}D`6 zlJO20!p4TTjJrKSA4lZNvf)Q zIe`%n@ETSmo-tHN0TsV{m`*D~Xnizqqf!UzhdfK_Tshhbd#!&v%PQ@JR!a4dpQYXF zV8ev>8_X1|Tja2WR1}nyV4Sayiq+rmC2{T!=QnFob(XK-Zd8eF_=hxnEG9C#HKEr7 z=0dmkuKD~yu|F&E@I6VtW|pV7k(K3qVyTubn8l%5V+l4g1eMXR+ zTyDQ`u1s&Qw67BGX!r z6Z`6UA)_%ez5G{sny+-*j`D;|1=>F-w(YT^%fBKYbvZUBiaGw6V9k+df2-(8ILD9L z{B!PcQyjoTE%2;17l1T>cV`05)hdAC;p^tVN=9Iu&hE!ZUj2DeFu1%r8iCIB5F*LG zWQjcM&C;6NYKq3=yYDlmV7H!nE!t*v38-E{3x0}YibCxdcdzh!=nyPxp%7o6DUcJT zt}@#b)S(hnfS^PfyJ~v1QX|{XT@%b*IAR%ZE;sxOBe^~FCa-kcf>X;chC<-Z#NE+R z_KHv%DP4T!je3z+-6H76?VrRHjA#PaY6v(??(f?V2=&IT?c34}PnFlcoJ7F|n$To_C34 zk}l$3qzAW28Q;r02WFd+n3)^8Vfjty%G(S>h`^ye)5&1wv}l(z^^@Tu@O)=;{gQbOUfVk% zBf8mNM=rJ#v!07spu+rKnjQW-nNi-|#;1AM*dqSOq(2&fF>xe&~B9v zaJvjVJZl_py()3>W~=HjSX0NJ2sGxR+TgBP5GA{+(KX+7uy2w(tu`nqfjHk#78-g7 zCj3OjrnHq{VWb$GM&6AW=@W+5kh#|%$86T%n2Lo|%QMWRnT*-a#9HN4R>050 z_hOCc9sd8y$Z%awJNP@@6&>44cepuKsh;Mf1|1ORm~Zfzem2QKsd$ClizmnO>c4KtW@jydY^LPzj>83PZ&XH$YQcek#EQTmknj%MZxH3xTarra zF~5w2KAH{_=p)WXjDOBhHgVqTbAf4&Nrc8NRgqg{-q#zyw>&1vze()Mzw)yenYd;pY6TL zD>Bmez?@kj$547ur;MZKN%gzD*q)??c8|(iz9Q#u6rI2ZSY%5IIlo7ekBZhBaJ~zN zyp7}>3@FhJkLU}!EjvAs98&5bmP14FjGHK(-;T_r1xMCna>sX@R=>UM{OxY0+WbLpAwE8=luLKY@W%uts9J{1G!^=(xVS3iC- zBX2s`sf^W$Z+)%k-~bJ*_mj<_Kbrv8F;&Fq4a~QO^1p(uj!NrmMdvzD&$bLsKgQcW zI<3R1h>y0yz7JBHLD%Nh?bFC6x)5lq8$s7+lTG+ef>zt``dZPM2Hr%zHRbi)?gadp zBtP3Xz|TqD-lo^%R1r0|!nRhTZ5*$qyS>3}`Y{B~_h;>HGvhT6+k4EW2X>A%4;vk4 zEA(fja}(+bR1+Zh*6nYCcM`W+q}JDp&QXBJ@8FG+wNColp0^jRb!-*UV+VG#gjtxd TebYPy1Ob@?hYMYOp8@>`O9OBp literal 0 HcmV?d00001