From a804be4844bb3038a3e890730dec736ca25fce75 Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Fri, 8 Jan 2016 08:38:49 -0800 Subject: [PATCH] Support XLSX that are defined in random order Some software generate [Content_Types].xml file with sheets definition in random order. Instead of having the first sheet (id = 1) defined first, it may be defined in 3rd position. Therefore, to read the file in the correct order, sheets order need to be fixed. --- src/Spout/Reader/ODS/Sheet.php | 6 +- src/Spout/Reader/XLSX/Helper/SheetHelper.php | 62 ++++++------------ src/Spout/Reader/XLSX/Sheet.php | 6 +- src/Spout/Writer/Common/Sheet.php | 6 +- tests/Spout/Reader/XLSX/ReaderTest.php | 22 +++++++ ...th_sheets_definition_in_reverse_order.xlsx | Bin 0 -> 4253 bytes 6 files changed, 52 insertions(+), 50 deletions(-) create mode 100644 tests/resources/xlsx/two_sheets_with_sheets_definition_in_reverse_order.xlsx diff --git a/src/Spout/Reader/ODS/Sheet.php b/src/Spout/Reader/ODS/Sheet.php index 65f3183..c78e4aa 100644 --- a/src/Spout/Reader/ODS/Sheet.php +++ b/src/Spout/Reader/ODS/Sheet.php @@ -19,7 +19,7 @@ class Sheet implements SheetInterface /** @var int ID of the sheet */ protected $id; - /** @var int Index of the sheet, based on order of creation (zero-based) */ + /** @var int Index of the sheet, based on order in the workbook (zero-based) */ protected $index; /** @var string Name of the sheet */ @@ -27,7 +27,7 @@ class Sheet implements SheetInterface /** * @param XMLReader $xmlReader XML Reader, positioned on the "" element - * @param int $sheetIndex Index of the sheet, based on order of creation (zero-based) + * @param int $sheetIndex Index of the sheet, based on order in the workbook (zero-based) * @param string $sheetName Name of the sheet */ public function __construct($xmlReader, $sheetIndex, $sheetName) @@ -48,7 +48,7 @@ class Sheet implements SheetInterface /** * @api - * @return int Index of the sheet, based on order of creation (zero-based) + * @return int Index of the sheet, based on order in the workbook (zero-based) */ public function getIndex() { diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php index b1c393e..3400509 100644 --- a/src/Spout/Reader/XLSX/Helper/SheetHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -13,9 +13,6 @@ use Box\Spout\Reader\XLSX\Sheet; */ class SheetHelper { - /** Extension for XML files */ - const XML_EXTENSION = '.xml'; - /** Paths of XML files relative to the XLSX file root */ const CONTENT_TYPES_XML_FILE_PATH = '[Content_Types].xml'; const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels'; @@ -79,9 +76,15 @@ class SheetHelper $sheetNode = $sheetNodes[$i]; $sheetDataXMLFilePath = $sheetNode->getAttribute('PartName'); - $sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath, $i); + $sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath); } + // make sure the sheets are sorted by index + // (as the sheets are not necessarily in this order in the XML file) + usort($sheets, function ($sheet1, $sheet2) { + return ($sheet1->getIndex() - $sheet2->getIndex()); + }); + return $sheets; } @@ -91,60 +94,37 @@ class SheetHelper * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. * The entry contains the ID and name of the sheet. * - * If this piece of data can't be found by parsing the different XML files, the ID will default - * to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will - * default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2"). - * * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml - * @param int $sheetIndexZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based) * @return \Box\Spout\Reader\XLSX\Sheet Sheet instance */ - protected function getSheetFromXML($sheetDataXMLFilePath, $sheetIndexZeroBased) + protected function getSheetFromXML($sheetDataXMLFilePath) { - $sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath); - - /* - * In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" - * In workbook.xml.rels, it is only "worksheets/sheet1.xml" - */ + // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" + // In workbook.xml.rels, it is only "worksheets/sheet1.xml" $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); // find the node associated to the given file path $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); + $relationshipNode = $relationshipNodes[0]; - if (count($relationshipNodes) === 1) { - $relationshipNode = $relationshipNodes[0]; - $sheetId = $relationshipNode->getAttribute('Id'); + $relationshipSheetId = $relationshipNode->getAttribute('Id'); - $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); - $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]'); + $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); + $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $relationshipSheetId . '"]'); + $sheetNode = $sheetNodes[0]; - if (count($sheetNodes) === 1) { - $sheetNode = $sheetNodes[0]; - $escapedSheetName = $sheetNode->getAttribute('name'); + $escapedSheetName = $sheetNode->getAttribute('name'); + $sheetIdOneBased = $sheetNode->getAttribute('sheetId'); + $sheetIndexZeroBased = $sheetIdOneBased - 1; - /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ - $escaper = new \Box\Spout\Common\Escaper\XLSX(); - $sheetName = $escaper->unescape($escapedSheetName); - } - } + /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ + $escaper = new \Box\Spout\Common\Escaper\XLSX(); + $sheetName = $escaper->unescape($escapedSheetName); return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetIndexZeroBased, $sheetName); } - /** - * Returns the default name of the sheet whose data is located - * at the given path. - * - * @param string $sheetDataXMLFilePath Path of the sheet data XML file - * @return string The default sheet name - */ - protected function getDefaultSheetName($sheetDataXMLFilePath) - { - return $this->globalFunctionsHelper->basename($sheetDataXMLFilePath, self::XML_EXTENSION); - } - /** * Returns a representation of the workbook.xml.rels file, ready to be parsed. * The returned value is cached. diff --git a/src/Spout/Reader/XLSX/Sheet.php b/src/Spout/Reader/XLSX/Sheet.php index 88128e4..85a4dc9 100644 --- a/src/Spout/Reader/XLSX/Sheet.php +++ b/src/Spout/Reader/XLSX/Sheet.php @@ -15,7 +15,7 @@ class Sheet implements SheetInterface /** @var \Box\Spout\Reader\XLSX\RowIterator To iterate over sheet's rows */ protected $rowIterator; - /** @var int Index of the sheet, based on order of creation (zero-based) */ + /** @var int Index of the sheet, based on order in the workbook (zero-based) */ protected $index; /** @var string Name of the sheet */ @@ -25,7 +25,7 @@ class Sheet implements SheetInterface * @param string $filePath Path of the XLSX file being read * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml * @param Helper\SharedStringsHelper Helper to work with shared strings - * @param int $sheetIndex Index of the sheet, based on order of creation (zero-based) + * @param int $sheetIndex Index of the sheet, based on order in the workbook (zero-based) * @param string $sheetName Name of the sheet */ public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $sheetIndex, $sheetName) @@ -46,7 +46,7 @@ class Sheet implements SheetInterface /** * @api - * @return int Index of the sheet, based on order of creation (zero-based) + * @return int Index of the sheet, based on order in the workbook (zero-based) */ public function getIndex() { diff --git a/src/Spout/Writer/Common/Sheet.php b/src/Spout/Writer/Common/Sheet.php index c46c876..f6a966b 100644 --- a/src/Spout/Writer/Common/Sheet.php +++ b/src/Spout/Writer/Common/Sheet.php @@ -24,7 +24,7 @@ class Sheet /** @var array Associative array [SHEET_INDEX] => [SHEET_NAME] keeping track of sheets' name to enforce uniqueness */ protected static $SHEETS_NAME_USED = []; - /** @var int Index of the sheet, based on order of creation (zero-based) */ + /** @var int Index of the sheet, based on order in the workbook (zero-based) */ protected $index; /** @var string Name of the sheet */ @@ -34,7 +34,7 @@ class Sheet protected $stringHelper; /** - * @param int $sheetIndex Index of the sheet, based on order of creation (zero-based) + * @param int $sheetIndex Index of the sheet, based on order in the workbook (zero-based) */ public function __construct($sheetIndex) { @@ -45,7 +45,7 @@ class Sheet /** * @api - * @return int Index of the sheet, based on order of creation (zero-based) + * @return int Index of the sheet, based on order in the workbook (zero-based) */ public function getIndex() { diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index c774749..2802d84 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -73,6 +73,28 @@ class ReaderTest extends \PHPUnit_Framework_TestCase } } + /** + * @return void + */ + public function testReadShouldSupportSheetsDefinitionInRandomOrder() + { + $allRows = $this->getAllRowsForFile('two_sheets_with_sheets_definition_in_reverse_order.xlsx'); + + $expectedRows = [ + ['s1 - A1', 's1 - B1', 's1 - C1', 's1 - D1', 's1 - E1'], + ['s1 - A2', 's1 - B2', 's1 - C2', 's1 - D2', 's1 - E2'], + ['s1 - A3', 's1 - B3', 's1 - C3', 's1 - D3', 's1 - E3'], + ['s1 - A4', 's1 - B4', 's1 - C4', 's1 - D4', 's1 - E4'], + ['s1 - A5', 's1 - B5', 's1 - C5', 's1 - D5', 's1 - E5'], + ['s2 - A1', 's2 - B1', 's2 - C1', 's2 - D1', 's2 - E1'], + ['s2 - A2', 's2 - B2', 's2 - C2', 's2 - D2', 's2 - E2'], + ['s2 - A3', 's2 - B3', 's2 - C3', 's2 - D3', 's2 - E3'], + ['s2 - A4', 's2 - B4', 's2 - C4', 's2 - D4', 's2 - E4'], + ['s2 - A5', 's2 - B5', 's2 - C5', 's2 - D5', 's2 - E5'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + /** * @return void */ diff --git a/tests/resources/xlsx/two_sheets_with_sheets_definition_in_reverse_order.xlsx b/tests/resources/xlsx/two_sheets_with_sheets_definition_in_reverse_order.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..adfce3f8ec064f8889d347b9c8e94657d9337d34 GIT binary patch literal 4253 zcmds4c{r478=o;`%94FMjD1PP7$ZarGZ~CAlNP%Z8X;z^*`mgpvXe^a_>#(+C5jYH zMBzl1lS4>lHz(UP)8f3+Wa~TU-|zaK>w2$wuX*p^{oMEOdG6=_Jp?R-T?hmMK|$Pw z%El;{JMB7P5J;2*1QG=PwMCIhengVruZM%Yh`#n}fu0@>X$Dv-T<9q+d?4qos%NDX zI=7!U93!RkyFhA>Rw3?ji1(?1*O#he&yD7xLR_Jti;FZ%Q=2wsaclS4ALj7I&a}<$jYM93g&$AWsS&f$$#*9RnN_IG?d+Q_ zK6>+xwKNQ=jdV;lgV{YZjW#4Xswif;Wwq{y`3ehi+#iz**-`e+K^ZB(kefKcM9-es z1U=H+D_>CLNvT>;n+cR2hPu-;N3{=>PGsghcZKOk&EN(uMJ6SOsP1e@4GS(PVBZpK zGAW1na^#tsageHO>2JH(927;Mq8J7WpI7ou=aEvFh(3O7pLG0t!}=8aqtvw5trE5L zBi%pv*vQ#tfRE#3XMzveYdvj32fGJqfzP{veayGo%+bqhX#-~^)t_h6;6u~6zM`7! zHKz^G2>I|p2e0lB<6tCwjMrp4lU{l6*2t8tfBp6RNoIzTKgB}cv0$X!X#0M=?3>cA zM_)+Qo%vHQhg48Q&?B4KP8Q%>f{14G%ErZQPmH`bsWz83`5=+p z?bP(oy5KSfDIwc$T3Fvk3u?)feEwu4njDzP{9_tZr~ zVEdI0Qo|ar>d_jg&gBOF=A$@MlMGRw+7pcP!a=6fVzP=~<*)Q>fH_07lq@mh58@bu zXTV(pEDksBDhR+~pvS*#JHX&))ta-XF{mHF_XE``vFvBrEP(8D(uqtyxwIbNinlhU z)tw6GK1+*u7|~wqoq`gVC#gHXnCRYf);moHSsMTL%iL6fM(5@$%fxiR1m<3%4f9>|A#f&k`;^5w|X~g;bEP&5+QW?!Hp|{QUAC{t?tqqYg^k zj2O3!t!V5X7s$0BS;-fVG6JzGdljU8>~qM25j}2ww-h!vAB{d#rb z;l*-`(8KO?uhP}%EOM8Ru-o&G8{+{os)1*z%K+T@x;pw0k00{$aU-4ZUE;()-eAjP zINt$!cmTA#Dpn1h>n0km`~ni`;sI&RZ8bxa$$j@`X|QWgPkyq@>=M+BL5nQJgQJiN z(ykR=kL&}K2|E%VToufT(nienpL&V7kgrx6WpZ5iWGgkY-8)qzm#J@&V6OW9;A~2J zP&S`p8}nGpL5wK0Gigg~l+8||?(j>}i?6hcPS$(^tqtGGG=D@FsI7Ovn(}-P-!I66 zxE!|yj!c!|uEqDB+23bb2Fr6ZwV4-i8}aWbMrv#p5Mw8GP4*-r$1KZyW@a_Rwho^T zt%#PAK~LQoEV&S;S@*0<;%fLA^7BgpZE_}sH@u-7G?|(CamKd9PVtZK;Bz-b#JFAy zFt*A%b%kH_9@0PNl*V<`B0Pt`oioXQQr%iU-FiW(u(RpVzI5|qNsHqb|CCo$ADN4) zuD@ybnaJs8L2PVDL%BH~(>B0rsuMcgU-BqE*V`?s?F2a|8q<3j!z-vI{2>L(mLBWm z{9x*|gnXW(`GFETC!gibz!Af&uOnyLDVfhC2*?nx7dXA&Q)bKi+W2+#ust@rcyHXV z3LP2b#!Lu{&0dFD?Nf~ov9)0qa%FcHUz^o6E0Lz=3_ugyxpV1x+wBMDnDqN%^UQgn zo-cM^z-u{HtIp0+C)ALDV(H_WlV+ce_t<*Cv8h==sHtxba`-|* zlvS>5e&XlQD|t8s$GgX((+QX=rfDjpZb$O%h~R@1gK=GSum4}wp<#h+@(qXn5)?e; zwV%>8^R+G1=k+5ETD(A#A3PmV)or@ZX{KyY+(a8P^7cYy&&+@(*o=(NqZGmR1E4@Vh@M(h1#O^Im?CamW9lO(+R@H?v>_o>U;23Zl! zo}F@JL!Jet;`@)aEbqOK^Ih0Mn;f{A{tu@Q?@IQ*u0*2W`o?DbX;*+hz)}$u1QOd| z`wgP8R8SYY`^_WZLUkWP1P3T{VG|!OYhWG4Jww6sV2N%Xxr*E(^MTh1;^h~TeH*1> z{@Olw2JG=^&s_7RQxD7Y6vliG2zW}=&{|3rXk_&ZCsIEC*5WYVJ(}{_6<^sk4AHZu z57`!>1UQDV9no--$~>-s^WCLWok|Y6R@%4t@cqfmcDK9NND{#rMMv>|ge+N6$6Ch<6?G%D}6f%VPwY-9lcR z?1!k_7`UVhdpJK_o~qHAPeYHt2FW2`PQpq7Oax;WHs!&MqZh8IokC?a|K-jXr>>N zvSjB9d?P_i8jCd+4QAQDD=t>F8v$#R6asFhs}vXuwjAbq4aS8%a#IB9to0o3~Apy(rt#Y;k8XyhW KBD35}xW54^Hc^ED literal 0 HcmV?d00001