From e9ec4e745c7ca9dc9d9e5340c9a928f3ad56bf4b Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Wed, 29 Apr 2015 00:16:03 -0700 Subject: [PATCH] Expose a Sheet object on Reader::XLSX::nextSheet() Added Sheet class for the XLSX reader that exposes basic sheet info, such as name or ID. When retrieving the sheet data XML, added extra XML parsing to retrieve sheet data. Added test --- .../Common/Helper/GlobalFunctionsHelper.php | 21 ++- .../Reader/Helper/XLSX/WorksheetHelper.php | 155 ++++++++++++++++-- src/Spout/Reader/Internal/XLSX/Worksheet.php | 17 +- src/Spout/Reader/Sheet.php | 57 +++++++ src/Spout/Reader/XLSX.php | 36 ++-- tests/Spout/Reader/XLSXTest.php | 26 +++ .../xlsx/two_sheets_with_custom_names.xlsx | Bin 0 -> 4258 bytes 7 files changed, 279 insertions(+), 33 deletions(-) create mode 100644 src/Spout/Reader/Sheet.php create mode 100644 tests/resources/xlsx/two_sheets_with_custom_names.xlsx diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index 5f22ed6..550a6b1 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -119,7 +119,7 @@ class GlobalFunctionsHelper * Wrapper around global function file_exists() * @see file_exists() * - * @param string $filename + * @param string $fileName * @return bool */ public function file_exists($fileName) @@ -127,11 +127,23 @@ class GlobalFunctionsHelper return file_exists($fileName); } + /** + * Wrapper around global function file_get_contents() + * @see file_get_contents() + * + * @param string $filePath + * @return bool + */ + public function file_get_contents($filePath) + { + return file_get_contents($filePath); + } + /** * Wrapper around global function is_readable() * @see is_readable() * - * @param string $filename + * @param string $fileName * @return bool */ public function is_readable($fileName) @@ -144,11 +156,12 @@ class GlobalFunctionsHelper * @see basename() * * @param string $path + * @param string|void $suffix * @return string */ - public function basename($path) + public function basename($path, $suffix = null) { - return basename($path); + return basename($path, $suffix); } /** diff --git a/src/Spout/Reader/Helper/XLSX/WorksheetHelper.php b/src/Spout/Reader/Helper/XLSX/WorksheetHelper.php index a105e01..f531b37 100644 --- a/src/Spout/Reader/Helper/XLSX/WorksheetHelper.php +++ b/src/Spout/Reader/Helper/XLSX/WorksheetHelper.php @@ -3,6 +3,7 @@ namespace Box\Spout\Reader\Helper\XLSX; use Box\Spout\Reader\Internal\XLSX\Worksheet; +use Box\Spout\Reader\Sheet; /** * Class WorksheetHelper @@ -12,11 +13,18 @@ use Box\Spout\Reader\Internal\XLSX\Worksheet; */ class WorksheetHelper { - /** Path of Content_Types XML file inside the XLSX file */ - const CONTENT_TYPES_XML_FILE_PATH = '[Content_Types].xml'; + /** Extension for XML files */ + const XML_EXTENSION = '.xml'; - /** Main namespace for the [Content_Types].xml file */ + /** Paths of XML files relative to the XLSX file root */ + const CONTENT_TYPES_XML_FILE_PATH = '[Content_Types].xml'; + const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels'; + const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml'; + + /** Namespaces for the XML files */ const MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML = 'http://schemas.openxmlformats.org/package/2006/content-types'; + const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships'; + const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; /** Value of the Override attribute used in [Content_Types].xml to define worksheets */ const OVERRIDE_CONTENT_TYPES_ATTRIBUTE = 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml'; @@ -24,12 +32,23 @@ class WorksheetHelper /** @var string Path of the XLSX file being read */ protected $filePath; + /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ + protected $globalFunctionsHelper; + + /** @var \SimpleXMLElement XML element representing the workbook.xml.rels file */ + protected $workbookXMLRelsAsXMLElement; + + /** @var \SimpleXMLElement XML element representing the workbook.xml file */ + protected $workbookXMLAsXMLElement; + /** * @param string $filePath Path of the XLSX file being read + * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePath) + public function __construct($filePath, $globalFunctionsHelper) { $this->filePath = $filePath; + $this->globalFunctionsHelper = $globalFunctionsHelper; } /** @@ -42,23 +61,139 @@ class WorksheetHelper { $worksheets = []; - $xmlContents = file_get_contents('zip://' . $this->filePath . '#' . self::CONTENT_TYPES_XML_FILE_PATH); - - $contentTypes = new \SimpleXMLElement($xmlContents); - $contentTypes->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML); + $contentTypesAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::CONTENT_TYPES_XML_FILE_PATH, + self::MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML + ); // find all nodes defining a worksheet - $sheetNodes = $contentTypes->xpath('//ns:Override[@ContentType="' . self::OVERRIDE_CONTENT_TYPES_ATTRIBUTE . '"]'); + $sheetNodes = $contentTypesAsXMLElement->xpath('//ns:Override[@ContentType="' . self::OVERRIDE_CONTENT_TYPES_ATTRIBUTE . '"]'); for ($i = 0; $i < count($sheetNodes); $i++) { $sheetNode = $sheetNodes[$i]; $sheetDataXMLFilePath = (string) $sheetNode->attributes()->PartName; - $worksheets[] = new Worksheet($i, $sheetDataXMLFilePath); + + $sheet = $this->getSheet($sheetDataXMLFilePath, $i); + $worksheets[] = new Worksheet($sheet, $i, $sheetDataXMLFilePath); } return $worksheets; } + /** + * Returns an instance of a sheet, given the path of its data XML file. + * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet. + * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. + * The entry contains the ID and name of the sheet. + * + * If this piece of data can't be found by parsing the different XML files, the ID will default + * to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will + * default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2"). + * + * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml + * @param int $sheetNumberZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based) + * @return \Box\Spout\Reader\Sheet Sheet instance + */ + protected function getSheet($sheetDataXMLFilePath, $sheetNumberZeroBased) + { + $sheetId = $sheetNumberZeroBased + 1; + $sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath); + + /* + * In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" + * In workbook.xml.rels, it is only "worksheets/sheet1.xml" + */ + $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); + + // find the node associated to the given file path + $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); + $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); + + if (count($relationshipNodes) === 1) { + $relationshipNode = $relationshipNodes[0]; + $sheetId = (string) $relationshipNode->attributes()->Id; + + $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); + $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]'); + + if (count($sheetNodes) === 1) { + $sheetNode = $sheetNodes[0]; + $sheetId = (int) $sheetNode->attributes()->sheetId; + $escapedSheetName = (string) $sheetNode->attributes()->name; + + $escaper = new \Box\Spout\Common\Escaper\XLSX(); + $sheetName = $escaper->unescape($escapedSheetName); + } + } + + return new Sheet($sheetId, $sheetNumberZeroBased, $sheetName); + } + + /** + * Returns the default name of the sheet whose data is located + * at the given path. + * + * @param $sheetDataXMLFilePath + * @return string The default sheet name + */ + protected function getDefaultSheetName($sheetDataXMLFilePath) + { + return $this->globalFunctionsHelper->basename($sheetDataXMLFilePath, self::XML_EXTENSION); + } + + /** + * Returns a representation of the workbook.xml.rels file, ready to be parsed. + * The returned value is cached. + * + * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + */ + protected function getWorkbookXMLRelsAsXMLElement() + { + if (!$this->workbookXMLRelsAsXMLElement) { + $this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::WORKBOOK_XML_RELS_FILE_PATH, + self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS + ); + } + + return $this->workbookXMLRelsAsXMLElement; + } + + /** + * Returns a representation of the workbook.xml file, ready to be parsed. + * The returned value is cached. + * + * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + */ + protected function getWorkbookXMLAsXMLElement() + { + if (!$this->workbookXMLAsXMLElement) { + $this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::WORKBOOK_XML_FILE_PATH, + self::MAIN_NAMESPACE_FOR_WORKBOOK_XML + ); + } + + return $this->workbookXMLAsXMLElement; + } + + /** + * Loads the contents of the given file in an XML parser and register the given XPath namespace. + * + * @param string $xmlFilePath The path of the XML file inside the XLSX file + * @param string $mainNamespace The main XPath namespace to register + * @return \SimpleXMLElement The XML element representing the file + */ + protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace) + { + $xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath); + + $xmlElement = new \SimpleXMLElement($xmlContents); + $xmlElement->registerXPathNamespace('ns', $mainNamespace); + + return $xmlElement; + } + /** * Returns whether another worksheet exists after the current worksheet. * The order is determined by the order of appearance in the [Content_Types].xml file. diff --git a/src/Spout/Reader/Internal/XLSX/Worksheet.php b/src/Spout/Reader/Internal/XLSX/Worksheet.php index 552c53e..4fd6ca2 100644 --- a/src/Spout/Reader/Internal/XLSX/Worksheet.php +++ b/src/Spout/Reader/Internal/XLSX/Worksheet.php @@ -10,18 +10,23 @@ namespace Box\Spout\Reader\Internal\XLSX; */ class Worksheet { + /** @var \Box\Spout\Reader\Sheet The "external" sheet */ + protected $externalSheet; + /** @var int Worksheet number, based on the order of appareance in [Content_Types].xml (zero-based) */ protected $worksheetNumber; /** @var string Path of the XML file containing the worksheet data */ protected $dataXmlFilePath; - /** + /**\ + * @param \Box\Spout\Reader\Sheet $externalSheet The associated "external" sheet * @param int $worksheetNumber Worksheet number, based on the order of appareance in [Content_Types].xml (zero-based) * @param string $dataXmlFilePath Path of the XML file containing the worksheet data */ - public function __construct($worksheetNumber, $dataXmlFilePath) + public function __construct($externalSheet, $worksheetNumber, $dataXmlFilePath) { + $this->externalSheet = $externalSheet; $this->worksheetNumber = $worksheetNumber; $this->dataXmlFilePath = $dataXmlFilePath; } @@ -34,6 +39,14 @@ class Worksheet return ltrim($this->dataXmlFilePath, '/'); } + /** + * @return \Box\Spout\Reader\Sheet The "external" sheet + */ + public function getExternalSheet() + { + return $this->externalSheet; + } + /** * @return int */ diff --git a/src/Spout/Reader/Sheet.php b/src/Spout/Reader/Sheet.php new file mode 100644 index 0000000..5b6b0ab --- /dev/null +++ b/src/Spout/Reader/Sheet.php @@ -0,0 +1,57 @@ +id = $sheetId; + $this->number = $sheetNumber; + $this->name = $sheetName; + } + + /** + * @return int ID of the sheet + */ + public function getId() + { + return $this->id; + } + + /** + * @return int Number of the sheet, based on order of creation (zero-based) + */ + public function getNumber() + { + return $this->number; + } + + /** + * @return string Name of the sheet + */ + public function getName() + { + return $this->name; + } +} diff --git a/src/Spout/Reader/XLSX.php b/src/Spout/Reader/XLSX.php index db46707..2fa85f0 100644 --- a/src/Spout/Reader/XLSX.php +++ b/src/Spout/Reader/XLSX.php @@ -76,7 +76,7 @@ class XLSX extends AbstractReader $this->extractSharedStrings($filePath); // Fetch all available worksheets - $this->worksheetHelper = new WorksheetHelper($filePath); + $this->worksheetHelper = new WorksheetHelper($filePath, $this->globalFunctionsHelper); $this->worksheets = $this->worksheetHelper->getWorksheets($filePath); if (count($this->worksheets) === 0) { @@ -119,29 +119,31 @@ class XLSX extends AbstractReader * Moves the pointer to the current worksheet. * Moving to another worksheet will stop the reading in the current worksheet. * - * @return void + * @return \Box\Spout\Reader\Sheet The next sheet * @throws Exception\ReaderNotOpenedException If the stream was not opened first * @throws Exception\EndOfWorksheetsReachedException If there is no more worksheets to read */ public function nextSheet() { - if ($this->hasNextSheet()) { - if ($this->currentWorksheet === null) { - $nextWorksheet = $this->worksheets[0]; - } else { - $currentWorksheetNumber = $this->currentWorksheet->getWorksheetNumber(); - $nextWorksheet = $this->worksheets[$currentWorksheetNumber + 1]; - } - - $this->initXmlReaderForWorksheetData($nextWorksheet); - $this->currentWorksheet = $nextWorksheet; - - // make sure that we are ready to read more rows - $this->hasReachedEndOfFile = false; - $this->emptyRowDataBuffer(); - } else { + if (!$this->hasNextSheet()) { throw new EndOfWorksheetsReachedException('End of worksheets was reached. Cannot read more worksheets.'); } + + if ($this->currentWorksheet === null) { + $nextWorksheet = $this->worksheets[0]; + } else { + $currentWorksheetNumber = $this->currentWorksheet->getWorksheetNumber(); + $nextWorksheet = $this->worksheets[$currentWorksheetNumber + 1]; + } + + $this->initXmlReaderForWorksheetData($nextWorksheet); + $this->currentWorksheet = $nextWorksheet; + + // make sure that we are ready to read more rows + $this->hasReachedEndOfFile = false; + $this->emptyRowDataBuffer(); + + return $this->currentWorksheet->getExternalSheet(); } /** diff --git a/tests/Spout/Reader/XLSXTest.php b/tests/Spout/Reader/XLSXTest.php index 23d26bc..3a1a34f 100644 --- a/tests/Spout/Reader/XLSXTest.php +++ b/tests/Spout/Reader/XLSXTest.php @@ -200,6 +200,32 @@ class XLSXTest extends \PHPUnit_Framework_TestCase $this->assertEquals([], $allRows, 'Sheet with no cells should be correctly processed.'); } + /** + * @return void + */ + public function testNextSheetShouldReturnCorrectSheetInfos() + { + $resourcePath = $this->getResourcePath('two_sheets_with_custom_names.xlsx'); + $reader = ReaderFactory::create(Type::XLSX); + $reader->open($resourcePath); + + /** @var \Box\Spout\Reader\Sheet[] $sheets */ + $sheets = []; + while ($reader->hasNextSheet()) { + $sheets[] = $reader->nextSheet(); + } + + $reader->close(); + + $this->assertEquals('CustomName1', $sheets[0]->getName()); + $this->assertEquals(0, $sheets[0]->getNumber()); + $this->assertEquals(1, $sheets[0]->getId()); + + $this->assertEquals('CustomName2', $sheets[1]->getName()); + $this->assertEquals(1, $sheets[1]->getNumber()); + $this->assertEquals(2, $sheets[1]->getId()); + } + /** * @param string $fileName * @return array All the read rows the given file diff --git a/tests/resources/xlsx/two_sheets_with_custom_names.xlsx b/tests/resources/xlsx/two_sheets_with_custom_names.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..722e2125d82ae5db6b37d31759f50047b180cb35 GIT binary patch literal 4258 zcmds4c{r478=o=Clzo@OSjLu$iOCWvOvo5BX|X$5rmQperHL}O6k&+69w(`+p%PMQ z;-pZNafpzzYd*%(Q2Abo5q;nJ_q)F5y58%WYu@K~KllB+pZmUlFAfc17XpDmP*7rq zc8Q8?r>7?GbG|?y_rd zi*LC;t3tjQVkSv`tM>4@ZT_Eo5}Sn^erwm4{{S&SmzxA7+1EA*R0$|Y(%glukCbFF zA2(G#NRc>svn;$N_`tK$;Lw_bdL+AjrynV6P2Kp+p1)a|B9ZE&rp5JXtjitYus=Xz zyOOk$ddg(YgPFx=C#UEdH3`0`SjSy^@jVVALYli?l}Sdoef=D7fC%Den!xM_Lg(f* zy48%OOw<@13a|Xy&NV_T*^D@2#9R}+??3sJnLmB1+k$7YYJ?N_gJ=+F0hDC~aJ#xT z1HXRv#e4dxuIyP(Mv$RY*%0u~i46qeT-~Fxw6mCkK|e+CMbpWPGEJcwqxz^Wdldhy zLizg;vwSaCMxC7d*Et^9(F;d6`Jty;7PRQ(z%Ji=8PbJ51 z+_sj2AvKYXNoFv+=cdsH1V<(LEVr!Y128{f!A(^ovZS5m9~~5t@Y%eCv3V5Z)Mn_h z#x8hau~&HItjbid)Bx0dE^}D(VA)t^?h99#ZqyW}H!U(TiKM)%DLL$PVIlk0(}%`o z)xI8ku3{3R>{|AR2AhMt2vpRVX^71){iyXwAxuOUJF;IYe$b$fVt<^P@~&CDcJ3H$ zm5+^_Z2|ajAv)uHiQemJ6FN-mtp#511@1B5+HH>B-b)uaE3xhZ+ipG-o%5UFGkf(< z`Y1IxIoQFQMlv~#M2zqp+A%+O@BYo93EP0WYX#%;=|%zJ7I4SHp$elN2e2~l%i14( zB~-sDD5v%r8Mw}H7dQkZt1cW-GVF&Q+rs8ih;0hNo6RVil(alG^4YB1SoX{piR5af zrhd@|moo|JQQx7J~6@y7>8MLpif&6?!2Go7hg@G_9fQNr=sLn)R{8Fr5N%~l-G*GMxVKLDp z&O1hmZZ#1n6)|Q-K9+1RFIg#p?J9|?#UA$l-45>(q!Q*-J2-meN!|`K`36Y15*U8^ zmYHpC<;mm{+AuRLAjp9^{<0 zoSA3Xv^j$h-Qo12UXchMitKjc4oAbZU{Sa1Pzt)>4|? zsR*uf^oWNMt+hTB!)DO$*~_zz#FCkl6W-E}U~Nf~jH>&(%MOK)F5 z7Ff!`oDeu+c_l8Ddxpc@x>REP@C^|%71xrcX&t0v9vJ zEu9yI-ggp`6Cxw=>QQRTz+A61v7J57L=Rcb^QB&^E;_nc zVG(-NeY!JMWp0JsGnm#zxuf5~7jQq!v3hi#=)0Wm+(#4b;|0AgA0v-1iUh#4r18 zfgw_*xoWXpXAg8+mc#PhOl@Wa+=c?$N|3v^2#B#0+Q&N*kRz7mzEceKFy4VPp?9Js zrBM^N`${jysXu<+E}lU?OMH8w{j!~jH_D1Q>|weirzdsvOm?lM8d-5@>{sP>dC{WZ7z3!S~9nof}OqA^`pjCln22v1U=Y^kwM&JQNe zh{N+8%@3B&aqwB*2p%%X`Zjd7H9YgVI1WkjeudF_NnuoUxA1H0pgU|dc&=Afh7R>{ z8IK8zF|NU^_A5t|Y;ER?IJ0RbISg&HQYmU~FZ7Z-SKeIy4*TBe`MD~wnfV!^j<0rK z!7Pr)fJt~r`SwE^z-ZY?+P;bK?X?iRMuX4k6MlFaNd&FC|2VhN<}nHpUk>2zpO zL<2|i+nX4N81eDm4}%xF?UFJ$J3BA(pE?_M?+e=s1shus4~qgnS3KT-{d6-~&wfH5`5cDGM7C6o0;J$>>)cvc}wbu zfggmc(b;cJ-n9{2ncfvKGsoNA)lwjVgHL(YS2mJ5`Nup!w0;E}hV7g0jO; z7|Duh|Ai3O{ALv6nQs_9oLDExXU45AQRjVyA8PbC4uw#wsW8-gEo1Eete{#MlW27Tj)9}mx@Wwb6z#gB@v zq;k6~(ZE z`L)7lVd7UY-zCF;J8Ve`vsM-m)f!>^^Lgu~F$?x|E$oL-W*xfh?RvGzLYD%(={?|1-9(wdLhO-M`tZ;?T}f=@v@!u+J$8Y z2jurxN`@8jvW4~Xfn|Z0W7)#0m{<$_)l_6b{eZ>~9KP0l7I-6ZOY?-a0s$U>fLEu> z&+M(AFD&ez8?dZt!}4G`^7XBcWd#F#$^PuYcPrl;Ar8Iid*$!~LLddWBA>aIaQ_DR CzC_#r literal 0 HcmV?d00001