filePath = $filePath; $this->useSharedStringsFileCache = $useSharedStringsFileCache; $rootTempFolder = ($tempFolder) ?: sys_get_temp_dir(); $this->fileSystemHelper = new FileSystemHelper($rootTempFolder); $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); } /** * Returns whether the XLSX file contains a shared strings XML file * * @return bool */ public function hasSharedStrings() { $hasSharedStrings = false; $zip = new \ZipArchive(); if ($zip->open($this->filePath) === true) { $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false); $zip->close(); } return $hasSharedStrings; } /** * Builds an in-memory array containing all the shared strings of the worksheet. * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. * It is then accessed by the worksheet data, via the string index in the built table. * * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx * * The XML file can be really big with worksheets containing a lot of data. That is why * we need to use a XML reader that provides streaming like the XMLReader library. * Please note that SimpleXML does not provide such a functionality but since it is faster * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. * * @return void * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read */ public function extractSharedStrings() { $xmlReader = new \XMLReader(); $sharedStringIndex = 0; $this->tempFilePointer = null; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT|LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } while ($xmlReader->name === 'si') { $node = new \SimpleXMLElement($xmlReader->readOuterXml()); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters $cleanNode = $this->removeSuperfluousTextNodes($node); // find all text nodes 't'; there can be multiple if the cell contains formatting $textNodes = $cleanNode->xpath('//ns:t'); $textValue = ''; foreach ($textNodes as $textNode) { if ($this->shouldPreserveWhitespace($textNode)) { $textValue .= $textNode->__toString(); } else { $textValue .= trim($textNode->__toString()); } } $unescapedTextValue = $escaper->unescape($textValue); if ($this->useSharedStringsFileCache) { // The shared string retrieval logic expects each cell data to be on one line only // Encoding the line feed character allows to preserve this assumption $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue); $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex); } else { if (!isset($this->inMemoryContents)) { $this->inMemoryContents = []; } $this->inMemoryContents[$sharedStringIndex] = $unescapedTextValue; } $sharedStringIndex++; // jump to the next 'si' tag $xmlReader->next('si'); } // close pointer to the last temp file that was written if ($this->tempFilePointer) { fclose($this->tempFilePointer); } $xmlReader->close(); } /** * @return string The path to the shared strings XML file */ protected function getSharedStringsFilePath() { return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; } /** * Removes nodes that should not be read, like the pronunciation of the Kanji characters. * By keeping them, their text content would be added to the read string. * * @param \SimpleXMLElement $parentNode Parent node that may contain nodes to remove * @return \SimpleXMLElement Cleaned parent node */ protected function removeSuperfluousTextNodes($parentNode) { $tagsToRemove = [ 'rPh', // Pronunciation of the text ]; foreach ($tagsToRemove as $tagToRemove) { $xpath = '//ns:' . $tagToRemove; $nodesToRemove = $parentNode->xpath($xpath); foreach ($nodesToRemove as $nodeToRemove) { // This is how to remove a node from the XML unset($nodeToRemove[0]); } } return $parentNode; } /** * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. * * @param \SimpleXMLElement $textNode The text node element () whitespace may be preserved * @return bool Whether whitespace should be preserved */ protected function shouldPreserveWhitespace($textNode) { $shouldPreserveWhitespace = false; $attributes = $textNode->attributes('xml', true); if ($attributes) { foreach ($attributes as $attributeName => $attributeValue) { if ($attributeName === 'space' && $attributeValue->__toString() === 'preserve') { $shouldPreserveWhitespace = true; break; } } } return $shouldPreserveWhitespace; } /** * Writes the given string to its associated temp file. * A new temporary file is created when the previous one has reached its max capacity. * * @param string $sharedString Shared string to write to the temp file * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return void */ protected function writeSharedStringToTempFile($sharedString, $sharedStringIndex) { $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); if (!file_exists($tempFilePath)) { if ($this->tempFilePointer) { fclose($this->tempFilePointer); } $this->tempFilePointer = fopen($tempFilePath, 'w'); } fwrite($this->tempFilePointer, $sharedString . PHP_EOL); } /** * Returns the path for the temp file that should contain the string for the given index * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return string The temp file path for the given index */ protected function getSharedStringTempFilePath($sharedStringIndex) { $numTempFile = intval($sharedStringIndex / self::MAX_NUM_STRINGS_PER_TEMP_FILE); return $this->tempFolder . '/sharedstrings' . $numTempFile; } /** * Returns the shared string at the given index. * Because the strings have been split into different files, it looks for the value in the correct file. * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * @return string The shared string at the given index * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index */ public function getStringAtIndex($sharedStringIndex) { $sharedString = null; if ($this->useSharedStringsFileCache) { $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); $indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE; if (!file_exists($tempFilePath)) { throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); } if ($this->inMemoryTempFilePath !== $tempFilePath) { // free memory unset($this->inMemoryContents); $this->inMemoryContents = explode(PHP_EOL, file_get_contents($tempFilePath)); $this->inMemoryTempFilePath = $tempFilePath; } if (array_key_exists($indexInFile, $this->inMemoryContents)) { $escapedSharedString = $this->inMemoryContents[$indexInFile]; $sharedString = $this->unescapeLineFeed($escapedSharedString); } } else { if (is_array($this->inMemoryContents) && array_key_exists($sharedStringIndex, $this->inMemoryContents)) { $sharedString = $this->inMemoryContents[$sharedStringIndex]; } } if ($sharedString === null) { throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); } return rtrim($sharedString, PHP_EOL); } /** * Escapes the line feed character (\n) * * @param string $unescapedString * @return string */ private function escapeLineFeed($unescapedString) { return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); } /** * Unescapes the line feed character (\n) * * @param string $escapedString * @return string */ private function unescapeLineFeed($escapedString) { return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); } /** * Deletes the created temporary folder and all its contents * * @return void */ public function cleanup() { $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); } }