diff --git a/src/Spout/Reader/AbstractReader2.php b/src/Spout/Reader/AbstractReader2.php new file mode 100644 index 0000000..ef24412 --- /dev/null +++ b/src/Spout/Reader/AbstractReader2.php @@ -0,0 +1,111 @@ +globalFunctionsHelper = $globalFunctionsHelper; + return $this; + } + + /** + * Prepares the reader to read the given file. It also makes sure + * that the file exists and is readable. + * + * @param string $filePath Path of the file to be read + * @return void + * @throws \Box\Spout\Common\Exception\IOException If the file at the given path does not exist, is not readable or is corrupted + */ + public function open($filePath) + { + if (!$this->isPhpStream($filePath)) { + // we skip the checks if the provided file path points to a PHP stream + if (!$this->globalFunctionsHelper->file_exists($filePath)) { + throw new IOException('Could not open ' . $filePath . ' for reading! File does not exist.'); + } else if (!$this->globalFunctionsHelper->is_readable($filePath)) { + throw new IOException('Could not open ' . $filePath . ' for reading! File is not readable.'); + } + } + + try { + $this->openReader($filePath); + $this->isStreamOpened = true; + } catch (\Exception $exception) { + throw new IOException('Could not open ' . $filePath . ' for reading! (' . $exception->getMessage() . ')'); + } + } + + /** + * Checks if a path is a PHP stream (like php://output, php://memory, ...) + * + * @param string $filePath Path of the file to be read + * @return bool Whether the given path maps to a PHP stream + */ + protected function isPhpStream($filePath) + { + return (strpos($filePath, 'php://') === 0); + } + + /** + * Closes the reader, preventing any additional reading + * + * @return void + */ + public function close() + { + if ($this->isStreamOpened) { + $this->closeReader(); + + $sheetIterator = $this->getSheetIterator(); + if ($sheetIterator) { + $sheetIterator->end(); + } + + $this->isStreamOpened = false; + } + } +} diff --git a/src/Spout/Reader/CSV/Reader.php b/src/Spout/Reader/CSV/Reader.php new file mode 100644 index 0000000..3b164d5 --- /dev/null +++ b/src/Spout/Reader/CSV/Reader.php @@ -0,0 +1,95 @@ +fieldDelimiter = $fieldDelimiter; + return $this; + } + + /** + * Sets the field enclosure for the CSV. + * Needs to be called before opening the reader. + * + * @param string $fieldEnclosure Character that enclose fields + * @return Reader + */ + public function setFieldEnclosure($fieldEnclosure) + { + $this->fieldEnclosure = $fieldEnclosure; + return $this; + } + + /** + * Opens the file at the given path to make it ready to be read. + * The file must be UTF-8 encoded. + * @TODO add encoding detection/conversion + * + * @param string $filePath Path of the CSV file to be read + * @return void + * @throws \Box\Spout\Common\Exception\IOException + */ + protected function openReader($filePath) + { + $this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r'); + if (!$this->filePointer) { + throw new IOException('Could not open file ' . $filePath . ' for reading.'); + } + + $this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper); + } + + /** + * Returns an iterator to iterate over sheets. + * + * @return SheetIterator To iterate over sheets + */ + public function getSheetIterator() + { + return $this->sheetIterator; + } + + + /** + * Closes the reader. To be used after reading the file. + * + * @return void + */ + protected function closeReader() + { + if ($this->filePointer) { + $this->globalFunctionsHelper->fclose($this->filePointer); + } + } +} diff --git a/src/Spout/Reader/CSV/RowIterator.php b/src/Spout/Reader/CSV/RowIterator.php new file mode 100644 index 0000000..2316fa7 --- /dev/null +++ b/src/Spout/Reader/CSV/RowIterator.php @@ -0,0 +1,163 @@ +filePointer = $filePointer; + $this->fieldDelimiter = $fieldDelimiter; + $this->fieldEnclosure = $fieldEnclosure; + $this->globalFunctionsHelper = $globalFunctionsHelper; + } + + /** + * Rewind the Iterator to the first element + * @link http://php.net/manual/en/iterator.rewind.php + * + * @return void + */ + public function rewind() + { + $this->rewindAndSkipUtf8Bom(); + + $this->numReadRows = 0; + $this->rowDataBuffer = null; + + $this->next(); + } + + /** + * This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file + * by moving the file pointer after it, so that it is not read. + * + * @return void + */ + protected function rewindAndSkipUtf8Bom() + { + $this->globalFunctionsHelper->rewind($this->filePointer); + + $hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM); + + if ($hasUtf8Bom) { + // we skip the 2 first bytes (so start from the 3rd byte) + $this->globalFunctionsHelper->fseek($this->filePointer, 3); + } else { + // if no BOM, reset the pointer to read from the beginning + $this->globalFunctionsHelper->fseek($this->filePointer, 0); + } + } + + /** + * Checks if current position is valid + * @link http://php.net/manual/en/iterator.valid.php + * + * @return boolean + */ + public function valid() + { + return ($this->filePointer && !$this->hasReachedEndOfFile); + } + + /** + * Move forward to next element. Empty rows are skipped. + * @link http://php.net/manual/en/iterator.next.php + * + * @return void + */ + public function next() + { + $lineData = null; + $this->hasReachedEndOfFile = feof($this->filePointer); + + if (!$this->hasReachedEndOfFile) { + do { + $lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure); + } while ($lineData && $this->isEmptyLine($lineData)); + + if ($lineData !== null) { + $this->rowDataBuffer = $lineData; + $this->numReadRows++; + } + } + } + + /** + * @param array $lineData Array containing the cells value for the line + * @return bool Whether the given line is empty + */ + protected function isEmptyLine($lineData) + { + return (count($lineData) === 1 && $lineData[0] === null); + } + + /** + * Return the current element from the buffer + * @link http://php.net/manual/en/iterator.current.php + * + * @return array + */ + public function current() + { + return $this->rowDataBuffer; + } + + /** + * Return the key of the current element + * @link http://php.net/manual/en/iterator.key.php + * + * @return int + */ + public function key() + { + return $this->numReadRows; + } + + /** + * Cleans up what was created to iterate over the object. + * + * @return void + */ + public function end() + { + // do nothing + } +} diff --git a/src/Spout/Reader/CSV/Sheet.php b/src/Spout/Reader/CSV/Sheet.php new file mode 100644 index 0000000..207fcae --- /dev/null +++ b/src/Spout/Reader/CSV/Sheet.php @@ -0,0 +1,35 @@ +rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + } + + /** + * @return RowIterator + */ + public function getRowIterator() + { + return $this->rowIterator; + } +} diff --git a/src/Spout/Reader/CSV/SheetIterator.php b/src/Spout/Reader/CSV/SheetIterator.php new file mode 100644 index 0000000..f424cd8 --- /dev/null +++ b/src/Spout/Reader/CSV/SheetIterator.php @@ -0,0 +1,96 @@ +sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + } + + /** + * Rewind the Iterator to the first element + * @link http://php.net/manual/en/iterator.rewind.php + * + * @return void + */ + public function rewind() + { + $this->hasReadUniqueSheet = false; + } + + /** + * Checks if current position is valid + * @link http://php.net/manual/en/iterator.valid.php + * + * @return boolean + */ + public function valid() + { + return (!$this->hasReadUniqueSheet); + } + + /** + * Move forward to next element + * @link http://php.net/manual/en/iterator.next.php + * + * @return void + */ + public function next() + { + $this->hasReadUniqueSheet = true; + } + + /** + * Return the current element + * @link http://php.net/manual/en/iterator.current.php + * + * @return Sheet + */ + public function current() + { + return $this->sheet; + } + + /** + * Return the key of the current element + * @link http://php.net/manual/en/iterator.key.php + * + * @return int + */ + public function key() + { + return 1; + } + + /** + * Cleans up what was created to iterate over the object. + * + * @return void + */ + public function end() + { + // do nothing + } +} diff --git a/src/Spout/Reader/Exception/NoSheetsFoundException.php b/src/Spout/Reader/Exception/NoSheetsFoundException.php new file mode 100644 index 0000000..dfc4907 --- /dev/null +++ b/src/Spout/Reader/Exception/NoSheetsFoundException.php @@ -0,0 +1,12 @@ +setGlobalFunctionsHelper(new GlobalFunctionsHelper()); + + return $reader; + } +} diff --git a/src/Spout/Reader/ReaderInterface2.php b/src/Spout/Reader/ReaderInterface2.php new file mode 100644 index 0000000..a61c83c --- /dev/null +++ b/src/Spout/Reader/ReaderInterface2.php @@ -0,0 +1,35 @@ + 0 + * Z => 25 + * AA => 26 : (26^(2-1) * (0+1)) + 0 + * AB => 27 : (26^(2-1) * (0+1)) + 1 + * BC => 54 : (26^(2-1) * (1+1)) + 2 + * BCZ => 1455 : (26^(3-1) * (1+1)) + (26^(2-1) * (2+1)) + 25 + */ + foreach (str_split($column) as $single_cell_index) + { + $currentColumnIndex = ord($single_cell_index) - $capitalAAsciiValue; + + if ($columnLength == 1) { + $columnIndex += $currentColumnIndex; + } else { + $columnIndex += pow($step, ($columnLength - 1)) * ($currentColumnIndex + 1); + } + + $columnLength--; + } + + return $columnIndex; + } + + /** + * Returns whether a cell index is valid, in an Excel world. + * To be valid, the cell index should start with capital letters and be followed by numbers. + * + * @param string $cellIndex The Excel cell index ('A1', 'BC13', ...) + * @return bool + */ + protected static function isValidCellIndex($cellIndex) + { + return (preg_match('/^[A-Z]+\d+$/', $cellIndex) === 1); + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactory.php b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactory.php new file mode 100644 index 0000000..8fffdb0 --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactory.php @@ -0,0 +1,154 @@ + 20 * 600 ≈ 12KB + */ + const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12; + + /** + * To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files + * instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory + * and the string will be quickly retrieved. + * The performance bottleneck is not when creating these temporary files, but rather when loading their content. + * Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works + * best when the indexes of the shared strings are sorted in the sheet data. + * 10,000 was chosen because it creates small files that are fast to be loaded in memory. + */ + const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000; + + /** @var CachingStrategyFactory|null Singleton instance */ + protected static $instance = null; + + /** + * Private constructor for singleton + */ + private function __construct() + { + } + + /** + * Returns the singleton instance of the factory + * + * @return CachingStrategyFactory + */ + public static function getInstance() + { + if (self::$instance === null) { + self::$instance = new CachingStrategyFactory(); + } + + return self::$instance; + } + + /** + * Returns the best caching strategy, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored + * @return CachingStrategyInterface The best caching strategy + */ + public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null) + { + if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) { + return new InMemoryStrategy($sharedStringsUniqueCount); + } else { + return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } + } + + /** + * Returns whether it is safe to use in-memory caching, given the number of unique shared strings + * and the amount of memory available. + * + * @param int $sharedStringsUniqueCount Number of unique shared strings + * @return bool + */ + protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount) + { + $memoryAvailable = $this->getMemoryLimitInKB(); + + if ($memoryAvailable === -1) { + // if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe + return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE); + } else { + $memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB; + return ($memoryAvailable > $memoryNeeded); + } + } + + /** + * Returns the PHP "memory_limit" in Kilobytes + * + * @return float + */ + protected function getMemoryLimitInKB() + { + $memoryLimitFormatted = $this->getMemoryLimitFromIni(); + $memoryLimitFormatted = strtolower(trim($memoryLimitFormatted)); + + // No memory limit + if ($memoryLimitFormatted === '-1') { + return -1; + } + + if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) { + $amount = intval($matches[1]); + $unit = $matches[2]; + + switch ($unit) { + case 'b': return ($amount / 1024); + case 'k': return $amount; + case 'm': return ($amount * 1024); + case 'g': return ($amount * 1024 * 1024); + case 't': return ($amount * 1024 * 1024 * 1024); + } + } + + return -1; + } + + /** + * Returns the formatted "memory_limit" value + * + * @return string + */ + protected function getMemoryLimitFromIni() + { + return ini_get('memory_limit'); + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyInterface.php b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyInterface.php new file mode 100644 index 0000000..631222a --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyInterface.php @@ -0,0 +1,44 @@ +fileSystemHelper = new FileSystemHelper($rootTempFolder); + $this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings')); + + $this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile; + + $this->globalFunctionsHelper = new GlobalFunctionsHelper(); + $this->tempFilePointer = null; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + $this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w'); + } + + // The shared string retrieval logic expects each cell data to be on one line only + // Encoding the line feed character allows to preserve this assumption + $lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString); + + $this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL); + } + + /** + * Returns the path for the temp file that should contain the string for the given index + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The temp file path for the given index + */ + protected function getSharedStringTempFilePath($sharedStringIndex) + { + $numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile); + return $this->tempFolder . '/sharedstrings' . $numTempFile; + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + // close pointer to the last temp file that was written + if ($this->tempFilePointer) { + $this->globalFunctionsHelper->fclose($this->tempFilePointer); + } + } + + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + $tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex); + $indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile; + + if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) { + throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex"); + } + + if ($this->inMemoryTempFilePath !== $tempFilePath) { + // free memory + unset($this->inMemoryTempFileContents); + + $this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath)); + $this->inMemoryTempFilePath = $tempFilePath; + } + + $sharedString = null; + if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) { + $escapedSharedString = $this->inMemoryTempFileContents[$indexInFile]; + $sharedString = $this->unescapeLineFeed($escapedSharedString); + } + + if ($sharedString === null) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + + return rtrim($sharedString, PHP_EOL); + } + + /** + * Escapes the line feed characters (\n) + * + * @param string $unescapedString + * @return string + */ + private function escapeLineFeed($unescapedString) + { + return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString); + } + + /** + * Unescapes the line feed characters (\n) + * + * @param string $escapedString + * @return string + */ + private function unescapeLineFeed($escapedString) + { + return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString); + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + if ($this->tempFolder) { + $this->fileSystemHelper->deleteFolderRecursively($this->tempFolder); + } + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/InMemoryStrategy.php b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/InMemoryStrategy.php new file mode 100644 index 0000000..c6a5321 --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/SharedStringsCaching/InMemoryStrategy.php @@ -0,0 +1,83 @@ +inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount); + $this->isCacheClosed = false; + } + + /** + * Adds the given string to the cache. + * + * @param string $sharedString The string to be added to the cache + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return void + */ + public function addStringForIndex($sharedString, $sharedStringIndex) + { + if (!$this->isCacheClosed) { + $this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString); + } + } + + /** + * Closes the cache after the last shared string was added. + * This prevents any additional string from being added to the cache. + * + * @return void + */ + public function closeCache() + { + $this->isCacheClosed = true; + } + + /** + * Returns the string located at the given index from the cache. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + try { + return $this->inMemoryCache->offsetGet($sharedStringIndex); + } catch (\RuntimeException $e) { + throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex"); + } + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function clearCache() + { + unset($this->inMemoryCache); + $this->isCacheClosed = false; + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php b/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php new file mode 100644 index 0000000..5c8fb46 --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/SharedStringsHelper.php @@ -0,0 +1,280 @@ +filePath = $filePath; + $this->tempFolder = $tempFolder; + } + + /** + * Returns whether the XLSX file contains a shared strings XML file + * + * @return bool + */ + public function hasSharedStrings() + { + $hasSharedStrings = false; + $zip = new \ZipArchive(); + + if ($zip->open($this->filePath) === true) { + $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false); + $zip->close(); + } + + return $hasSharedStrings; + } + + /** + * Builds an in-memory array containing all the shared strings of the sheet. + * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. + * It is then accessed by the sheet data, via the string index in the built table. + * + * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx + * + * The XML file can be really big with sheets containing a lot of data. That is why + * we need to use a XML reader that provides streaming like the XMLReader library. + * Please note that SimpleXML does not provide such a functionality but since it is faster + * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. + * + * @return void + * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read + */ + public function extractSharedStrings() + { + $xmlReader = new \XMLReader(); + $sharedStringIndex = 0; + $escaper = new \Box\Spout\Common\Escaper\XLSX(); + + $sharedStringsFilePath = $this->getSharedStringsFilePath(); + if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) { + throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); + } + + $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); + $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); + + while ($xmlReader->read() && $xmlReader->name !== 'si') { + // do nothing until a 'si' tag is reached + } + + while ($xmlReader->name === 'si') { + $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); + $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); + + // removes nodes that should not be read, like the pronunciation of the Kanji characters + $cleanNode = $this->removeSuperfluousTextNodes($node); + + // find all text nodes 't'; there can be multiple if the cell contains formatting + $textNodes = $cleanNode->xpath('//ns:t'); + + $textValue = ''; + foreach ($textNodes as $textNode) { + if ($this->shouldPreserveWhitespace($textNode)) { + $textValue .= $textNode->__toString(); + } else { + $textValue .= trim($textNode->__toString()); + } + } + + $unescapedTextValue = $escaper->unescape($textValue); + $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); + + $sharedStringIndex++; + + // jump to the next 'si' tag + $xmlReader->next('si'); + } + + $this->cachingStrategy->closeCache(); + + $xmlReader->close(); + } + + /** + * @return string The path to the shared strings XML file + */ + protected function getSharedStringsFilePath() + { + return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH; + } + + /** + * Returns the shared strings unique count, as specified in tag. + * + * @param \XMLReader $xmlReader XMLReader instance + * @return int Number of unique shared strings in the sharedStrings.xml file + * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read + */ + protected function getSharedStringsUniqueCount($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of invalid file + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $xmlReader->next('sst'); + + // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) + while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) { + $xmlReader->read(); + } + + $readError = libxml_get_last_error(); + if ($readError !== false) { + throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]"); + } + + // reset the setting to display XML warnings/errors + libxml_use_internal_errors(false); + + return intval($xmlReader->getAttribute('uniqueCount')); + } + + /** + * Returns the best shared strings caching strategy. + * + * @param int $sharedStringsUniqueCount + * @return CachingStrategyInterface + */ + protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) + { + return CachingStrategyFactory::getInstance() + ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder); + } + + /** + * Returns a SimpleXMLElement node from the current node in the given XMLReader instance. + * This is to simplify the parsing of the subtree. + * + * @param \XMLReader $xmlReader + * @return \SimpleXMLElement + * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read + */ + protected function getSimpleXmlElementNodeFromXMLReader($xmlReader) + { + // Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node. + // For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks + libxml_clear_errors(); + libxml_use_internal_errors(true); + + $node = null; + try { + $node = new \SimpleXMLElement($xmlReader->readOuterXml()); + } catch (\Exception $exception) { + $error = libxml_get_last_error(); + libxml_use_internal_errors(false); + + throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].'); + } + + libxml_use_internal_errors(false); + + return $node; + } + + /** + * Removes nodes that should not be read, like the pronunciation of the Kanji characters. + * By keeping them, their text content would be added to the read string. + * + * @param \SimpleXMLElement $parentNode Parent node that may contain nodes to remove + * @return \SimpleXMLElement Cleaned parent node + */ + protected function removeSuperfluousTextNodes($parentNode) + { + $tagsToRemove = [ + 'rPh', // Pronunciation of the text + ]; + + foreach ($tagsToRemove as $tagToRemove) { + $xpath = '//ns:' . $tagToRemove; + $nodesToRemove = $parentNode->xpath($xpath); + + foreach ($nodesToRemove as $nodeToRemove) { + // This is how to remove a node from the XML + unset($nodeToRemove[0]); + } + } + + return $parentNode; + } + + /** + * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. + * + * @param \SimpleXMLElement $textNode The text node element () whitespace may be preserved + * @return bool Whether whitespace should be preserved + */ + protected function shouldPreserveWhitespace($textNode) + { + $shouldPreserveWhitespace = false; + + $attributes = $textNode->attributes('xml', true); + if ($attributes) { + foreach ($attributes as $attributeName => $attributeValue) { + if ($attributeName === 'space' && $attributeValue->__toString() === 'preserve') { + $shouldPreserveWhitespace = true; + break; + } + } + } + + return $shouldPreserveWhitespace; + } + + /** + * Returns the shared string at the given index, using the previously chosen caching strategy. + * + * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file + * @return string The shared string at the given index + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index + */ + public function getStringAtIndex($sharedStringIndex) + { + return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); + } + + /** + * Destroys the cache, freeing memory and removing any created artifacts + * + * @return void + */ + public function cleanup() + { + if ($this->cachingStrategy) { + $this->cachingStrategy->clearCache(); + } + } +} diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php new file mode 100644 index 0000000..3cbe9cb --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -0,0 +1,199 @@ +filePath = $filePath; + $this->sharedStringsHelper = $sharedStringsHelper; + $this->globalFunctionsHelper = $globalFunctionsHelper; + } + + /** + * Returns the sheets metadata of the file located at the previously given file path. + * The paths to the sheets' data are read from the [Content_Types].xml file. + * + * @return Sheet[] Sheets within the XLSX file + */ + public function getSheets() + { + $sheets = []; + + $contentTypesAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::CONTENT_TYPES_XML_FILE_PATH, + self::MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML + ); + + // find all nodes defining a sheet + $sheetNodes = $contentTypesAsXMLElement->xpath('//ns:Override[@ContentType="' . self::OVERRIDE_CONTENT_TYPES_ATTRIBUTE . '"]'); + + for ($i = 0; $i < count($sheetNodes); $i++) { + $sheetNode = $sheetNodes[$i]; + $sheetDataXMLFilePath = (string) $sheetNode->attributes()->PartName; + + $sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath, $i); + } + + return $sheets; + } + + /** + * Returns an instance of a sheet, given the path of its data XML file. + * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet. + * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. + * The entry contains the ID and name of the sheet. + * + * If this piece of data can't be found by parsing the different XML files, the ID will default + * to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will + * default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2"). + * + * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml + * @param int $sheetIndexZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based) + * @return \Box\Spout\Reader\Sheet Sheet instance + */ + protected function getSheetFromXML($sheetDataXMLFilePath, $sheetIndexZeroBased) + { + $sheetId = $sheetIndexZeroBased + 1; + $sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath); + + /* + * In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" + * In workbook.xml.rels, it is only "worksheets/sheet1.xml" + */ + $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); + + // find the node associated to the given file path + $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); + $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); + + if (count($relationshipNodes) === 1) { + $relationshipNode = $relationshipNodes[0]; + $sheetId = (string) $relationshipNode->attributes()->Id; + + $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); + $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]'); + + if (count($sheetNodes) === 1) { + $sheetNode = $sheetNodes[0]; + $sheetId = (int) $sheetNode->attributes()->sheetId; + $escapedSheetName = (string) $sheetNode->attributes()->name; + + $escaper = new \Box\Spout\Common\Escaper\XLSX(); + $sheetName = $escaper->unescape($escapedSheetName); + } + } + + return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetId, $sheetIndexZeroBased, $sheetName); + } + + /** + * Returns the default name of the sheet whose data is located + * at the given path. + * + * @param $sheetDataXMLFilePath + * @return string The default sheet name + */ + protected function getDefaultSheetName($sheetDataXMLFilePath) + { + return $this->globalFunctionsHelper->basename($sheetDataXMLFilePath, self::XML_EXTENSION); + } + + /** + * Returns a representation of the workbook.xml.rels file, ready to be parsed. + * The returned value is cached. + * + * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + */ + protected function getWorkbookXMLRelsAsXMLElement() + { + if (!$this->workbookXMLRelsAsXMLElement) { + $this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::WORKBOOK_XML_RELS_FILE_PATH, + self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS + ); + } + + return $this->workbookXMLRelsAsXMLElement; + } + + /** + * Returns a representation of the workbook.xml file, ready to be parsed. + * The returned value is cached. + * + * @return \SimpleXMLElement XML element representating the workbook.xml.rels file + */ + protected function getWorkbookXMLAsXMLElement() + { + if (!$this->workbookXMLAsXMLElement) { + $this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace( + self::WORKBOOK_XML_FILE_PATH, + self::MAIN_NAMESPACE_FOR_WORKBOOK_XML + ); + } + + return $this->workbookXMLAsXMLElement; + } + + /** + * Loads the contents of the given file in an XML parser and register the given XPath namespace. + * + * @param string $xmlFilePath The path of the XML file inside the XLSX file + * @param string $mainNamespace The main XPath namespace to register + * @return \SimpleXMLElement The XML element representing the file + */ + protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace) + { + $xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath); + + $xmlElement = new \SimpleXMLElement($xmlContents); + $xmlElement->registerXPathNamespace('ns', $mainNamespace); + + return $xmlElement; + } +} diff --git a/src/Spout/Reader/XLSX/Reader.php b/src/Spout/Reader/XLSX/Reader.php new file mode 100644 index 0000000..68712cc --- /dev/null +++ b/src/Spout/Reader/XLSX/Reader.php @@ -0,0 +1,93 @@ +tempFolder = $tempFolder; + return $this; + } + + /** + * Opens the file at the given file path to make it ready to be read. + * It also parses the sharedStrings.xml file to get all the shared strings available in memory + * and fetches all the available sheets. + * + * @param string $filePath Path of the file to be read + * @return void + * @throws \Box\Spout\Common\Exception\IOException If the file at the given path or its content cannot be read + * @throws \Box\Spout\Reader\Exception\NoSheetsFoundException If there are no sheets in the file + */ + protected function openReader($filePath) + { + $this->zip = new \ZipArchive(); + + if ($this->zip->open($filePath) === true) { + $this->sharedStringsHelper = new SharedStringsHelper($filePath, $this->tempFolder); + + if ($this->sharedStringsHelper->hasSharedStrings()) { + // Extracts all the strings from the sheets for easy access in the future + $this->sharedStringsHelper->extractSharedStrings(); + } + + $this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper); + } else { + throw new IOException('Could not open ' . $filePath . ' for reading.'); + } + } + + /** + * Returns an iterator to iterate over sheets. + * + * @return SheetIterator To iterate over sheets + */ + public function getSheetIterator() + { + return $this->sheetIterator; + } + + /** + * Closes the reader. To be used after reading the file. + * + * @return void + */ + protected function closeReader() + { + if ($this->zip) { + $this->zip->close(); + } + + if ($this->sharedStringsHelper) { + $this->sharedStringsHelper->cleanup(); + } + } +} diff --git a/src/Spout/Reader/XLSX/RowIterator.php b/src/Spout/Reader/XLSX/RowIterator.php new file mode 100644 index 0000000..e96898f --- /dev/null +++ b/src/Spout/Reader/XLSX/RowIterator.php @@ -0,0 +1,356 @@ +filePath = $filePath; + $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); + $this->sharedStringsHelper = $sharedStringsHelper; + + $this->xmlReader = new \XMLReader(); + $this->escaper = new \Box\Spout\Common\Escaper\XLSX(); + } + + /** + * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml + * @return string Path of the XML file containing the sheet data, + * without the leading slash. + */ + protected function normalizeSheetDataXMLFilePath($sheetDataXMLFilePath) + { + return ltrim($sheetDataXMLFilePath, '/'); + } + + /** + * Rewind the Iterator to the first element. + * Initializes the XMLReader object that reads the associated sheet data. + * The XMLReader is configured to be safe from billion laughs attack. + * @link http://php.net/manual/en/iterator.rewind.php + * + * @return void + * @throws \Box\Spout\Common\Exception\IOException If the sheet data XML cannot be read + */ + public function rewind() + { + $this->xmlReader->close(); + + $sheetDataFilePath = 'zip://' . $this->filePath . '#' . $this->sheetDataXMLFilePath; + if ($this->xmlReader->open($sheetDataFilePath, null, LIBXML_NONET) === false) { + throw new IOException('Could not open "' . $this->sheetDataXMLFilePath . '".'); + } + + $this->numReadRows = 0; + $this->rowDataBuffer = null; + $this->hasReachedEndOfFile = false; + $this->numColumns = 0; + + $this->next(); + } + + /** + * Checks if current position is valid + * @link http://php.net/manual/en/iterator.valid.php + * + * @return boolean + */ + public function valid() + { + return (!$this->hasReachedEndOfFile); + } + + /** + * Move forward to next element. Empty rows will be skipped. + * @link http://php.net/manual/en/iterator.next.php + * + * @return void + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found + */ + public function next() + { + $isInsideRowTag = false; + $rowData = []; + + while ($this->xmlReader->read()) { + if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) { + // Read dimensions of the sheet + $dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) + if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) { + $lastCellIndex = $matches[1]; + $this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1; + } + + } else if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { + // Start of the row description + $isInsideRowTag = true; + + // Read spans info if present + $numberOfColumnsForRow = $this->numColumns; + $spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance + if ($spans) { + list(, $numberOfColumnsForRow) = explode(':', $spans); + $numberOfColumnsForRow = intval($numberOfColumnsForRow); + } + $rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : []; + + } else if ($isInsideRowTag && $this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) { + // Start of a cell description + $currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); + $currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex); + + $node = $this->xmlReader->expand(); + $rowData[$currentColumnIndex] = $this->getCellValue($node); + + } else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) { + // End of the row description + // If needed, we fill the empty cells + $rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData); + $this->numReadRows++; + break; + + } else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) { + // The closing "" marks the end of the file + $this->hasReachedEndOfFile = true; + } + } + + $this->rowDataBuffer = $rowData; + } + + /** + * Returns the cell's string value from a node's nested value node + * + * @param \DOMNode $node + * @return string The value associated with the cell + */ + protected function getVNodeValue($node) + { + // for cell types having a "v" tag containing the value. + // if not, the returned value should be empty string. + $vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0); + if ($vNode !== null) { + return $vNode->nodeValue; + } + return ""; + } + + /** + * Returns the cell String value where string is inline. + * + * @param \DOMNode $node + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatInlineStringCellValue($node) + { + // inline strings are formatted this way: + // [INLINE_STRING] + $tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0); + $escapedCellValue = trim($tNode->nodeValue); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell String value from shared-strings file using nodeValue index. + * + * @param string $nodeValue + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatSharedStringCellValue($nodeValue) + { + // shared strings are formatted this way: + // [SHARED_STRING_INDEX] + $sharedStringIndex = intval($nodeValue); + $escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell String value, where string is stored in value node. + * + * @param string $nodeValue + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatStrCellValue($nodeValue) + { + $escapedCellValue = trim($nodeValue); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell Numeric value from string of nodeValue. + * + * @param string $nodeValue + * @return int|float The value associated with the cell + */ + protected function formatNumericCellValue($nodeValue) + { + $cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue); + return $cellValue; + } + + /** + * Returns the cell Boolean value from a specific node's Value. + * + * @param string $nodeValue + * @return bool The value associated with the cell + */ + protected function formatBooleanCellValue($nodeValue) + { + // !! is similar to boolval() + $cellValue = !!$nodeValue; + return $cellValue; + } + + /** + * Returns a cell's PHP Date value, associated to the given stored nodeValue. + * + * @param string $nodeValue + * @return \DateTime|null The value associated with the cell (null when the cell has an error) + */ + protected function formatDateCellValue($nodeValue) + { + // Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php) + try { + $cellValue = new \DateTime($nodeValue); + return $cellValue; + } catch (\Exception $e) { + return null; + } + } + + /** + * Returns the (unescaped) correctly marshalled, cell value associated to the given XML node. + * + * @param \DOMNode $node + * @return string|int|float|bool|\DateTime|null The value associated with the cell (null when the cell has an error) + */ + protected function getCellValue($node) + { + // Default cell type is "n" + $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC; + $vNodeValue = $this->getVNodeValue($node); + + if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) { + return $vNodeValue; + } + + switch ($cellType) { + case self::CELL_TYPE_INLINE_STRING: + return $this->formatInlineStringCellValue($node); + case self::CELL_TYPE_SHARED_STRING: + return $this->formatSharedStringCellValue($vNodeValue); + case self::CELL_TYPE_STR: + return $this->formatStrCellValue($vNodeValue); + case self::CELL_TYPE_BOOLEAN: + return $this->formatBooleanCellValue($vNodeValue); + case self::CELL_TYPE_NUMERIC: + return $this->formatNumericCellValue($vNodeValue); + case self::CELL_TYPE_DATE: + return $this->formatDateCellValue($vNodeValue); + default: + return null; + } + } + + /** + * Return the current element, from the buffer. + * @link http://php.net/manual/en/iterator.current.php + * + * @return array|null + */ + public function current() + { + return $this->rowDataBuffer; + } + + /** + * Return the key of the current element + * @link http://php.net/manual/en/iterator.key.php + * + * @return int + */ + public function key() + { + return $this->numReadRows; + } + + + /** + * Cleans up what was created to iterate over the object. + * + * @return void + */ + public function end() + { + $this->xmlReader->close(); + } +} diff --git a/src/Spout/Reader/XLSX/Sheet.php b/src/Spout/Reader/XLSX/Sheet.php new file mode 100644 index 0000000..e2eebec --- /dev/null +++ b/src/Spout/Reader/XLSX/Sheet.php @@ -0,0 +1,74 @@ +rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper); + $this->id = $sheetId; + $this->index = $sheetIndex; + $this->name = $sheetName; + } + + /** + * @return RowIterator + */ + public function getRowIterator() + { + return $this->rowIterator; + } + + /** + * @return int ID of the sheet + */ + public function getId() + { + return $this->id; + } + + /** + * @return int Index of the sheet, based on order of creation (zero-based) + */ + public function getIndex() + { + return $this->index; + } + + /** + * @return string Name of the sheet + */ + public function getName() + { + return $this->name; + } +} diff --git a/src/Spout/Reader/XLSX/SheetIterator.php b/src/Spout/Reader/XLSX/SheetIterator.php new file mode 100644 index 0000000..aae58c2 --- /dev/null +++ b/src/Spout/Reader/XLSX/SheetIterator.php @@ -0,0 +1,112 @@ +sheets = $sheetHelper->getSheets(); + + if (count($this->sheets) === 0) { + throw new NoSheetsFoundException('The file must contain at least one sheet.'); + } + } + + /** + * Rewind the Iterator to the first element + * @link http://php.net/manual/en/iterator.rewind.php + * + * @return void + */ + public function rewind() + { + $this->currentSheetIndex = 0; + } + + /** + * Checks if current position is valid + * @link http://php.net/manual/en/iterator.valid.php + * + * @return boolean + */ + public function valid() + { + return ($this->currentSheetIndex < count($this->sheets)); + } + + /** + * Move forward to next element + * @link http://php.net/manual/en/iterator.next.php + * + * @return void + */ + public function next() + { + if (array_key_exists($this->currentSheetIndex, $this->sheets)) { + $currentSheet = $this->sheets[$this->currentSheetIndex]; + $currentSheet->getRowIterator()->end(); + + $this->currentSheetIndex++; + } + } + + /** + * Return the current element + * @link http://php.net/manual/en/iterator.current.php + * + * @return Sheet + */ + public function current() + { + return $this->sheets[$this->currentSheetIndex]; + } + + /** + * Return the key of the current element + * @link http://php.net/manual/en/iterator.key.php + * + * @return int + */ + public function key() + { + return $this->currentSheetIndex + 1; + } + + /** + * Cleans up what was created to iterate over the object. + * + * @return void + */ + public function end() + { + // make sure we are not leaking memory in case the iteration stopped before the end + foreach ($this->sheets as $sheet) { + $sheet->getRowIterator()->end(); + } + } +} diff --git a/tests/Spout/Reader/CSV/ReaderTest.php b/tests/Spout/Reader/CSV/ReaderTest.php new file mode 100644 index 0000000..de55b94 --- /dev/null +++ b/tests/Spout/Reader/CSV/ReaderTest.php @@ -0,0 +1,181 @@ +open('/path/to/fake/file.csv'); + } + + /** + * @expectedException \Box\Spout\Common\Exception\IOException + * + * @return void + */ + public function testOpenShouldThrowExceptionIfFileNotReadable() + { + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['is_readable']) + ->getMock(); + $helperStub->method('is_readable')->willReturn(false); + + $resourcePath = $this->getResourcePath('csv_standard.csv'); + + $reader = ReaderFactory2::create(Type::CSV); + $reader->setGlobalFunctionsHelper($helperStub); + $reader->open($resourcePath); + } + + + /** + * @return void + */ + public function testReadStandardCSV() + { + $allRows = $this->getAllRowsForFile('csv_standard.csv'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldNotStopAtCommaIfEnclosed() + { + $allRows = $this->getAllRowsForFile('csv_with_comma_enclosed.csv'); + $this->assertEquals('This is, a comma', $allRows[0][0]); + } + + /** + * @return void + */ + public function testReadShouldKeepEmptyCells() + { + $allRows = $this->getAllRowsForFile('csv_with_empty_cells.csv'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', '', 'csv--23'], + ['csv--31', 'csv--32', ''], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSkipEmptyLines() + { + $allRows = $this->getAllRowsForFile('csv_with_empty_line.csv'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--31', 'csv--32', 'csv--33'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldHaveTheRightNumberOfCells() + { + $allRows = $this->getAllRowsForFile('csv_with_different_cells_number.csv'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22'], + ['csv--31'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSupportCustomFieldDelimiter() + { + $allRows = $this->getAllRowsForFile('csv_delimited_with_pipes.csv', '|'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSupportCustomFieldEnclosure() + { + $allRows = $this->getAllRowsForFile('csv_text_enclosed_with_pound.csv', ',', '#'); + $this->assertEquals('This is, a comma', $allRows[0][0]); + } + + /** + * @return void + */ + public function testReadShouldSkipUtf8Bom() + { + $allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv'); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22', 'csv--23'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @param string $fileName + * @param string|void $fieldDelimiter + * @param string|void $fieldEnclosure + * @return array All the read rows the given file + */ + private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"') + { + $allRows = []; + $resourcePath = $this->getResourcePath($fileName); + + $reader = ReaderFactory2::create(Type::CSV); + $reader->setFieldDelimiter($fieldDelimiter); + $reader->setFieldEnclosure($fieldEnclosure); + + $reader->open($resourcePath); + + foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { + foreach ($sheet->getRowIterator() as $rowIndex => $row) { + $allRows[] = $row; + } + } + + $reader->close(); + + return $allRows; + } +} diff --git a/tests/Spout/Reader/XLSX/Helper/CellHelperTest.php b/tests/Spout/Reader/XLSX/Helper/CellHelperTest.php new file mode 100644 index 0000000..ff417b9 --- /dev/null +++ b/tests/Spout/Reader/XLSX/Helper/CellHelperTest.php @@ -0,0 +1,60 @@ + 1, 3 => 3]; + $filledArray = CellHelper::fillMissingArrayIndexes($arrayToFill, 'FILL'); + + $expectedFilledArray = ['FILL', 1, 'FILL', 3]; + $this->assertEquals($expectedFilledArray, $filledArray); + } + + /** + * @return array + */ + public function dataProviderForTestGetColumnIndexFromCellIndex() + { + return [ + ['A1', 0], + ['Z3', 25], + ['AA5', 26], + ['AB24', 27], + ['BC5', 54], + ['BCZ99', 1455], + ]; + } + + /** + * @dataProvider dataProviderForTestGetColumnIndexFromCellIndex + * + * @param string $cellIndex + * @param int $expectedColumnIndex + * @return void + */ + public function testGetColumnIndexFromCellIndex($cellIndex, $expectedColumnIndex) + { + $this->assertEquals($expectedColumnIndex, CellHelper::getColumnIndexFromCellIndex($cellIndex)); + } + + /** + * @expectedException \Box\Spout\Common\Exception\InvalidArgumentException + * + * @return void + */ + public function testGetColumnIndexFromCellIndexShouldThrowIfInvalidCellIndex() + { + CellHelper::getColumnIndexFromCellIndex('InvalidCellIndex'); + } +} diff --git a/tests/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactoryTest.php b/tests/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactoryTest.php new file mode 100644 index 0000000..ea77b4f --- /dev/null +++ b/tests/Spout/Reader/XLSX/Helper/SharedStringsCaching/CachingStrategyFactoryTest.php @@ -0,0 +1,99 @@ +getMockBuilder('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitInKB']) + ->getMock(); + + $factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB); + + \ReflectionHelper::setStaticValue('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub); + + $strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null); + + $fullExpectedStrategyClassName = 'Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\\' . $expectedStrategyClassName; + $this->assertEquals($fullExpectedStrategyClassName, get_class($strategy)); + + $strategy->clearCache(); + \ReflectionHelper::reset(); + } + + /** + * @return array + */ + public function dataProviderForTestGetMemoryLimitInKB() + { + return [ + ['-1', -1], + ['invalid', -1], + ['1024B', 1], + ['128K', 128], + ['256KB', 256], + ['512M', 512 * 1024], + ['2MB', 2 * 1024], + ['1G', 1 * 1024 * 1024], + ['10GB', 10 * 1024 * 1024], + ['2T', 2 * 1024 * 1024 * 1024], + ['5TB', 5 * 1024 * 1024 * 1024], + ]; + } + + /** + * @dataProvider dataProviderForTestGetMemoryLimitInKB + * + * @param string $memoryLimitFormatted + * @param float $expectedMemoryLimitInKB + * @return void + */ + public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB) + { + /** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */ + $factoryStub = $this + ->getMockBuilder('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory') + ->disableOriginalConstructor() + ->setMethods(['getMemoryLimitFromIni']) + ->getMock(); + + $factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted); + + $memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB'); + + $this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB); + } +} diff --git a/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php b/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php new file mode 100644 index 0000000..a72d19a --- /dev/null +++ b/tests/Spout/Reader/XLSX/Helper/SharedStringsHelperTest.php @@ -0,0 +1,112 @@ +getResourcePath('one_sheet_with_shared_strings.xlsx'); + $this->sharedStringsHelper = new SharedStringsHelper($resourcePath); + } + + /** + * @return void + */ + public function tearDown() + { + $this->sharedStringsHelper->cleanup(); + } + + /** + * @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException + * @return void + */ + public function testGetStringAtIndexShouldThrowExceptionIfStringNotFound() + { + $this->sharedStringsHelper->extractSharedStrings(); + $this->sharedStringsHelper->getStringAtIndex(PHP_INT_MAX); + } + + /** + * @return void + */ + public function testGetStringAtIndexShouldReturnTheCorrectStringIfFound() + { + $this->sharedStringsHelper->extractSharedStrings(); + + $sharedString = $this->sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('s1--A1', $sharedString); + + $sharedString = $this->sharedStringsHelper->getStringAtIndex(24); + $this->assertEquals('s1--E5', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy); + } + + /** + * @return void + */ + public function testGetStringAtIndexShouldWorkWithMultilineStrings() + { + $resourcePath = $this->getResourcePath('one_sheet_with_shared_multiline_strings.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals("s1\nA1", $sharedString); + + $sharedString = $sharedStringsHelper->getStringAtIndex(24); + $this->assertEquals("s1\nE5", $sharedString); + + $sharedStringsHelper->cleanup(); + } + + /** + * @return void + */ + public function testGetStringAtIndexWithFileBasedStrategy() + { + // force the file-based strategy by setting no memory limit + $originalMemoryLimit = ini_get('memory_limit'); + ini_set('memory_limit', '-1'); + + $resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx'); + $sharedStringsHelper = new SharedStringsHelper($resourcePath); + + $sharedStringsHelper->extractSharedStrings(); + + $sharedString = $sharedStringsHelper->getStringAtIndex(0); + $this->assertEquals('str', $sharedString); + + $sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1); + $this->assertEquals('str', $sharedString); + + $usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy'); + $this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy); + + $sharedStringsHelper->cleanup(); + + ini_set('memory_limit', $originalMemoryLimit); + } +} diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php new file mode 100644 index 0000000..c5fb583 --- /dev/null +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -0,0 +1,300 @@ +getAllRowsForFile($filePath); + } + + /** + * @return array + */ + public function dataProviderForTestReadForAllWorksheets() + { + return [ + ['one_sheet_with_shared_strings.xlsx', 5, 5], + ['one_sheet_with_inline_strings.xlsx', 5, 5], + ['two_sheets_with_shared_strings.xlsx', 10, 5], + ['two_sheets_with_inline_strings.xlsx', 10, 5] + ]; + } + + /** + * @dataProvider dataProviderForTestReadForAllWorksheets + * + * @param string $resourceName + * @param int $expectedNumOfRows + * @param int $expectedNumOfCellsPerRow + * @return void + */ + public function testReadForAllWorksheets($resourceName, $expectedNumOfRows, $expectedNumOfCellsPerRow) + { + $allRows = $this->getAllRowsForFile($resourceName); + + $this->assertEquals($expectedNumOfRows, count($allRows), "There should be $expectedNumOfRows rows"); + foreach ($allRows as $row) { + $this->assertEquals($expectedNumOfCellsPerRow, count($row), "There should be $expectedNumOfCellsPerRow cells for every row"); + } + } + + /** + * @return void + */ + public function testReadShouldSupportFilesWithoutSharedStringsFile() + { + $allRows = $this->getAllRowsForFile('sheet_with_no_shared_strings_file.xlsx'); + + $expectedRows = [ + [10, 11], + [20, 21], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSupportAllCellTypes() + { + $allRows = $this->getAllRowsForFile('sheet_with_all_cell_types.xlsx'); + + $expectedRows = [ + [ + 's1--A1', 's1--A2', + false, true, + \DateTime::createFromFormat('Y-m-d H:i:s', '2015-06-03 13:21:58'), + \DateTime::createFromFormat('Y-m-d H:i:s', '2015-06-01 00:00:00'), + 10, 10.43, + null, + 'weird string', // valid 'str' string + null, // invalid date + ], + ['', '', '', '', '', '', '', '', ''], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldKeepEmptyCellsAtTheEndIfDimensionsSpecified() + { + $allRows = $this->getAllRowsForFile('sheet_without_dimensions_but_spans_and_empty_cells.xlsx'); + + $this->assertEquals(2, count($allRows), 'There should be 2 rows'); + foreach ($allRows as $row) { + $this->assertEquals(5, count($row), 'There should be 5 cells for every row, because empty rows should be preserved'); + } + + $expectedRows = [ + ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'], + ['s1--A2', 's1--B2', 's1--C2', '', ''], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldKeepEmptyCellsAtTheEndIfNoDimensionsButSpansSpecified() + { + $allRows = $this->getAllRowsForFile('sheet_without_dimensions_and_empty_cells.xlsx'); + + $this->assertEquals(2, count($allRows), 'There should be 2 rows'); + $this->assertEquals(5, count($allRows[0]), 'There should be 5 cells in the first row'); + $this->assertEquals(3, count($allRows[1]), 'There should be only 3 cells in the second row, because empty rows at the end should be skip'); + + $expectedRows = [ + ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'], + ['s1--A2', 's1--B2', 's1--C2'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSkipEmptyCellsAtTheEndIfDimensionsNotSpecified() + { + $allRows = $this->getAllRowsForFile('sheet_without_dimensions_and_empty_cells.xlsx'); + + $this->assertEquals(2, count($allRows), 'There should be 2 rows'); + $this->assertEquals(5, count($allRows[0]), 'There should be 5 cells in the first row'); + $this->assertEquals(3, count($allRows[1]), 'There should be only 3 cells in the second row, because empty rows at the end should be skip'); + + $expectedRows = [ + ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'], + ['s1--A2', 's1--B2', 's1--C2'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSkipEmptyRows() + { + $allRows = $this->getAllRowsForFile('sheet_with_empty_rows.xlsx'); + + $this->assertEquals(2, count($allRows), 'There should be only 2 rows, because the empty row is skipped'); + + $expectedRows = [ + ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'], + ['s1--A3', 's1--B3', 's1--C3', 's1--D3', 's1--E3'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSupportEmptySharedString() + { + $allRows = $this->getAllRowsForFile('sheet_with_empty_shared_string.xlsx'); + + $expectedRows = [ + ['s1--A1', '', 's1--C1'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldPreserveSpaceIfSpecified() + { + $allRows = $this->getAllRowsForFile('sheet_with_preserve_space_shared_strings.xlsx'); + + $expectedRows = [ + [' s1--A1', 's1--B1 ', ' s1--C1 '], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSkipPronunciationData() + { + $allRows = $this->getAllRowsForFile('sheet_with_pronunciation.xlsx'); + + $expectedRow = ['名前', '一二三四']; + $this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.'); + } + + + /** + * @return array + */ + public function dataProviderForTestReadShouldBeProtectedAgainstAttacks() + { + return [ + ['attack_billion_laughs.xlsx'], + ['attack_quadratic_blowup.xlsx'], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks + * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) + * + * @param string $fileName + * @return void + */ + public function testReadShouldBeProtectedAgainstAttacks($fileName) + { + $startTime = microtime(true); + + try { + $this->getAllRowsForFile($fileName); + $this->fail('An exception should have been thrown'); + } catch (IOException $exception) { + $duration = microtime(true) - $startTime; + $this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.'); + + $expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB + $this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.'); + } + } + + /** + * @return void + */ + public function testReadShouldBeAbleToProcessEmptySheets() + { + $allRows = $this->getAllRowsForFile('sheet_with_no_cells.xlsx'); + $this->assertEquals([], $allRows, 'Sheet with no cells should be correctly processed.'); + } + + /** + * @return void + */ + public function testReadShouldSkipFormulas() + { + $allRows = $this->getAllRowsForFile('sheet_with_formulas.xlsx'); + + $expectedRows = [ + ['val1', 'val2', 'total1', 'total2'], + [10, 20, 30, 21], + [11, 21, 32, 41], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @param string $fileName + * @return array All the read rows the given file + */ + private function getAllRowsForFile($fileName) + { + $allRows = []; + $resourcePath = $this->getResourcePath($fileName); + + $reader = ReaderFactory2::create(Type::XLSX); + $reader->open($resourcePath); + + foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { + foreach ($sheet->getRowIterator() as $rowIndex => $row) { + $allRows[] = $row; + } + } + + $reader->close(); + + return $allRows; + } +} diff --git a/tests/Spout/Reader/XLSX/SheetTest.php b/tests/Spout/Reader/XLSX/SheetTest.php new file mode 100644 index 0000000..c9449f4 --- /dev/null +++ b/tests/Spout/Reader/XLSX/SheetTest.php @@ -0,0 +1,53 @@ +openFileAndReturnSheets('two_sheets_with_custom_names.xlsx'); + + $this->assertEquals('CustomName1', $sheets[0]->getName()); + $this->assertEquals(0, $sheets[0]->getIndex()); + $this->assertEquals(1, $sheets[0]->getId()); + + $this->assertEquals('CustomName2', $sheets[1]->getName()); + $this->assertEquals(1, $sheets[1]->getIndex()); + $this->assertEquals(2, $sheets[1]->getId()); + } + + /** + * @param string $fileName + * @return Sheet[] + */ + private function openFileAndReturnSheets($fileName) + { + $resourcePath = $this->getResourcePath($fileName); + $reader = ReaderFactory2::create(Type::XLSX); + $reader->open($resourcePath); + + $sheets = []; + foreach ($reader->getSheetIterator() as $sheet) { + $sheets[] = $sheet; + } + + $reader->close(); + + return $sheets; + } +}