Moved readers to iterators

Instead of the hasNext() / next() syntax, readers now implements the PHP iterator pattern.
It allows readers to be used with a foreach() loop.

All readers now share the same structure (CSV is treated as having exactly one sheet):
- one concrete Reader
- one SheetIterator, exposed by the Reader
- one or more Sheets, returned at every iteration
- one RowIterator, exposed by the Sheet

Introducing the concept of sheets for CSV may be kind of confusing but it makes Spout way more consistent.
Also, this confusion may be resolved by creating a wrapper around the readers if needed.

-- This commit does not delete the old files, not change the folder structure for Writers. This will be done in another commit.
This commit is contained in:
Adrien Loison 2015-07-15 00:22:37 -07:00
parent 322c3d0738
commit ae3ee357ff
27 changed files with 3112 additions and 0 deletions

View File

@ -0,0 +1,111 @@
<?php
namespace Box\Spout\Reader;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\Exception\ReaderNotOpenedException;
use Box\Spout\Reader\Exception\EndOfFileReachedException;
/**
* Class AbstractReader2
*
* @package Box\Spout\Reader
* @abstract
*/
abstract class AbstractReader2 implements ReaderInterface2
{
/** @var bool Indicates whether the stream is currently open */
protected $isStreamOpened = false;
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/**
* Opens the file at the given file path to make it ready to be read
*
* @param string $filePath Path of the file to be read
* @return void
*/
abstract protected function openReader($filePath);
/**
* Returns an iterator to iterate over sheets.
*
* @return \Iterator To iterate over sheets
*/
abstract public function getSheetIterator();
/**
* Closes the reader. To be used after reading the file.
*
* @return AbstractReader
*/
abstract protected function closeReader();
/**
* @param $globalFunctionsHelper
* @return AbstractReader
*/
public function setGlobalFunctionsHelper($globalFunctionsHelper)
{
$this->globalFunctionsHelper = $globalFunctionsHelper;
return $this;
}
/**
* Prepares the reader to read the given file. It also makes sure
* that the file exists and is readable.
*
* @param string $filePath Path of the file to be read
* @return void
* @throws \Box\Spout\Common\Exception\IOException If the file at the given path does not exist, is not readable or is corrupted
*/
public function open($filePath)
{
if (!$this->isPhpStream($filePath)) {
// we skip the checks if the provided file path points to a PHP stream
if (!$this->globalFunctionsHelper->file_exists($filePath)) {
throw new IOException('Could not open ' . $filePath . ' for reading! File does not exist.');
} else if (!$this->globalFunctionsHelper->is_readable($filePath)) {
throw new IOException('Could not open ' . $filePath . ' for reading! File is not readable.');
}
}
try {
$this->openReader($filePath);
$this->isStreamOpened = true;
} catch (\Exception $exception) {
throw new IOException('Could not open ' . $filePath . ' for reading! (' . $exception->getMessage() . ')');
}
}
/**
* Checks if a path is a PHP stream (like php://output, php://memory, ...)
*
* @param string $filePath Path of the file to be read
* @return bool Whether the given path maps to a PHP stream
*/
protected function isPhpStream($filePath)
{
return (strpos($filePath, 'php://') === 0);
}
/**
* Closes the reader, preventing any additional reading
*
* @return void
*/
public function close()
{
if ($this->isStreamOpened) {
$this->closeReader();
$sheetIterator = $this->getSheetIterator();
if ($sheetIterator) {
$sheetIterator->end();
}
$this->isStreamOpened = false;
}
}
}

View File

@ -0,0 +1,95 @@
<?php
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\AbstractReader2;
use Box\Spout\Common\Exception\IOException;
/**
* Class Reader
* This class provides support to read data from a CSV file.
*
* @package Box\Spout\Reader\CSV
*/
class Reader extends AbstractReader2
{
/** @var resource Pointer to the file to be written */
protected $filePointer;
/** @var SheetIterator To iterator over the CSV unique "sheet" */
protected $sheetIterator;
/** @var string Defines the character used to delimit fields (one character only) */
protected $fieldDelimiter = ',';
/** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"';
/**
* Sets the field delimiter for the CSV.
* Needs to be called before opening the reader.
*
* @param string $fieldDelimiter Character that delimits fields
* @return Reader
*/
public function setFieldDelimiter($fieldDelimiter)
{
$this->fieldDelimiter = $fieldDelimiter;
return $this;
}
/**
* Sets the field enclosure for the CSV.
* Needs to be called before opening the reader.
*
* @param string $fieldEnclosure Character that enclose fields
* @return Reader
*/
public function setFieldEnclosure($fieldEnclosure)
{
$this->fieldEnclosure = $fieldEnclosure;
return $this;
}
/**
* Opens the file at the given path to make it ready to be read.
* The file must be UTF-8 encoded.
* @TODO add encoding detection/conversion
*
* @param string $filePath Path of the CSV file to be read
* @return void
* @throws \Box\Spout\Common\Exception\IOException
*/
protected function openReader($filePath)
{
$this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r');
if (!$this->filePointer) {
throw new IOException('Could not open file ' . $filePath . ' for reading.');
}
$this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper);
}
/**
* Returns an iterator to iterate over sheets.
*
* @return SheetIterator To iterate over sheets
*/
public function getSheetIterator()
{
return $this->sheetIterator;
}
/**
* Closes the reader. To be used after reading the file.
*
* @return void
*/
protected function closeReader()
{
if ($this->filePointer) {
$this->globalFunctionsHelper->fclose($this->filePointer);
}
}
}

View File

@ -0,0 +1,163 @@
<?php
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\IteratorInterface;
/**
* Class RowIterator
* Iterate over CSV rows.
*
* @package Box\Spout\Reader\CSV
*/
class RowIterator implements IteratorInterface
{
const UTF8_BOM = "\xEF\xBB\xBF";
/** @var resource Pointer to the CSV file to read */
protected $filePointer;
/** @var int Number of read rows */
protected $numReadRows = 0;
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
protected $rowDataBuffer = null;
/** @var bool Indicates whether all rows have been read */
protected $hasReachedEndOfFile = false;
/** @var string Defines the character used to delimit fields (one character only) */
protected $fieldDelimiter = ',';
/** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"';
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/**
* @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
{
$this->filePointer = $filePointer;
$this->fieldDelimiter = $fieldDelimiter;
$this->fieldEnclosure = $fieldEnclosure;
$this->globalFunctionsHelper = $globalFunctionsHelper;
}
/**
* Rewind the Iterator to the first element
* @link http://php.net/manual/en/iterator.rewind.php
*
* @return void
*/
public function rewind()
{
$this->rewindAndSkipUtf8Bom();
$this->numReadRows = 0;
$this->rowDataBuffer = null;
$this->next();
}
/**
* This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file
* by moving the file pointer after it, so that it is not read.
*
* @return void
*/
protected function rewindAndSkipUtf8Bom()
{
$this->globalFunctionsHelper->rewind($this->filePointer);
$hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM);
if ($hasUtf8Bom) {
// we skip the 2 first bytes (so start from the 3rd byte)
$this->globalFunctionsHelper->fseek($this->filePointer, 3);
} else {
// if no BOM, reset the pointer to read from the beginning
$this->globalFunctionsHelper->fseek($this->filePointer, 0);
}
}
/**
* Checks if current position is valid
* @link http://php.net/manual/en/iterator.valid.php
*
* @return boolean
*/
public function valid()
{
return ($this->filePointer && !$this->hasReachedEndOfFile);
}
/**
* Move forward to next element. Empty rows are skipped.
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
*/
public function next()
{
$lineData = null;
$this->hasReachedEndOfFile = feof($this->filePointer);
if (!$this->hasReachedEndOfFile) {
do {
$lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure);
} while ($lineData && $this->isEmptyLine($lineData));
if ($lineData !== null) {
$this->rowDataBuffer = $lineData;
$this->numReadRows++;
}
}
}
/**
* @param array $lineData Array containing the cells value for the line
* @return bool Whether the given line is empty
*/
protected function isEmptyLine($lineData)
{
return (count($lineData) === 1 && $lineData[0] === null);
}
/**
* Return the current element from the buffer
* @link http://php.net/manual/en/iterator.current.php
*
* @return array
*/
public function current()
{
return $this->rowDataBuffer;
}
/**
* Return the key of the current element
* @link http://php.net/manual/en/iterator.key.php
*
* @return int
*/
public function key()
{
return $this->numReadRows;
}
/**
* Cleans up what was created to iterate over the object.
*
* @return void
*/
public function end()
{
// do nothing
}
}

View File

@ -0,0 +1,35 @@
<?php
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\SheetInterface;
/**
* Class Sheet
*
* @package Box\Spout\Reader\CSV
*/
class Sheet implements SheetInterface
{
/** @var RowIterator To iterate over the CSV's rows */
protected $rowIterator;
/**
* @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
{
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
}
/**
* @return RowIterator
*/
public function getRowIterator()
{
return $this->rowIterator;
}
}

View File

@ -0,0 +1,96 @@
<?php
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\IteratorInterface;
/**
* Class SheetIterator
* Iterate over CSV unique "sheet".
*
* @package Box\Spout\Reader\CSV
*/
class SheetIterator implements IteratorInterface
{
/** @var Sheet The CSV unique "sheet" */
protected $sheet;
/** @var bool Whether the unique "sheet" has already been read */
protected $hasReadUniqueSheet = false;
/**
* @param resource $filePointer
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
{
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
}
/**
* Rewind the Iterator to the first element
* @link http://php.net/manual/en/iterator.rewind.php
*
* @return void
*/
public function rewind()
{
$this->hasReadUniqueSheet = false;
}
/**
* Checks if current position is valid
* @link http://php.net/manual/en/iterator.valid.php
*
* @return boolean
*/
public function valid()
{
return (!$this->hasReadUniqueSheet);
}
/**
* Move forward to next element
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
*/
public function next()
{
$this->hasReadUniqueSheet = true;
}
/**
* Return the current element
* @link http://php.net/manual/en/iterator.current.php
*
* @return Sheet
*/
public function current()
{
return $this->sheet;
}
/**
* Return the key of the current element
* @link http://php.net/manual/en/iterator.key.php
*
* @return int
*/
public function key()
{
return 1;
}
/**
* Cleans up what was created to iterate over the object.
*
* @return void
*/
public function end()
{
// do nothing
}
}

View File

@ -0,0 +1,12 @@
<?php
namespace Box\Spout\Reader\Exception;
/**
* Class NoSheetsFoundException
*
* @package Box\Spout\Reader\Exception
*/
class NoSheetsFoundException extends ReaderException
{
}

View File

@ -0,0 +1,18 @@
<?php
namespace Box\Spout\Reader;
/**
* Interface IteratorInterface
*
* @package Box\Spout\Reader
*/
interface IteratorInterface extends \Iterator
{
/**
* Cleans up what was created to iterate over the object.
*
* @return void
*/
public function end();
}

View File

@ -0,0 +1,44 @@
<?php
namespace Box\Spout\Reader;
use Box\Spout\Common\Exception\UnsupportedTypeException;
use Box\Spout\Common\Helper\GlobalFunctionsHelper;
use Box\Spout\Common\Type;
/**
* Class ReaderFactory2
* This factory is used to create readers, based on the type of the file to be read.
* It supports CSV and XLSX formats.
*
* @package Box\Spout\Reader
*/
class ReaderFactory2
{
/**
* This creates an instance of the appropriate reader, given the type of the file to be read
*
* @param string $readerType Type of the reader to instantiate
* @return \Box\Spout\Reader\CSV\Reader|\Box\Spout\Reader\XLSX\Reader
* @throws \Box\Spout\Common\Exception\UnsupportedTypeException
*/
public static function create($readerType)
{
$reader = null;
switch ($readerType) {
case Type::CSV:
$reader = new CSV\Reader();
break;
case Type::XLSX:
$reader = new XLSX\Reader();
break;
default:
throw new UnsupportedTypeException('No readers supporting the given type: ' . $readerType);
}
$reader->setGlobalFunctionsHelper(new GlobalFunctionsHelper());
return $reader;
}
}

View File

@ -0,0 +1,35 @@
<?php
namespace Box\Spout\Reader;
/**
* Interface ReaderInterface2
*
* @package Box\Spout\Reader
*/
interface ReaderInterface2
{
/**
* Prepares the reader to read the given file. It also makes sure
* that the file exists and is readable.
*
* @param string $filePath Path of the file to be read
* @return void
* @throws \Box\Spout\Common\Exception\IOException
*/
public function open($filePath);
/**
* Returns an iterator to iterate over sheets.
*
* @return \Iterator To iterate over sheets
*/
public function getSheetIterator();
/**
* Closes the reader, preventing any additional reading
*
* @return void
*/
public function close();
}

View File

@ -0,0 +1,18 @@
<?php
namespace Box\Spout\Reader;
/**
* Interface SheetInterface
*
* @package Box\Spout\Reader
*/
interface SheetInterface
{
/**
* Returns an iterator to iterate over the sheet's rows.
*
* @return \Iterator
*/
public function getRowIterator();
}

View File

@ -0,0 +1,97 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Common\Exception\InvalidArgumentException;
/**
* Class CellHelper
* This class provides helper functions when working with cells
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class CellHelper
{
/**
* Fills the missing indexes of an array with a given value.
* For instance, $dataArray = []; $a[1] = 1; $a[3] = 3;
* Calling fillMissingArrayIndexes($dataArray, 'FILL') will return this array: ['FILL', 1, 'FILL', 3]
*
* @param array $dataArray The array to fill
* @param string|void $fillValue optional
* @return array
*/
public static function fillMissingArrayIndexes($dataArray, $fillValue = '')
{
$existingIndexes = array_keys($dataArray);
$newIndexes = array_fill_keys(range(0, max($existingIndexes)), $fillValue);
$dataArray += $newIndexes;
ksort($dataArray);
return $dataArray;
}
/**
* Returns the base 10 column index associated to the cell index (base 26).
* Excel uses A to Z letters for column indexing, where A is the 1st column,
* Z is the 26th and AA is the 27th.
* The mapping is zero based, so that A1 maps to 0, B2 maps to 1, Z13 to 25 and AA4 to 26.
*
* @param string $cellIndex The Excel cell index ('A1', 'BC13', ...)
* @return int
* @throws \Box\Spout\Common\Exception\InvalidArgumentException When the given cell index is invalid
*/
public static function getColumnIndexFromCellIndex($cellIndex)
{
if (!self::isValidCellIndex($cellIndex)) {
throw new InvalidArgumentException('Cannot get column index from an invalid cell index.');
}
$columnIndex = 0;
$capitalAAsciiValue = ord('A');
$capitalZAsciiValue = ord('Z');
$step = $capitalZAsciiValue - $capitalAAsciiValue + 1;
// Remove row information
$column = preg_replace('/\d/', '', $cellIndex);
$columnLength = strlen($column);
/*
* This is how the following loop will process the data:
* A => 0
* Z => 25
* AA => 26 : (26^(2-1) * (0+1)) + 0
* AB => 27 : (26^(2-1) * (0+1)) + 1
* BC => 54 : (26^(2-1) * (1+1)) + 2
* BCZ => 1455 : (26^(3-1) * (1+1)) + (26^(2-1) * (2+1)) + 25
*/
foreach (str_split($column) as $single_cell_index)
{
$currentColumnIndex = ord($single_cell_index) - $capitalAAsciiValue;
if ($columnLength == 1) {
$columnIndex += $currentColumnIndex;
} else {
$columnIndex += pow($step, ($columnLength - 1)) * ($currentColumnIndex + 1);
}
$columnLength--;
}
return $columnIndex;
}
/**
* Returns whether a cell index is valid, in an Excel world.
* To be valid, the cell index should start with capital letters and be followed by numbers.
*
* @param string $cellIndex The Excel cell index ('A1', 'BC13', ...)
* @return bool
*/
protected static function isValidCellIndex($cellIndex)
{
return (preg_match('/^[A-Z]+\d+$/', $cellIndex) === 1);
}
}

View File

@ -0,0 +1,154 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper\SharedStringsCaching;
/**
* Class CachingStrategyFactory
*
* @package Box\Spout\Reader\XLSX\Helper\SharedStringsCaching
*/
class CachingStrategyFactory
{
/**
* The memory amount needed to store a string was obtained empirically from this data:
*
* ------------------------------------
* | Number of chars⁺ | Memory needed |
* ------------------------------------
* | 3,000 | 1 MB |
* | 15,000 | 2 MB |
* | 30,000 | 5 MB |
* | 75,000 | 11 MB |
* | 150,000 | 21 MB |
* | 300,000 | 43 MB |
* | 750,000 | 105 MB |
* | 1,500,000 | 210 MB |
* | 2,250,000 | 315 MB |
* | 3,000,000 | 420 MB |
* | 4,500,000 | 630 MB |
* ------------------------------------
*
* All characters were 1 byte long
*
* This gives a linear graph where each 1-byte character requires about 150 bytes to be stored.
* Given that some characters can take up to 4 bytes, we need 600 bytes per character to be safe.
* Also, there is on average about 20 characters per cell (this is entirely empirical data...).
*
* This means that in order to store one shared string in memory, the memory amount needed is:
* => 20 * 600 12KB
*/
const AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB = 12;
/**
* To avoid running out of memory when extracting a huge number of shared strings, they can be saved to temporary files
* instead of in memory. Then, when accessing a string, the corresponding file contents will be loaded in memory
* and the string will be quickly retrieved.
* The performance bottleneck is not when creating these temporary files, but rather when loading their content.
* Because the contents of the last loaded file stays in memory until another file needs to be loaded, it works
* best when the indexes of the shared strings are sorted in the sheet data.
* 10,000 was chosen because it creates small files that are fast to be loaded in memory.
*/
const MAX_NUM_STRINGS_PER_TEMP_FILE = 10000;
/** @var CachingStrategyFactory|null Singleton instance */
protected static $instance = null;
/**
* Private constructor for singleton
*/
private function __construct()
{
}
/**
* Returns the singleton instance of the factory
*
* @return CachingStrategyFactory
*/
public static function getInstance()
{
if (self::$instance === null) {
self::$instance = new CachingStrategyFactory();
}
return self::$instance;
}
/**
* Returns the best caching strategy, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @return CachingStrategyInterface The best caching strategy
*/
public function getBestCachingStrategy($sharedStringsUniqueCount, $tempFolder = null)
{
if ($this->isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)) {
return new InMemoryStrategy($sharedStringsUniqueCount);
} else {
return new FileBasedStrategy($tempFolder, self::MAX_NUM_STRINGS_PER_TEMP_FILE);
}
}
/**
* Returns whether it is safe to use in-memory caching, given the number of unique shared strings
* and the amount of memory available.
*
* @param int $sharedStringsUniqueCount Number of unique shared strings
* @return bool
*/
protected function isInMemoryStrategyUsageSafe($sharedStringsUniqueCount)
{
$memoryAvailable = $this->getMemoryLimitInKB();
if ($memoryAvailable === -1) {
// if cannot get memory limit or if memory limit set as unlimited, don't trust and play safe
return ($sharedStringsUniqueCount < self::MAX_NUM_STRINGS_PER_TEMP_FILE);
} else {
$memoryNeeded = $sharedStringsUniqueCount * self::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB;
return ($memoryAvailable > $memoryNeeded);
}
}
/**
* Returns the PHP "memory_limit" in Kilobytes
*
* @return float
*/
protected function getMemoryLimitInKB()
{
$memoryLimitFormatted = $this->getMemoryLimitFromIni();
$memoryLimitFormatted = strtolower(trim($memoryLimitFormatted));
// No memory limit
if ($memoryLimitFormatted === '-1') {
return -1;
}
if (preg_match('/(\d+)([bkmgt])b?/', $memoryLimitFormatted, $matches)) {
$amount = intval($matches[1]);
$unit = $matches[2];
switch ($unit) {
case 'b': return ($amount / 1024);
case 'k': return $amount;
case 'm': return ($amount * 1024);
case 'g': return ($amount * 1024 * 1024);
case 't': return ($amount * 1024 * 1024 * 1024);
}
}
return -1;
}
/**
* Returns the formatted "memory_limit" value
*
* @return string
*/
protected function getMemoryLimitFromIni()
{
return ini_get('memory_limit');
}
}

View File

@ -0,0 +1,44 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper\SharedStringsCaching;
/**
* Interface CachingStrategyInterface
*
* @package Box\Spout\Reader\XLSX\Helper\SharedStringsCaching
*/
interface CachingStrategyInterface
{
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex);
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache();
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex);
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache();
}

View File

@ -0,0 +1,188 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper\SharedStringsCaching;
use Box\Spout\Common\Helper\FileSystemHelper;
use Box\Spout\Common\Helper\GlobalFunctionsHelper;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
/**
* Class FileBasedStrategy
*
* This class implements the file-based caching strategy for shared strings.
* Shared strings are stored in small files (with a max number of strings per file).
* This strategy is slower than an in-memory strategy but is used to avoid out of memory crashes.
*
* @package Box\Spout\Reader\XLSX\Helper\SharedStringsCaching
*/
class FileBasedStrategy implements CachingStrategyInterface
{
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \Box\Spout\Common\Helper\FileSystemHelper Helper to perform file system operations */
protected $fileSystemHelper;
/**
* @var int Maximum number of strings that can be stored in one temp file
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $maxNumStringsPerTempFile;
/** @var resource Pointer to the last temp file a shared string was written to */
protected $tempFilePointer;
/**
* @var string Path of the temporary file whose contents is currently stored in memory
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFilePath;
/**
* @var string Contents of the temporary file that was last read
* @see CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE
*/
protected $inMemoryTempFileContents;
/**
* @param string|null $tempFolder Temporary folder where the temporary files to store shared strings will be stored
* @param int $maxNumStringsPerTempFile Maximum number of strings that can be stored in one temp file
*/
public function __construct($tempFolder, $maxNumStringsPerTempFile)
{
$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
$this->maxNumStringsPerTempFile = $maxNumStringsPerTempFile;
$this->globalFunctionsHelper = new GlobalFunctionsHelper();
$this->tempFilePointer = null;
}
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
$this->tempFilePointer = $this->globalFunctionsHelper->fopen($tempFilePath, 'w');
}
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedSharedString = $this->escapeLineFeed($sharedString);
$this->globalFunctionsHelper->fwrite($this->tempFilePointer, $lineFeedEncodedSharedString . PHP_EOL);
}
/**
* Returns the path for the temp file that should contain the string for the given index
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The temp file path for the given index
*/
protected function getSharedStringTempFilePath($sharedStringIndex)
{
$numTempFile = intval($sharedStringIndex / $this->maxNumStringsPerTempFile);
return $this->tempFolder . '/sharedstrings' . $numTempFile;
}
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache()
{
// close pointer to the last temp file that was written
if ($this->tempFilePointer) {
$this->globalFunctionsHelper->fclose($this->tempFilePointer);
}
}
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
$indexInFile = $sharedStringIndex % $this->maxNumStringsPerTempFile;
if (!$this->globalFunctionsHelper->file_exists($tempFilePath)) {
throw new SharedStringNotFoundException("Shared string temp file not found: $tempFilePath ; for index: $sharedStringIndex");
}
if ($this->inMemoryTempFilePath !== $tempFilePath) {
// free memory
unset($this->inMemoryTempFileContents);
$this->inMemoryTempFileContents = explode(PHP_EOL, $this->globalFunctionsHelper->file_get_contents($tempFilePath));
$this->inMemoryTempFilePath = $tempFilePath;
}
$sharedString = null;
if (array_key_exists($indexInFile, $this->inMemoryTempFileContents)) {
$escapedSharedString = $this->inMemoryTempFileContents[$indexInFile];
$sharedString = $this->unescapeLineFeed($escapedSharedString);
}
if ($sharedString === null) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
return rtrim($sharedString, PHP_EOL);
}
/**
* Escapes the line feed characters (\n)
*
* @param string $unescapedString
* @return string
*/
private function escapeLineFeed($unescapedString)
{
return str_replace("\n", self::ESCAPED_LINE_FEED_CHARACTER, $unescapedString);
}
/**
* Unescapes the line feed characters (\n)
*
* @param string $escapedString
* @return string
*/
private function unescapeLineFeed($escapedString)
{
return str_replace(self::ESCAPED_LINE_FEED_CHARACTER, "\n", $escapedString);
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache()
{
if ($this->tempFolder) {
$this->fileSystemHelper->deleteFolderRecursively($this->tempFolder);
}
}
}

View File

@ -0,0 +1,83 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper\SharedStringsCaching;
use Box\Spout\Reader\Exception\SharedStringNotFoundException;
/**
* Class InMemoryStrategy
*
* This class implements the in-memory caching strategy for shared strings.
* This strategy is used when the number of unique strings is low, compared to the memory available.
*
* @package Box\Spout\Reader\XLSX\Helper\SharedStringsCaching
*/
class InMemoryStrategy implements CachingStrategyInterface
{
/** @var \SplFixedArray Array used to cache the shared strings */
protected $inMemoryCache;
/** @var bool Whether the cache has been closed */
protected $isCacheClosed;
/**
* @param int $sharedStringsUniqueCount Number of unique shared strings
*/
public function __construct($sharedStringsUniqueCount)
{
$this->inMemoryCache = new \SplFixedArray($sharedStringsUniqueCount);
$this->isCacheClosed = false;
}
/**
* Adds the given string to the cache.
*
* @param string $sharedString The string to be added to the cache
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return void
*/
public function addStringForIndex($sharedString, $sharedStringIndex)
{
if (!$this->isCacheClosed) {
$this->inMemoryCache->offsetSet($sharedStringIndex, $sharedString);
}
}
/**
* Closes the cache after the last shared string was added.
* This prevents any additional string from being added to the cache.
*
* @return void
*/
public function closeCache()
{
$this->isCacheClosed = true;
}
/**
* Returns the string located at the given index from the cache.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
try {
return $this->inMemoryCache->offsetGet($sharedStringIndex);
} catch (\RuntimeException $e) {
throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
}
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function clearCache()
{
unset($this->inMemoryCache);
$this->isCacheClosed = false;
}
}

View File

@ -0,0 +1,280 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
/**
* Class SharedStringsHelper
* This class provides helper functions for reading sharedStrings XML file
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class SharedStringsHelper
{
/** Path of sharedStrings XML file inside the XLSX file */
const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
/** Main namespace for the sharedStrings.xml file */
const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/** @var string Path of the XLSX file being read */
protected $filePath;
/** @var string Temporary folder where the temporary files to store shared strings will be stored */
protected $tempFolder;
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */
protected $cachingStrategy;
/**
* @param string $filePath Path of the XLSX file being read
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
*/
public function __construct($filePath, $tempFolder = null)
{
$this->filePath = $filePath;
$this->tempFolder = $tempFolder;
}
/**
* Returns whether the XLSX file contains a shared strings XML file
*
* @return bool
*/
public function hasSharedStrings()
{
$hasSharedStrings = false;
$zip = new \ZipArchive();
if ($zip->open($this->filePath) === true) {
$hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
$zip->close();
}
return $hasSharedStrings;
}
/**
* Builds an in-memory array containing all the shared strings of the sheet.
* All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
* It is then accessed by the sheet data, via the string index in the built table.
*
* More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
*
* The XML file can be really big with sheets containing a lot of data. That is why
* we need to use a XML reader that provides streaming like the XMLReader library.
* Please note that SimpleXML does not provide such a functionality but since it is faster
* and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
*
* @return void
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
*/
public function extractSharedStrings()
{
$xmlReader = new \XMLReader();
$sharedStringIndex = 0;
$escaper = new \Box\Spout\Common\Escaper\XLSX();
$sharedStringsFilePath = $this->getSharedStringsFilePath();
if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) {
throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
}
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
while ($xmlReader->read() && $xmlReader->name !== 'si') {
// do nothing until a 'si' tag is reached
}
while ($xmlReader->name === 'si') {
$node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
// removes nodes that should not be read, like the pronunciation of the Kanji characters
$cleanNode = $this->removeSuperfluousTextNodes($node);
// find all text nodes 't'; there can be multiple if the cell contains formatting
$textNodes = $cleanNode->xpath('//ns:t');
$textValue = '';
foreach ($textNodes as $textNode) {
if ($this->shouldPreserveWhitespace($textNode)) {
$textValue .= $textNode->__toString();
} else {
$textValue .= trim($textNode->__toString());
}
}
$unescapedTextValue = $escaper->unescape($textValue);
$this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
$sharedStringIndex++;
// jump to the next 'si' tag
$xmlReader->next('si');
}
$this->cachingStrategy->closeCache();
$xmlReader->close();
}
/**
* @return string The path to the shared strings XML file
*/
protected function getSharedStringsFilePath()
{
return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
}
/**
* Returns the shared strings unique count, as specified in <sst> tag.
*
* @param \XMLReader $xmlReader XMLReader instance
* @return int Number of unique shared strings in the sharedStrings.xml file
* @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
*/
protected function getSharedStringsUniqueCount($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of invalid file
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$xmlReader->next('sst');
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== \XMLReader::ELEMENT) {
$xmlReader->read();
}
$readError = libxml_get_last_error();
if ($readError !== false) {
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$readError->message}]");
}
// reset the setting to display XML warnings/errors
libxml_use_internal_errors(false);
return intval($xmlReader->getAttribute('uniqueCount'));
}
/**
* Returns the best shared strings caching strategy.
*
* @param int $sharedStringsUniqueCount
* @return CachingStrategyInterface
*/
protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
{
return CachingStrategyFactory::getInstance()
->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
}
/**
* Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
* This is to simplify the parsing of the subtree.
*
* @param \XMLReader $xmlReader
* @return \SimpleXMLElement
* @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
*/
protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
{
// Use internal errors to avoid displaying lots of warning messages in case of error found in the XML node.
// For instance, if the file is used to perform a "Billion Laughs" or "Quadratic Blowup" attacks
libxml_clear_errors();
libxml_use_internal_errors(true);
$node = null;
try {
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
} catch (\Exception $exception) {
$error = libxml_get_last_error();
libxml_use_internal_errors(false);
throw new IOException('The sharedStrings.xml file contains unreadable data [' . trim($error->message) . '].');
}
libxml_use_internal_errors(false);
return $node;
}
/**
* Removes nodes that should not be read, like the pronunciation of the Kanji characters.
* By keeping them, their text content would be added to the read string.
*
* @param \SimpleXMLElement $parentNode Parent node that may contain nodes to remove
* @return \SimpleXMLElement Cleaned parent node
*/
protected function removeSuperfluousTextNodes($parentNode)
{
$tagsToRemove = [
'rPh', // Pronunciation of the text
];
foreach ($tagsToRemove as $tagToRemove) {
$xpath = '//ns:' . $tagToRemove;
$nodesToRemove = $parentNode->xpath($xpath);
foreach ($nodesToRemove as $nodeToRemove) {
// This is how to remove a node from the XML
unset($nodeToRemove[0]);
}
}
return $parentNode;
}
/**
* If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
*
* @param \SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
* @return bool Whether whitespace should be preserved
*/
protected function shouldPreserveWhitespace($textNode)
{
$shouldPreserveWhitespace = false;
$attributes = $textNode->attributes('xml', true);
if ($attributes) {
foreach ($attributes as $attributeName => $attributeValue) {
if ($attributeName === 'space' && $attributeValue->__toString() === 'preserve') {
$shouldPreserveWhitespace = true;
break;
}
}
}
return $shouldPreserveWhitespace;
}
/**
* Returns the shared string at the given index, using the previously chosen caching strategy.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
* @return string The shared string at the given index
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex($sharedStringIndex)
{
return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
}
/**
* Destroys the cache, freeing memory and removing any created artifacts
*
* @return void
*/
public function cleanup()
{
if ($this->cachingStrategy) {
$this->cachingStrategy->clearCache();
}
}
}

View File

@ -0,0 +1,199 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Reader\XLSX\Sheet;
/**
* Class SheetHelper
* This class provides helper functions related to XLSX sheets
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class SheetHelper
{
/** Extension for XML files */
const XML_EXTENSION = '.xml';
/** Paths of XML files relative to the XLSX file root */
const CONTENT_TYPES_XML_FILE_PATH = '[Content_Types].xml';
const WORKBOOK_XML_RELS_FILE_PATH = 'xl/_rels/workbook.xml.rels';
const WORKBOOK_XML_FILE_PATH = 'xl/workbook.xml';
/** Namespaces for the XML files */
const MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML = 'http://schemas.openxmlformats.org/package/2006/content-types';
const MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships';
const MAIN_NAMESPACE_FOR_WORKBOOK_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/** Value of the Override attribute used in [Content_Types].xml to define sheets */
const OVERRIDE_CONTENT_TYPES_ATTRIBUTE = 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml';
/** @var string Path of the XLSX file being read */
protected $filePath;
/** @var \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \SimpleXMLElement XML element representing the workbook.xml.rels file */
protected $workbookXMLRelsAsXMLElement;
/** @var \SimpleXMLElement XML element representing the workbook.xml file */
protected $workbookXMLAsXMLElement;
/**
* @param string $filePath Path of the XLSX file being read
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper)
{
$this->filePath = $filePath;
$this->sharedStringsHelper = $sharedStringsHelper;
$this->globalFunctionsHelper = $globalFunctionsHelper;
}
/**
* Returns the sheets metadata of the file located at the previously given file path.
* The paths to the sheets' data are read from the [Content_Types].xml file.
*
* @return Sheet[] Sheets within the XLSX file
*/
public function getSheets()
{
$sheets = [];
$contentTypesAsXMLElement = $this->getFileAsXMLElementWithNamespace(
self::CONTENT_TYPES_XML_FILE_PATH,
self::MAIN_NAMESPACE_FOR_CONTENT_TYPES_XML
);
// find all nodes defining a sheet
$sheetNodes = $contentTypesAsXMLElement->xpath('//ns:Override[@ContentType="' . self::OVERRIDE_CONTENT_TYPES_ATTRIBUTE . '"]');
for ($i = 0; $i < count($sheetNodes); $i++) {
$sheetNode = $sheetNodes[$i];
$sheetDataXMLFilePath = (string) $sheetNode->attributes()->PartName;
$sheets[] = $this->getSheetFromXML($sheetDataXMLFilePath, $i);
}
return $sheets;
}
/**
* Returns an instance of a sheet, given the path of its data XML file.
* We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet.
* Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID.
* The entry contains the ID and name of the sheet.
*
* If this piece of data can't be found by parsing the different XML files, the ID will default
* to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will
* default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2").
*
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @param int $sheetIndexZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based)
* @return \Box\Spout\Reader\Sheet Sheet instance
*/
protected function getSheetFromXML($sheetDataXMLFilePath, $sheetIndexZeroBased)
{
$sheetId = $sheetIndexZeroBased + 1;
$sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath);
/*
* In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
* In workbook.xml.rels, it is only "worksheets/sheet1.xml"
*/
$sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/');
// find the node associated to the given file path
$workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement();
$relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]');
if (count($relationshipNodes) === 1) {
$relationshipNode = $relationshipNodes[0];
$sheetId = (string) $relationshipNode->attributes()->Id;
$workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
$sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]');
if (count($sheetNodes) === 1) {
$sheetNode = $sheetNodes[0];
$sheetId = (int) $sheetNode->attributes()->sheetId;
$escapedSheetName = (string) $sheetNode->attributes()->name;
$escaper = new \Box\Spout\Common\Escaper\XLSX();
$sheetName = $escaper->unescape($escapedSheetName);
}
}
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetId, $sheetIndexZeroBased, $sheetName);
}
/**
* Returns the default name of the sheet whose data is located
* at the given path.
*
* @param $sheetDataXMLFilePath
* @return string The default sheet name
*/
protected function getDefaultSheetName($sheetDataXMLFilePath)
{
return $this->globalFunctionsHelper->basename($sheetDataXMLFilePath, self::XML_EXTENSION);
}
/**
* Returns a representation of the workbook.xml.rels file, ready to be parsed.
* The returned value is cached.
*
* @return \SimpleXMLElement XML element representating the workbook.xml.rels file
*/
protected function getWorkbookXMLRelsAsXMLElement()
{
if (!$this->workbookXMLRelsAsXMLElement) {
$this->workbookXMLRelsAsXMLElement = $this->getFileAsXMLElementWithNamespace(
self::WORKBOOK_XML_RELS_FILE_PATH,
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML_RELS
);
}
return $this->workbookXMLRelsAsXMLElement;
}
/**
* Returns a representation of the workbook.xml file, ready to be parsed.
* The returned value is cached.
*
* @return \SimpleXMLElement XML element representating the workbook.xml.rels file
*/
protected function getWorkbookXMLAsXMLElement()
{
if (!$this->workbookXMLAsXMLElement) {
$this->workbookXMLAsXMLElement = $this->getFileAsXMLElementWithNamespace(
self::WORKBOOK_XML_FILE_PATH,
self::MAIN_NAMESPACE_FOR_WORKBOOK_XML
);
}
return $this->workbookXMLAsXMLElement;
}
/**
* Loads the contents of the given file in an XML parser and register the given XPath namespace.
*
* @param string $xmlFilePath The path of the XML file inside the XLSX file
* @param string $mainNamespace The main XPath namespace to register
* @return \SimpleXMLElement The XML element representing the file
*/
protected function getFileAsXMLElementWithNamespace($xmlFilePath, $mainNamespace)
{
$xmlContents = $this->globalFunctionsHelper->file_get_contents('zip://' . $this->filePath . '#' . $xmlFilePath);
$xmlElement = new \SimpleXMLElement($xmlContents);
$xmlElement->registerXPathNamespace('ns', $mainNamespace);
return $xmlElement;
}
}

View File

@ -0,0 +1,93 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\AbstractReader2;
use Box\Spout\Reader\XLSX\Helper\SharedStringsHelper;
/**
* Class Reader
* This class provides support to read data from a XLSX file
*
* @package Box\Spout\Reader\XLSX
*/
class Reader extends AbstractReader2
{
/** @var string Temporary folder where the temporary files will be created */
protected $tempFolder;
/** @var \ZipArchive */
protected $zip;
/** @var \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var SheetIterator To iterator over the XLSX sheets */
protected $sheetIterator;
/**
* @param string $tempFolder Temporary folder where the temporary files will be created
* @return Reader
*/
public function setTempFolder($tempFolder)
{
$this->tempFolder = $tempFolder;
return $this;
}
/**
* Opens the file at the given file path to make it ready to be read.
* It also parses the sharedStrings.xml file to get all the shared strings available in memory
* and fetches all the available sheets.
*
* @param string $filePath Path of the file to be read
* @return void
* @throws \Box\Spout\Common\Exception\IOException If the file at the given path or its content cannot be read
* @throws \Box\Spout\Reader\Exception\NoSheetsFoundException If there are no sheets in the file
*/
protected function openReader($filePath)
{
$this->zip = new \ZipArchive();
if ($this->zip->open($filePath) === true) {
$this->sharedStringsHelper = new SharedStringsHelper($filePath, $this->tempFolder);
if ($this->sharedStringsHelper->hasSharedStrings()) {
// Extracts all the strings from the sheets for easy access in the future
$this->sharedStringsHelper->extractSharedStrings();
}
$this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper);
} else {
throw new IOException('Could not open ' . $filePath . ' for reading.');
}
}
/**
* Returns an iterator to iterate over sheets.
*
* @return SheetIterator To iterate over sheets
*/
public function getSheetIterator()
{
return $this->sheetIterator;
}
/**
* Closes the reader. To be used after reading the file.
*
* @return void
*/
protected function closeReader()
{
if ($this->zip) {
$this->zip->close();
}
if ($this->sharedStringsHelper) {
$this->sharedStringsHelper->cleanup();
}
}
}

View File

@ -0,0 +1,356 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\XLSX\Helper\CellHelper;
/**
* Class RowIterator
*
* @package Box\Spout\Reader\XLSX
*/
class RowIterator implements IteratorInterface
{
/** Definition of all possible cell types */
const CELL_TYPE_INLINE_STRING = 'inlineStr';
const CELL_TYPE_STR = 'str';
const CELL_TYPE_SHARED_STRING = 's';
const CELL_TYPE_BOOLEAN = 'b';
const CELL_TYPE_NUMERIC = 'n';
const CELL_TYPE_DATE = 'd';
const CELL_TYPE_ERROR = 'e';
/** Definition of XML nodes names used to parse data */
const XML_NODE_DIMENSION = 'dimension';
const XML_NODE_WORKSHEET = 'worksheet';
const XML_NODE_ROW = 'row';
const XML_NODE_CELL = 'c';
const XML_NODE_VALUE = 'v';
const XML_NODE_INLINE_STRING_VALUE = 't';
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_REF = 'ref';
const XML_ATTRIBUTE_SPANS = 'spans';
const XML_ATTRIBUTE_CELL_INDEX = 'r';
const XML_ATTRIBUTE_TYPE = 't';
/** @var string Path of the XLSX file being read */
protected $filePath;
/** @var string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml */
protected $sheetDataXMLFilePath;
/** @var Helper\SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var \XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */
protected $escaper;
/** @var int Number of read rows */
protected $numReadRows = 0;
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
protected $rowDataBuffer = null;
/** @var bool Indicates whether all rows have been read */
protected $hasReachedEndOfFile = false;
/** @var int The number of columns the sheet has (0 meaning undefined) */
protected $numColumns = 0;
/**
* @param string $filePath Path of the XLSX file being read
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @param Helper\SharedStringsHelper $sharedStringsHelper Helper to work with shared strings
*/
public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper)
{
$this->filePath = $filePath;
$this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
$this->sharedStringsHelper = $sharedStringsHelper;
$this->xmlReader = new \XMLReader();
$this->escaper = new \Box\Spout\Common\Escaper\XLSX();
}
/**
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @return string Path of the XML file containing the sheet data,
* without the leading slash.
*/
protected function normalizeSheetDataXMLFilePath($sheetDataXMLFilePath)
{
return ltrim($sheetDataXMLFilePath, '/');
}
/**
* Rewind the Iterator to the first element.
* Initializes the XMLReader object that reads the associated sheet data.
* The XMLReader is configured to be safe from billion laughs attack.
* @link http://php.net/manual/en/iterator.rewind.php
*
* @return void
* @throws \Box\Spout\Common\Exception\IOException If the sheet data XML cannot be read
*/
public function rewind()
{
$this->xmlReader->close();
$sheetDataFilePath = 'zip://' . $this->filePath . '#' . $this->sheetDataXMLFilePath;
if ($this->xmlReader->open($sheetDataFilePath, null, LIBXML_NONET) === false) {
throw new IOException('Could not open "' . $this->sheetDataXMLFilePath . '".');
}
$this->numReadRows = 0;
$this->rowDataBuffer = null;
$this->hasReachedEndOfFile = false;
$this->numColumns = 0;
$this->next();
}
/**
* Checks if current position is valid
* @link http://php.net/manual/en/iterator.valid.php
*
* @return boolean
*/
public function valid()
{
return (!$this->hasReachedEndOfFile);
}
/**
* Move forward to next element. Empty rows will be skipped.
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
*/
public function next()
{
$isInsideRowTag = false;
$rowData = [];
while ($this->xmlReader->read()) {
if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_DIMENSION) {
// Read dimensions of the sheet
$dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) {
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
}
} else if ($this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// Start of the row description
$isInsideRowTag = true;
// Read spans info if present
$numberOfColumnsForRow = $this->numColumns;
$spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
if ($spans) {
list(, $numberOfColumnsForRow) = explode(':', $spans);
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
$rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
} else if ($isInsideRowTag && $this->xmlReader->nodeType == \XMLReader::ELEMENT && $this->xmlReader->name === self::XML_NODE_CELL) {
// Start of a cell description
$currentCellIndex = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
$currentColumnIndex = CellHelper::getColumnIndexFromCellIndex($currentCellIndex);
$node = $this->xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
} else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_ROW) {
// End of the row description
// If needed, we fill the empty cells
$rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
$this->numReadRows++;
break;
} else if ($this->xmlReader->nodeType == \XMLReader::END_ELEMENT && $this->xmlReader->name === self::XML_NODE_WORKSHEET) {
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
}
}
$this->rowDataBuffer = $rowData;
}
/**
* Returns the cell's string value from a node's nested value node
*
* @param \DOMNode $node
* @return string The value associated with the cell
*/
protected function getVNodeValue($node)
{
// for cell types having a "v" tag containing the value.
// if not, the returned value should be empty string.
$vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0);
if ($vNode !== null) {
return $vNode->nodeValue;
}
return "";
}
/**
* Returns the cell String value where string is inline.
*
* @param \DOMNode $node
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatInlineStringCellValue($node)
{
// inline strings are formatted this way:
// <c r="A1" t="inlineStr"><is><t>[INLINE_STRING]</t></is></c>
$tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0);
$escapedCellValue = trim($tNode->nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value from shared-strings file using nodeValue index.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatSharedStringCellValue($nodeValue)
{
// shared strings are formatted this way:
// <c r="A1" t="s"><v>[SHARED_STRING_INDEX]</v></c>
$sharedStringIndex = intval($nodeValue);
$escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value, where string is stored in value node.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatStrCellValue($nodeValue)
{
$escapedCellValue = trim($nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell Numeric value from string of nodeValue.
*
* @param string $nodeValue
* @return int|float The value associated with the cell
*/
protected function formatNumericCellValue($nodeValue)
{
$cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue);
return $cellValue;
}
/**
* Returns the cell Boolean value from a specific node's Value.
*
* @param string $nodeValue
* @return bool The value associated with the cell
*/
protected function formatBooleanCellValue($nodeValue)
{
// !! is similar to boolval()
$cellValue = !!$nodeValue;
return $cellValue;
}
/**
* Returns a cell's PHP Date value, associated to the given stored nodeValue.
*
* @param string $nodeValue
* @return \DateTime|null The value associated with the cell (null when the cell has an error)
*/
protected function formatDateCellValue($nodeValue)
{
// Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php)
try {
$cellValue = new \DateTime($nodeValue);
return $cellValue;
} catch (\Exception $e) {
return null;
}
}
/**
* Returns the (unescaped) correctly marshalled, cell value associated to the given XML node.
*
* @param \DOMNode $node
* @return string|int|float|bool|\DateTime|null The value associated with the cell (null when the cell has an error)
*/
protected function getCellValue($node)
{
// Default cell type is "n"
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC;
$vNodeValue = $this->getVNodeValue($node);
if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) {
return $vNodeValue;
}
switch ($cellType) {
case self::CELL_TYPE_INLINE_STRING:
return $this->formatInlineStringCellValue($node);
case self::CELL_TYPE_SHARED_STRING:
return $this->formatSharedStringCellValue($vNodeValue);
case self::CELL_TYPE_STR:
return $this->formatStrCellValue($vNodeValue);
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($vNodeValue);
case self::CELL_TYPE_NUMERIC:
return $this->formatNumericCellValue($vNodeValue);
case self::CELL_TYPE_DATE:
return $this->formatDateCellValue($vNodeValue);
default:
return null;
}
}
/**
* Return the current element, from the buffer.
* @link http://php.net/manual/en/iterator.current.php
*
* @return array|null
*/
public function current()
{
return $this->rowDataBuffer;
}
/**
* Return the key of the current element
* @link http://php.net/manual/en/iterator.key.php
*
* @return int
*/
public function key()
{
return $this->numReadRows;
}
/**
* Cleans up what was created to iterate over the object.
*
* @return void
*/
public function end()
{
$this->xmlReader->close();
}
}

View File

@ -0,0 +1,74 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Reader\SheetInterface;
/**
* Class Sheet
* Represents a sheet within a XLSX file
*
* @package Box\Spout\Reader\XLSX
*/
class Sheet implements SheetInterface
{
/** @var RowIterator To iterate over sheet's rows */
protected $rowIterator;
/** @var int ID of the sheet */
protected $id;
/** @var int Index of the sheet, based on order of creation (zero-based) */
protected $index;
/** @var string Name of the sheet */
protected $name;
/**
* @param string $filePath Path of the XLSX file being read
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @param Helper\SharedStringsHelper Helper to work with shared strings
* @param int $sheetId ID of the sheet
* @param int $sheetIndex Index of the sheet, based on order of creation (zero-based)
* @param string $sheetName Name of the sheet
*/
function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $sheetId, $sheetIndex, $sheetName)
{
$this->rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper);
$this->id = $sheetId;
$this->index = $sheetIndex;
$this->name = $sheetName;
}
/**
* @return RowIterator
*/
public function getRowIterator()
{
return $this->rowIterator;
}
/**
* @return int ID of the sheet
*/
public function getId()
{
return $this->id;
}
/**
* @return int Index of the sheet, based on order of creation (zero-based)
*/
public function getIndex()
{
return $this->index;
}
/**
* @return string Name of the sheet
*/
public function getName()
{
return $this->name;
}
}

View File

@ -0,0 +1,112 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\XLSX\Helper\SheetHelper;
use Box\Spout\Reader\Exception\NoSheetsFoundException;
/**
* Class SheetIterator
* Iterate over XLSX sheet.
*
* @package Box\Spout\Reader\XLSX
*/
class SheetIterator implements IteratorInterface
{
/** @var Sheet[] The list of sheet present in the file */
protected $sheets;
/** @var int The index of the sheet being read (zero-based) */
protected $currentSheetIndex;
/**
* @param string $filePath Path of the file to be read
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper $sharedStringsHelper
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
* @throws \Box\Spout\Reader\Exception\NoSheetsFoundException If there are no sheets in the file
*/
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper)
{
// Fetch all available sheets
$sheetHelper = new SheetHelper($filePath, $sharedStringsHelper, $globalFunctionsHelper);
$this->sheets = $sheetHelper->getSheets();
if (count($this->sheets) === 0) {
throw new NoSheetsFoundException('The file must contain at least one sheet.');
}
}
/**
* Rewind the Iterator to the first element
* @link http://php.net/manual/en/iterator.rewind.php
*
* @return void
*/
public function rewind()
{
$this->currentSheetIndex = 0;
}
/**
* Checks if current position is valid
* @link http://php.net/manual/en/iterator.valid.php
*
* @return boolean
*/
public function valid()
{
return ($this->currentSheetIndex < count($this->sheets));
}
/**
* Move forward to next element
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
*/
public function next()
{
if (array_key_exists($this->currentSheetIndex, $this->sheets)) {
$currentSheet = $this->sheets[$this->currentSheetIndex];
$currentSheet->getRowIterator()->end();
$this->currentSheetIndex++;
}
}
/**
* Return the current element
* @link http://php.net/manual/en/iterator.current.php
*
* @return Sheet
*/
public function current()
{
return $this->sheets[$this->currentSheetIndex];
}
/**
* Return the key of the current element
* @link http://php.net/manual/en/iterator.key.php
*
* @return int
*/
public function key()
{
return $this->currentSheetIndex + 1;
}
/**
* Cleans up what was created to iterate over the object.
*
* @return void
*/
public function end()
{
// make sure we are not leaking memory in case the iteration stopped before the end
foreach ($this->sheets as $sheet) {
$sheet->getRowIterator()->end();
}
}
}

View File

@ -0,0 +1,181 @@
<?php
namespace Box\Spout\Reader\CSV;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory2;
use Box\Spout\TestUsingResource;
/**
* Class ReaderTest
*
* @package Box\Spout\Reader\CSV
*/
class ReaderTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @expectedException \Box\Spout\Common\Exception\IOException
*
* @return void
*/
public function testOpenShouldThrowExceptionIfFileDoesNotExist()
{
ReaderFactory2::create(Type::CSV)->open('/path/to/fake/file.csv');
}
/**
* @expectedException \Box\Spout\Common\Exception\IOException
*
* @return void
*/
public function testOpenShouldThrowExceptionIfFileNotReadable()
{
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['is_readable'])
->getMock();
$helperStub->method('is_readable')->willReturn(false);
$resourcePath = $this->getResourcePath('csv_standard.csv');
$reader = ReaderFactory2::create(Type::CSV);
$reader->setGlobalFunctionsHelper($helperStub);
$reader->open($resourcePath);
}
/**
* @return void
*/
public function testReadStandardCSV()
{
$allRows = $this->getAllRowsForFile('csv_standard.csv');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldNotStopAtCommaIfEnclosed()
{
$allRows = $this->getAllRowsForFile('csv_with_comma_enclosed.csv');
$this->assertEquals('This is, a comma', $allRows[0][0]);
}
/**
* @return void
*/
public function testReadShouldKeepEmptyCells()
{
$allRows = $this->getAllRowsForFile('csv_with_empty_cells.csv');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', '', 'csv--23'],
['csv--31', 'csv--32', ''],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSkipEmptyLines()
{
$allRows = $this->getAllRowsForFile('csv_with_empty_line.csv');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldHaveTheRightNumberOfCells()
{
$allRows = $this->getAllRowsForFile('csv_with_different_cells_number.csv');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22'],
['csv--31'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportCustomFieldDelimiter()
{
$allRows = $this->getAllRowsForFile('csv_delimited_with_pipes.csv', '|');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportCustomFieldEnclosure()
{
$allRows = $this->getAllRowsForFile('csv_text_enclosed_with_pound.csv', ',', '#');
$this->assertEquals('This is, a comma', $allRows[0][0]);
}
/**
* @return void
*/
public function testReadShouldSkipUtf8Bom()
{
$allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv');
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @param string $fileName
* @param string|void $fieldDelimiter
* @param string|void $fieldEnclosure
* @return array All the read rows the given file
*/
private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"')
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
$reader = ReaderFactory2::create(Type::CSV);
$reader->setFieldDelimiter($fieldDelimiter);
$reader->setFieldEnclosure($fieldEnclosure);
$reader->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {
foreach ($sheet->getRowIterator() as $rowIndex => $row) {
$allRows[] = $row;
}
}
$reader->close();
return $allRows;
}
}

View File

@ -0,0 +1,60 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
/**
* Class CellHelperTest
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class CellHelperTest extends \PHPUnit_Framework_TestCase
{
/**
* @return void
*/
public function testFillMissingArrayIndexes()
{
$arrayToFill = [1 => 1, 3 => 3];
$filledArray = CellHelper::fillMissingArrayIndexes($arrayToFill, 'FILL');
$expectedFilledArray = ['FILL', 1, 'FILL', 3];
$this->assertEquals($expectedFilledArray, $filledArray);
}
/**
* @return array
*/
public function dataProviderForTestGetColumnIndexFromCellIndex()
{
return [
['A1', 0],
['Z3', 25],
['AA5', 26],
['AB24', 27],
['BC5', 54],
['BCZ99', 1455],
];
}
/**
* @dataProvider dataProviderForTestGetColumnIndexFromCellIndex
*
* @param string $cellIndex
* @param int $expectedColumnIndex
* @return void
*/
public function testGetColumnIndexFromCellIndex($cellIndex, $expectedColumnIndex)
{
$this->assertEquals($expectedColumnIndex, CellHelper::getColumnIndexFromCellIndex($cellIndex));
}
/**
* @expectedException \Box\Spout\Common\Exception\InvalidArgumentException
*
* @return void
*/
public function testGetColumnIndexFromCellIndexShouldThrowIfInvalidCellIndex()
{
CellHelper::getColumnIndexFromCellIndex('InvalidCellIndex');
}
}

View File

@ -0,0 +1,99 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper\SharedStringsCaching;
/**
* Class CachingStrategyFactoryTest
*
* @package Box\Spout\Reader\XLSX\Helper\SharedStringsCaching
*/
class CachingStrategyFactoryTest extends \PHPUnit_Framework_TestCase
{
/**
* @return array
*/
public function dataProviderForTestGetBestCachingStrategy()
{
return [
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 10, -1, 'FileBasedStrategy'],
[CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE - 10, -1, 'InMemoryStrategy'],
[10 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[15, CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'FileBasedStrategy'],
[5 , CachingStrategyFactory::AMOUNT_MEMORY_NEEDED_PER_STRING_IN_KB * 10, 'InMemoryStrategy'],
];
}
/**
* @dataProvider dataProviderForTestGetBestCachingStrategy
*
* @param int $sharedStringsUniqueCount
* @param int $memoryLimitInKB
* @param string $expectedStrategyClassName
* @return void
*/
public function testGetBestCachingStrategy($sharedStringsUniqueCount, $memoryLimitInKB, $expectedStrategyClassName)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitInKB'])
->getMock();
$factoryStub->method('getMemoryLimitInKB')->willReturn($memoryLimitInKB);
\ReflectionHelper::setStaticValue('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory', 'instance', $factoryStub);
$strategy = $factoryStub->getBestCachingStrategy($sharedStringsUniqueCount, null);
$fullExpectedStrategyClassName = 'Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\\' . $expectedStrategyClassName;
$this->assertEquals($fullExpectedStrategyClassName, get_class($strategy));
$strategy->clearCache();
\ReflectionHelper::reset();
}
/**
* @return array
*/
public function dataProviderForTestGetMemoryLimitInKB()
{
return [
['-1', -1],
['invalid', -1],
['1024B', 1],
['128K', 128],
['256KB', 256],
['512M', 512 * 1024],
['2MB', 2 * 1024],
['1G', 1 * 1024 * 1024],
['10GB', 10 * 1024 * 1024],
['2T', 2 * 1024 * 1024 * 1024],
['5TB', 5 * 1024 * 1024 * 1024],
];
}
/**
* @dataProvider dataProviderForTestGetMemoryLimitInKB
*
* @param string $memoryLimitFormatted
* @param float $expectedMemoryLimitInKB
* @return void
*/
public function testGetMemoryLimitInKB($memoryLimitFormatted, $expectedMemoryLimitInKB)
{
/** @var CachingStrategyFactory|\PHPUnit_Framework_MockObject_MockObject $factoryStub */
$factoryStub = $this
->getMockBuilder('\Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory')
->disableOriginalConstructor()
->setMethods(['getMemoryLimitFromIni'])
->getMock();
$factoryStub->method('getMemoryLimitFromIni')->willReturn($memoryLimitFormatted);
$memoryLimitInKB = \ReflectionHelper::callMethodOnObject($factoryStub, 'getMemoryLimitInKB');
$this->assertEquals($expectedMemoryLimitInKB, $memoryLimitInKB);
}
}

View File

@ -0,0 +1,112 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\FileBasedStrategy;
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\InMemoryStrategy;
use Box\Spout\TestUsingResource;
/**
* Class SharedStringsHelperTest
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class SharedStringsHelperTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/** @var SharedStringsHelper */
private $sharedStringsHelper;
/**
* @return void
*/
public function setUp()
{
$resourcePath = $this->getResourcePath('one_sheet_with_shared_strings.xlsx');
$this->sharedStringsHelper = new SharedStringsHelper($resourcePath);
}
/**
* @return void
*/
public function tearDown()
{
$this->sharedStringsHelper->cleanup();
}
/**
* @expectedException \Box\Spout\Reader\Exception\SharedStringNotFoundException
* @return void
*/
public function testGetStringAtIndexShouldThrowExceptionIfStringNotFound()
{
$this->sharedStringsHelper->extractSharedStrings();
$this->sharedStringsHelper->getStringAtIndex(PHP_INT_MAX);
}
/**
* @return void
*/
public function testGetStringAtIndexShouldReturnTheCorrectStringIfFound()
{
$this->sharedStringsHelper->extractSharedStrings();
$sharedString = $this->sharedStringsHelper->getStringAtIndex(0);
$this->assertEquals('s1--A1', $sharedString);
$sharedString = $this->sharedStringsHelper->getStringAtIndex(24);
$this->assertEquals('s1--E5', $sharedString);
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($this->sharedStringsHelper, 'cachingStrategy');
$this->assertTrue($usedCachingStrategy instanceof InMemoryStrategy);
}
/**
* @return void
*/
public function testGetStringAtIndexShouldWorkWithMultilineStrings()
{
$resourcePath = $this->getResourcePath('one_sheet_with_shared_multiline_strings.xlsx');
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
$sharedStringsHelper->extractSharedStrings();
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
$this->assertEquals("s1\nA1", $sharedString);
$sharedString = $sharedStringsHelper->getStringAtIndex(24);
$this->assertEquals("s1\nE5", $sharedString);
$sharedStringsHelper->cleanup();
}
/**
* @return void
*/
public function testGetStringAtIndexWithFileBasedStrategy()
{
// force the file-based strategy by setting no memory limit
$originalMemoryLimit = ini_get('memory_limit');
ini_set('memory_limit', '-1');
$resourcePath = $this->getResourcePath('sheet_with_lots_of_shared_strings.xlsx');
$sharedStringsHelper = new SharedStringsHelper($resourcePath);
$sharedStringsHelper->extractSharedStrings();
$sharedString = $sharedStringsHelper->getStringAtIndex(0);
$this->assertEquals('str', $sharedString);
$sharedString = $sharedStringsHelper->getStringAtIndex(CachingStrategyFactory::MAX_NUM_STRINGS_PER_TEMP_FILE + 1);
$this->assertEquals('str', $sharedString);
$usedCachingStrategy = \ReflectionHelper::getValueOnObject($sharedStringsHelper, 'cachingStrategy');
$this->assertTrue($usedCachingStrategy instanceof FileBasedStrategy);
$sharedStringsHelper->cleanup();
ini_set('memory_limit', $originalMemoryLimit);
}
}

View File

@ -0,0 +1,300 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory2;
use Box\Spout\TestUsingResource;
/**
* Class ReaderTest
*
* @package Box\Spout\Reader\XLSX
*/
class ReaderTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @return array
*/
public function dataProviderForTestReadShouldThrowException()
{
return [
['/path/to/fake/file.xlsx'],
['file_with_no_sheets_in_content_types.xlsx'],
['file_corrupted.xlsx'],
];
}
/**
* @dataProvider dataProviderForTestReadShouldThrowException
* @expectedException \Box\Spout\Common\Exception\IOException
*
* @param string $filePath
* @return void
*/
public function testReadShouldThrowException($filePath)
{
$this->getAllRowsForFile($filePath);
}
/**
* @return array
*/
public function dataProviderForTestReadForAllWorksheets()
{
return [
['one_sheet_with_shared_strings.xlsx', 5, 5],
['one_sheet_with_inline_strings.xlsx', 5, 5],
['two_sheets_with_shared_strings.xlsx', 10, 5],
['two_sheets_with_inline_strings.xlsx', 10, 5]
];
}
/**
* @dataProvider dataProviderForTestReadForAllWorksheets
*
* @param string $resourceName
* @param int $expectedNumOfRows
* @param int $expectedNumOfCellsPerRow
* @return void
*/
public function testReadForAllWorksheets($resourceName, $expectedNumOfRows, $expectedNumOfCellsPerRow)
{
$allRows = $this->getAllRowsForFile($resourceName);
$this->assertEquals($expectedNumOfRows, count($allRows), "There should be $expectedNumOfRows rows");
foreach ($allRows as $row) {
$this->assertEquals($expectedNumOfCellsPerRow, count($row), "There should be $expectedNumOfCellsPerRow cells for every row");
}
}
/**
* @return void
*/
public function testReadShouldSupportFilesWithoutSharedStringsFile()
{
$allRows = $this->getAllRowsForFile('sheet_with_no_shared_strings_file.xlsx');
$expectedRows = [
[10, 11],
[20, 21],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportAllCellTypes()
{
$allRows = $this->getAllRowsForFile('sheet_with_all_cell_types.xlsx');
$expectedRows = [
[
's1--A1', 's1--A2',
false, true,
\DateTime::createFromFormat('Y-m-d H:i:s', '2015-06-03 13:21:58'),
\DateTime::createFromFormat('Y-m-d H:i:s', '2015-06-01 00:00:00'),
10, 10.43,
null,
'weird string', // valid 'str' string
null, // invalid date
],
['', '', '', '', '', '', '', '', ''],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldKeepEmptyCellsAtTheEndIfDimensionsSpecified()
{
$allRows = $this->getAllRowsForFile('sheet_without_dimensions_but_spans_and_empty_cells.xlsx');
$this->assertEquals(2, count($allRows), 'There should be 2 rows');
foreach ($allRows as $row) {
$this->assertEquals(5, count($row), 'There should be 5 cells for every row, because empty rows should be preserved');
}
$expectedRows = [
['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'],
['s1--A2', 's1--B2', 's1--C2', '', ''],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldKeepEmptyCellsAtTheEndIfNoDimensionsButSpansSpecified()
{
$allRows = $this->getAllRowsForFile('sheet_without_dimensions_and_empty_cells.xlsx');
$this->assertEquals(2, count($allRows), 'There should be 2 rows');
$this->assertEquals(5, count($allRows[0]), 'There should be 5 cells in the first row');
$this->assertEquals(3, count($allRows[1]), 'There should be only 3 cells in the second row, because empty rows at the end should be skip');
$expectedRows = [
['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'],
['s1--A2', 's1--B2', 's1--C2'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSkipEmptyCellsAtTheEndIfDimensionsNotSpecified()
{
$allRows = $this->getAllRowsForFile('sheet_without_dimensions_and_empty_cells.xlsx');
$this->assertEquals(2, count($allRows), 'There should be 2 rows');
$this->assertEquals(5, count($allRows[0]), 'There should be 5 cells in the first row');
$this->assertEquals(3, count($allRows[1]), 'There should be only 3 cells in the second row, because empty rows at the end should be skip');
$expectedRows = [
['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'],
['s1--A2', 's1--B2', 's1--C2'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSkipEmptyRows()
{
$allRows = $this->getAllRowsForFile('sheet_with_empty_rows.xlsx');
$this->assertEquals(2, count($allRows), 'There should be only 2 rows, because the empty row is skipped');
$expectedRows = [
['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'],
['s1--A3', 's1--B3', 's1--C3', 's1--D3', 's1--E3'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportEmptySharedString()
{
$allRows = $this->getAllRowsForFile('sheet_with_empty_shared_string.xlsx');
$expectedRows = [
['s1--A1', '', 's1--C1'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldPreserveSpaceIfSpecified()
{
$allRows = $this->getAllRowsForFile('sheet_with_preserve_space_shared_strings.xlsx');
$expectedRows = [
[' s1--A1', 's1--B1 ', ' s1--C1 '],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSkipPronunciationData()
{
$allRows = $this->getAllRowsForFile('sheet_with_pronunciation.xlsx');
$expectedRow = ['名前', '一二三四'];
$this->assertEquals($expectedRow, $allRows[0], 'Pronunciation data should be removed.');
}
/**
* @return array
*/
public function dataProviderForTestReadShouldBeProtectedAgainstAttacks()
{
return [
['attack_billion_laughs.xlsx'],
['attack_quadratic_blowup.xlsx'],
];
}
/**
* @dataProvider dataProviderForTestReadShouldBeProtectedAgainstAttacks
* @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used)
*
* @param string $fileName
* @return void
*/
public function testReadShouldBeProtectedAgainstAttacks($fileName)
{
$startTime = microtime(true);
try {
$this->getAllRowsForFile($fileName);
$this->fail('An exception should have been thrown');
} catch (IOException $exception) {
$duration = microtime(true) - $startTime;
$this->assertLessThan(10, $duration, 'Entities should not be expanded and therefore take more than 10 seconds to be parsed.');
$expectedMaxMemoryUsage = 30 * 1024 * 1024; // 30MB
$this->assertLessThan($expectedMaxMemoryUsage, memory_get_peak_usage(true), 'Entities should not be expanded and therefore consume all the memory.');
}
}
/**
* @return void
*/
public function testReadShouldBeAbleToProcessEmptySheets()
{
$allRows = $this->getAllRowsForFile('sheet_with_no_cells.xlsx');
$this->assertEquals([], $allRows, 'Sheet with no cells should be correctly processed.');
}
/**
* @return void
*/
public function testReadShouldSkipFormulas()
{
$allRows = $this->getAllRowsForFile('sheet_with_formulas.xlsx');
$expectedRows = [
['val1', 'val2', 'total1', 'total2'],
[10, 20, 30, 21],
[11, 21, 32, 41],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @param string $fileName
* @return array All the read rows the given file
*/
private function getAllRowsForFile($fileName)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
$reader = ReaderFactory2::create(Type::XLSX);
$reader->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {
foreach ($sheet->getRowIterator() as $rowIndex => $row) {
$allRows[] = $row;
}
}
$reader->close();
return $allRows;
}
}

View File

@ -0,0 +1,53 @@
<?php
namespace Box\Spout\Reader\XLSX;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory2;
use Box\Spout\TestUsingResource;
/**
* Class SheetTest
*
* @package Box\Spout\Reader\XLSX
*/
class SheetTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @return void
*/
public function testNextSheetShouldReturnCorrectSheetInfos()
{
$sheets = $this->openFileAndReturnSheets('two_sheets_with_custom_names.xlsx');
$this->assertEquals('CustomName1', $sheets[0]->getName());
$this->assertEquals(0, $sheets[0]->getIndex());
$this->assertEquals(1, $sheets[0]->getId());
$this->assertEquals('CustomName2', $sheets[1]->getName());
$this->assertEquals(1, $sheets[1]->getIndex());
$this->assertEquals(2, $sheets[1]->getId());
}
/**
* @param string $fileName
* @return Sheet[]
*/
private function openFileAndReturnSheets($fileName)
{
$resourcePath = $this->getResourcePath($fileName);
$reader = ReaderFactory2::create(Type::XLSX);
$reader->open($resourcePath);
$sheets = [];
foreach ($reader->getSheetIterator() as $sheet) {
$sheets[] = $sheet;
}
$reader->close();
return $sheets;
}
}