diff --git a/src/Spout/Reader/AbstractReader.php b/src/Spout/Reader/AbstractReader.php index cb476ab..0c5849d 100644 --- a/src/Spout/Reader/AbstractReader.php +++ b/src/Spout/Reader/AbstractReader.php @@ -22,6 +22,9 @@ abstract class AbstractReader implements ReaderInterface /** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */ protected $shouldFormatDates = false; + /** @var bool Whether empty rows should be returned or skipped */ + protected $shouldPreserveEmptyRows = false; + /** * Returns whether stream wrappers are supported * @@ -64,6 +67,7 @@ abstract class AbstractReader implements ReaderInterface /** * Sets whether date/time values should be returned as PHP objects or be formatted as strings. * + * @api * @param bool $shouldFormatDates * @return AbstractReader */ @@ -73,6 +77,19 @@ abstract class AbstractReader implements ReaderInterface return $this; } + /** + * Sets whether empty rows should be returned or skipped. + * + * @api + * @param bool $shouldPreserveEmptyRows + * @return AbstractReader + */ + public function setShouldPreserveEmptyRows($shouldPreserveEmptyRows) + { + $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows; + return $this; + } + /** * Prepares the reader to read the given file. It also makes sure * that the file exists and is readable. diff --git a/src/Spout/Reader/XLSX/Helper/SheetHelper.php b/src/Spout/Reader/XLSX/Helper/SheetHelper.php index a6ff909..d69fef2 100644 --- a/src/Spout/Reader/XLSX/Helper/SheetHelper.php +++ b/src/Spout/Reader/XLSX/Helper/SheetHelper.php @@ -29,18 +29,23 @@ class SheetHelper /** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */ protected $shouldFormatDates; + /** @var bool Whether empty rows should be returned or skipped */ + protected $shouldPreserveEmptyRows; + /** * @param string $filePath Path of the XLSX file being read * @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings + * @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped */ - public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates) + public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows) { $this->filePath = $filePath; $this->sharedStringsHelper = $sharedStringsHelper; $this->globalFunctionsHelper = $globalFunctionsHelper; $this->shouldFormatDates = $shouldFormatDates; + $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows; } /** @@ -92,7 +97,7 @@ class SheetHelper $sheetDataXMLFilePath = $this->getSheetDataXMLFilePathForSheetId($sheetId); - return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName); + return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $this->shouldPreserveEmptyRows, $sheetIndexZeroBased, $sheetName); } /** diff --git a/src/Spout/Reader/XLSX/Reader.php b/src/Spout/Reader/XLSX/Reader.php index bcf02cc..7532ee7 100644 --- a/src/Spout/Reader/XLSX/Reader.php +++ b/src/Spout/Reader/XLSX/Reader.php @@ -69,7 +69,7 @@ class Reader extends AbstractReader $this->sharedStringsHelper->extractSharedStrings(); } - $this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper, $this->shouldFormatDates); + $this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper, $this->shouldFormatDates, $this->shouldPreserveEmptyRows); } else { throw new IOException("Could not open $filePath for reading."); } diff --git a/src/Spout/Reader/XLSX/RowIterator.php b/src/Spout/Reader/XLSX/RowIterator.php index 896222e..e9ff507 100644 --- a/src/Spout/Reader/XLSX/RowIterator.php +++ b/src/Spout/Reader/XLSX/RowIterator.php @@ -26,6 +26,7 @@ class RowIterator implements IteratorInterface /** Definition of XML attributes used to parse data */ const XML_ATTRIBUTE_REF = 'ref'; const XML_ATTRIBUTE_SPANS = 'spans'; + const XML_ATTRIBUTE_ROW_INDEX = 'r'; const XML_ATTRIBUTE_CELL_INDEX = 'r'; /** @var string Path of the XLSX file being read */ @@ -43,7 +44,10 @@ class RowIterator implements IteratorInterface /** @var Helper\StyleHelper $styleHelper Helper to work with styles */ protected $styleHelper; - /** @var int Number of read rows */ + /** + * TODO: This variable can be deleted when row indices get preserved + * @var int Number of read rows + */ protected $numReadRows = 0; /** @var array|null Buffer used to store the row data, while checking if there are more rows to read */ @@ -55,6 +59,15 @@ class RowIterator implements IteratorInterface /** @var int The number of columns the sheet has (0 meaning undefined) */ protected $numColumns = 0; + /** @var bool Whether empty rows should be returned or skipped */ + protected $shouldPreserveEmptyRows; + + /** @var int Last row index processed (one-based) */ + protected $lastRowIndexProcessed = 0; + + /** @var int Row index to be processed next (one-based) */ + protected $nextRowIndexToBeProcessed = 0; + /** @var int Last column index processed (zero-based) */ protected $lastColumnIndexProcessed = -1; @@ -63,8 +76,9 @@ class RowIterator implements IteratorInterface * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml * @param Helper\SharedStringsHelper $sharedStringsHelper Helper to work with shared strings * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings + * @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped */ - public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates) + public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows) { $this->filePath = $filePath; $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); @@ -73,6 +87,8 @@ class RowIterator implements IteratorInterface $this->styleHelper = new StyleHelper($filePath); $this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper, $this->styleHelper, $shouldFormatDates); + + $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows; } /** @@ -104,6 +120,8 @@ class RowIterator implements IteratorInterface } $this->numReadRows = 0; + $this->lastRowIndexProcessed = 0; + $this->nextRowIndexToBeProcessed = 0; $this->rowDataBuffer = null; $this->hasReachedEndOfFile = false; $this->numColumns = 0; @@ -123,7 +141,7 @@ class RowIterator implements IteratorInterface } /** - * Move forward to next element. Empty rows will be skipped. + * Move forward to next element. Reads data describing the next unprocessed row. * @link http://php.net/manual/en/iterator.next.php * * @return void @@ -131,53 +149,73 @@ class RowIterator implements IteratorInterface * @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML */ public function next() + { + $this->nextRowIndexToBeProcessed++; + + if ($this->doesNeedDataForNextRowToBeProcessed()) { + $this->readDataForNextRow($this->xmlReader); + } + } + + /** + * Returns whether we need data for the next row to be processed. + * We don't need to read data if: + * we have already read at least one row + * AND + * we need to preserve empty rows + * AND + * the last row that was read is not the row that need to be processed + * (i.e. if we need to return empty rows) + * + * @return bool Whether we need data for the next row to be processed. + */ + protected function doesNeedDataForNextRowToBeProcessed() + { + $hasReadAtLeastOneRow = ($this->lastRowIndexProcessed !== 0); + + return ( + !$hasReadAtLeastOneRow || + !$this->shouldPreserveEmptyRows || + $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed + ); + } + + /** + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object + * @return void + * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found + * @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML + */ + protected function readDataForNextRow($xmlReader) { $rowData = []; try { - while ($this->xmlReader->read()) { - if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_DIMENSION)) { - // Read dimensions of the sheet - $dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) - if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) { - $lastCellIndex = $matches[1]; - $this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1; + while ($xmlReader->read()) { + if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_DIMENSION)) { + $this->processDimensionStartingNode($xmlReader); + + } else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) { + $rowData = $this->processRowStartingNode($xmlReader); + + } else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) { + $rowData = $this->processCellStartingNode($xmlReader, $rowData); + + } else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) { + // if the fetched row is empty and we don't want to preserve it.., + if (!$this->shouldPreserveEmptyRows && $this->isEmptyRow($rowData)) { + // ... skip it + continue; } - } else if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) { - // Start of the row description + $rowData = $this->processRowEndingNode($rowData); - // Reset index of the last processed column - $this->lastColumnIndexProcessed = -1; - - // Read spans info if present - $numberOfColumnsForRow = $this->numColumns; - $spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance - if ($spans) { - list(, $numberOfColumnsForRow) = explode(':', $spans); - $numberOfColumnsForRow = intval($numberOfColumnsForRow); - } - $rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : []; - - } else if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) { - // Start of a cell description - $currentColumnIndex = $this->getCellIndex($this->xmlReader); - - $node = $this->xmlReader->expand(); - $rowData[$currentColumnIndex] = $this->getCellValue($node); - - $this->lastColumnIndexProcessed = $currentColumnIndex; - - } else if ($this->xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) { - // End of the row description - // If needed, we fill the empty cells - $rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData); - $this->numReadRows++; + // at this point, we have all the data we need for the row + // so that we can populate the buffer break; - } else if ($this->xmlReader->isPositionedOnEndingNode(self::XML_NODE_WORKSHEET)) { - // The closing "" marks the end of the file - $this->hasReachedEndOfFile = true; + } else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_WORKSHEET)) { + $this->processWorksheetEndingNode(); break; } } @@ -190,11 +228,101 @@ class RowIterator implements IteratorInterface } /** - * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" tag - * @return int + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" starting node + * @return void + */ + protected function processDimensionStartingNode($xmlReader) + { + // Read dimensions of the sheet + $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) + if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) { + $lastCellIndex = $matches[1]; + $this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1; + } + } + + /** + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" starting node + * @return array + */ + protected function processRowStartingNode($xmlReader) + { + // Reset index of the last processed column + $this->lastColumnIndexProcessed = -1; + + // Mark the last processed row as the one currently being read + $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader); + + // Read spans info if present + $numberOfColumnsForRow = $this->numColumns; + $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance + if ($spans) { + list(, $numberOfColumnsForRow) = explode(':', $spans); + $numberOfColumnsForRow = intval($numberOfColumnsForRow); + } + + return ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : []; + } + + /** + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" starting node + * @param array $rowData Data of all cells read so far (key = cell index, value = cell value) + * @return array Original row data + data for the cell that was just read (key = cell index, value = cell value) + */ + protected function processCellStartingNode($xmlReader, $rowData) + { + $currentColumnIndex = $this->getColumnIndex($xmlReader); + + $node = $xmlReader->expand(); + $rowData[$currentColumnIndex] = $this->getCellValue($node); + + $this->lastColumnIndexProcessed = $currentColumnIndex; + + return $rowData; + } + + /** + * @param array $rowData Data of all cells read so far (key = cell index, value = cell value) + * @return array + */ + protected function processRowEndingNode($rowData) + { + $this->numReadRows++; + + // If needed, we fill the empty cells + return ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData); + } + + /** + * @return void + */ + protected function processWorksheetEndingNode() + { + // The closing "" marks the end of the file + $this->hasReachedEndOfFile = true; + } + + /** + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" node + * @return int Row index * @throws \Box\Spout\Common\Exception\InvalidArgumentException When the given cell index is invalid */ - protected function getCellIndex($xmlReader) + protected function getRowIndex($xmlReader) + { + // Get "r" attribute if present (from something like + $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX); + + return ($currentRowIndex !== null) ? + intval($currentRowIndex) : + $this->lastRowIndexProcessed + 1; + } + + /** + * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "" node + * @return int Column index + * @throws \Box\Spout\Common\Exception\InvalidArgumentException When the given cell index is invalid + */ + protected function getColumnIndex($xmlReader) { // Get "r" attribute if present (from something like $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); @@ -216,25 +344,53 @@ class RowIterator implements IteratorInterface } /** - * Return the current element, from the buffer. + * @param array $rowData + * @return bool Whether the given row is empty + */ + protected function isEmptyRow($rowData) + { + return (count($rowData) === 1 && $rowData[0] === ''); + } + + /** + * Return the current element, either an empty row or from the buffer. * @link http://php.net/manual/en/iterator.current.php * * @return array|null */ public function current() { - return $this->rowDataBuffer; + $rowDataForRowToBeProcessed = $this->rowDataBuffer; + + if ($this->shouldPreserveEmptyRows) { + // when we need to preserve empty rows, we will either return + // an empty row or the last row read. This depends whether the + // index of last row that was read matches the index of the last + // row whose value should be returned. + if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) { + // return empty row if mismatch between last processed row + // and the row that needs to be returned + $rowDataForRowToBeProcessed = ['']; + } + } + + return $rowDataForRowToBeProcessed; } /** - * Return the key of the current element + * Return the key of the current element. Here, the row index. * @link http://php.net/manual/en/iterator.key.php * * @return int */ public function key() { - return $this->numReadRows; + // TODO: This should return $this->nextRowIndexToBeProcessed + // but to avoid a breaking change, the return value for + // this function has been kept as the number of rows read. + return $this->shouldPreserveEmptyRows ? + $this->nextRowIndexToBeProcessed : + $this->numReadRows; } diff --git a/src/Spout/Reader/XLSX/Sheet.php b/src/Spout/Reader/XLSX/Sheet.php index a1c7d95..b2405ae 100644 --- a/src/Spout/Reader/XLSX/Sheet.php +++ b/src/Spout/Reader/XLSX/Sheet.php @@ -26,12 +26,13 @@ class Sheet implements SheetInterface * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml * @param Helper\SharedStringsHelper Helper to work with shared strings * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings + * @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped * @param int $sheetIndex Index of the sheet, based on order in the workbook (zero-based) * @param string $sheetName Name of the sheet */ - public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $sheetIndex, $sheetName) + public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows, $sheetIndex, $sheetName) { - $this->rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates); + $this->rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows); $this->index = $sheetIndex; $this->name = $sheetName; } diff --git a/src/Spout/Reader/XLSX/SheetIterator.php b/src/Spout/Reader/XLSX/SheetIterator.php index f286cea..88cd350 100644 --- a/src/Spout/Reader/XLSX/SheetIterator.php +++ b/src/Spout/Reader/XLSX/SheetIterator.php @@ -25,12 +25,13 @@ class SheetIterator implements IteratorInterface * @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper $sharedStringsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings + * @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped * @throws \Box\Spout\Reader\Exception\NoSheetsFoundException If there are no sheets in the file */ - public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates) + public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows) { // Fetch all available sheets - $sheetHelper = new SheetHelper($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates); + $sheetHelper = new SheetHelper($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows); $this->sheets = $sheetHelper->getSheets(); if (count($this->sheets) === 0) { diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index dffbc26..51b79d4 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -352,16 +352,39 @@ class ReaderTest extends \PHPUnit_Framework_TestCase /** * @return void */ - public function testReadShouldSkipEmptyRows() + public function testReadShouldSkipEmptyRowsIfShouldPreserveEmptyRowsNotSet() { - $allRows = $this->getAllRowsForFile('sheet_with_empty_row.xlsx'); + $allRows = $this->getAllRowsForFile('sheet_with_empty_rows_and_missing_row_index.xlsx'); - $this->assertEquals(2, count($allRows), 'There should be only 2 rows, because the empty row is skipped'); + $this->assertEquals(3, count($allRows), 'There should be only 3 rows, because the empty rows are skipped'); $expectedRows = [ - ['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'], // skipped row here - ['s1--A3', 's1--B3', 's1--C3', 's1--D3', 's1--E3'], + ['s1--A2', 's1--B2', 's1--C2'], + // skipped row here + // skipped row here + ['s1--A5', 's1--B5', 's1--C5'], + ['s1--A6', 's1--B6', 's1--C6'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldReturnEmptyLinesIfShouldPreserveEmptyRowsSet() + { + $allRows = $this->getAllRowsForFile('sheet_with_empty_rows_and_missing_row_index.xlsx', false, true); + + $this->assertEquals(6, count($allRows), 'There should be 6 rows'); + + $expectedRows = [ + [''], + ['s1--A2', 's1--B2', 's1--C2'], + [''], + [''], + ['s1--A5', 's1--B5', 's1--C5'], + ['s1--A6', 's1--B6', 's1--C6'], ]; $this->assertEquals($expectedRows, $allRows); } @@ -595,15 +618,18 @@ class ReaderTest extends \PHPUnit_Framework_TestCase /** * @param string $fileName * @param bool|void $shouldFormatDates + * @param bool|void $shouldPreserveEmptyRows * @return array All the read rows the given file */ - private function getAllRowsForFile($fileName, $shouldFormatDates = false) + private function getAllRowsForFile($fileName, $shouldFormatDates = false, $shouldPreserveEmptyRows = false) { $allRows = []; $resourcePath = $this->getResourcePath($fileName); + /** @var \Box\Spout\Reader\XLSX\Reader $reader */ $reader = ReaderFactory::create(Type::XLSX); $reader->setShouldFormatDates($shouldFormatDates); + $reader->setShouldPreserveEmptyRows($shouldPreserveEmptyRows); $reader->open($resourcePath); foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { diff --git a/tests/resources/xlsx/sheet_with_empty_row.xlsx b/tests/resources/xlsx/sheet_with_empty_row.xlsx deleted file mode 100644 index b9330b0..0000000 Binary files a/tests/resources/xlsx/sheet_with_empty_row.xlsx and /dev/null differ diff --git a/tests/resources/xlsx/sheet_with_empty_rows_and_missing_row_index.xlsx b/tests/resources/xlsx/sheet_with_empty_rows_and_missing_row_index.xlsx new file mode 100644 index 0000000..2f5cf15 Binary files /dev/null and b/tests/resources/xlsx/sheet_with_empty_rows_and_missing_row_index.xlsx differ