Add option to preserve empty rows when reading an XLSX file

This commit is contained in:
Adrien Loison 2016-10-11 20:09:42 -07:00
parent 77178122c3
commit 503809eb53
9 changed files with 267 additions and 61 deletions

View File

@ -22,6 +22,9 @@ abstract class AbstractReader implements ReaderInterface
/** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */
protected $shouldFormatDates = false;
/** @var bool Whether empty rows should be returned or skipped */
protected $shouldPreserveEmptyRows = false;
/**
* Returns whether stream wrappers are supported
*
@ -64,6 +67,7 @@ abstract class AbstractReader implements ReaderInterface
/**
* Sets whether date/time values should be returned as PHP objects or be formatted as strings.
*
* @api
* @param bool $shouldFormatDates
* @return AbstractReader
*/
@ -73,6 +77,19 @@ abstract class AbstractReader implements ReaderInterface
return $this;
}
/**
* Sets whether empty rows should be returned or skipped.
*
* @api
* @param bool $shouldPreserveEmptyRows
* @return AbstractReader
*/
public function setShouldPreserveEmptyRows($shouldPreserveEmptyRows)
{
$this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
return $this;
}
/**
* Prepares the reader to read the given file. It also makes sure
* that the file exists and is readable.

View File

@ -29,18 +29,23 @@ class SheetHelper
/** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */
protected $shouldFormatDates;
/** @var bool Whether empty rows should be returned or skipped */
protected $shouldPreserveEmptyRows;
/**
* @param string $filePath Path of the XLSX file being read
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper Helper to work with shared strings
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
* @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
*/
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates)
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows)
{
$this->filePath = $filePath;
$this->sharedStringsHelper = $sharedStringsHelper;
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->shouldFormatDates = $shouldFormatDates;
$this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
}
/**
@ -92,7 +97,7 @@ class SheetHelper
$sheetDataXMLFilePath = $this->getSheetDataXMLFilePathForSheetId($sheetId);
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $sheetIndexZeroBased, $sheetName);
return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $this->shouldFormatDates, $this->shouldPreserveEmptyRows, $sheetIndexZeroBased, $sheetName);
}
/**

View File

@ -69,7 +69,7 @@ class Reader extends AbstractReader
$this->sharedStringsHelper->extractSharedStrings();
}
$this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper, $this->shouldFormatDates);
$this->sheetIterator = new SheetIterator($filePath, $this->sharedStringsHelper, $this->globalFunctionsHelper, $this->shouldFormatDates, $this->shouldPreserveEmptyRows);
} else {
throw new IOException("Could not open $filePath for reading.");
}

View File

@ -26,6 +26,7 @@ class RowIterator implements IteratorInterface
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_REF = 'ref';
const XML_ATTRIBUTE_SPANS = 'spans';
const XML_ATTRIBUTE_ROW_INDEX = 'r';
const XML_ATTRIBUTE_CELL_INDEX = 'r';
/** @var string Path of the XLSX file being read */
@ -43,7 +44,10 @@ class RowIterator implements IteratorInterface
/** @var Helper\StyleHelper $styleHelper Helper to work with styles */
protected $styleHelper;
/** @var int Number of read rows */
/**
* TODO: This variable can be deleted when row indices get preserved
* @var int Number of read rows
*/
protected $numReadRows = 0;
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
@ -55,6 +59,15 @@ class RowIterator implements IteratorInterface
/** @var int The number of columns the sheet has (0 meaning undefined) */
protected $numColumns = 0;
/** @var bool Whether empty rows should be returned or skipped */
protected $shouldPreserveEmptyRows;
/** @var int Last row index processed (one-based) */
protected $lastRowIndexProcessed = 0;
/** @var int Row index to be processed next (one-based) */
protected $nextRowIndexToBeProcessed = 0;
/** @var int Last column index processed (zero-based) */
protected $lastColumnIndexProcessed = -1;
@ -63,8 +76,9 @@ class RowIterator implements IteratorInterface
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @param Helper\SharedStringsHelper $sharedStringsHelper Helper to work with shared strings
* @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
*/
public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates)
public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows)
{
$this->filePath = $filePath;
$this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
@ -73,6 +87,8 @@ class RowIterator implements IteratorInterface
$this->styleHelper = new StyleHelper($filePath);
$this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper, $this->styleHelper, $shouldFormatDates);
$this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
}
/**
@ -104,6 +120,8 @@ class RowIterator implements IteratorInterface
}
$this->numReadRows = 0;
$this->lastRowIndexProcessed = 0;
$this->nextRowIndexToBeProcessed = 0;
$this->rowDataBuffer = null;
$this->hasReachedEndOfFile = false;
$this->numColumns = 0;
@ -123,7 +141,7 @@ class RowIterator implements IteratorInterface
}
/**
* Move forward to next element. Empty rows will be skipped.
* Move forward to next element. Reads data describing the next unprocessed row.
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
@ -131,53 +149,73 @@ class RowIterator implements IteratorInterface
* @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML
*/
public function next()
{
$this->nextRowIndexToBeProcessed++;
if ($this->doesNeedDataForNextRowToBeProcessed()) {
$this->readDataForNextRow($this->xmlReader);
}
}
/**
* Returns whether we need data for the next row to be processed.
* We don't need to read data if:
* we have already read at least one row
* AND
* we need to preserve empty rows
* AND
* the last row that was read is not the row that need to be processed
* (i.e. if we need to return empty rows)
*
* @return bool Whether we need data for the next row to be processed.
*/
protected function doesNeedDataForNextRowToBeProcessed()
{
$hasReadAtLeastOneRow = ($this->lastRowIndexProcessed !== 0);
return (
!$hasReadAtLeastOneRow ||
!$this->shouldPreserveEmptyRows ||
$this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed
);
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
* @return void
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
* @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML
*/
protected function readDataForNextRow($xmlReader)
{
$rowData = [];
try {
while ($this->xmlReader->read()) {
if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_DIMENSION)) {
// Read dimensions of the sheet
$dimensionRef = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) {
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_DIMENSION)) {
$this->processDimensionStartingNode($xmlReader);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) {
$rowData = $this->processRowStartingNode($xmlReader);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) {
$rowData = $this->processCellStartingNode($xmlReader, $rowData);
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) {
// if the fetched row is empty and we don't want to preserve it..,
if (!$this->shouldPreserveEmptyRows && $this->isEmptyRow($rowData)) {
// ... skip it
continue;
}
} else if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) {
// Start of the row description
$rowData = $this->processRowEndingNode($rowData);
// Reset index of the last processed column
$this->lastColumnIndexProcessed = -1;
// Read spans info if present
$numberOfColumnsForRow = $this->numColumns;
$spans = $this->xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
if ($spans) {
list(, $numberOfColumnsForRow) = explode(':', $spans);
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
$rowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
} else if ($this->xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) {
// Start of a cell description
$currentColumnIndex = $this->getCellIndex($this->xmlReader);
$node = $this->xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
$this->lastColumnIndexProcessed = $currentColumnIndex;
} else if ($this->xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) {
// End of the row description
// If needed, we fill the empty cells
$rowData = ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
$this->numReadRows++;
// at this point, we have all the data we need for the row
// so that we can populate the buffer
break;
} else if ($this->xmlReader->isPositionedOnEndingNode(self::XML_NODE_WORKSHEET)) {
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_WORKSHEET)) {
$this->processWorksheetEndingNode();
break;
}
}
@ -190,11 +228,101 @@ class RowIterator implements IteratorInterface
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<c>" tag
* @return int
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node
* @return void
*/
protected function processDimensionStartingNode($xmlReader)
{
// Read dimensions of the sheet
$dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
if (preg_match('/[A-Z\d]+:([A-Z\d]+)/', $dimensionRef, $matches)) {
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
}
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node
* @return array
*/
protected function processRowStartingNode($xmlReader)
{
// Reset index of the last processed column
$this->lastColumnIndexProcessed = -1;
// Mark the last processed row as the one currently being read
$this->lastRowIndexProcessed = $this->getRowIndex($xmlReader);
// Read spans info if present
$numberOfColumnsForRow = $this->numColumns;
$spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
if ($spans) {
list(, $numberOfColumnsForRow) = explode(':', $spans);
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
return ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node
* @param array $rowData Data of all cells read so far (key = cell index, value = cell value)
* @return array Original row data + data for the cell that was just read (key = cell index, value = cell value)
*/
protected function processCellStartingNode($xmlReader, $rowData)
{
$currentColumnIndex = $this->getColumnIndex($xmlReader);
$node = $xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
$this->lastColumnIndexProcessed = $currentColumnIndex;
return $rowData;
}
/**
* @param array $rowData Data of all cells read so far (key = cell index, value = cell value)
* @return array
*/
protected function processRowEndingNode($rowData)
{
$this->numReadRows++;
// If needed, we fill the empty cells
return ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
}
/**
* @return void
*/
protected function processWorksheetEndingNode()
{
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<row>" node
* @return int Row index
* @throws \Box\Spout\Common\Exception\InvalidArgumentException When the given cell index is invalid
*/
protected function getCellIndex($xmlReader)
protected function getRowIndex($xmlReader)
{
// Get "r" attribute if present (from something like <row r="3"...>
$currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX);
return ($currentRowIndex !== null) ?
intval($currentRowIndex) :
$this->lastRowIndexProcessed + 1;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<c>" node
* @return int Column index
* @throws \Box\Spout\Common\Exception\InvalidArgumentException When the given cell index is invalid
*/
protected function getColumnIndex($xmlReader)
{
// Get "r" attribute if present (from something like <c r="A1"...>
$currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
@ -216,25 +344,53 @@ class RowIterator implements IteratorInterface
}
/**
* Return the current element, from the buffer.
* @param array $rowData
* @return bool Whether the given row is empty
*/
protected function isEmptyRow($rowData)
{
return (count($rowData) === 1 && $rowData[0] === '');
}
/**
* Return the current element, either an empty row or from the buffer.
* @link http://php.net/manual/en/iterator.current.php
*
* @return array|null
*/
public function current()
{
return $this->rowDataBuffer;
$rowDataForRowToBeProcessed = $this->rowDataBuffer;
if ($this->shouldPreserveEmptyRows) {
// when we need to preserve empty rows, we will either return
// an empty row or the last row read. This depends whether the
// index of last row that was read matches the index of the last
// row whose value should be returned.
if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) {
// return empty row if mismatch between last processed row
// and the row that needs to be returned
$rowDataForRowToBeProcessed = [''];
}
}
return $rowDataForRowToBeProcessed;
}
/**
* Return the key of the current element
* Return the key of the current element. Here, the row index.
* @link http://php.net/manual/en/iterator.key.php
*
* @return int
*/
public function key()
{
return $this->numReadRows;
// TODO: This should return $this->nextRowIndexToBeProcessed
// but to avoid a breaking change, the return value for
// this function has been kept as the number of rows read.
return $this->shouldPreserveEmptyRows ?
$this->nextRowIndexToBeProcessed :
$this->numReadRows;
}

View File

@ -26,12 +26,13 @@ class Sheet implements SheetInterface
* @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
* @param Helper\SharedStringsHelper Helper to work with shared strings
* @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
* @param int $sheetIndex Index of the sheet, based on order in the workbook (zero-based)
* @param string $sheetName Name of the sheet
*/
public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $sheetIndex, $sheetName)
public function __construct($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows, $sheetIndex, $sheetName)
{
$this->rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates);
$this->rowIterator = new RowIterator($filePath, $sheetDataXMLFilePath, $sharedStringsHelper, $shouldFormatDates, $shouldPreserveEmptyRows);
$this->index = $sheetIndex;
$this->name = $sheetName;
}

View File

@ -25,12 +25,13 @@ class SheetIterator implements IteratorInterface
* @param \Box\Spout\Reader\XLSX\Helper\SharedStringsHelper $sharedStringsHelper
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
* @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
* @throws \Box\Spout\Reader\Exception\NoSheetsFoundException If there are no sheets in the file
*/
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates)
public function __construct($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows)
{
// Fetch all available sheets
$sheetHelper = new SheetHelper($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates);
$sheetHelper = new SheetHelper($filePath, $sharedStringsHelper, $globalFunctionsHelper, $shouldFormatDates, $shouldPreserveEmptyRows);
$this->sheets = $sheetHelper->getSheets();
if (count($this->sheets) === 0) {

View File

@ -352,16 +352,39 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
/**
* @return void
*/
public function testReadShouldSkipEmptyRows()
public function testReadShouldSkipEmptyRowsIfShouldPreserveEmptyRowsNotSet()
{
$allRows = $this->getAllRowsForFile('sheet_with_empty_row.xlsx');
$allRows = $this->getAllRowsForFile('sheet_with_empty_rows_and_missing_row_index.xlsx');
$this->assertEquals(2, count($allRows), 'There should be only 2 rows, because the empty row is skipped');
$this->assertEquals(3, count($allRows), 'There should be only 3 rows, because the empty rows are skipped');
$expectedRows = [
['s1--A1', 's1--B1', 's1--C1', 's1--D1', 's1--E1'],
// skipped row here
['s1--A3', 's1--B3', 's1--C3', 's1--D3', 's1--E3'],
['s1--A2', 's1--B2', 's1--C2'],
// skipped row here
// skipped row here
['s1--A5', 's1--B5', 's1--C5'],
['s1--A6', 's1--B6', 's1--C6'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldReturnEmptyLinesIfShouldPreserveEmptyRowsSet()
{
$allRows = $this->getAllRowsForFile('sheet_with_empty_rows_and_missing_row_index.xlsx', false, true);
$this->assertEquals(6, count($allRows), 'There should be 6 rows');
$expectedRows = [
[''],
['s1--A2', 's1--B2', 's1--C2'],
[''],
[''],
['s1--A5', 's1--B5', 's1--C5'],
['s1--A6', 's1--B6', 's1--C6'],
];
$this->assertEquals($expectedRows, $allRows);
}
@ -595,15 +618,18 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
/**
* @param string $fileName
* @param bool|void $shouldFormatDates
* @param bool|void $shouldPreserveEmptyRows
* @return array All the read rows the given file
*/
private function getAllRowsForFile($fileName, $shouldFormatDates = false)
private function getAllRowsForFile($fileName, $shouldFormatDates = false, $shouldPreserveEmptyRows = false)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
/** @var \Box\Spout\Reader\XLSX\Reader $reader */
$reader = ReaderFactory::create(Type::XLSX);
$reader->setShouldFormatDates($shouldFormatDates);
$reader->setShouldPreserveEmptyRows($shouldPreserveEmptyRows);
$reader->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {