Add option to preserve empty rows when reading a CSV file

This commit is contained in:
Adrien Loison 2016-10-12 00:11:18 -07:00
parent 503809eb53
commit dd7cb1b04e
8 changed files with 115 additions and 80 deletions

View File

@ -119,8 +119,9 @@ class Reader extends AbstractReader
$this->filePointer, $this->filePointer,
$this->fieldDelimiter, $this->fieldDelimiter,
$this->fieldEnclosure, $this->fieldEnclosure,
$this->encoding,
$this->endOfLineCharacter, $this->endOfLineCharacter,
$this->encoding,
$this->shouldPreserveEmptyRows,
$this->globalFunctionsHelper $this->globalFunctionsHelper
); );
} }

View File

@ -52,21 +52,26 @@ class RowIterator implements IteratorInterface
/** @var string End of line delimiter, given by the user as input. */ /** @var string End of line delimiter, given by the user as input. */
protected $inputEOLDelimiter; protected $inputEOLDelimiter;
/** @var bool Whether empty rows should be returned or skipped */
protected $shouldPreserveEmptyRows;
/** /**
* @param resource $filePointer Pointer to the CSV file to read * @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param string $endOfLineDelimiter End of line delimiter * @param string $endOfLineDelimiter End of line delimiter
* @param string $encoding Encoding of the CSV file to be read
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $endOfLineDelimiter, $globalFunctionsHelper) public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $endOfLineDelimiter, $encoding, $shouldPreserveEmptyRows, $globalFunctionsHelper)
{ {
$this->filePointer = $filePointer; $this->filePointer = $filePointer;
$this->fieldDelimiter = $fieldDelimiter; $this->fieldDelimiter = $fieldDelimiter;
$this->fieldEnclosure = $fieldEnclosure; $this->fieldEnclosure = $fieldEnclosure;
$this->encoding = $encoding; $this->encoding = $encoding;
$this->inputEOLDelimiter = $endOfLineDelimiter; $this->inputEOLDelimiter = $endOfLineDelimiter;
$this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
$this->globalFunctionsHelper = $globalFunctionsHelper; $this->globalFunctionsHelper = $globalFunctionsHelper;
$this->encodingHelper = new EncodingHelper($globalFunctionsHelper); $this->encodingHelper = new EncodingHelper($globalFunctionsHelper);
@ -114,7 +119,7 @@ class RowIterator implements IteratorInterface
} }
/** /**
* Move forward to next element. Empty rows are skipped. * Move forward to next element. Reads data for the next unprocessed row.
* @link http://php.net/manual/en/iterator.next.php * @link http://php.net/manual/en/iterator.next.php
* *
* @return void * @return void
@ -124,25 +129,48 @@ class RowIterator implements IteratorInterface
{ {
$this->hasReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer); $this->hasReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer);
if ($this->hasReachedEndOfFile) { if (!$this->hasReachedEndOfFile) {
return; $this->readDataForNextRow();
} }
}
/**
* @return void
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
*/
protected function readDataForNextRow()
{
do { do {
$rowData = $this->getNextUTF8EncodedRow(); $rowData = $this->getNextUTF8EncodedRow();
$hasNowReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer); } while ($this->shouldReadNextRow($rowData));
} while (($rowData === false && !$hasNowReachedEndOfFile) || $this->isEmptyLine($rowData));
if ($rowData !== false) { if ($rowData !== false) {
$this->rowDataBuffer = $rowData; // str_replace will replace NULL values by empty strings
$this->rowDataBuffer = str_replace(null, null, $rowData);
$this->numReadRows++; $this->numReadRows++;
} else { } else {
// If we reach this point, it means end of file was reached. // If we reach this point, it means end of file was reached.
// This happens when the last lines are empty lines. // This happens when the last lines are empty lines.
$this->hasReachedEndOfFile = $hasNowReachedEndOfFile; $this->hasReachedEndOfFile = true;
} }
} }
/**
* @param array|bool $currentRowData
* @return bool Whether the data for the current row can be returned or if we need to keep reading
*/
protected function shouldReadNextRow($currentRowData)
{
$hasSuccessfullyFetchedRowData = ($currentRowData !== false);
$hasNowReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer);
$isEmptyLine = $this->isEmptyLine($currentRowData);
return (
(!$hasSuccessfullyFetchedRowData && !$hasNowReachedEndOfFile) ||
(!$this->shouldPreserveEmptyRows && $isEmptyLine)
);
}
/** /**
* Returns the next row, converted if necessary to UTF-8. * Returns the next row, converted if necessary to UTF-8.
* As fgetcsv() does not manage correctly encoding for non UTF-8 data, * As fgetcsv() does not manage correctly encoding for non UTF-8 data,
@ -154,7 +182,7 @@ class RowIterator implements IteratorInterface
protected function getNextUTF8EncodedRow() protected function getNextUTF8EncodedRow()
{ {
$encodedRowData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, self::MAX_READ_BYTES_PER_LINE, $this->fieldDelimiter, $this->fieldEnclosure); $encodedRowData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, self::MAX_READ_BYTES_PER_LINE, $this->fieldDelimiter, $this->fieldEnclosure);
if (false === $encodedRowData) { if ($encodedRowData === false) {
return false; return false;
} }
@ -195,7 +223,7 @@ class RowIterator implements IteratorInterface
} }
/** /**
* @param array $lineData Array containing the cells value for the line * @param array|bool $lineData Array containing the cells value for the line
* @return bool Whether the given line is empty * @return bool Whether the given line is empty
*/ */
protected function isEmptyLine($lineData) protected function isEmptyLine($lineData)

View File

@ -18,12 +18,21 @@ class Sheet implements SheetInterface
* @param resource $filePointer Pointer to the CSV file to read * @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $endOfLineCharacter Character defining the end of a line
* @param string $encoding Encoding of the CSV file to be read * @param string $encoding Encoding of the CSV file to be read
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $endOfLineCharacter, $globalFunctionsHelper) public function __construct(
$filePointer, $fieldDelimiter, $fieldEnclosure,
$endOfLineCharacter, $encoding, $shouldPreserveEmptyRows,
$globalFunctionsHelper)
{ {
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $endOfLineCharacter, $globalFunctionsHelper); $this->rowIterator = new RowIterator(
$filePointer, $fieldDelimiter, $fieldEnclosure,
$endOfLineCharacter, $encoding, $shouldPreserveEmptyRows,
$globalFunctionsHelper
);
} }
/** /**

View File

@ -22,12 +22,21 @@ class SheetIterator implements IteratorInterface
* @param resource $filePointer * @param resource $filePointer
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $endOfLineCharacter Character defining the end of a line
* @param string $encoding Encoding of the CSV file to be read * @param string $encoding Encoding of the CSV file to be read
* @param bool $shouldPreserveEmptyRows Whether empty rows should be returned or skipped
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $endOfLineCharacter, $globalFunctionsHelper) public function __construct(
$filePointer, $fieldDelimiter, $fieldEnclosure,
$endOfLineCharacter, $encoding, $shouldPreserveEmptyRows,
$globalFunctionsHelper)
{ {
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $endOfLineCharacter, $globalFunctionsHelper); $this->sheet = new Sheet(
$filePointer, $fieldDelimiter, $fieldEnclosure,
$endOfLineCharacter, $encoding, $shouldPreserveEmptyRows,
$globalFunctionsHelper
);
} }
/** /**

View File

@ -115,29 +115,40 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
} }
/** /**
* @return array * @return void
*/ */
public function dataProviderForTestReadShouldSkipEmptyLines() public function testReadShouldSkipEmptyLinesIfShouldPreserveEmptyRowsNotSet()
{ {
return [ $allRows = $this->getAllRowsForFile('csv_with_multiple_empty_lines.csv');
['csv_with_empty_line.csv'],
['csv_with_empty_last_line.csv'], $expectedRows = [
// skipped row here
['csv--21', 'csv--22', 'csv--23'],
// skipped row here
['csv--41', 'csv--42', 'csv--43'],
// skipped row here
// last row empty
]; ];
$this->assertEquals($expectedRows, $allRows);
} }
/** /**
* @dataProvider dataProviderForTestReadShouldSkipEmptyLines
*
* @param string $fileName
* @return void * @return void
*/ */
public function testReadShouldSkipEmptyLines($fileName) public function testReadShouldReturnEmptyLinesIfShouldPreserveEmptyRowsSet()
{ {
$allRows = $this->getAllRowsForFile($fileName); $allRows = $this->getAllRowsForFile(
'csv_with_multiple_empty_lines.csv',
',', '"', "\n", EncodingHelper::ENCODING_UTF8,
$shouldPreserveEmptyRows = true
);
$expectedRows = [ $expectedRows = [
['csv--11', 'csv--12', 'csv--13'], [''],
['csv--31', 'csv--32', 'csv--33'], ['csv--21', 'csv--22', 'csv--23'],
[''],
['csv--41', 'csv--42', 'csv--43'],
[''],
]; ];
$this->assertEquals($expectedRows, $allRows); $this->assertEquals($expectedRows, $allRows);
} }
@ -204,6 +215,21 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$this->assertEquals('This is, a comma', $allRows[0][0]); $this->assertEquals('This is, a comma', $allRows[0][0]);
} }
/**
* @return void
*/
public function testReadCustomEOLs()
{
$allRows = $this->getAllRowsForFile('csv_with_CR_EOL.csv', ',', '"', "\r");
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/** /**
* @return void * @return void
*/ */
@ -236,7 +262,7 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
*/ */
public function testReadShouldSkipBom($fileName, $fileEncoding) public function testReadShouldSkipBom($fileName, $fileEncoding)
{ {
$allRows = $this->getAllRowsForFile($fileName, ',', '"', $fileEncoding); $allRows = $this->getAllRowsForFile($fileName, ',', '"', "\n", $fileEncoding);
$expectedRows = [ $expectedRows = [
['csv--11', 'csv--12', 'csv--13'], ['csv--11', 'csv--12', 'csv--13'],
@ -275,6 +301,7 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$allRows = []; $allRows = [];
$resourcePath = $this->getResourcePath($fileName); $resourcePath = $this->getResourcePath($fileName);
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper|\PHPUnit_Framework_MockObject_MockObject $helperStub */
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['function_exists']) ->setMethods(['function_exists'])
->getMock(); ->getMock();
@ -405,14 +432,18 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
* @param string $fileName * @param string $fileName
* @param string|void $fieldDelimiter * @param string|void $fieldDelimiter
* @param string|void $fieldEnclosure * @param string|void $fieldEnclosure
* @param string|void $endOfLineCharacter
* @param string|void $encoding * @param string|void $encoding
* @param bool|void $shouldPreserveEmptyRows
* @return array All the read rows the given file * @return array All the read rows the given file
*/ */
private function getAllRowsForFile( private function getAllRowsForFile(
$fileName, $fileName,
$fieldDelimiter = ',', $fieldDelimiter = ',',
$fieldEnclosure = '"', $fieldEnclosure = '"',
$encoding = EncodingHelper::ENCODING_UTF8) $endOfLineCharacter = "\n",
$encoding = EncodingHelper::ENCODING_UTF8,
$shouldPreserveEmptyRows = false)
{ {
$allRows = []; $allRows = [];
$resourcePath = $this->getResourcePath($fileName); $resourcePath = $this->getResourcePath($fileName);
@ -422,7 +453,9 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$reader $reader
->setFieldDelimiter($fieldDelimiter) ->setFieldDelimiter($fieldDelimiter)
->setFieldEnclosure($fieldEnclosure) ->setFieldEnclosure($fieldEnclosure)
->setEndOfLineCharacter($endOfLineCharacter)
->setEncoding($encoding) ->setEncoding($encoding)
->setShouldPreserveEmptyRows($shouldPreserveEmptyRows)
->open($resourcePath); ->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {
@ -436,51 +469,6 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
return $allRows; return $allRows;
} }
/**
* @return array
*/
public function dataProviderForTestReadCustomEOL()
{
return [
['csv_with_CR_EOL.csv', "\r"],
['csv_standard.csv', "\n"],
];
}
/**
* @dataProvider dataProviderForTestReadCustomEOL
*
* @param string $fileName
* @param string $customEOL
* @return void
*/
public function testReadCustomEOLs($fileName, $customEOL)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
/** @var \Box\Spout\Reader\CSV\Reader $reader */
$reader = ReaderFactory::create(Type::CSV);
$reader
->setEndOfLineCharacter($customEOL)
->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheet) {
foreach ($sheet->getRowIterator() as $row) {
$allRows[] = $row;
}
}
$reader->close();
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/** /**
* @return void * @return void
*/ */

View File

@ -1,2 +0,0 @@
csv--11,csv--12,csv--13
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--31 csv--32 csv--33

View File

@ -1,3 +0,0 @@
csv--11,csv--12,csv--13
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--31 csv--32 csv--33

View File

@ -0,0 +1,5 @@
csv--21,csv--22,csv--23
csv--41,csv--42,csv--43
1 csv--21 csv--22 csv--23
2 csv--41 csv--42 csv--43