Fix line breaks on CSV reader
This commit is contained in:
parent
e321f30c3b
commit
d6e8fe4b54
@ -291,35 +291,6 @@ class GlobalFunctionsHelper
|
|||||||
return stream_get_wrappers();
|
return stream_get_wrappers();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Wrapper around global function stream_get_line()
|
|
||||||
* @see stream_get_line()
|
|
||||||
*
|
|
||||||
* @param resource $handle
|
|
||||||
* @param int $length
|
|
||||||
* @param string|void $ending
|
|
||||||
* @return string|bool
|
|
||||||
*/
|
|
||||||
public function stream_get_line($handle, $length, $ending = null)
|
|
||||||
{
|
|
||||||
return stream_get_line($handle, $length, $ending);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Wrapper around global function str_getcsv()
|
|
||||||
* @see str_getcsv()
|
|
||||||
*
|
|
||||||
* @param string $input
|
|
||||||
* @param string|void $delimiter
|
|
||||||
* @param string|void $enclosure
|
|
||||||
* @param string|void $escape
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function str_getcsv($input, $delimiter = null, $enclosure = null, $escape = null)
|
|
||||||
{
|
|
||||||
return str_getcsv($input, $delimiter, $enclosure, $escape);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wrapper around global function function_exists()
|
* Wrapper around global function function_exists()
|
||||||
* @see function_exists()
|
* @see function_exists()
|
||||||
|
@ -32,6 +32,9 @@ class Reader extends AbstractReader
|
|||||||
/** @var string Defines the End of line */
|
/** @var string Defines the End of line */
|
||||||
protected $endOfLineCharacter = "\n";
|
protected $endOfLineCharacter = "\n";
|
||||||
|
|
||||||
|
/** @var string */
|
||||||
|
protected $autoDetectLineEndings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the field delimiter for the CSV.
|
* Sets the field delimiter for the CSV.
|
||||||
* Needs to be called before opening the reader.
|
* Needs to be called before opening the reader.
|
||||||
@ -104,6 +107,9 @@ class Reader extends AbstractReader
|
|||||||
*/
|
*/
|
||||||
protected function openReader($filePath)
|
protected function openReader($filePath)
|
||||||
{
|
{
|
||||||
|
$this->autoDetectLineEndings = ini_get('auto_detect_line_endings');
|
||||||
|
ini_set('auto_detect_line_endings', '1');
|
||||||
|
|
||||||
$this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r');
|
$this->filePointer = $this->globalFunctionsHelper->fopen($filePath, 'r');
|
||||||
if (!$this->filePointer) {
|
if (!$this->filePointer) {
|
||||||
throw new IOException("Could not open file $filePath for reading.");
|
throw new IOException("Could not open file $filePath for reading.");
|
||||||
@ -140,5 +146,7 @@ class Reader extends AbstractReader
|
|||||||
if ($this->filePointer) {
|
if ($this->filePointer) {
|
||||||
$this->globalFunctionsHelper->fclose($this->filePointer);
|
$this->globalFunctionsHelper->fclose($this->filePointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ini_set('auto_detect_line_endings', $this->autoDetectLineEndings);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ use Box\Spout\Common\Helper\EncodingHelper;
|
|||||||
class RowIterator implements IteratorInterface
|
class RowIterator implements IteratorInterface
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* If no value is given to stream_get_line(), it defaults to 8192 (which may be too low).
|
* If no value is given to fgetcsv(), it defaults to 8192 (which may be too low).
|
||||||
* Alignement with other functions like fgets() is discussed here: https://bugs.php.net/bug.php?id=48421
|
* Alignement with other functions like fgets() is discussed here: https://bugs.php.net/bug.php?id=48421
|
||||||
*/
|
*/
|
||||||
const MAX_READ_BYTES_PER_LINE = 32768;
|
const MAX_READ_BYTES_PER_LINE = 32768;
|
||||||
@ -128,16 +128,12 @@ class RowIterator implements IteratorInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
$lineData = false;
|
$rowData = $this->getNextUTF8EncodedRow();
|
||||||
$utf8EncodedLineData = $this->getNextUTF8EncodedLine();
|
|
||||||
if ($utf8EncodedLineData !== false) {
|
|
||||||
$lineData = $this->globalFunctionsHelper->str_getcsv($utf8EncodedLineData, $this->fieldDelimiter, $this->fieldEnclosure);
|
|
||||||
}
|
|
||||||
$hasNowReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer);
|
$hasNowReachedEndOfFile = $this->globalFunctionsHelper->feof($this->filePointer);
|
||||||
} while (($lineData === false && !$hasNowReachedEndOfFile) || $this->isEmptyLine($lineData));
|
} while (($rowData === false && !$hasNowReachedEndOfFile) || $this->isEmptyLine($rowData));
|
||||||
|
|
||||||
if ($lineData !== false) {
|
if ($rowData !== false) {
|
||||||
$this->rowDataBuffer = $lineData;
|
$this->rowDataBuffer = $rowData;
|
||||||
$this->numReadRows++;
|
$this->numReadRows++;
|
||||||
} else {
|
} else {
|
||||||
// If we reach this point, it means end of file was reached.
|
// If we reach this point, it means end of file was reached.
|
||||||
@ -147,24 +143,39 @@ class RowIterator implements IteratorInterface
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next line, converted if necessary to UTF-8.
|
* Returns the next row, converted if necessary to UTF-8.
|
||||||
* Neither fgets nor fgetcsv don't work with non UTF-8 data... so we need to do some things manually.
|
* As fgetcsv() does not manage correctly encoding for non UTF-8 data,
|
||||||
|
* we remove manually whitespace with ltrim or rtrim (depending on the order of the bytes)
|
||||||
*
|
*
|
||||||
* @return string|false The next line for the current file pointer, encoded in UTF-8 or FALSE if nothing to read
|
* @return array|false The row for the current file pointer, encoded in UTF-8 or FALSE if nothing to read
|
||||||
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
|
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
|
||||||
*/
|
*/
|
||||||
protected function getNextUTF8EncodedLine()
|
protected function getNextUTF8EncodedRow()
|
||||||
{
|
{
|
||||||
// Read until the EOL delimiter or EOF is reached. The delimiter's encoding needs to match the CSV's encoding.
|
$encodedRowData = fgetcsv($this->filePointer, self::MAX_READ_BYTES_PER_LINE, $this->fieldDelimiter, $this->fieldEnclosure);
|
||||||
$encodedEOLDelimiter = $this->getEncodedEOLDelimiter();
|
if (false === $encodedRowData) {
|
||||||
$encodedLineData = $this->globalFunctionsHelper->stream_get_line($this->filePointer, self::MAX_READ_BYTES_PER_LINE, $encodedEOLDelimiter);
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// If the line could have been read, it can be converted to UTF-8
|
foreach ($encodedRowData as $cellIndex => $cellValue) {
|
||||||
$utf8EncodedLineData = ($encodedLineData !== false) ?
|
switch($this->encoding) {
|
||||||
$this->encodingHelper->attemptConversionToUTF8($encodedLineData, $this->encoding) :
|
case EncodingHelper::ENCODING_UTF16_LE:
|
||||||
false;
|
case EncodingHelper::ENCODING_UTF32_LE:
|
||||||
|
// remove whitespace from the beginning of a string as fgetcsv() add extra whitespace when it try to explode non UTF-8 data
|
||||||
|
$cellValue = ltrim($cellValue);
|
||||||
|
break;
|
||||||
|
|
||||||
return $utf8EncodedLineData;
|
case EncodingHelper::ENCODING_UTF16_BE:
|
||||||
|
case EncodingHelper::ENCODING_UTF32_BE:
|
||||||
|
// remove whitespace from the end of a string as fgetcsv() add extra whitespace when it try to explode non UTF-8 data
|
||||||
|
$cellValue = rtrim($cellValue);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
$encodedRowData[$cellIndex] = $this->encodingHelper->attemptConversionToUTF8($cellValue, $this->encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $encodedRowData;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -204,6 +204,15 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
|
|||||||
$this->assertEquals('This is, a comma', $allRows[0][0]);
|
$this->assertEquals('This is, a comma', $allRows[0][0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function testReadShouldNotTruncateLineBreak()
|
||||||
|
{
|
||||||
|
$allRows = $this->getAllRowsForFile('csv_with_line_breaks.csv', ',');
|
||||||
|
$this->assertEquals("This is,\na comma", $allRows[0][0]);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return array
|
* @return array
|
||||||
*/
|
*/
|
||||||
|
2
tests/resources/csv/csv_with_line_breaks.csv
Normal file
2
tests/resources/csv/csv_with_line_breaks.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
"This is,
|
||||||
|
a comma",csv--12
|
|
Loading…
x
Reference in New Issue
Block a user