Support for multiple BOMs depending on the selected encoding

This commit is contained in:
Adrien Loison 2015-07-25 14:06:02 -07:00
parent 03d1917080
commit d946f12951
7 changed files with 142 additions and 29 deletions

View File

@ -0,0 +1,92 @@
<?php
namespace Box\Spout\Reader\CSV\Helper;
/**
* Class EncodingHelper
* This class provides helper functions to work with encodings.
*
* @package Box\Spout\Reader\CSV\Helper
*/
class EncodingHelper
{
/** Definition of the encodings that can have a BOM */
const ENCODING_UTF8 = 'UTF-8';
const ENCODING_UTF16_LE = 'UTF-16LE';
const ENCODING_UTF16_BE = 'UTF-16BE';
const ENCODING_UTF32_LE = 'UTF-32LE';
const ENCODING_UTF32_BE = 'UTF-32BE';
/** Definition of the BOMs for the different encodings */
const BOM_UTF8 = "\xEF\xBB\xBF";
const BOM_UTF16_LE = "\xFF\xFE";
const BOM_UTF16_BE = "\xFE\xFF";
const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var array Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
protected $supportedEncodingsWithBom;
/**
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($globalFunctionsHelper)
{
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->supportedEncodingsWithBom = [
self::ENCODING_UTF8 => self::BOM_UTF8,
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
];
}
/**
* Returns the number of bytes to use as offset in order to skip the BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
*/
public function getBytesOffsetToSkipBOM($filePointer, $encoding)
{
$byteOffsetToSkipBom = 0;
if ($this->hasBom($filePointer, $encoding)) {
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
// we skip the N first bytes
$byteOffsetToSkipBom = strlen($bomUsed);
}
return $byteOffsetToSkipBom;
}
/**
* Returns whether the file identified by the given pointer has a BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return bool TRUE if the file has a BOM, FALSE otherwise
*/
protected function hasBOM($filePointer, $encoding)
{
$hasBOM = false;
$this->globalFunctionsHelper->rewind($filePointer);
if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
$numBytesInBom = strlen($potentialBom);
$hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
}
return $hasBOM;
}
}

View File

@ -25,6 +25,9 @@ class Reader extends AbstractReader
/** @var string Defines the character used to enclose fields (one character only) */ /** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"'; protected $fieldEnclosure = '"';
/** @var string Encoding of the CSV file to be read */
protected $encoding = 'UTF-8';
/** /**
* Sets the field delimiter for the CSV. * Sets the field delimiter for the CSV.
* Needs to be called before opening the reader. * Needs to be called before opening the reader.
@ -51,10 +54,21 @@ class Reader extends AbstractReader
return $this; return $this;
} }
/**
* Sets the encoding of the CSV file to be read.
* Needs to be called before opening the reader.
*
* @param string $encoding Encoding of the CSV file to be read
* @return Reader
*/
public function setEncoding($encoding)
{
$this->encoding = $encoding;
return $this;
}
/** /**
* Opens the file at the given path to make it ready to be read. * Opens the file at the given path to make it ready to be read.
* The file must be UTF-8 encoded.
* @TODO add encoding detection/conversion
* *
* @param string $filePath Path of the CSV file to be read * @param string $filePath Path of the CSV file to be read
* @return void * @return void
@ -67,7 +81,13 @@ class Reader extends AbstractReader
throw new IOException("Could not open file $filePath for reading."); throw new IOException("Could not open file $filePath for reading.");
} }
$this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper); $this->sheetIterator = new SheetIterator(
$this->filePointer,
$this->fieldDelimiter,
$this->fieldEnclosure,
$this->encoding,
$this->globalFunctionsHelper
);
} }
/** /**

View File

@ -2,6 +2,7 @@
namespace Box\Spout\Reader\CSV; namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\CSV\Helper\EncodingHelper;
use Box\Spout\Reader\IteratorInterface; use Box\Spout\Reader\IteratorInterface;
/** /**
@ -12,8 +13,6 @@ use Box\Spout\Reader\IteratorInterface;
*/ */
class RowIterator implements IteratorInterface class RowIterator implements IteratorInterface
{ {
const UTF8_BOM = "\xEF\xBB\xBF";
/** @var resource Pointer to the CSV file to read */ /** @var resource Pointer to the CSV file to read */
protected $filePointer; protected $filePointer;
@ -27,10 +26,13 @@ class RowIterator implements IteratorInterface
protected $hasReachedEndOfFile = false; protected $hasReachedEndOfFile = false;
/** @var string Defines the character used to delimit fields (one character only) */ /** @var string Defines the character used to delimit fields (one character only) */
protected $fieldDelimiter = ','; protected $fieldDelimiter;
/** @var string Defines the character used to enclose fields (one character only) */ /** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"'; protected $fieldEnclosure;
/** @var string Encoding of the CSV file to be read */
protected $encoding;
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper; protected $globalFunctionsHelper;
@ -39,14 +41,18 @@ class RowIterator implements IteratorInterface
* @param resource $filePointer Pointer to the CSV file to read * @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{ {
$this->filePointer = $filePointer; $this->filePointer = $filePointer;
$this->fieldDelimiter = $fieldDelimiter; $this->fieldDelimiter = $fieldDelimiter;
$this->fieldEnclosure = $fieldEnclosure; $this->fieldEnclosure = $fieldEnclosure;
$this->encoding = $encoding;
$this->globalFunctionsHelper = $globalFunctionsHelper; $this->globalFunctionsHelper = $globalFunctionsHelper;
$this->encodingHelper = new EncodingHelper($globalFunctionsHelper);
} }
/** /**
@ -57,7 +63,7 @@ class RowIterator implements IteratorInterface
*/ */
public function rewind() public function rewind()
{ {
$this->rewindAndSkipUtf8Bom(); $this->rewindAndSkipBom();
$this->numReadRows = 0; $this->numReadRows = 0;
$this->rowDataBuffer = null; $this->rowDataBuffer = null;
@ -66,24 +72,17 @@ class RowIterator implements IteratorInterface
} }
/** /**
* This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file * This rewinds and skips the BOM if inserted at the beginning of the file
* by moving the file pointer after it, so that it is not read. * by moving the file pointer after it, so that it is not read.
* *
* @return void * @return void
*/ */
protected function rewindAndSkipUtf8Bom() protected function rewindAndSkipBom()
{ {
$this->globalFunctionsHelper->rewind($this->filePointer); $byteOffsetToSkipBom = $this->encodingHelper->getBytesOffsetToSkipBOM($this->filePointer, $this->encoding);
$hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM); // sets the cursor after the BOM (0 means no BOM, so rewind it)
$this->globalFunctionsHelper->fseek($this->filePointer, $byteOffsetToSkipBom);
if ($hasUtf8Bom) {
// we skip the 2 first bytes (so start from the 3rd byte)
$this->globalFunctionsHelper->fseek($this->filePointer, 3);
} else {
// if no BOM, reset the pointer to read from the beginning
$this->globalFunctionsHelper->fseek($this->filePointer, 0);
}
} }
/** /**

View File

@ -18,11 +18,12 @@ class Sheet implements SheetInterface
* @param resource $filePointer Pointer to the CSV file to read * @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{ {
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); $this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
} }
/** /**

View File

@ -22,11 +22,12 @@ class SheetIterator implements IteratorInterface
* @param resource $filePointer * @param resource $filePointer
* @param string $fieldDelimiter Character that delimits fields * @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields * @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/ */
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{ {
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); $this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
} }
/** /**

View File

@ -15,7 +15,7 @@ class Writer extends AbstractWriter
{ {
/** Number of rows to write before flushing */ /** Number of rows to write before flushing */
const FLUSH_THRESHOLD = 500; const FLUSH_THRESHOLD = 500;
const UTF8_BOM = "\xEF\xBB\xBF"; const BOM_UTF8 = "\xEF\xBB\xBF";
/** @var string Content-Type value for the header */ /** @var string Content-Type value for the header */
protected static $headerContentType = 'text/csv; charset=UTF-8'; protected static $headerContentType = 'text/csv; charset=UTF-8';
@ -61,7 +61,7 @@ class Writer extends AbstractWriter
protected function openWriter() protected function openWriter()
{ {
// Adds UTF-8 BOM for Unicode compatibility // Adds UTF-8 BOM for Unicode compatibility
$this->globalFunctionsHelper->fputs($this->filePointer, self::UTF8_BOM); $this->globalFunctionsHelper->fputs($this->filePointer, self::BOM_UTF8);
} }
/** /**

View File

@ -70,7 +70,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
]; ];
$writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv'); $writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv');
$this->assertContains(Writer::UTF8_BOM, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); $this->assertContains(Writer::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
} }
/** /**
@ -162,6 +162,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase
private function trimWrittenContent($writtenContent) private function trimWrittenContent($writtenContent)
{ {
// remove line feeds and UTF-8 BOM // remove line feeds and UTF-8 BOM
return trim($writtenContent, PHP_EOL . Writer::UTF8_BOM); return trim($writtenContent, PHP_EOL . Writer::BOM_UTF8);
} }
} }