Support for multiple BOMs depending on the selected encoding
This commit is contained in:
parent
03d1917080
commit
d946f12951
92
src/Spout/Reader/CSV/Helper/EncodingHelper.php
Normal file
92
src/Spout/Reader/CSV/Helper/EncodingHelper.php
Normal file
@ -0,0 +1,92 @@
|
||||
<?php
|
||||
|
||||
namespace Box\Spout\Reader\CSV\Helper;
|
||||
|
||||
/**
|
||||
* Class EncodingHelper
|
||||
* This class provides helper functions to work with encodings.
|
||||
*
|
||||
* @package Box\Spout\Reader\CSV\Helper
|
||||
*/
|
||||
class EncodingHelper
|
||||
{
|
||||
/** Definition of the encodings that can have a BOM */
|
||||
const ENCODING_UTF8 = 'UTF-8';
|
||||
const ENCODING_UTF16_LE = 'UTF-16LE';
|
||||
const ENCODING_UTF16_BE = 'UTF-16BE';
|
||||
const ENCODING_UTF32_LE = 'UTF-32LE';
|
||||
const ENCODING_UTF32_BE = 'UTF-32BE';
|
||||
|
||||
/** Definition of the BOMs for the different encodings */
|
||||
const BOM_UTF8 = "\xEF\xBB\xBF";
|
||||
const BOM_UTF16_LE = "\xFF\xFE";
|
||||
const BOM_UTF16_BE = "\xFE\xFF";
|
||||
const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
|
||||
const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
|
||||
|
||||
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
|
||||
protected $globalFunctionsHelper;
|
||||
|
||||
/** @var array Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
|
||||
protected $supportedEncodingsWithBom;
|
||||
|
||||
/**
|
||||
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
|
||||
*/
|
||||
public function __construct($globalFunctionsHelper)
|
||||
{
|
||||
$this->globalFunctionsHelper = $globalFunctionsHelper;
|
||||
|
||||
$this->supportedEncodingsWithBom = [
|
||||
self::ENCODING_UTF8 => self::BOM_UTF8,
|
||||
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
|
||||
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
|
||||
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
|
||||
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of bytes to use as offset in order to skip the BOM.
|
||||
*
|
||||
* @param resource $filePointer Pointer to the file to check
|
||||
* @param string $encoding Encoding of the file to check
|
||||
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
|
||||
*/
|
||||
public function getBytesOffsetToSkipBOM($filePointer, $encoding)
|
||||
{
|
||||
$byteOffsetToSkipBom = 0;
|
||||
|
||||
if ($this->hasBom($filePointer, $encoding)) {
|
||||
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
|
||||
|
||||
// we skip the N first bytes
|
||||
$byteOffsetToSkipBom = strlen($bomUsed);
|
||||
}
|
||||
|
||||
return $byteOffsetToSkipBom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the file identified by the given pointer has a BOM.
|
||||
*
|
||||
* @param resource $filePointer Pointer to the file to check
|
||||
* @param string $encoding Encoding of the file to check
|
||||
* @return bool TRUE if the file has a BOM, FALSE otherwise
|
||||
*/
|
||||
protected function hasBOM($filePointer, $encoding)
|
||||
{
|
||||
$hasBOM = false;
|
||||
|
||||
$this->globalFunctionsHelper->rewind($filePointer);
|
||||
|
||||
if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
|
||||
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
|
||||
$numBytesInBom = strlen($potentialBom);
|
||||
|
||||
$hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
|
||||
}
|
||||
|
||||
return $hasBOM;
|
||||
}
|
||||
}
|
@ -25,6 +25,9 @@ class Reader extends AbstractReader
|
||||
/** @var string Defines the character used to enclose fields (one character only) */
|
||||
protected $fieldEnclosure = '"';
|
||||
|
||||
/** @var string Encoding of the CSV file to be read */
|
||||
protected $encoding = 'UTF-8';
|
||||
|
||||
/**
|
||||
* Sets the field delimiter for the CSV.
|
||||
* Needs to be called before opening the reader.
|
||||
@ -51,10 +54,21 @@ class Reader extends AbstractReader
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the encoding of the CSV file to be read.
|
||||
* Needs to be called before opening the reader.
|
||||
*
|
||||
* @param string $encoding Encoding of the CSV file to be read
|
||||
* @return Reader
|
||||
*/
|
||||
public function setEncoding($encoding)
|
||||
{
|
||||
$this->encoding = $encoding;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens the file at the given path to make it ready to be read.
|
||||
* The file must be UTF-8 encoded.
|
||||
* @TODO add encoding detection/conversion
|
||||
*
|
||||
* @param string $filePath Path of the CSV file to be read
|
||||
* @return void
|
||||
@ -67,7 +81,13 @@ class Reader extends AbstractReader
|
||||
throw new IOException("Could not open file $filePath for reading.");
|
||||
}
|
||||
|
||||
$this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper);
|
||||
$this->sheetIterator = new SheetIterator(
|
||||
$this->filePointer,
|
||||
$this->fieldDelimiter,
|
||||
$this->fieldEnclosure,
|
||||
$this->encoding,
|
||||
$this->globalFunctionsHelper
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
namespace Box\Spout\Reader\CSV;
|
||||
|
||||
use Box\Spout\Reader\CSV\Helper\EncodingHelper;
|
||||
use Box\Spout\Reader\IteratorInterface;
|
||||
|
||||
/**
|
||||
@ -12,8 +13,6 @@ use Box\Spout\Reader\IteratorInterface;
|
||||
*/
|
||||
class RowIterator implements IteratorInterface
|
||||
{
|
||||
const UTF8_BOM = "\xEF\xBB\xBF";
|
||||
|
||||
/** @var resource Pointer to the CSV file to read */
|
||||
protected $filePointer;
|
||||
|
||||
@ -27,10 +26,13 @@ class RowIterator implements IteratorInterface
|
||||
protected $hasReachedEndOfFile = false;
|
||||
|
||||
/** @var string Defines the character used to delimit fields (one character only) */
|
||||
protected $fieldDelimiter = ',';
|
||||
protected $fieldDelimiter;
|
||||
|
||||
/** @var string Defines the character used to enclose fields (one character only) */
|
||||
protected $fieldEnclosure = '"';
|
||||
protected $fieldEnclosure;
|
||||
|
||||
/** @var string Encoding of the CSV file to be read */
|
||||
protected $encoding;
|
||||
|
||||
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
|
||||
protected $globalFunctionsHelper;
|
||||
@ -39,14 +41,18 @@ class RowIterator implements IteratorInterface
|
||||
* @param resource $filePointer Pointer to the CSV file to read
|
||||
* @param string $fieldDelimiter Character that delimits fields
|
||||
* @param string $fieldEnclosure Character that enclose fields
|
||||
* @param string $encoding Encoding of the CSV file to be read
|
||||
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
|
||||
*/
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
|
||||
{
|
||||
$this->filePointer = $filePointer;
|
||||
$this->fieldDelimiter = $fieldDelimiter;
|
||||
$this->fieldEnclosure = $fieldEnclosure;
|
||||
$this->encoding = $encoding;
|
||||
$this->globalFunctionsHelper = $globalFunctionsHelper;
|
||||
|
||||
$this->encodingHelper = new EncodingHelper($globalFunctionsHelper);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -57,7 +63,7 @@ class RowIterator implements IteratorInterface
|
||||
*/
|
||||
public function rewind()
|
||||
{
|
||||
$this->rewindAndSkipUtf8Bom();
|
||||
$this->rewindAndSkipBom();
|
||||
|
||||
$this->numReadRows = 0;
|
||||
$this->rowDataBuffer = null;
|
||||
@ -66,24 +72,17 @@ class RowIterator implements IteratorInterface
|
||||
}
|
||||
|
||||
/**
|
||||
* This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file
|
||||
* This rewinds and skips the BOM if inserted at the beginning of the file
|
||||
* by moving the file pointer after it, so that it is not read.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
protected function rewindAndSkipUtf8Bom()
|
||||
protected function rewindAndSkipBom()
|
||||
{
|
||||
$this->globalFunctionsHelper->rewind($this->filePointer);
|
||||
$byteOffsetToSkipBom = $this->encodingHelper->getBytesOffsetToSkipBOM($this->filePointer, $this->encoding);
|
||||
|
||||
$hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM);
|
||||
|
||||
if ($hasUtf8Bom) {
|
||||
// we skip the 2 first bytes (so start from the 3rd byte)
|
||||
$this->globalFunctionsHelper->fseek($this->filePointer, 3);
|
||||
} else {
|
||||
// if no BOM, reset the pointer to read from the beginning
|
||||
$this->globalFunctionsHelper->fseek($this->filePointer, 0);
|
||||
}
|
||||
// sets the cursor after the BOM (0 means no BOM, so rewind it)
|
||||
$this->globalFunctionsHelper->fseek($this->filePointer, $byteOffsetToSkipBom);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -18,11 +18,12 @@ class Sheet implements SheetInterface
|
||||
* @param resource $filePointer Pointer to the CSV file to read
|
||||
* @param string $fieldDelimiter Character that delimits fields
|
||||
* @param string $fieldEnclosure Character that enclose fields
|
||||
* @param string $encoding Encoding of the CSV file to be read
|
||||
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
|
||||
*/
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
|
||||
{
|
||||
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
|
||||
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -22,11 +22,12 @@ class SheetIterator implements IteratorInterface
|
||||
* @param resource $filePointer
|
||||
* @param string $fieldDelimiter Character that delimits fields
|
||||
* @param string $fieldEnclosure Character that enclose fields
|
||||
* @param string $encoding Encoding of the CSV file to be read
|
||||
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
|
||||
*/
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
|
||||
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
|
||||
{
|
||||
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
|
||||
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -15,7 +15,7 @@ class Writer extends AbstractWriter
|
||||
{
|
||||
/** Number of rows to write before flushing */
|
||||
const FLUSH_THRESHOLD = 500;
|
||||
const UTF8_BOM = "\xEF\xBB\xBF";
|
||||
const BOM_UTF8 = "\xEF\xBB\xBF";
|
||||
|
||||
/** @var string Content-Type value for the header */
|
||||
protected static $headerContentType = 'text/csv; charset=UTF-8';
|
||||
@ -61,7 +61,7 @@ class Writer extends AbstractWriter
|
||||
protected function openWriter()
|
||||
{
|
||||
// Adds UTF-8 BOM for Unicode compatibility
|
||||
$this->globalFunctionsHelper->fputs($this->filePointer, self::UTF8_BOM);
|
||||
$this->globalFunctionsHelper->fputs($this->filePointer, self::BOM_UTF8);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -70,7 +70,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
|
||||
];
|
||||
$writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv');
|
||||
|
||||
$this->assertContains(Writer::UTF8_BOM, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
|
||||
$this->assertContains(Writer::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
|
||||
}
|
||||
|
||||
/**
|
||||
@ -162,6 +162,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase
|
||||
private function trimWrittenContent($writtenContent)
|
||||
{
|
||||
// remove line feeds and UTF-8 BOM
|
||||
return trim($writtenContent, PHP_EOL . Writer::UTF8_BOM);
|
||||
return trim($writtenContent, PHP_EOL . Writer::BOM_UTF8);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user