diff --git a/src/Spout/Reader/CSV/Helper/EncodingHelper.php b/src/Spout/Reader/CSV/Helper/EncodingHelper.php new file mode 100644 index 0000000..1987548 --- /dev/null +++ b/src/Spout/Reader/CSV/Helper/EncodingHelper.php @@ -0,0 +1,92 @@ +globalFunctionsHelper = $globalFunctionsHelper; + + $this->supportedEncodingsWithBom = [ + self::ENCODING_UTF8 => self::BOM_UTF8, + self::ENCODING_UTF16_LE => self::BOM_UTF16_LE, + self::ENCODING_UTF16_BE => self::BOM_UTF16_BE, + self::ENCODING_UTF32_LE => self::BOM_UTF32_LE, + self::ENCODING_UTF32_BE => self::BOM_UTF32_BE, + ]; + } + + /** + * Returns the number of bytes to use as offset in order to skip the BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return int Bytes offset to apply to skip the BOM (0 means no BOM) + */ + public function getBytesOffsetToSkipBOM($filePointer, $encoding) + { + $byteOffsetToSkipBom = 0; + + if ($this->hasBom($filePointer, $encoding)) { + $bomUsed = $this->supportedEncodingsWithBom[$encoding]; + + // we skip the N first bytes + $byteOffsetToSkipBom = strlen($bomUsed); + } + + return $byteOffsetToSkipBom; + } + + /** + * Returns whether the file identified by the given pointer has a BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return bool TRUE if the file has a BOM, FALSE otherwise + */ + protected function hasBOM($filePointer, $encoding) + { + $hasBOM = false; + + $this->globalFunctionsHelper->rewind($filePointer); + + if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) { + $potentialBom = $this->supportedEncodingsWithBom[$encoding]; + $numBytesInBom = strlen($potentialBom); + + $hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom); + } + + return $hasBOM; + } +} diff --git a/src/Spout/Reader/CSV/Reader.php b/src/Spout/Reader/CSV/Reader.php index 523eaca..10bb8d4 100644 --- a/src/Spout/Reader/CSV/Reader.php +++ b/src/Spout/Reader/CSV/Reader.php @@ -25,6 +25,9 @@ class Reader extends AbstractReader /** @var string Defines the character used to enclose fields (one character only) */ protected $fieldEnclosure = '"'; + /** @var string Encoding of the CSV file to be read */ + protected $encoding = 'UTF-8'; + /** * Sets the field delimiter for the CSV. * Needs to be called before opening the reader. @@ -51,10 +54,21 @@ class Reader extends AbstractReader return $this; } + /** + * Sets the encoding of the CSV file to be read. + * Needs to be called before opening the reader. + * + * @param string $encoding Encoding of the CSV file to be read + * @return Reader + */ + public function setEncoding($encoding) + { + $this->encoding = $encoding; + return $this; + } + /** * Opens the file at the given path to make it ready to be read. - * The file must be UTF-8 encoded. - * @TODO add encoding detection/conversion * * @param string $filePath Path of the CSV file to be read * @return void @@ -67,7 +81,13 @@ class Reader extends AbstractReader throw new IOException("Could not open file $filePath for reading."); } - $this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper); + $this->sheetIterator = new SheetIterator( + $this->filePointer, + $this->fieldDelimiter, + $this->fieldEnclosure, + $this->encoding, + $this->globalFunctionsHelper + ); } /** diff --git a/src/Spout/Reader/CSV/RowIterator.php b/src/Spout/Reader/CSV/RowIterator.php index ffb533f..d941ad1 100644 --- a/src/Spout/Reader/CSV/RowIterator.php +++ b/src/Spout/Reader/CSV/RowIterator.php @@ -2,6 +2,7 @@ namespace Box\Spout\Reader\CSV; +use Box\Spout\Reader\CSV\Helper\EncodingHelper; use Box\Spout\Reader\IteratorInterface; /** @@ -12,8 +13,6 @@ use Box\Spout\Reader\IteratorInterface; */ class RowIterator implements IteratorInterface { - const UTF8_BOM = "\xEF\xBB\xBF"; - /** @var resource Pointer to the CSV file to read */ protected $filePointer; @@ -27,10 +26,13 @@ class RowIterator implements IteratorInterface protected $hasReachedEndOfFile = false; /** @var string Defines the character used to delimit fields (one character only) */ - protected $fieldDelimiter = ','; + protected $fieldDelimiter; /** @var string Defines the character used to enclose fields (one character only) */ - protected $fieldEnclosure = '"'; + protected $fieldEnclosure; + + /** @var string Encoding of the CSV file to be read */ + protected $encoding; /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ protected $globalFunctionsHelper; @@ -39,14 +41,18 @@ class RowIterator implements IteratorInterface * @param resource $filePointer Pointer to the CSV file to read * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { $this->filePointer = $filePointer; $this->fieldDelimiter = $fieldDelimiter; $this->fieldEnclosure = $fieldEnclosure; + $this->encoding = $encoding; $this->globalFunctionsHelper = $globalFunctionsHelper; + + $this->encodingHelper = new EncodingHelper($globalFunctionsHelper); } /** @@ -57,7 +63,7 @@ class RowIterator implements IteratorInterface */ public function rewind() { - $this->rewindAndSkipUtf8Bom(); + $this->rewindAndSkipBom(); $this->numReadRows = 0; $this->rowDataBuffer = null; @@ -66,24 +72,17 @@ class RowIterator implements IteratorInterface } /** - * This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file + * This rewinds and skips the BOM if inserted at the beginning of the file * by moving the file pointer after it, so that it is not read. * * @return void */ - protected function rewindAndSkipUtf8Bom() + protected function rewindAndSkipBom() { - $this->globalFunctionsHelper->rewind($this->filePointer); + $byteOffsetToSkipBom = $this->encodingHelper->getBytesOffsetToSkipBOM($this->filePointer, $this->encoding); - $hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM); - - if ($hasUtf8Bom) { - // we skip the 2 first bytes (so start from the 3rd byte) - $this->globalFunctionsHelper->fseek($this->filePointer, 3); - } else { - // if no BOM, reset the pointer to read from the beginning - $this->globalFunctionsHelper->fseek($this->filePointer, 0); - } + // sets the cursor after the BOM (0 means no BOM, so rewind it) + $this->globalFunctionsHelper->fseek($this->filePointer, $byteOffsetToSkipBom); } /** diff --git a/src/Spout/Reader/CSV/Sheet.php b/src/Spout/Reader/CSV/Sheet.php index 207fcae..fd8d214 100644 --- a/src/Spout/Reader/CSV/Sheet.php +++ b/src/Spout/Reader/CSV/Sheet.php @@ -18,11 +18,12 @@ class Sheet implements SheetInterface * @param resource $filePointer Pointer to the CSV file to read * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { - $this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + $this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper); } /** diff --git a/src/Spout/Reader/CSV/SheetIterator.php b/src/Spout/Reader/CSV/SheetIterator.php index f424cd8..7e7af38 100644 --- a/src/Spout/Reader/CSV/SheetIterator.php +++ b/src/Spout/Reader/CSV/SheetIterator.php @@ -22,11 +22,12 @@ class SheetIterator implements IteratorInterface * @param resource $filePointer * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { - $this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + $this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper); } /** diff --git a/src/Spout/Writer/CSV/Writer.php b/src/Spout/Writer/CSV/Writer.php index d37ad5b..57c554c 100644 --- a/src/Spout/Writer/CSV/Writer.php +++ b/src/Spout/Writer/CSV/Writer.php @@ -15,7 +15,7 @@ class Writer extends AbstractWriter { /** Number of rows to write before flushing */ const FLUSH_THRESHOLD = 500; - const UTF8_BOM = "\xEF\xBB\xBF"; + const BOM_UTF8 = "\xEF\xBB\xBF"; /** @var string Content-Type value for the header */ protected static $headerContentType = 'text/csv; charset=UTF-8'; @@ -61,7 +61,7 @@ class Writer extends AbstractWriter protected function openWriter() { // Adds UTF-8 BOM for Unicode compatibility - $this->globalFunctionsHelper->fputs($this->filePointer, self::UTF8_BOM); + $this->globalFunctionsHelper->fputs($this->filePointer, self::BOM_UTF8); } /** diff --git a/tests/Spout/Writer/CSV/WriterTest.php b/tests/Spout/Writer/CSV/WriterTest.php index 83e2e03..fd7a3bb 100644 --- a/tests/Spout/Writer/CSV/WriterTest.php +++ b/tests/Spout/Writer/CSV/WriterTest.php @@ -70,7 +70,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase ]; $writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv'); - $this->assertContains(Writer::UTF8_BOM, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); + $this->assertContains(Writer::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); } /** @@ -162,6 +162,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase private function trimWrittenContent($writtenContent) { // remove line feeds and UTF-8 BOM - return trim($writtenContent, PHP_EOL . Writer::UTF8_BOM); + return trim($writtenContent, PHP_EOL . Writer::BOM_UTF8); } }