diff --git a/README.md b/README.md index 5bf9ed1..039acef 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ For XLSX files, the number of rows per sheet is limited to 1,048,576 (see [Offic ### Configuring the CSV reader and writer -It is possible to configure the both the CSV reader and writer to specify the field separator as well as the field enclosure: +It is possible to configure both the CSV reader and writer to specify the field separator as well as the field enclosure: ```php use Box\Spout\Reader\ReaderFactory; use Box\Spout\Common\Type; @@ -136,6 +136,13 @@ $reader->setFieldDelimiter('|'); $reader->setFieldEnclosure('@'); ``` +Additionally, if you need to read non UTF-8 files, you can specify the encoding of your file this way: +```php +$reader->setEncoding('UTF-16LE'); +``` + +The writer always generate CSV files encoded in UTF-8, with a BOM. + ### Configuring the XLSX writer #### Strings storage diff --git a/composer.json b/composer.json index a08d6bf..7918254 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,10 @@ "phpunit/phpunit": ">=3.7", "scrutinizer/ocular": "~1.1" }, + "suggest": { + "ext-iconv": "To handle non UTF-8 CSV files (if \"php-intl\" is not already installed or is too limited)", + "ext-intl": "To handle non UTF-8 CSV files (if \"iconv\" is not already installed)" + }, "autoload": { "psr-4": { "Box\\Spout\\": "src/Spout" diff --git a/src/Spout/Common/Exception/EncodingConversionException.php b/src/Spout/Common/Exception/EncodingConversionException.php new file mode 100644 index 0000000..ff5e243 --- /dev/null +++ b/src/Spout/Common/Exception/EncodingConversionException.php @@ -0,0 +1,12 @@ +globalFunctionsHelper = $globalFunctionsHelper; + + $this->supportedEncodingsWithBom = [ + self::ENCODING_UTF8 => self::BOM_UTF8, + self::ENCODING_UTF16_LE => self::BOM_UTF16_LE, + self::ENCODING_UTF16_BE => self::BOM_UTF16_BE, + self::ENCODING_UTF32_LE => self::BOM_UTF32_LE, + self::ENCODING_UTF32_BE => self::BOM_UTF32_BE, + ]; + } + + /** + * Returns the number of bytes to use as offset in order to skip the BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return int Bytes offset to apply to skip the BOM (0 means no BOM) + */ + public function getBytesOffsetToSkipBOM($filePointer, $encoding) + { + $byteOffsetToSkipBom = 0; + + if ($this->hasBom($filePointer, $encoding)) { + $bomUsed = $this->supportedEncodingsWithBom[$encoding]; + + // we skip the N first bytes + $byteOffsetToSkipBom = strlen($bomUsed); + } + + return $byteOffsetToSkipBom; + } + + /** + * Returns whether the file identified by the given pointer has a BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return bool TRUE if the file has a BOM, FALSE otherwise + */ + protected function hasBOM($filePointer, $encoding) + { + $hasBOM = false; + + $this->globalFunctionsHelper->rewind($filePointer); + + if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) { + $potentialBom = $this->supportedEncodingsWithBom[$encoding]; + $numBytesInBom = strlen($potentialBom); + + $hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom); + } + + return $hasBOM; + } + + /** + * Attempts to convert a non UTF-8 string into UTF-8. + * + * @param string $string Non UTF-8 string to be converted + * @param string $sourceEncoding The encoding used to encode the source string + * @return string The converted, UTF-8 string + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + public function attemptConversionToUTF8($string, $sourceEncoding) + { + return $this->attemptConversion($string, $sourceEncoding, self::ENCODING_UTF8); + } + + /** + * Attempts to convert a UTF-8 string into the given encoding. + * + * @param string $string UTF-8 string to be converted + * @param string $targetEncoding The encoding the string should be re-encoded into + * @return string The converted string, encoded with the given encoding + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + public function attemptConversionFromUTF8($string, $targetEncoding) + { + return $this->attemptConversion($string, self::ENCODING_UTF8, $targetEncoding); + } + + /** + * Attempts to convert the given string to the given encoding. + * Depending on what is installed on the server, we will try to iconv or mbstring. + * + * @param string $string string to be converted + * @param string $sourceEncoding The encoding used to encode the source string + * @param string $targetEncoding The encoding the string should be re-encoded into + * @return string The converted string, encoded with the given encoding + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + protected function attemptConversion($string, $sourceEncoding, $targetEncoding) + { + // if source and target encodings are the same, it's a no-op + if ($sourceEncoding === $targetEncoding) { + return $string; + } + + $convertedString = null; + + if ($this->canUseIconv()) { + $convertedString = $this->globalFunctionsHelper->iconv($string, $sourceEncoding, $targetEncoding); + } else if ($this->canUseMbString()) { + $convertedString = $this->globalFunctionsHelper->mb_convert_encoding($string, $sourceEncoding, $targetEncoding); + } else { + throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding is not supported. Please install \"iconv\" or \"PHP Intl\"."); + } + + if ($convertedString === false) { + throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding failed."); + } + + return $convertedString; + } + + /** + * Returns whether "iconv" can be used. + * + * @return bool TRUE if "iconv" is available and can be used, FALSE otherwise + */ + protected function canUseIconv() + { + return $this->globalFunctionsHelper->function_exists('iconv'); + } + + /** + * Returns whether "mb_string" functions can be used. + * These functions come with the PHP Intl package. + * + * @return bool TRUE if "mb_string" functions are available and can be used, FALSE otherwise + */ + protected function canUseMbString() + { + return $this->globalFunctionsHelper->function_exists('mb_convert_encoding'); + } +} diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index 47ed052..7cd8de5 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -203,4 +203,73 @@ class GlobalFunctionsHelper { header($string); } + + /** + * Wrapper around global function iconv() + * @see iconv() + * + * @param string $string The string to be converted + * @param string $sourceEncoding The encoding of the source string + * @param string $targetEncoding The encoding the source string should be converted to + * @return string|bool the converted string or FALSE on failure. + */ + public function iconv($string, $sourceEncoding, $targetEncoding) + { + return iconv($sourceEncoding, $targetEncoding, $string); + } + + /** + * Wrapper around global function mb_convert_encoding() + * @see mb_convert_encoding() + * + * @param string $string The string to be converted + * @param string $sourceEncoding The encoding of the source string + * @param string $targetEncoding The encoding the source string should be converted to + * @return string|bool the converted string or FALSE on failure. + */ + public function mb_convert_encoding($string, $sourceEncoding, $targetEncoding) + { + return mb_convert_encoding($string, $targetEncoding, $sourceEncoding); + } + + /** + * Wrapper around global function stream_get_line() + * @see stream_get_line() + * + * @param resource $handle + * @param int $length + * @param string|void $ending + * @return string|bool + */ + public function stream_get_line($handle, $length, $ending = null) + { + return stream_get_line($handle, $length, $ending); + } + + /** + * Wrapper around global function str_getcsv() + * @see str_getcsv() + * + * @param string $input + * @param string|void $delimiter + * @param string|void $enclosure + * @param string|void $escape + * @return array + */ + public function str_getcsv($input, $delimiter = null, $enclosure = null, $escape = null) + { + return str_getcsv($input, $delimiter, $enclosure, $escape); + } + + /** + * Wrapper around global function function_exists() + * @see function_exists() + * + * @param string $functionName + * @return bool + */ + public function function_exists($functionName) + { + return function_exists($functionName); + } } diff --git a/src/Spout/Reader/CSV/Reader.php b/src/Spout/Reader/CSV/Reader.php index 523eaca..45a13ef 100644 --- a/src/Spout/Reader/CSV/Reader.php +++ b/src/Spout/Reader/CSV/Reader.php @@ -4,6 +4,7 @@ namespace Box\Spout\Reader\CSV; use Box\Spout\Reader\AbstractReader; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class Reader @@ -25,6 +26,9 @@ class Reader extends AbstractReader /** @var string Defines the character used to enclose fields (one character only) */ protected $fieldEnclosure = '"'; + /** @var string Encoding of the CSV file to be read */ + protected $encoding = EncodingHelper::ENCODING_UTF8; + /** * Sets the field delimiter for the CSV. * Needs to be called before opening the reader. @@ -51,10 +55,22 @@ class Reader extends AbstractReader return $this; } + /** + * Sets the encoding of the CSV file to be read. + * Needs to be called before opening the reader. + * + * @param string $encoding Encoding of the CSV file to be read + * @return Reader + */ + public function setEncoding($encoding) + { + $this->encoding = $encoding; + return $this; + } + /** * Opens the file at the given path to make it ready to be read. - * The file must be UTF-8 encoded. - * @TODO add encoding detection/conversion + * If setEncoding() was not called, it assumes that the file is encoded in UTF-8. * * @param string $filePath Path of the CSV file to be read * @return void @@ -67,7 +83,13 @@ class Reader extends AbstractReader throw new IOException("Could not open file $filePath for reading."); } - $this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper); + $this->sheetIterator = new SheetIterator( + $this->filePointer, + $this->fieldDelimiter, + $this->fieldEnclosure, + $this->encoding, + $this->globalFunctionsHelper + ); } /** diff --git a/src/Spout/Reader/CSV/RowIterator.php b/src/Spout/Reader/CSV/RowIterator.php index ffb533f..3de1da7 100644 --- a/src/Spout/Reader/CSV/RowIterator.php +++ b/src/Spout/Reader/CSV/RowIterator.php @@ -3,6 +3,7 @@ namespace Box\Spout\Reader\CSV; use Box\Spout\Reader\IteratorInterface; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class RowIterator @@ -12,8 +13,6 @@ use Box\Spout\Reader\IteratorInterface; */ class RowIterator implements IteratorInterface { - const UTF8_BOM = "\xEF\xBB\xBF"; - /** @var resource Pointer to the CSV file to read */ protected $filePointer; @@ -27,26 +26,36 @@ class RowIterator implements IteratorInterface protected $hasReachedEndOfFile = false; /** @var string Defines the character used to delimit fields (one character only) */ - protected $fieldDelimiter = ','; + protected $fieldDelimiter; /** @var string Defines the character used to enclose fields (one character only) */ - protected $fieldEnclosure = '"'; + protected $fieldEnclosure; + + /** @var string Encoding of the CSV file to be read */ + protected $encoding; /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ protected $globalFunctionsHelper; + /** @var \Box\Spout\Common\Helper\EncodingHelper Helper to work with different encodings */ + protected $encodingHelper; + /** * @param resource $filePointer Pointer to the CSV file to read * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { $this->filePointer = $filePointer; $this->fieldDelimiter = $fieldDelimiter; $this->fieldEnclosure = $fieldEnclosure; + $this->encoding = $encoding; $this->globalFunctionsHelper = $globalFunctionsHelper; + + $this->encodingHelper = new EncodingHelper($globalFunctionsHelper); } /** @@ -57,7 +66,7 @@ class RowIterator implements IteratorInterface */ public function rewind() { - $this->rewindAndSkipUtf8Bom(); + $this->rewindAndSkipBom(); $this->numReadRows = 0; $this->rowDataBuffer = null; @@ -66,24 +75,17 @@ class RowIterator implements IteratorInterface } /** - * This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file + * This rewinds and skips the BOM if inserted at the beginning of the file * by moving the file pointer after it, so that it is not read. * * @return void */ - protected function rewindAndSkipUtf8Bom() + protected function rewindAndSkipBom() { - $this->globalFunctionsHelper->rewind($this->filePointer); + $byteOffsetToSkipBom = $this->encodingHelper->getBytesOffsetToSkipBOM($this->filePointer, $this->encoding); - $hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM); - - if ($hasUtf8Bom) { - // we skip the 2 first bytes (so start from the 3rd byte) - $this->globalFunctionsHelper->fseek($this->filePointer, 3); - } else { - // if no BOM, reset the pointer to read from the beginning - $this->globalFunctionsHelper->fseek($this->filePointer, 0); - } + // sets the cursor after the BOM (0 means no BOM, so rewind it) + $this->globalFunctionsHelper->fseek($this->filePointer, $byteOffsetToSkipBom); } /** @@ -102,6 +104,7 @@ class RowIterator implements IteratorInterface * @link http://php.net/manual/en/iterator.next.php * * @return void + * @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8 */ public function next() { @@ -110,7 +113,8 @@ class RowIterator implements IteratorInterface if (!$this->hasReachedEndOfFile) { do { - $lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure); + $utf8EncodedLineData = $this->getNextUTF8EncodedLine(); + $lineData = $this->globalFunctionsHelper->str_getcsv($utf8EncodedLineData, $this->fieldDelimiter, $this->fieldEnclosure); } while ($lineData === false || ($lineData !== null && $this->isEmptyLine($lineData))); if ($lineData !== false && $lineData !== null) { @@ -120,6 +124,25 @@ class RowIterator implements IteratorInterface } } + /** + * Returns the next line, converted if necessary to UTF-8. + * Neither fgets nor fgetcsv don't work with non UTF-8 data... so we need to do some things manually. + * + * @return string The next line for the current file pointer, encoded in UTF-8 + * @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8 + */ + protected function getNextUTF8EncodedLine() + { + // Read until the EOL delimiter or EOF is reached. The delimiter's encoding needs to match the CSV's encoding. + $encodedEOLDelimiter = $this->encodingHelper->attemptConversionFromUTF8("\n", $this->encoding); + $encodedLineData = $this->globalFunctionsHelper->stream_get_line($this->filePointer, 0, $encodedEOLDelimiter); + + // Once the line has been read, it can be converted to UTF-8 + $utf8EncodedLineData = $this->encodingHelper->attemptConversionToUTF8($encodedLineData, $this->encoding); + + return $utf8EncodedLineData; + } + /** * @param array $lineData Array containing the cells value for the line * @return bool Whether the given line is empty diff --git a/src/Spout/Reader/CSV/Sheet.php b/src/Spout/Reader/CSV/Sheet.php index 207fcae..fd8d214 100644 --- a/src/Spout/Reader/CSV/Sheet.php +++ b/src/Spout/Reader/CSV/Sheet.php @@ -18,11 +18,12 @@ class Sheet implements SheetInterface * @param resource $filePointer Pointer to the CSV file to read * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { - $this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + $this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper); } /** diff --git a/src/Spout/Reader/CSV/SheetIterator.php b/src/Spout/Reader/CSV/SheetIterator.php index f424cd8..7e7af38 100644 --- a/src/Spout/Reader/CSV/SheetIterator.php +++ b/src/Spout/Reader/CSV/SheetIterator.php @@ -22,11 +22,12 @@ class SheetIterator implements IteratorInterface * @param resource $filePointer * @param string $fieldDelimiter Character that delimits fields * @param string $fieldEnclosure Character that enclose fields + * @param string $encoding Encoding of the CSV file to be read * @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper */ - public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper) + public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper) { - $this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper); + $this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper); } /** diff --git a/src/Spout/Writer/CSV/Writer.php b/src/Spout/Writer/CSV/Writer.php index d37ad5b..d008096 100644 --- a/src/Spout/Writer/CSV/Writer.php +++ b/src/Spout/Writer/CSV/Writer.php @@ -4,6 +4,7 @@ namespace Box\Spout\Writer\CSV; use Box\Spout\Writer\AbstractWriter; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class Writer @@ -15,7 +16,6 @@ class Writer extends AbstractWriter { /** Number of rows to write before flushing */ const FLUSH_THRESHOLD = 500; - const UTF8_BOM = "\xEF\xBB\xBF"; /** @var string Content-Type value for the header */ protected static $headerContentType = 'text/csv; charset=UTF-8'; @@ -61,7 +61,7 @@ class Writer extends AbstractWriter protected function openWriter() { // Adds UTF-8 BOM for Unicode compatibility - $this->globalFunctionsHelper->fputs($this->filePointer, self::UTF8_BOM); + $this->globalFunctionsHelper->fputs($this->filePointer, EncodingHelper::BOM_UTF8); } /** diff --git a/tests/Spout/Common/Helper/EncodingHelperTest.php b/tests/Spout/Common/Helper/EncodingHelperTest.php new file mode 100644 index 0000000..d142d4e --- /dev/null +++ b/tests/Spout/Common/Helper/EncodingHelperTest.php @@ -0,0 +1,223 @@ +getResourcePath($fileName); + $filePointer = fopen($resourcePath, 'r'); + + $encodingHelper = new EncodingHelper(new GlobalFunctionsHelper()); + $bytesOffset = $encodingHelper->getBytesOffsetToSkipBOM($filePointer, $encoding); + + $this->assertEquals($expectedBytesOffset, $bytesOffset); + } + + /** + * @return array + */ + public function dataProviderForIconvOrMbstringUsage() + { + return [ + [$shouldUseIconv = true], + [$shouldNotUseIconv = false], + ]; + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionToUTF8ShouldThrowIfConversionFailed($shouldUseIconv) + { + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['iconv', 'mb_convert_encoding']) + ->getMock(); + $helperStub->method('iconv')->willReturn(false); + $helperStub->method('mb_convert_encoding')->willReturn(false); + + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([$helperStub]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @return void + */ + public function testAttemptConversionToUTF8ShouldThrowIfConversionNotSupported() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn(false); + $encodingHelperStub->method('canUseMbString')->willReturn(false); + + $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionToUTF8ShouldReturnReencodedString($shouldUseIconv) + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([new GlobalFunctionsHelper()]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodedString = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input'); + $decodedString = $encodingHelperStub->attemptConversionToUTF8($encodedString, EncodingHelper::ENCODING_UTF16_LE); + + $this->assertEquals('input', $decodedString); + } + + /** + * @return void + */ + public function testAttemptConversionToUTF8ShouldBeNoopWhenTargetIsUTF8() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv']) + ->getMock(); + $encodingHelperStub->expects($this->never())->method('canUseIconv'); + + $decodedString = $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF8); + $this->assertEquals('input', $decodedString); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionFromUTF8ShouldThrowIfConversionFailed($shouldUseIconv) + { + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['iconv', 'mb_convert_encoding']) + ->getMock(); + $helperStub->method('iconv')->willReturn(false); + $helperStub->method('mb_convert_encoding')->willReturn(false); + + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([$helperStub]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @return void + */ + public function testAttemptConversionFromUTF8ShouldThrowIfConversionNotSupported() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn(false); + $encodingHelperStub->method('canUseMbString')->willReturn(false); + + $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionFromUTF8ShouldReturnReencodedString($shouldUseIconv) + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([new GlobalFunctionsHelper()]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + $encodedStringWithIconv = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input'); + + $this->assertEquals($encodedStringWithIconv, $encodedString); + } + + /** + * @return void + */ + public function testAttemptConversionFromUTF8ShouldBeNoopWhenTargetIsUTF8() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv']) + ->getMock(); + $encodingHelperStub->expects($this->never())->method('canUseIconv'); + + $encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF8); + $this->assertEquals('input', $encodedString); + } +} diff --git a/tests/Spout/Reader/CSV/ReaderTest.php b/tests/Spout/Reader/CSV/ReaderTest.php index 932633f..180441e 100644 --- a/tests/Spout/Reader/CSV/ReaderTest.php +++ b/tests/Spout/Reader/CSV/ReaderTest.php @@ -2,8 +2,9 @@ namespace Box\Spout\Reader\CSV; -use Box\Spout\Common\Type; use Box\Spout\Reader\ReaderFactory; +use Box\Spout\Common\Type; +use Box\Spout\Common\Helper\EncodingHelper; use Box\Spout\TestUsingResource; /** @@ -167,15 +168,96 @@ class ReaderTest extends \PHPUnit_Framework_TestCase } /** + * @return array + */ + public function dataProviderForTestReadShouldSkipBom() + { + return [ + ['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8], + ['csv_with_utf16le_bom.csv', EncodingHelper::ENCODING_UTF16_LE], + ['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE], + ['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE], + ['csv_with_utf32be_bom.csv', EncodingHelper::ENCODING_UTF32_BE], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldSkipBom + * + * @param string $fileName + * @param string $fileEncoding * @return void */ - public function testReadShouldSkipUtf8Bom() + public function testReadShouldSkipBom($fileName, $fileEncoding) { - $allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv'); + $allRows = $this->getAllRowsForFile($fileName, ',', '"', $fileEncoding); $expectedRows = [ ['csv--11', 'csv--12', 'csv--13'], ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return array + */ + public function dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs() + { + $shouldUseIconv = true; + $shouldNotUseIconv = false; + + return [ + ['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldUseIconv], + ['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldNotUseIconv], + ['csv_with_encoding_cp1252.csv', 'CP1252', $shouldUseIconv], + ['csv_with_encoding_cp1252.csv', 'CP1252', $shouldNotUseIconv], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs + * + * @param string $fileName + * @param string $fileEncoding + * @param bool $shouldUseIconv + * @return void + */ + public function testReadShouldSupportNonUTF8FilesWithoutBOMs($fileName, $fileEncoding, $shouldUseIconv) + { + $allRows = []; + $resourcePath = $this->getResourcePath($fileName); + + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['function_exists']) + ->getMock(); + + $returnValueMap = [ + ['iconv', $shouldUseIconv], + ['mb_convert_encoding', true], + ]; + $helperStub->method('function_exists')->will($this->returnValueMap($returnValueMap)); + + /** @var \Box\Spout\Reader\CSV\Reader $reader */ + $reader = ReaderFactory::create(Type::CSV); + $reader + ->setGlobalFunctionsHelper($helperStub) + ->setEncoding($fileEncoding) + ->open($resourcePath); + + foreach ($reader->getSheetIterator() as $sheet) { + foreach ($sheet->getRowIterator() as $row) { + $allRows[] = $row; + } + } + + $reader->close(); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], ]; $this->assertEquals($expectedRows, $allRows); } @@ -228,18 +310,25 @@ class ReaderTest extends \PHPUnit_Framework_TestCase * @param string $fileName * @param string|void $fieldDelimiter * @param string|void $fieldEnclosure + * @param string|void $encoding * @return array All the read rows the given file */ - private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"') + private function getAllRowsForFile( + $fileName, + $fieldDelimiter = ',', + $fieldEnclosure = '"', + $encoding = EncodingHelper::ENCODING_UTF8) { $allRows = []; $resourcePath = $this->getResourcePath($fileName); + /** @var \Box\Spout\Reader\CSV\Reader $reader */ $reader = ReaderFactory::create(Type::CSV); - $reader->setFieldDelimiter($fieldDelimiter); - $reader->setFieldEnclosure($fieldEnclosure); - - $reader->open($resourcePath); + $reader + ->setFieldDelimiter($fieldDelimiter) + ->setFieldEnclosure($fieldEnclosure) + ->setEncoding($encoding) + ->open($resourcePath); foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { foreach ($sheet->getRowIterator() as $rowIndex => $row) { diff --git a/tests/Spout/Writer/CSV/WriterTest.php b/tests/Spout/Writer/CSV/WriterTest.php index 83e2e03..fce430f 100644 --- a/tests/Spout/Writer/CSV/WriterTest.php +++ b/tests/Spout/Writer/CSV/WriterTest.php @@ -2,8 +2,9 @@ namespace Box\Spout\Writer\CSV; -use Box\Spout\Common\Type; use Box\Spout\TestUsingResource; +use Box\Spout\Common\Type; +use Box\Spout\Common\Helper\EncodingHelper; use Box\Spout\Writer\WriterFactory; /** @@ -70,7 +71,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase ]; $writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv'); - $this->assertContains(Writer::UTF8_BOM, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); + $this->assertContains(EncodingHelper::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); } /** @@ -162,6 +163,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase private function trimWrittenContent($writtenContent) { // remove line feeds and UTF-8 BOM - return trim($writtenContent, PHP_EOL . Writer::UTF8_BOM); + return trim($writtenContent, PHP_EOL . EncodingHelper::BOM_UTF8); } } diff --git a/tests/resources/csv/csv_with_encoding_cp1252.csv b/tests/resources/csv/csv_with_encoding_cp1252.csv new file mode 100644 index 0000000..552edd3 --- /dev/null +++ b/tests/resources/csv/csv_with_encoding_cp1252.csv @@ -0,0 +1,3 @@ +csv--11,csv--12,csv--13 +csv--21,csv--22,csv--23 +csv--31,csv--32,csv--33 \ No newline at end of file diff --git a/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv b/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv new file mode 100644 index 0000000..4e0a014 Binary files /dev/null and b/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf16be_bom.csv b/tests/resources/csv/csv_with_utf16be_bom.csv new file mode 100644 index 0000000..6c33ecf Binary files /dev/null and b/tests/resources/csv/csv_with_utf16be_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf16le_bom.csv b/tests/resources/csv/csv_with_utf16le_bom.csv new file mode 100644 index 0000000..e881bfa Binary files /dev/null and b/tests/resources/csv/csv_with_utf16le_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf32be_bom.csv b/tests/resources/csv/csv_with_utf32be_bom.csv new file mode 100644 index 0000000..3a14956 Binary files /dev/null and b/tests/resources/csv/csv_with_utf32be_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf32le_bom.csv b/tests/resources/csv/csv_with_utf32le_bom.csv new file mode 100644 index 0000000..dd67f85 Binary files /dev/null and b/tests/resources/csv/csv_with_utf32le_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf8_bom.csv b/tests/resources/csv/csv_with_utf8_bom.csv index fd7a3c4..216a017 100644 --- a/tests/resources/csv/csv_with_utf8_bom.csv +++ b/tests/resources/csv/csv_with_utf8_bom.csv @@ -1,2 +1,3 @@ csv--11,csv--12,csv--13 -csv--21,csv--22,csv--23 \ No newline at end of file +csv--21,csv--22,csv--23 +csv--31,csv--32,csv--33 \ No newline at end of file