diff --git a/composer.json b/composer.json index a08d6bf..7918254 100644 --- a/composer.json +++ b/composer.json @@ -21,6 +21,10 @@ "phpunit/phpunit": ">=3.7", "scrutinizer/ocular": "~1.1" }, + "suggest": { + "ext-iconv": "To handle non UTF-8 CSV files (if \"php-intl\" is not already installed or is too limited)", + "ext-intl": "To handle non UTF-8 CSV files (if \"iconv\" is not already installed)" + }, "autoload": { "psr-4": { "Box\\Spout\\": "src/Spout" diff --git a/src/Spout/Common/Exception/EncodingConversionException.php b/src/Spout/Common/Exception/EncodingConversionException.php new file mode 100644 index 0000000..ff5e243 --- /dev/null +++ b/src/Spout/Common/Exception/EncodingConversionException.php @@ -0,0 +1,12 @@ +globalFunctionsHelper = $globalFunctionsHelper; + + $this->supportedEncodingsWithBom = [ + self::ENCODING_UTF8 => self::BOM_UTF8, + self::ENCODING_UTF16_LE => self::BOM_UTF16_LE, + self::ENCODING_UTF16_BE => self::BOM_UTF16_BE, + self::ENCODING_UTF32_LE => self::BOM_UTF32_LE, + self::ENCODING_UTF32_BE => self::BOM_UTF32_BE, + ]; + } + + /** + * Returns the number of bytes to use as offset in order to skip the BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return int Bytes offset to apply to skip the BOM (0 means no BOM) + */ + public function getBytesOffsetToSkipBOM($filePointer, $encoding) + { + $byteOffsetToSkipBom = 0; + + if ($this->hasBom($filePointer, $encoding)) { + $bomUsed = $this->supportedEncodingsWithBom[$encoding]; + + // we skip the N first bytes + $byteOffsetToSkipBom = strlen($bomUsed); + } + + return $byteOffsetToSkipBom; + } + + /** + * Returns whether the file identified by the given pointer has a BOM. + * + * @param resource $filePointer Pointer to the file to check + * @param string $encoding Encoding of the file to check + * @return bool TRUE if the file has a BOM, FALSE otherwise + */ + protected function hasBOM($filePointer, $encoding) + { + $hasBOM = false; + + $this->globalFunctionsHelper->rewind($filePointer); + + if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) { + $potentialBom = $this->supportedEncodingsWithBom[$encoding]; + $numBytesInBom = strlen($potentialBom); + + $hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom); + } + + return $hasBOM; + } + + /** + * Attempts to convert a non UTF-8 string into UTF-8. + * + * @param string $string Non UTF-8 string to be converted + * @param string $sourceEncoding The encoding used to encode the source string + * @return string The converted, UTF-8 string + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + public function attemptConversionToUTF8($string, $sourceEncoding) + { + return $this->attemptConversion($string, $sourceEncoding, self::ENCODING_UTF8); + } + + /** + * Attempts to convert a UTF-8 string into the given encoding. + * + * @param string $string UTF-8 string to be converted + * @param string $targetEncoding The encoding the string should be re-encoded into + * @return string The converted string, encoded with the given encoding + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + public function attemptConversionFromUTF8($string, $targetEncoding) + { + return $this->attemptConversion($string, self::ENCODING_UTF8, $targetEncoding); + } + + /** + * Attempts to convert the given string to the given encoding. + * Depending on what is installed on the server, we will try to iconv or mbstring. + * + * @param string $string string to be converted + * @param string $sourceEncoding The encoding used to encode the source string + * @param string $targetEncoding The encoding the string should be re-encoded into + * @return string The converted string, encoded with the given encoding + * @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed + */ + protected function attemptConversion($string, $sourceEncoding, $targetEncoding) + { + // if source and target encodings are the same, it's a no-op + if ($sourceEncoding === $targetEncoding) { + return $string; + } + + $convertedString = null; + + if ($this->canUseIconv()) { + $convertedString = $this->globalFunctionsHelper->iconv($string, $sourceEncoding, $targetEncoding); + } else if ($this->canUseMbString()) { + $convertedString = $this->globalFunctionsHelper->mb_convert_encoding($string, $sourceEncoding, $targetEncoding); + } else { + throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding is not supported. Please install \"iconv\" or \"PHP Intl\"."); + } + + if ($convertedString === false) { + throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding failed."); + } + + return $convertedString; + } + + /** + * Returns whether "iconv" can be used. + * + * @return bool TRUE if "iconv" is available and can be used, FALSE otherwise + */ + protected function canUseIconv() + { + return $this->globalFunctionsHelper->function_exists('iconv'); + } + + /** + * Returns whether "mb_string" functions can be used. + * These functions come with the PHP Intl package. + * + * @return bool TRUE if "mb_string" functions are available and can be used, FALSE otherwise + */ + protected function canUseMbString() + { + return $this->globalFunctionsHelper->function_exists('mb_convert_encoding'); + } +} diff --git a/src/Spout/Common/Helper/GlobalFunctionsHelper.php b/src/Spout/Common/Helper/GlobalFunctionsHelper.php index 47ed052..7cd8de5 100644 --- a/src/Spout/Common/Helper/GlobalFunctionsHelper.php +++ b/src/Spout/Common/Helper/GlobalFunctionsHelper.php @@ -203,4 +203,73 @@ class GlobalFunctionsHelper { header($string); } + + /** + * Wrapper around global function iconv() + * @see iconv() + * + * @param string $string The string to be converted + * @param string $sourceEncoding The encoding of the source string + * @param string $targetEncoding The encoding the source string should be converted to + * @return string|bool the converted string or FALSE on failure. + */ + public function iconv($string, $sourceEncoding, $targetEncoding) + { + return iconv($sourceEncoding, $targetEncoding, $string); + } + + /** + * Wrapper around global function mb_convert_encoding() + * @see mb_convert_encoding() + * + * @param string $string The string to be converted + * @param string $sourceEncoding The encoding of the source string + * @param string $targetEncoding The encoding the source string should be converted to + * @return string|bool the converted string or FALSE on failure. + */ + public function mb_convert_encoding($string, $sourceEncoding, $targetEncoding) + { + return mb_convert_encoding($string, $targetEncoding, $sourceEncoding); + } + + /** + * Wrapper around global function stream_get_line() + * @see stream_get_line() + * + * @param resource $handle + * @param int $length + * @param string|void $ending + * @return string|bool + */ + public function stream_get_line($handle, $length, $ending = null) + { + return stream_get_line($handle, $length, $ending); + } + + /** + * Wrapper around global function str_getcsv() + * @see str_getcsv() + * + * @param string $input + * @param string|void $delimiter + * @param string|void $enclosure + * @param string|void $escape + * @return array + */ + public function str_getcsv($input, $delimiter = null, $enclosure = null, $escape = null) + { + return str_getcsv($input, $delimiter, $enclosure, $escape); + } + + /** + * Wrapper around global function function_exists() + * @see function_exists() + * + * @param string $functionName + * @return bool + */ + public function function_exists($functionName) + { + return function_exists($functionName); + } } diff --git a/src/Spout/Reader/CSV/Helper/EncodingHelper.php b/src/Spout/Reader/CSV/Helper/EncodingHelper.php deleted file mode 100644 index 1987548..0000000 --- a/src/Spout/Reader/CSV/Helper/EncodingHelper.php +++ /dev/null @@ -1,92 +0,0 @@ -globalFunctionsHelper = $globalFunctionsHelper; - - $this->supportedEncodingsWithBom = [ - self::ENCODING_UTF8 => self::BOM_UTF8, - self::ENCODING_UTF16_LE => self::BOM_UTF16_LE, - self::ENCODING_UTF16_BE => self::BOM_UTF16_BE, - self::ENCODING_UTF32_LE => self::BOM_UTF32_LE, - self::ENCODING_UTF32_BE => self::BOM_UTF32_BE, - ]; - } - - /** - * Returns the number of bytes to use as offset in order to skip the BOM. - * - * @param resource $filePointer Pointer to the file to check - * @param string $encoding Encoding of the file to check - * @return int Bytes offset to apply to skip the BOM (0 means no BOM) - */ - public function getBytesOffsetToSkipBOM($filePointer, $encoding) - { - $byteOffsetToSkipBom = 0; - - if ($this->hasBom($filePointer, $encoding)) { - $bomUsed = $this->supportedEncodingsWithBom[$encoding]; - - // we skip the N first bytes - $byteOffsetToSkipBom = strlen($bomUsed); - } - - return $byteOffsetToSkipBom; - } - - /** - * Returns whether the file identified by the given pointer has a BOM. - * - * @param resource $filePointer Pointer to the file to check - * @param string $encoding Encoding of the file to check - * @return bool TRUE if the file has a BOM, FALSE otherwise - */ - protected function hasBOM($filePointer, $encoding) - { - $hasBOM = false; - - $this->globalFunctionsHelper->rewind($filePointer); - - if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) { - $potentialBom = $this->supportedEncodingsWithBom[$encoding]; - $numBytesInBom = strlen($potentialBom); - - $hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom); - } - - return $hasBOM; - } -} diff --git a/src/Spout/Reader/CSV/Reader.php b/src/Spout/Reader/CSV/Reader.php index 10bb8d4..45a13ef 100644 --- a/src/Spout/Reader/CSV/Reader.php +++ b/src/Spout/Reader/CSV/Reader.php @@ -4,6 +4,7 @@ namespace Box\Spout\Reader\CSV; use Box\Spout\Reader\AbstractReader; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class Reader @@ -26,7 +27,7 @@ class Reader extends AbstractReader protected $fieldEnclosure = '"'; /** @var string Encoding of the CSV file to be read */ - protected $encoding = 'UTF-8'; + protected $encoding = EncodingHelper::ENCODING_UTF8; /** * Sets the field delimiter for the CSV. @@ -69,6 +70,7 @@ class Reader extends AbstractReader /** * Opens the file at the given path to make it ready to be read. + * If setEncoding() was not called, it assumes that the file is encoded in UTF-8. * * @param string $filePath Path of the CSV file to be read * @return void diff --git a/src/Spout/Reader/CSV/RowIterator.php b/src/Spout/Reader/CSV/RowIterator.php index d941ad1..3de1da7 100644 --- a/src/Spout/Reader/CSV/RowIterator.php +++ b/src/Spout/Reader/CSV/RowIterator.php @@ -2,8 +2,8 @@ namespace Box\Spout\Reader\CSV; -use Box\Spout\Reader\CSV\Helper\EncodingHelper; use Box\Spout\Reader\IteratorInterface; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class RowIterator @@ -37,6 +37,9 @@ class RowIterator implements IteratorInterface /** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */ protected $globalFunctionsHelper; + /** @var \Box\Spout\Common\Helper\EncodingHelper Helper to work with different encodings */ + protected $encodingHelper; + /** * @param resource $filePointer Pointer to the CSV file to read * @param string $fieldDelimiter Character that delimits fields @@ -101,6 +104,7 @@ class RowIterator implements IteratorInterface * @link http://php.net/manual/en/iterator.next.php * * @return void + * @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8 */ public function next() { @@ -109,7 +113,8 @@ class RowIterator implements IteratorInterface if (!$this->hasReachedEndOfFile) { do { - $lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure); + $utf8EncodedLineData = $this->getNextUTF8EncodedLine(); + $lineData = $this->globalFunctionsHelper->str_getcsv($utf8EncodedLineData, $this->fieldDelimiter, $this->fieldEnclosure); } while ($lineData === false || ($lineData !== null && $this->isEmptyLine($lineData))); if ($lineData !== false && $lineData !== null) { @@ -119,6 +124,25 @@ class RowIterator implements IteratorInterface } } + /** + * Returns the next line, converted if necessary to UTF-8. + * Neither fgets nor fgetcsv don't work with non UTF-8 data... so we need to do some things manually. + * + * @return string The next line for the current file pointer, encoded in UTF-8 + * @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8 + */ + protected function getNextUTF8EncodedLine() + { + // Read until the EOL delimiter or EOF is reached. The delimiter's encoding needs to match the CSV's encoding. + $encodedEOLDelimiter = $this->encodingHelper->attemptConversionFromUTF8("\n", $this->encoding); + $encodedLineData = $this->globalFunctionsHelper->stream_get_line($this->filePointer, 0, $encodedEOLDelimiter); + + // Once the line has been read, it can be converted to UTF-8 + $utf8EncodedLineData = $this->encodingHelper->attemptConversionToUTF8($encodedLineData, $this->encoding); + + return $utf8EncodedLineData; + } + /** * @param array $lineData Array containing the cells value for the line * @return bool Whether the given line is empty diff --git a/src/Spout/Writer/CSV/Writer.php b/src/Spout/Writer/CSV/Writer.php index 57c554c..d008096 100644 --- a/src/Spout/Writer/CSV/Writer.php +++ b/src/Spout/Writer/CSV/Writer.php @@ -4,6 +4,7 @@ namespace Box\Spout\Writer\CSV; use Box\Spout\Writer\AbstractWriter; use Box\Spout\Common\Exception\IOException; +use Box\Spout\Common\Helper\EncodingHelper; /** * Class Writer @@ -15,7 +16,6 @@ class Writer extends AbstractWriter { /** Number of rows to write before flushing */ const FLUSH_THRESHOLD = 500; - const BOM_UTF8 = "\xEF\xBB\xBF"; /** @var string Content-Type value for the header */ protected static $headerContentType = 'text/csv; charset=UTF-8'; @@ -61,7 +61,7 @@ class Writer extends AbstractWriter protected function openWriter() { // Adds UTF-8 BOM for Unicode compatibility - $this->globalFunctionsHelper->fputs($this->filePointer, self::BOM_UTF8); + $this->globalFunctionsHelper->fputs($this->filePointer, EncodingHelper::BOM_UTF8); } /** diff --git a/tests/Spout/Common/Helper/EncodingHelperTest.php b/tests/Spout/Common/Helper/EncodingHelperTest.php new file mode 100644 index 0000000..d142d4e --- /dev/null +++ b/tests/Spout/Common/Helper/EncodingHelperTest.php @@ -0,0 +1,223 @@ +getResourcePath($fileName); + $filePointer = fopen($resourcePath, 'r'); + + $encodingHelper = new EncodingHelper(new GlobalFunctionsHelper()); + $bytesOffset = $encodingHelper->getBytesOffsetToSkipBOM($filePointer, $encoding); + + $this->assertEquals($expectedBytesOffset, $bytesOffset); + } + + /** + * @return array + */ + public function dataProviderForIconvOrMbstringUsage() + { + return [ + [$shouldUseIconv = true], + [$shouldNotUseIconv = false], + ]; + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionToUTF8ShouldThrowIfConversionFailed($shouldUseIconv) + { + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['iconv', 'mb_convert_encoding']) + ->getMock(); + $helperStub->method('iconv')->willReturn(false); + $helperStub->method('mb_convert_encoding')->willReturn(false); + + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([$helperStub]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @return void + */ + public function testAttemptConversionToUTF8ShouldThrowIfConversionNotSupported() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn(false); + $encodingHelperStub->method('canUseMbString')->willReturn(false); + + $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionToUTF8ShouldReturnReencodedString($shouldUseIconv) + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([new GlobalFunctionsHelper()]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodedString = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input'); + $decodedString = $encodingHelperStub->attemptConversionToUTF8($encodedString, EncodingHelper::ENCODING_UTF16_LE); + + $this->assertEquals('input', $decodedString); + } + + /** + * @return void + */ + public function testAttemptConversionToUTF8ShouldBeNoopWhenTargetIsUTF8() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv']) + ->getMock(); + $encodingHelperStub->expects($this->never())->method('canUseIconv'); + + $decodedString = $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF8); + $this->assertEquals('input', $decodedString); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionFromUTF8ShouldThrowIfConversionFailed($shouldUseIconv) + { + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['iconv', 'mb_convert_encoding']) + ->getMock(); + $helperStub->method('iconv')->willReturn(false); + $helperStub->method('mb_convert_encoding')->willReturn(false); + + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([$helperStub]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @expectedException \Box\Spout\Common\Exception\EncodingConversionException + * + * @return void + */ + public function testAttemptConversionFromUTF8ShouldThrowIfConversionNotSupported() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn(false); + $encodingHelperStub->method('canUseMbString')->willReturn(false); + + $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + } + + /** + * @dataProvider dataProviderForIconvOrMbstringUsage + * + * @param bool $shouldUseIconv + * @return void + */ + public function testAttemptConversionFromUTF8ShouldReturnReencodedString($shouldUseIconv) + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->setConstructorArgs([new GlobalFunctionsHelper()]) + ->setMethods(['canUseIconv', 'canUseMbString']) + ->getMock(); + $encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv); + $encodingHelperStub->method('canUseMbString')->willReturn(true); + + $encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE); + $encodedStringWithIconv = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input'); + + $this->assertEquals($encodedStringWithIconv, $encodedString); + } + + /** + * @return void + */ + public function testAttemptConversionFromUTF8ShouldBeNoopWhenTargetIsUTF8() + { + /** @var EncodingHelper $encodingHelperStub */ + $encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper') + ->disableOriginalConstructor() + ->setMethods(['canUseIconv']) + ->getMock(); + $encodingHelperStub->expects($this->never())->method('canUseIconv'); + + $encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF8); + $this->assertEquals('input', $encodedString); + } +} diff --git a/tests/Spout/Reader/CSV/ReaderTest.php b/tests/Spout/Reader/CSV/ReaderTest.php index 932633f..180441e 100644 --- a/tests/Spout/Reader/CSV/ReaderTest.php +++ b/tests/Spout/Reader/CSV/ReaderTest.php @@ -2,8 +2,9 @@ namespace Box\Spout\Reader\CSV; -use Box\Spout\Common\Type; use Box\Spout\Reader\ReaderFactory; +use Box\Spout\Common\Type; +use Box\Spout\Common\Helper\EncodingHelper; use Box\Spout\TestUsingResource; /** @@ -167,15 +168,96 @@ class ReaderTest extends \PHPUnit_Framework_TestCase } /** + * @return array + */ + public function dataProviderForTestReadShouldSkipBom() + { + return [ + ['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8], + ['csv_with_utf16le_bom.csv', EncodingHelper::ENCODING_UTF16_LE], + ['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE], + ['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE], + ['csv_with_utf32be_bom.csv', EncodingHelper::ENCODING_UTF32_BE], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldSkipBom + * + * @param string $fileName + * @param string $fileEncoding * @return void */ - public function testReadShouldSkipUtf8Bom() + public function testReadShouldSkipBom($fileName, $fileEncoding) { - $allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv'); + $allRows = $this->getAllRowsForFile($fileName, ',', '"', $fileEncoding); $expectedRows = [ ['csv--11', 'csv--12', 'csv--13'], ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], + ]; + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return array + */ + public function dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs() + { + $shouldUseIconv = true; + $shouldNotUseIconv = false; + + return [ + ['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldUseIconv], + ['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldNotUseIconv], + ['csv_with_encoding_cp1252.csv', 'CP1252', $shouldUseIconv], + ['csv_with_encoding_cp1252.csv', 'CP1252', $shouldNotUseIconv], + ]; + } + + /** + * @dataProvider dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs + * + * @param string $fileName + * @param string $fileEncoding + * @param bool $shouldUseIconv + * @return void + */ + public function testReadShouldSupportNonUTF8FilesWithoutBOMs($fileName, $fileEncoding, $shouldUseIconv) + { + $allRows = []; + $resourcePath = $this->getResourcePath($fileName); + + $helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper') + ->setMethods(['function_exists']) + ->getMock(); + + $returnValueMap = [ + ['iconv', $shouldUseIconv], + ['mb_convert_encoding', true], + ]; + $helperStub->method('function_exists')->will($this->returnValueMap($returnValueMap)); + + /** @var \Box\Spout\Reader\CSV\Reader $reader */ + $reader = ReaderFactory::create(Type::CSV); + $reader + ->setGlobalFunctionsHelper($helperStub) + ->setEncoding($fileEncoding) + ->open($resourcePath); + + foreach ($reader->getSheetIterator() as $sheet) { + foreach ($sheet->getRowIterator() as $row) { + $allRows[] = $row; + } + } + + $reader->close(); + + $expectedRows = [ + ['csv--11', 'csv--12', 'csv--13'], + ['csv--21', 'csv--22', 'csv--23'], + ['csv--31', 'csv--32', 'csv--33'], ]; $this->assertEquals($expectedRows, $allRows); } @@ -228,18 +310,25 @@ class ReaderTest extends \PHPUnit_Framework_TestCase * @param string $fileName * @param string|void $fieldDelimiter * @param string|void $fieldEnclosure + * @param string|void $encoding * @return array All the read rows the given file */ - private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"') + private function getAllRowsForFile( + $fileName, + $fieldDelimiter = ',', + $fieldEnclosure = '"', + $encoding = EncodingHelper::ENCODING_UTF8) { $allRows = []; $resourcePath = $this->getResourcePath($fileName); + /** @var \Box\Spout\Reader\CSV\Reader $reader */ $reader = ReaderFactory::create(Type::CSV); - $reader->setFieldDelimiter($fieldDelimiter); - $reader->setFieldEnclosure($fieldEnclosure); - - $reader->open($resourcePath); + $reader + ->setFieldDelimiter($fieldDelimiter) + ->setFieldEnclosure($fieldEnclosure) + ->setEncoding($encoding) + ->open($resourcePath); foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) { foreach ($sheet->getRowIterator() as $rowIndex => $row) { diff --git a/tests/Spout/Writer/CSV/WriterTest.php b/tests/Spout/Writer/CSV/WriterTest.php index fd7a3bb..fce430f 100644 --- a/tests/Spout/Writer/CSV/WriterTest.php +++ b/tests/Spout/Writer/CSV/WriterTest.php @@ -2,8 +2,9 @@ namespace Box\Spout\Writer\CSV; -use Box\Spout\Common\Type; use Box\Spout\TestUsingResource; +use Box\Spout\Common\Type; +use Box\Spout\Common\Helper\EncodingHelper; use Box\Spout\Writer\WriterFactory; /** @@ -70,7 +71,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase ]; $writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv'); - $this->assertContains(Writer::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); + $this->assertContains(EncodingHelper::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM'); } /** @@ -162,6 +163,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase private function trimWrittenContent($writtenContent) { // remove line feeds and UTF-8 BOM - return trim($writtenContent, PHP_EOL . Writer::BOM_UTF8); + return trim($writtenContent, PHP_EOL . EncodingHelper::BOM_UTF8); } } diff --git a/tests/resources/csv/csv_with_encoding_cp1252.csv b/tests/resources/csv/csv_with_encoding_cp1252.csv new file mode 100644 index 0000000..552edd3 --- /dev/null +++ b/tests/resources/csv/csv_with_encoding_cp1252.csv @@ -0,0 +1,3 @@ +csv--11,csv--12,csv--13 +csv--21,csv--22,csv--23 +csv--31,csv--32,csv--33 \ No newline at end of file diff --git a/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv b/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv new file mode 100644 index 0000000..4e0a014 Binary files /dev/null and b/tests/resources/csv/csv_with_encoding_utf16le_no_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf16be_bom.csv b/tests/resources/csv/csv_with_utf16be_bom.csv new file mode 100644 index 0000000..6c33ecf Binary files /dev/null and b/tests/resources/csv/csv_with_utf16be_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf16le_bom.csv b/tests/resources/csv/csv_with_utf16le_bom.csv new file mode 100644 index 0000000..e881bfa Binary files /dev/null and b/tests/resources/csv/csv_with_utf16le_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf32be_bom.csv b/tests/resources/csv/csv_with_utf32be_bom.csv new file mode 100644 index 0000000..3a14956 Binary files /dev/null and b/tests/resources/csv/csv_with_utf32be_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf32le_bom.csv b/tests/resources/csv/csv_with_utf32le_bom.csv new file mode 100644 index 0000000..dd67f85 Binary files /dev/null and b/tests/resources/csv/csv_with_utf32le_bom.csv differ diff --git a/tests/resources/csv/csv_with_utf8_bom.csv b/tests/resources/csv/csv_with_utf8_bom.csv index fd7a3c4..216a017 100644 --- a/tests/resources/csv/csv_with_utf8_bom.csv +++ b/tests/resources/csv/csv_with_utf8_bom.csv @@ -1,2 +1,3 @@ csv--11,csv--12,csv--13 -csv--21,csv--22,csv--23 \ No newline at end of file +csv--21,csv--22,csv--23 +csv--31,csv--32,csv--33 \ No newline at end of file