Merge pull request #76 from box/csv_multiple_encodings

Csv multiple encodings
This commit is contained in:
Adrien Loison 2015-07-27 21:02:52 -07:00
commit 2ec12dd16b
20 changed files with 672 additions and 41 deletions

View File

@ -126,7 +126,7 @@ For XLSX files, the number of rows per sheet is limited to 1,048,576 (see [Offic
### Configuring the CSV reader and writer
It is possible to configure the both the CSV reader and writer to specify the field separator as well as the field enclosure:
It is possible to configure both the CSV reader and writer to specify the field separator as well as the field enclosure:
```php
use Box\Spout\Reader\ReaderFactory;
use Box\Spout\Common\Type;
@ -136,6 +136,13 @@ $reader->setFieldDelimiter('|');
$reader->setFieldEnclosure('@');
```
Additionally, if you need to read non UTF-8 files, you can specify the encoding of your file this way:
```php
$reader->setEncoding('UTF-16LE');
```
The writer always generate CSV files encoded in UTF-8, with a BOM.
### Configuring the XLSX writer
#### Strings storage

View File

@ -21,6 +21,10 @@
"phpunit/phpunit": ">=3.7",
"scrutinizer/ocular": "~1.1"
},
"suggest": {
"ext-iconv": "To handle non UTF-8 CSV files (if \"php-intl\" is not already installed or is too limited)",
"ext-intl": "To handle non UTF-8 CSV files (if \"iconv\" is not already installed)"
},
"autoload": {
"psr-4": {
"Box\\Spout\\": "src/Spout"

View File

@ -0,0 +1,12 @@
<?php
namespace Box\Spout\Common\Exception;
/**
* Class EncodingConversionException
*
* @package Box\Spout\Common\Exception
*/
class EncodingConversionException extends SpoutException
{
}

View File

@ -0,0 +1,175 @@
<?php
namespace Box\Spout\Common\Helper;
use Box\Spout\Common\Exception\EncodingConversionException;
/**
* Class EncodingHelper
* This class provides helper functions to work with encodings.
*
* @package Box\Spout\Common\Helper
*/
class EncodingHelper
{
/** Definition of the encodings that can have a BOM */
const ENCODING_UTF8 = 'UTF-8';
const ENCODING_UTF16_LE = 'UTF-16LE';
const ENCODING_UTF16_BE = 'UTF-16BE';
const ENCODING_UTF32_LE = 'UTF-32LE';
const ENCODING_UTF32_BE = 'UTF-32BE';
/** Definition of the BOMs for the different encodings */
const BOM_UTF8 = "\xEF\xBB\xBF";
const BOM_UTF16_LE = "\xFF\xFE";
const BOM_UTF16_BE = "\xFE\xFF";
const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var array Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
protected $supportedEncodingsWithBom;
/**
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($globalFunctionsHelper)
{
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->supportedEncodingsWithBom = [
self::ENCODING_UTF8 => self::BOM_UTF8,
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
];
}
/**
* Returns the number of bytes to use as offset in order to skip the BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
*/
public function getBytesOffsetToSkipBOM($filePointer, $encoding)
{
$byteOffsetToSkipBom = 0;
if ($this->hasBom($filePointer, $encoding)) {
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
// we skip the N first bytes
$byteOffsetToSkipBom = strlen($bomUsed);
}
return $byteOffsetToSkipBom;
}
/**
* Returns whether the file identified by the given pointer has a BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return bool TRUE if the file has a BOM, FALSE otherwise
*/
protected function hasBOM($filePointer, $encoding)
{
$hasBOM = false;
$this->globalFunctionsHelper->rewind($filePointer);
if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
$numBytesInBom = strlen($potentialBom);
$hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
}
return $hasBOM;
}
/**
* Attempts to convert a non UTF-8 string into UTF-8.
*
* @param string $string Non UTF-8 string to be converted
* @param string $sourceEncoding The encoding used to encode the source string
* @return string The converted, UTF-8 string
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
public function attemptConversionToUTF8($string, $sourceEncoding)
{
return $this->attemptConversion($string, $sourceEncoding, self::ENCODING_UTF8);
}
/**
* Attempts to convert a UTF-8 string into the given encoding.
*
* @param string $string UTF-8 string to be converted
* @param string $targetEncoding The encoding the string should be re-encoded into
* @return string The converted string, encoded with the given encoding
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
public function attemptConversionFromUTF8($string, $targetEncoding)
{
return $this->attemptConversion($string, self::ENCODING_UTF8, $targetEncoding);
}
/**
* Attempts to convert the given string to the given encoding.
* Depending on what is installed on the server, we will try to iconv or mbstring.
*
* @param string $string string to be converted
* @param string $sourceEncoding The encoding used to encode the source string
* @param string $targetEncoding The encoding the string should be re-encoded into
* @return string The converted string, encoded with the given encoding
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
protected function attemptConversion($string, $sourceEncoding, $targetEncoding)
{
// if source and target encodings are the same, it's a no-op
if ($sourceEncoding === $targetEncoding) {
return $string;
}
$convertedString = null;
if ($this->canUseIconv()) {
$convertedString = $this->globalFunctionsHelper->iconv($string, $sourceEncoding, $targetEncoding);
} else if ($this->canUseMbString()) {
$convertedString = $this->globalFunctionsHelper->mb_convert_encoding($string, $sourceEncoding, $targetEncoding);
} else {
throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding is not supported. Please install \"iconv\" or \"PHP Intl\".");
}
if ($convertedString === false) {
throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding failed.");
}
return $convertedString;
}
/**
* Returns whether "iconv" can be used.
*
* @return bool TRUE if "iconv" is available and can be used, FALSE otherwise
*/
protected function canUseIconv()
{
return $this->globalFunctionsHelper->function_exists('iconv');
}
/**
* Returns whether "mb_string" functions can be used.
* These functions come with the PHP Intl package.
*
* @return bool TRUE if "mb_string" functions are available and can be used, FALSE otherwise
*/
protected function canUseMbString()
{
return $this->globalFunctionsHelper->function_exists('mb_convert_encoding');
}
}

View File

@ -203,4 +203,73 @@ class GlobalFunctionsHelper
{
header($string);
}
/**
* Wrapper around global function iconv()
* @see iconv()
*
* @param string $string The string to be converted
* @param string $sourceEncoding The encoding of the source string
* @param string $targetEncoding The encoding the source string should be converted to
* @return string|bool the converted string or FALSE on failure.
*/
public function iconv($string, $sourceEncoding, $targetEncoding)
{
return iconv($sourceEncoding, $targetEncoding, $string);
}
/**
* Wrapper around global function mb_convert_encoding()
* @see mb_convert_encoding()
*
* @param string $string The string to be converted
* @param string $sourceEncoding The encoding of the source string
* @param string $targetEncoding The encoding the source string should be converted to
* @return string|bool the converted string or FALSE on failure.
*/
public function mb_convert_encoding($string, $sourceEncoding, $targetEncoding)
{
return mb_convert_encoding($string, $targetEncoding, $sourceEncoding);
}
/**
* Wrapper around global function stream_get_line()
* @see stream_get_line()
*
* @param resource $handle
* @param int $length
* @param string|void $ending
* @return string|bool
*/
public function stream_get_line($handle, $length, $ending = null)
{
return stream_get_line($handle, $length, $ending);
}
/**
* Wrapper around global function str_getcsv()
* @see str_getcsv()
*
* @param string $input
* @param string|void $delimiter
* @param string|void $enclosure
* @param string|void $escape
* @return array
*/
public function str_getcsv($input, $delimiter = null, $enclosure = null, $escape = null)
{
return str_getcsv($input, $delimiter, $enclosure, $escape);
}
/**
* Wrapper around global function function_exists()
* @see function_exists()
*
* @param string $functionName
* @return bool
*/
public function function_exists($functionName)
{
return function_exists($functionName);
}
}

View File

@ -4,6 +4,7 @@ namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\AbstractReader;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class Reader
@ -25,6 +26,9 @@ class Reader extends AbstractReader
/** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"';
/** @var string Encoding of the CSV file to be read */
protected $encoding = EncodingHelper::ENCODING_UTF8;
/**
* Sets the field delimiter for the CSV.
* Needs to be called before opening the reader.
@ -51,10 +55,22 @@ class Reader extends AbstractReader
return $this;
}
/**
* Sets the encoding of the CSV file to be read.
* Needs to be called before opening the reader.
*
* @param string $encoding Encoding of the CSV file to be read
* @return Reader
*/
public function setEncoding($encoding)
{
$this->encoding = $encoding;
return $this;
}
/**
* Opens the file at the given path to make it ready to be read.
* The file must be UTF-8 encoded.
* @TODO add encoding detection/conversion
* If setEncoding() was not called, it assumes that the file is encoded in UTF-8.
*
* @param string $filePath Path of the CSV file to be read
* @return void
@ -67,7 +83,13 @@ class Reader extends AbstractReader
throw new IOException("Could not open file $filePath for reading.");
}
$this->sheetIterator = new SheetIterator($this->filePointer, $this->fieldDelimiter, $this->fieldEnclosure, $this->globalFunctionsHelper);
$this->sheetIterator = new SheetIterator(
$this->filePointer,
$this->fieldDelimiter,
$this->fieldEnclosure,
$this->encoding,
$this->globalFunctionsHelper
);
}
/**

View File

@ -3,6 +3,7 @@
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class RowIterator
@ -12,8 +13,6 @@ use Box\Spout\Reader\IteratorInterface;
*/
class RowIterator implements IteratorInterface
{
const UTF8_BOM = "\xEF\xBB\xBF";
/** @var resource Pointer to the CSV file to read */
protected $filePointer;
@ -27,26 +26,36 @@ class RowIterator implements IteratorInterface
protected $hasReachedEndOfFile = false;
/** @var string Defines the character used to delimit fields (one character only) */
protected $fieldDelimiter = ',';
protected $fieldDelimiter;
/** @var string Defines the character used to enclose fields (one character only) */
protected $fieldEnclosure = '"';
protected $fieldEnclosure;
/** @var string Encoding of the CSV file to be read */
protected $encoding;
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \Box\Spout\Common\Helper\EncodingHelper Helper to work with different encodings */
protected $encodingHelper;
/**
* @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{
$this->filePointer = $filePointer;
$this->fieldDelimiter = $fieldDelimiter;
$this->fieldEnclosure = $fieldEnclosure;
$this->encoding = $encoding;
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->encodingHelper = new EncodingHelper($globalFunctionsHelper);
}
/**
@ -57,7 +66,7 @@ class RowIterator implements IteratorInterface
*/
public function rewind()
{
$this->rewindAndSkipUtf8Bom();
$this->rewindAndSkipBom();
$this->numReadRows = 0;
$this->rowDataBuffer = null;
@ -66,24 +75,17 @@ class RowIterator implements IteratorInterface
}
/**
* This rewinds and skips the UTF-8 BOM if inserted at the beginning of the file
* This rewinds and skips the BOM if inserted at the beginning of the file
* by moving the file pointer after it, so that it is not read.
*
* @return void
*/
protected function rewindAndSkipUtf8Bom()
protected function rewindAndSkipBom()
{
$this->globalFunctionsHelper->rewind($this->filePointer);
$byteOffsetToSkipBom = $this->encodingHelper->getBytesOffsetToSkipBOM($this->filePointer, $this->encoding);
$hasUtf8Bom = ($this->globalFunctionsHelper->fgets($this->filePointer, 4) === self::UTF8_BOM);
if ($hasUtf8Bom) {
// we skip the 2 first bytes (so start from the 3rd byte)
$this->globalFunctionsHelper->fseek($this->filePointer, 3);
} else {
// if no BOM, reset the pointer to read from the beginning
$this->globalFunctionsHelper->fseek($this->filePointer, 0);
}
// sets the cursor after the BOM (0 means no BOM, so rewind it)
$this->globalFunctionsHelper->fseek($this->filePointer, $byteOffsetToSkipBom);
}
/**
@ -102,6 +104,7 @@ class RowIterator implements IteratorInterface
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
*/
public function next()
{
@ -110,7 +113,8 @@ class RowIterator implements IteratorInterface
if (!$this->hasReachedEndOfFile) {
do {
$lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure);
$utf8EncodedLineData = $this->getNextUTF8EncodedLine();
$lineData = $this->globalFunctionsHelper->str_getcsv($utf8EncodedLineData, $this->fieldDelimiter, $this->fieldEnclosure);
} while ($lineData === false || ($lineData !== null && $this->isEmptyLine($lineData)));
if ($lineData !== false && $lineData !== null) {
@ -120,6 +124,25 @@ class RowIterator implements IteratorInterface
}
}
/**
* Returns the next line, converted if necessary to UTF-8.
* Neither fgets nor fgetcsv don't work with non UTF-8 data... so we need to do some things manually.
*
* @return string The next line for the current file pointer, encoded in UTF-8
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
*/
protected function getNextUTF8EncodedLine()
{
// Read until the EOL delimiter or EOF is reached. The delimiter's encoding needs to match the CSV's encoding.
$encodedEOLDelimiter = $this->encodingHelper->attemptConversionFromUTF8("\n", $this->encoding);
$encodedLineData = $this->globalFunctionsHelper->stream_get_line($this->filePointer, 0, $encodedEOLDelimiter);
// Once the line has been read, it can be converted to UTF-8
$utf8EncodedLineData = $this->encodingHelper->attemptConversionToUTF8($encodedLineData, $this->encoding);
return $utf8EncodedLineData;
}
/**
* @param array $lineData Array containing the cells value for the line
* @return bool Whether the given line is empty

View File

@ -18,11 +18,12 @@ class Sheet implements SheetInterface
* @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
$this->rowIterator = new RowIterator($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
}
/**

View File

@ -22,11 +22,12 @@ class SheetIterator implements IteratorInterface
* @param resource $filePointer
* @param string $fieldDelimiter Character that delimits fields
* @param string $fieldEnclosure Character that enclose fields
* @param string $encoding Encoding of the CSV file to be read
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper)
public function __construct($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper)
{
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $globalFunctionsHelper);
$this->sheet = new Sheet($filePointer, $fieldDelimiter, $fieldEnclosure, $encoding, $globalFunctionsHelper);
}
/**

View File

@ -4,6 +4,7 @@ namespace Box\Spout\Writer\CSV;
use Box\Spout\Writer\AbstractWriter;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class Writer
@ -15,7 +16,6 @@ class Writer extends AbstractWriter
{
/** Number of rows to write before flushing */
const FLUSH_THRESHOLD = 500;
const UTF8_BOM = "\xEF\xBB\xBF";
/** @var string Content-Type value for the header */
protected static $headerContentType = 'text/csv; charset=UTF-8';
@ -61,7 +61,7 @@ class Writer extends AbstractWriter
protected function openWriter()
{
// Adds UTF-8 BOM for Unicode compatibility
$this->globalFunctionsHelper->fputs($this->filePointer, self::UTF8_BOM);
$this->globalFunctionsHelper->fputs($this->filePointer, EncodingHelper::BOM_UTF8);
}
/**

View File

@ -0,0 +1,223 @@
<?php
namespace Box\Spout\Common\Helper;
use Box\Spout\TestUsingResource;
/**
* Class EncodingHelperTest
*
* @package Box\Spout\Common\Helper
*/
class EncodingHelperTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @return array
*/
public function dataProviderForTestGetBytesOffsetToSkipBOM()
{
return [
['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8, 3],
['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE, 2],
['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE, 4],
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, 0],
['csv_standard.csv', EncodingHelper::ENCODING_UTF8, 0],
];
}
/**
* @dataProvider dataProviderForTestGetBytesOffsetToSkipBOM
*
* @param string $fileName
* @param string $encoding
* @param int $expectedBytesOffset
* @return void
*/
public function testGetBytesOffsetToSkipBOM($fileName, $encoding, $expectedBytesOffset)
{
$resourcePath = $this->getResourcePath($fileName);
$filePointer = fopen($resourcePath, 'r');
$encodingHelper = new EncodingHelper(new GlobalFunctionsHelper());
$bytesOffset = $encodingHelper->getBytesOffsetToSkipBOM($filePointer, $encoding);
$this->assertEquals($expectedBytesOffset, $bytesOffset);
}
/**
* @return array
*/
public function dataProviderForIconvOrMbstringUsage()
{
return [
[$shouldUseIconv = true],
[$shouldNotUseIconv = false],
];
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionToUTF8ShouldThrowIfConversionFailed($shouldUseIconv)
{
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['iconv', 'mb_convert_encoding'])
->getMock();
$helperStub->method('iconv')->willReturn(false);
$helperStub->method('mb_convert_encoding')->willReturn(false);
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([$helperStub])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @return void
*/
public function testAttemptConversionToUTF8ShouldThrowIfConversionNotSupported()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn(false);
$encodingHelperStub->method('canUseMbString')->willReturn(false);
$encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionToUTF8ShouldReturnReencodedString($shouldUseIconv)
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([new GlobalFunctionsHelper()])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodedString = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input');
$decodedString = $encodingHelperStub->attemptConversionToUTF8($encodedString, EncodingHelper::ENCODING_UTF16_LE);
$this->assertEquals('input', $decodedString);
}
/**
* @return void
*/
public function testAttemptConversionToUTF8ShouldBeNoopWhenTargetIsUTF8()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv'])
->getMock();
$encodingHelperStub->expects($this->never())->method('canUseIconv');
$decodedString = $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF8);
$this->assertEquals('input', $decodedString);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionFromUTF8ShouldThrowIfConversionFailed($shouldUseIconv)
{
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['iconv', 'mb_convert_encoding'])
->getMock();
$helperStub->method('iconv')->willReturn(false);
$helperStub->method('mb_convert_encoding')->willReturn(false);
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([$helperStub])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @return void
*/
public function testAttemptConversionFromUTF8ShouldThrowIfConversionNotSupported()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn(false);
$encodingHelperStub->method('canUseMbString')->willReturn(false);
$encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionFromUTF8ShouldReturnReencodedString($shouldUseIconv)
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([new GlobalFunctionsHelper()])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
$encodedStringWithIconv = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input');
$this->assertEquals($encodedStringWithIconv, $encodedString);
}
/**
* @return void
*/
public function testAttemptConversionFromUTF8ShouldBeNoopWhenTargetIsUTF8()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv'])
->getMock();
$encodingHelperStub->expects($this->never())->method('canUseIconv');
$encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF8);
$this->assertEquals('input', $encodedString);
}
}

View File

@ -2,8 +2,9 @@
namespace Box\Spout\Reader\CSV;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory;
use Box\Spout\Common\Type;
use Box\Spout\Common\Helper\EncodingHelper;
use Box\Spout\TestUsingResource;
/**
@ -167,15 +168,96 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
}
/**
* @return array
*/
public function dataProviderForTestReadShouldSkipBom()
{
return [
['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8],
['csv_with_utf16le_bom.csv', EncodingHelper::ENCODING_UTF16_LE],
['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE],
['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE],
['csv_with_utf32be_bom.csv', EncodingHelper::ENCODING_UTF32_BE],
];
}
/**
* @dataProvider dataProviderForTestReadShouldSkipBom
*
* @param string $fileName
* @param string $fileEncoding
* @return void
*/
public function testReadShouldSkipUtf8Bom()
public function testReadShouldSkipBom($fileName, $fileEncoding)
{
$allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv');
$allRows = $this->getAllRowsForFile($fileName, ',', '"', $fileEncoding);
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return array
*/
public function dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs()
{
$shouldUseIconv = true;
$shouldNotUseIconv = false;
return [
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldUseIconv],
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldNotUseIconv],
['csv_with_encoding_cp1252.csv', 'CP1252', $shouldUseIconv],
['csv_with_encoding_cp1252.csv', 'CP1252', $shouldNotUseIconv],
];
}
/**
* @dataProvider dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs
*
* @param string $fileName
* @param string $fileEncoding
* @param bool $shouldUseIconv
* @return void
*/
public function testReadShouldSupportNonUTF8FilesWithoutBOMs($fileName, $fileEncoding, $shouldUseIconv)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['function_exists'])
->getMock();
$returnValueMap = [
['iconv', $shouldUseIconv],
['mb_convert_encoding', true],
];
$helperStub->method('function_exists')->will($this->returnValueMap($returnValueMap));
/** @var \Box\Spout\Reader\CSV\Reader $reader */
$reader = ReaderFactory::create(Type::CSV);
$reader
->setGlobalFunctionsHelper($helperStub)
->setEncoding($fileEncoding)
->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheet) {
foreach ($sheet->getRowIterator() as $row) {
$allRows[] = $row;
}
}
$reader->close();
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
@ -228,18 +310,25 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
* @param string $fileName
* @param string|void $fieldDelimiter
* @param string|void $fieldEnclosure
* @param string|void $encoding
* @return array All the read rows the given file
*/
private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"')
private function getAllRowsForFile(
$fileName,
$fieldDelimiter = ',',
$fieldEnclosure = '"',
$encoding = EncodingHelper::ENCODING_UTF8)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
/** @var \Box\Spout\Reader\CSV\Reader $reader */
$reader = ReaderFactory::create(Type::CSV);
$reader->setFieldDelimiter($fieldDelimiter);
$reader->setFieldEnclosure($fieldEnclosure);
$reader->open($resourcePath);
$reader
->setFieldDelimiter($fieldDelimiter)
->setFieldEnclosure($fieldEnclosure)
->setEncoding($encoding)
->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {
foreach ($sheet->getRowIterator() as $rowIndex => $row) {

View File

@ -2,8 +2,9 @@
namespace Box\Spout\Writer\CSV;
use Box\Spout\Common\Type;
use Box\Spout\TestUsingResource;
use Box\Spout\Common\Type;
use Box\Spout\Common\Helper\EncodingHelper;
use Box\Spout\Writer\WriterFactory;
/**
@ -70,7 +71,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
];
$writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv');
$this->assertContains(Writer::UTF8_BOM, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
$this->assertContains(EncodingHelper::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
}
/**
@ -162,6 +163,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase
private function trimWrittenContent($writtenContent)
{
// remove line feeds and UTF-8 BOM
return trim($writtenContent, PHP_EOL . Writer::UTF8_BOM);
return trim($writtenContent, PHP_EOL . EncodingHelper::BOM_UTF8);
}
}

View File

@ -0,0 +1,3 @@
csv--11,csv--12,csv--13
csv--21,csv--22,csv--23
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 c�s�v�-�-�1�1� �c�s�v�-�-�1�2� �c�s�v�-�-�1�3�
2 �c�s�v�-�-�2�1� �c�s�v�-�-�2�2� �c�s�v�-�-�2�3�
3 �c�s�v�-�-�3�1� �c�s�v�-�-�3�2� �c�s�v�-�-�3�3�

Binary file not shown.
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 �����c���s���v���-���-���1���1��� ���c���s���v���-���-���1���2��� ���c���s���v���-���-���1���3���
2 ���c���s���v���-���-���2���1��� ���c���s���v���-���-���2���2��� ���c���s���v���-���-���2���3���
3 ���c���s���v���-���-���3���1��� ���c���s���v���-���-���3���2��� ���c���s���v���-���-���3���3

Binary file not shown.
1 ��c���s���v���-���-���1���1��� ���c���s���v���-���-���1���2��� ���c���s���v���-���-���1���3���
2 ���c���s���v���-���-���2���1��� ���c���s���v���-���-���2���2��� ���c���s���v���-���-���2���3���
3 ���c���s���v���-���-���3���1��� ���c���s���v���-���-���3���2��� ���c���s���v���-���-���3���3���

View File

@ -1,2 +1,3 @@
csv--11,csv--12,csv--13
csv--21,csv--22,csv--23
csv--21,csv--22,csv--23
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33