Attempt to convert the non UTF-8 strings to UTF-8

This commit is contained in:
Adrien Loison 2015-07-26 15:00:31 -07:00
parent d946f12951
commit 5e1cfbfdbd
18 changed files with 620 additions and 109 deletions

View File

@ -21,6 +21,10 @@
"phpunit/phpunit": ">=3.7",
"scrutinizer/ocular": "~1.1"
},
"suggest": {
"ext-iconv": "To handle non UTF-8 CSV files (if \"php-intl\" is not already installed or is too limited)",
"ext-intl": "To handle non UTF-8 CSV files (if \"iconv\" is not already installed)"
},
"autoload": {
"psr-4": {
"Box\\Spout\\": "src/Spout"

View File

@ -0,0 +1,12 @@
<?php
namespace Box\Spout\Common\Exception;
/**
* Class EncodingConversionException
*
* @package Box\Spout\Common\Exception
*/
class EncodingConversionException extends SpoutException
{
}

View File

@ -0,0 +1,175 @@
<?php
namespace Box\Spout\Common\Helper;
use Box\Spout\Common\Exception\EncodingConversionException;
/**
* Class EncodingHelper
* This class provides helper functions to work with encodings.
*
* @package Box\Spout\Common\Helper
*/
class EncodingHelper
{
/** Definition of the encodings that can have a BOM */
const ENCODING_UTF8 = 'UTF-8';
const ENCODING_UTF16_LE = 'UTF-16LE';
const ENCODING_UTF16_BE = 'UTF-16BE';
const ENCODING_UTF32_LE = 'UTF-32LE';
const ENCODING_UTF32_BE = 'UTF-32BE';
/** Definition of the BOMs for the different encodings */
const BOM_UTF8 = "\xEF\xBB\xBF";
const BOM_UTF16_LE = "\xFF\xFE";
const BOM_UTF16_BE = "\xFE\xFF";
const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var array Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
protected $supportedEncodingsWithBom;
/**
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($globalFunctionsHelper)
{
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->supportedEncodingsWithBom = [
self::ENCODING_UTF8 => self::BOM_UTF8,
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
];
}
/**
* Returns the number of bytes to use as offset in order to skip the BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
*/
public function getBytesOffsetToSkipBOM($filePointer, $encoding)
{
$byteOffsetToSkipBom = 0;
if ($this->hasBom($filePointer, $encoding)) {
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
// we skip the N first bytes
$byteOffsetToSkipBom = strlen($bomUsed);
}
return $byteOffsetToSkipBom;
}
/**
* Returns whether the file identified by the given pointer has a BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return bool TRUE if the file has a BOM, FALSE otherwise
*/
protected function hasBOM($filePointer, $encoding)
{
$hasBOM = false;
$this->globalFunctionsHelper->rewind($filePointer);
if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
$numBytesInBom = strlen($potentialBom);
$hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
}
return $hasBOM;
}
/**
* Attempts to convert a non UTF-8 string into UTF-8.
*
* @param string $string Non UTF-8 string to be converted
* @param string $sourceEncoding The encoding used to encode the source string
* @return string The converted, UTF-8 string
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
public function attemptConversionToUTF8($string, $sourceEncoding)
{
return $this->attemptConversion($string, $sourceEncoding, self::ENCODING_UTF8);
}
/**
* Attempts to convert a UTF-8 string into the given encoding.
*
* @param string $string UTF-8 string to be converted
* @param string $targetEncoding The encoding the string should be re-encoded into
* @return string The converted string, encoded with the given encoding
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
public function attemptConversionFromUTF8($string, $targetEncoding)
{
return $this->attemptConversion($string, self::ENCODING_UTF8, $targetEncoding);
}
/**
* Attempts to convert the given string to the given encoding.
* Depending on what is installed on the server, we will try to iconv or mbstring.
*
* @param string $string string to be converted
* @param string $sourceEncoding The encoding used to encode the source string
* @param string $targetEncoding The encoding the string should be re-encoded into
* @return string The converted string, encoded with the given encoding
* @throws \Box\Spout\Common\Exception\EncodingConversionException If conversion is not supported or if the conversion failed
*/
protected function attemptConversion($string, $sourceEncoding, $targetEncoding)
{
// if source and target encodings are the same, it's a no-op
if ($sourceEncoding === $targetEncoding) {
return $string;
}
$convertedString = null;
if ($this->canUseIconv()) {
$convertedString = $this->globalFunctionsHelper->iconv($string, $sourceEncoding, $targetEncoding);
} else if ($this->canUseMbString()) {
$convertedString = $this->globalFunctionsHelper->mb_convert_encoding($string, $sourceEncoding, $targetEncoding);
} else {
throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding is not supported. Please install \"iconv\" or \"PHP Intl\".");
}
if ($convertedString === false) {
throw new EncodingConversionException("The conversion from $sourceEncoding to $targetEncoding failed.");
}
return $convertedString;
}
/**
* Returns whether "iconv" can be used.
*
* @return bool TRUE if "iconv" is available and can be used, FALSE otherwise
*/
protected function canUseIconv()
{
return $this->globalFunctionsHelper->function_exists('iconv');
}
/**
* Returns whether "mb_string" functions can be used.
* These functions come with the PHP Intl package.
*
* @return bool TRUE if "mb_string" functions are available and can be used, FALSE otherwise
*/
protected function canUseMbString()
{
return $this->globalFunctionsHelper->function_exists('mb_convert_encoding');
}
}

View File

@ -203,4 +203,73 @@ class GlobalFunctionsHelper
{
header($string);
}
/**
* Wrapper around global function iconv()
* @see iconv()
*
* @param string $string The string to be converted
* @param string $sourceEncoding The encoding of the source string
* @param string $targetEncoding The encoding the source string should be converted to
* @return string|bool the converted string or FALSE on failure.
*/
public function iconv($string, $sourceEncoding, $targetEncoding)
{
return iconv($sourceEncoding, $targetEncoding, $string);
}
/**
* Wrapper around global function mb_convert_encoding()
* @see mb_convert_encoding()
*
* @param string $string The string to be converted
* @param string $sourceEncoding The encoding of the source string
* @param string $targetEncoding The encoding the source string should be converted to
* @return string|bool the converted string or FALSE on failure.
*/
public function mb_convert_encoding($string, $sourceEncoding, $targetEncoding)
{
return mb_convert_encoding($string, $targetEncoding, $sourceEncoding);
}
/**
* Wrapper around global function stream_get_line()
* @see stream_get_line()
*
* @param resource $handle
* @param int $length
* @param string|void $ending
* @return string|bool
*/
public function stream_get_line($handle, $length, $ending = null)
{
return stream_get_line($handle, $length, $ending);
}
/**
* Wrapper around global function str_getcsv()
* @see str_getcsv()
*
* @param string $input
* @param string|void $delimiter
* @param string|void $enclosure
* @param string|void $escape
* @return array
*/
public function str_getcsv($input, $delimiter = null, $enclosure = null, $escape = null)
{
return str_getcsv($input, $delimiter, $enclosure, $escape);
}
/**
* Wrapper around global function function_exists()
* @see function_exists()
*
* @param string $functionName
* @return bool
*/
public function function_exists($functionName)
{
return function_exists($functionName);
}
}

View File

@ -1,92 +0,0 @@
<?php
namespace Box\Spout\Reader\CSV\Helper;
/**
* Class EncodingHelper
* This class provides helper functions to work with encodings.
*
* @package Box\Spout\Reader\CSV\Helper
*/
class EncodingHelper
{
/** Definition of the encodings that can have a BOM */
const ENCODING_UTF8 = 'UTF-8';
const ENCODING_UTF16_LE = 'UTF-16LE';
const ENCODING_UTF16_BE = 'UTF-16BE';
const ENCODING_UTF32_LE = 'UTF-32LE';
const ENCODING_UTF32_BE = 'UTF-32BE';
/** Definition of the BOMs for the different encodings */
const BOM_UTF8 = "\xEF\xBB\xBF";
const BOM_UTF16_LE = "\xFF\xFE";
const BOM_UTF16_BE = "\xFE\xFF";
const BOM_UTF32_LE = "\xFF\xFE\x00\x00";
const BOM_UTF32_BE = "\x00\x00\xFE\xFF";
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var array Map representing the encodings supporting BOMs (key) and their associated BOM (value) */
protected $supportedEncodingsWithBom;
/**
* @param \Box\Spout\Common\Helper\GlobalFunctionsHelper $globalFunctionsHelper
*/
public function __construct($globalFunctionsHelper)
{
$this->globalFunctionsHelper = $globalFunctionsHelper;
$this->supportedEncodingsWithBom = [
self::ENCODING_UTF8 => self::BOM_UTF8,
self::ENCODING_UTF16_LE => self::BOM_UTF16_LE,
self::ENCODING_UTF16_BE => self::BOM_UTF16_BE,
self::ENCODING_UTF32_LE => self::BOM_UTF32_LE,
self::ENCODING_UTF32_BE => self::BOM_UTF32_BE,
];
}
/**
* Returns the number of bytes to use as offset in order to skip the BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return int Bytes offset to apply to skip the BOM (0 means no BOM)
*/
public function getBytesOffsetToSkipBOM($filePointer, $encoding)
{
$byteOffsetToSkipBom = 0;
if ($this->hasBom($filePointer, $encoding)) {
$bomUsed = $this->supportedEncodingsWithBom[$encoding];
// we skip the N first bytes
$byteOffsetToSkipBom = strlen($bomUsed);
}
return $byteOffsetToSkipBom;
}
/**
* Returns whether the file identified by the given pointer has a BOM.
*
* @param resource $filePointer Pointer to the file to check
* @param string $encoding Encoding of the file to check
* @return bool TRUE if the file has a BOM, FALSE otherwise
*/
protected function hasBOM($filePointer, $encoding)
{
$hasBOM = false;
$this->globalFunctionsHelper->rewind($filePointer);
if (array_key_exists($encoding, $this->supportedEncodingsWithBom)) {
$potentialBom = $this->supportedEncodingsWithBom[$encoding];
$numBytesInBom = strlen($potentialBom);
$hasBOM = ($this->globalFunctionsHelper->fgets($filePointer, $numBytesInBom + 1) === $potentialBom);
}
return $hasBOM;
}
}

View File

@ -4,6 +4,7 @@ namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\AbstractReader;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class Reader
@ -26,7 +27,7 @@ class Reader extends AbstractReader
protected $fieldEnclosure = '"';
/** @var string Encoding of the CSV file to be read */
protected $encoding = 'UTF-8';
protected $encoding = EncodingHelper::ENCODING_UTF8;
/**
* Sets the field delimiter for the CSV.
@ -69,6 +70,7 @@ class Reader extends AbstractReader
/**
* Opens the file at the given path to make it ready to be read.
* If setEncoding() was not called, it assumes that the file is encoded in UTF-8.
*
* @param string $filePath Path of the CSV file to be read
* @return void

View File

@ -2,8 +2,8 @@
namespace Box\Spout\Reader\CSV;
use Box\Spout\Reader\CSV\Helper\EncodingHelper;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class RowIterator
@ -37,6 +37,9 @@ class RowIterator implements IteratorInterface
/** @var \Box\Spout\Common\Helper\GlobalFunctionsHelper Helper to work with global functions */
protected $globalFunctionsHelper;
/** @var \Box\Spout\Common\Helper\EncodingHelper Helper to work with different encodings */
protected $encodingHelper;
/**
* @param resource $filePointer Pointer to the CSV file to read
* @param string $fieldDelimiter Character that delimits fields
@ -101,6 +104,7 @@ class RowIterator implements IteratorInterface
* @link http://php.net/manual/en/iterator.next.php
*
* @return void
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
*/
public function next()
{
@ -109,7 +113,8 @@ class RowIterator implements IteratorInterface
if (!$this->hasReachedEndOfFile) {
do {
$lineData = $this->globalFunctionsHelper->fgetcsv($this->filePointer, 0, $this->fieldDelimiter, $this->fieldEnclosure);
$utf8EncodedLineData = $this->getNextUTF8EncodedLine();
$lineData = $this->globalFunctionsHelper->str_getcsv($utf8EncodedLineData, $this->fieldDelimiter, $this->fieldEnclosure);
} while ($lineData === false || ($lineData !== null && $this->isEmptyLine($lineData)));
if ($lineData !== false && $lineData !== null) {
@ -119,6 +124,25 @@ class RowIterator implements IteratorInterface
}
}
/**
* Returns the next line, converted if necessary to UTF-8.
* Neither fgets nor fgetcsv don't work with non UTF-8 data... so we need to do some things manually.
*
* @return string The next line for the current file pointer, encoded in UTF-8
* @throws \Box\Spout\Common\Exception\EncodingConversionException If unable to convert data to UTF-8
*/
protected function getNextUTF8EncodedLine()
{
// Read until the EOL delimiter or EOF is reached. The delimiter's encoding needs to match the CSV's encoding.
$encodedEOLDelimiter = $this->encodingHelper->attemptConversionFromUTF8("\n", $this->encoding);
$encodedLineData = $this->globalFunctionsHelper->stream_get_line($this->filePointer, 0, $encodedEOLDelimiter);
// Once the line has been read, it can be converted to UTF-8
$utf8EncodedLineData = $this->encodingHelper->attemptConversionToUTF8($encodedLineData, $this->encoding);
return $utf8EncodedLineData;
}
/**
* @param array $lineData Array containing the cells value for the line
* @return bool Whether the given line is empty

View File

@ -4,6 +4,7 @@ namespace Box\Spout\Writer\CSV;
use Box\Spout\Writer\AbstractWriter;
use Box\Spout\Common\Exception\IOException;
use Box\Spout\Common\Helper\EncodingHelper;
/**
* Class Writer
@ -15,7 +16,6 @@ class Writer extends AbstractWriter
{
/** Number of rows to write before flushing */
const FLUSH_THRESHOLD = 500;
const BOM_UTF8 = "\xEF\xBB\xBF";
/** @var string Content-Type value for the header */
protected static $headerContentType = 'text/csv; charset=UTF-8';
@ -61,7 +61,7 @@ class Writer extends AbstractWriter
protected function openWriter()
{
// Adds UTF-8 BOM for Unicode compatibility
$this->globalFunctionsHelper->fputs($this->filePointer, self::BOM_UTF8);
$this->globalFunctionsHelper->fputs($this->filePointer, EncodingHelper::BOM_UTF8);
}
/**

View File

@ -0,0 +1,223 @@
<?php
namespace Box\Spout\Common\Helper;
use Box\Spout\TestUsingResource;
/**
* Class EncodingHelperTest
*
* @package Box\Spout\Common\Helper
*/
class EncodingHelperTest extends \PHPUnit_Framework_TestCase
{
use TestUsingResource;
/**
* @return array
*/
public function dataProviderForTestGetBytesOffsetToSkipBOM()
{
return [
['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8, 3],
['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE, 2],
['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE, 4],
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, 0],
['csv_standard.csv', EncodingHelper::ENCODING_UTF8, 0],
];
}
/**
* @dataProvider dataProviderForTestGetBytesOffsetToSkipBOM
*
* @param string $fileName
* @param string $encoding
* @param int $expectedBytesOffset
* @return void
*/
public function testGetBytesOffsetToSkipBOM($fileName, $encoding, $expectedBytesOffset)
{
$resourcePath = $this->getResourcePath($fileName);
$filePointer = fopen($resourcePath, 'r');
$encodingHelper = new EncodingHelper(new GlobalFunctionsHelper());
$bytesOffset = $encodingHelper->getBytesOffsetToSkipBOM($filePointer, $encoding);
$this->assertEquals($expectedBytesOffset, $bytesOffset);
}
/**
* @return array
*/
public function dataProviderForIconvOrMbstringUsage()
{
return [
[$shouldUseIconv = true],
[$shouldNotUseIconv = false],
];
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionToUTF8ShouldThrowIfConversionFailed($shouldUseIconv)
{
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['iconv', 'mb_convert_encoding'])
->getMock();
$helperStub->method('iconv')->willReturn(false);
$helperStub->method('mb_convert_encoding')->willReturn(false);
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([$helperStub])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @return void
*/
public function testAttemptConversionToUTF8ShouldThrowIfConversionNotSupported()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn(false);
$encodingHelperStub->method('canUseMbString')->willReturn(false);
$encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionToUTF8ShouldReturnReencodedString($shouldUseIconv)
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([new GlobalFunctionsHelper()])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodedString = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input');
$decodedString = $encodingHelperStub->attemptConversionToUTF8($encodedString, EncodingHelper::ENCODING_UTF16_LE);
$this->assertEquals('input', $decodedString);
}
/**
* @return void
*/
public function testAttemptConversionToUTF8ShouldBeNoopWhenTargetIsUTF8()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv'])
->getMock();
$encodingHelperStub->expects($this->never())->method('canUseIconv');
$decodedString = $encodingHelperStub->attemptConversionToUTF8('input', EncodingHelper::ENCODING_UTF8);
$this->assertEquals('input', $decodedString);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionFromUTF8ShouldThrowIfConversionFailed($shouldUseIconv)
{
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['iconv', 'mb_convert_encoding'])
->getMock();
$helperStub->method('iconv')->willReturn(false);
$helperStub->method('mb_convert_encoding')->willReturn(false);
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([$helperStub])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @expectedException \Box\Spout\Common\Exception\EncodingConversionException
*
* @return void
*/
public function testAttemptConversionFromUTF8ShouldThrowIfConversionNotSupported()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn(false);
$encodingHelperStub->method('canUseMbString')->willReturn(false);
$encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
}
/**
* @dataProvider dataProviderForIconvOrMbstringUsage
*
* @param bool $shouldUseIconv
* @return void
*/
public function testAttemptConversionFromUTF8ShouldReturnReencodedString($shouldUseIconv)
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->setConstructorArgs([new GlobalFunctionsHelper()])
->setMethods(['canUseIconv', 'canUseMbString'])
->getMock();
$encodingHelperStub->method('canUseIconv')->willReturn($shouldUseIconv);
$encodingHelperStub->method('canUseMbString')->willReturn(true);
$encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF16_LE);
$encodedStringWithIconv = iconv(EncodingHelper::ENCODING_UTF8, EncodingHelper::ENCODING_UTF16_LE, 'input');
$this->assertEquals($encodedStringWithIconv, $encodedString);
}
/**
* @return void
*/
public function testAttemptConversionFromUTF8ShouldBeNoopWhenTargetIsUTF8()
{
/** @var EncodingHelper $encodingHelperStub */
$encodingHelperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\EncodingHelper')
->disableOriginalConstructor()
->setMethods(['canUseIconv'])
->getMock();
$encodingHelperStub->expects($this->never())->method('canUseIconv');
$encodedString = $encodingHelperStub->attemptConversionFromUTF8('input', EncodingHelper::ENCODING_UTF8);
$this->assertEquals('input', $encodedString);
}
}

View File

@ -2,8 +2,9 @@
namespace Box\Spout\Reader\CSV;
use Box\Spout\Common\Type;
use Box\Spout\Reader\ReaderFactory;
use Box\Spout\Common\Type;
use Box\Spout\Common\Helper\EncodingHelper;
use Box\Spout\TestUsingResource;
/**
@ -167,15 +168,96 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
}
/**
* @return array
*/
public function dataProviderForTestReadShouldSkipBom()
{
return [
['csv_with_utf8_bom.csv', EncodingHelper::ENCODING_UTF8],
['csv_with_utf16le_bom.csv', EncodingHelper::ENCODING_UTF16_LE],
['csv_with_utf16be_bom.csv', EncodingHelper::ENCODING_UTF16_BE],
['csv_with_utf32le_bom.csv', EncodingHelper::ENCODING_UTF32_LE],
['csv_with_utf32be_bom.csv', EncodingHelper::ENCODING_UTF32_BE],
];
}
/**
* @dataProvider dataProviderForTestReadShouldSkipBom
*
* @param string $fileName
* @param string $fileEncoding
* @return void
*/
public function testReadShouldSkipUtf8Bom()
public function testReadShouldSkipBom($fileName, $fileEncoding)
{
$allRows = $this->getAllRowsForFile('csv_with_utf8_bom.csv');
$allRows = $this->getAllRowsForFile($fileName, ',', '"', $fileEncoding);
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return array
*/
public function dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs()
{
$shouldUseIconv = true;
$shouldNotUseIconv = false;
return [
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldUseIconv],
['csv_with_encoding_utf16le_no_bom.csv', EncodingHelper::ENCODING_UTF16_LE, $shouldNotUseIconv],
['csv_with_encoding_cp1252.csv', 'CP1252', $shouldUseIconv],
['csv_with_encoding_cp1252.csv', 'CP1252', $shouldNotUseIconv],
];
}
/**
* @dataProvider dataProviderForTestReadShouldSupportNonUTF8FilesWithoutBOMs
*
* @param string $fileName
* @param string $fileEncoding
* @param bool $shouldUseIconv
* @return void
*/
public function testReadShouldSupportNonUTF8FilesWithoutBOMs($fileName, $fileEncoding, $shouldUseIconv)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
$helperStub = $this->getMockBuilder('\Box\Spout\Common\Helper\GlobalFunctionsHelper')
->setMethods(['function_exists'])
->getMock();
$returnValueMap = [
['iconv', $shouldUseIconv],
['mb_convert_encoding', true],
];
$helperStub->method('function_exists')->will($this->returnValueMap($returnValueMap));
/** @var \Box\Spout\Reader\CSV\Reader $reader */
$reader = ReaderFactory::create(Type::CSV);
$reader
->setGlobalFunctionsHelper($helperStub)
->setEncoding($fileEncoding)
->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheet) {
foreach ($sheet->getRowIterator() as $row) {
$allRows[] = $row;
}
}
$reader->close();
$expectedRows = [
['csv--11', 'csv--12', 'csv--13'],
['csv--21', 'csv--22', 'csv--23'],
['csv--31', 'csv--32', 'csv--33'],
];
$this->assertEquals($expectedRows, $allRows);
}
@ -228,18 +310,25 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
* @param string $fileName
* @param string|void $fieldDelimiter
* @param string|void $fieldEnclosure
* @param string|void $encoding
* @return array All the read rows the given file
*/
private function getAllRowsForFile($fileName, $fieldDelimiter = ",", $fieldEnclosure = '"')
private function getAllRowsForFile(
$fileName,
$fieldDelimiter = ',',
$fieldEnclosure = '"',
$encoding = EncodingHelper::ENCODING_UTF8)
{
$allRows = [];
$resourcePath = $this->getResourcePath($fileName);
/** @var \Box\Spout\Reader\CSV\Reader $reader */
$reader = ReaderFactory::create(Type::CSV);
$reader->setFieldDelimiter($fieldDelimiter);
$reader->setFieldEnclosure($fieldEnclosure);
$reader->open($resourcePath);
$reader
->setFieldDelimiter($fieldDelimiter)
->setFieldEnclosure($fieldEnclosure)
->setEncoding($encoding)
->open($resourcePath);
foreach ($reader->getSheetIterator() as $sheetIndex => $sheet) {
foreach ($sheet->getRowIterator() as $rowIndex => $row) {

View File

@ -2,8 +2,9 @@
namespace Box\Spout\Writer\CSV;
use Box\Spout\Common\Type;
use Box\Spout\TestUsingResource;
use Box\Spout\Common\Type;
use Box\Spout\Common\Helper\EncodingHelper;
use Box\Spout\Writer\WriterFactory;
/**
@ -70,7 +71,7 @@ class WriterTest extends \PHPUnit_Framework_TestCase
];
$writtenContent = $this->writeToCsvFileAndReturnWrittenContent($allRows, 'csv_with_utf8_bom.csv');
$this->assertContains(Writer::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
$this->assertContains(EncodingHelper::BOM_UTF8, $writtenContent, 'The CSV file should contain a UTF-8 BOM');
}
/**
@ -162,6 +163,6 @@ class WriterTest extends \PHPUnit_Framework_TestCase
private function trimWrittenContent($writtenContent)
{
// remove line feeds and UTF-8 BOM
return trim($writtenContent, PHP_EOL . Writer::BOM_UTF8);
return trim($writtenContent, PHP_EOL . EncodingHelper::BOM_UTF8);
}
}

View File

@ -0,0 +1,3 @@
csv--11,csv--12,csv--13
csv--21,csv--22,csv--23
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 c�s�v�-�-�1�1� �c�s�v�-�-�1�2� �c�s�v�-�-�1�3�
2 �c�s�v�-�-�2�1� �c�s�v�-�-�2�2� �c�s�v�-�-�2�3�
3 �c�s�v�-�-�3�1� �c�s�v�-�-�3�2� �c�s�v�-�-�3�3�

Binary file not shown.
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33

Binary file not shown.
1 �����c���s���v���-���-���1���1��� ���c���s���v���-���-���1���2��� ���c���s���v���-���-���1���3���
2 ���c���s���v���-���-���2���1��� ���c���s���v���-���-���2���2��� ���c���s���v���-���-���2���3���
3 ���c���s���v���-���-���3���1��� ���c���s���v���-���-���3���2��� ���c���s���v���-���-���3���3

Binary file not shown.
1 ��c���s���v���-���-���1���1��� ���c���s���v���-���-���1���2��� ���c���s���v���-���-���1���3���
2 ���c���s���v���-���-���2���1��� ���c���s���v���-���-���2���2��� ���c���s���v���-���-���2���3���
3 ���c���s���v���-���-���3���1��� ���c���s���v���-���-���3���2��� ���c���s���v���-���-���3���3���

View File

@ -1,2 +1,3 @@
csv--11,csv--12,csv--13
csv--21,csv--22,csv--23
csv--31,csv--32,csv--33
1 csv--11 csv--12 csv--13
2 csv--21 csv--22 csv--23
3 csv--31 csv--32 csv--33