Merge pull request #131 from box/better_date_support_xlsx

Better date support
This commit is contained in:
Adrien Loison 2015-10-23 16:26:00 -07:00
commit 2a9400dfca
8 changed files with 485 additions and 7 deletions

View File

@ -151,6 +151,22 @@ class SimpleXMLElement
return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null;
}
/**
* Returns the immediate children.
*
* @return array The children
*/
public function children()
{
$children = [];
foreach ($this->simpleXMLElement->children() as $child) {
$children[] = $this->wrapSimpleXMLElement($child);
}
return $children;
}
/**
* @return string
*/

View File

@ -25,19 +25,35 @@ class CellValueFormatter
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_TYPE = 't';
const XML_ATTRIBUTE_STYLE_ID = 's';
/** Constants used for date formatting */
const NUM_DAYS_FROM_JAN_1_1900_TO_JAN_1_1970 = 25569; // actually 25567 but accommodating for an Excel bug with some bisextile years
const NUM_SECONDS_IN_ONE_DAY = 86400;
/**
* February 29th, 1900 is NOT a leap year but Excel thinks it is...
* @see https://en.wikipedia.org/wiki/Year_1900_problem#Microsoft_Excel
*/
const ERRONEOUS_EXCEL_LEAP_YEAR_DAY = 60;
/** @var SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var StyleHelper Helper to work with styles */
protected $styleHelper;
/** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */
protected $escaper;
/**
* @param SharedStringsHelper $sharedStringsHelper Helper to work with shared strings
* @param StyleHelper $styleHelper Helper to work with styles
*/
public function __construct($sharedStringsHelper)
public function __construct($sharedStringsHelper, $styleHelper)
{
$this->sharedStringsHelper = $sharedStringsHelper;
$this->styleHelper = $styleHelper;
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$this->escaper = new \Box\Spout\Common\Escaper\XLSX();
@ -53,6 +69,7 @@ class CellValueFormatter
{
// Default cell type is "n"
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC;
$cellStyleId = intval($node->getAttribute(self::XML_ATTRIBUTE_STYLE_ID));
$vNodeValue = $this->getVNodeValue($node);
if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) {
@ -69,7 +86,7 @@ class CellValueFormatter
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($vNodeValue);
case self::CELL_TYPE_NUMERIC:
return $this->formatNumericCellValue($vNodeValue);
return $this->formatNumericCellValue($vNodeValue, $cellStyleId);
case self::CELL_TYPE_DATE:
return $this->formatDateCellValue($vNodeValue);
default:
@ -138,14 +155,49 @@ class CellValueFormatter
/**
* Returns the cell Numeric value from string of nodeValue.
* The value can also represent a timestamp and a DateTime will be returned.
*
* @param string $nodeValue
* @return int|float The value associated with the cell
* @param int $cellStyleId 0 being the default style
* @return int|float|\DateTime|null The value associated with the cell
*/
protected function formatNumericCellValue($nodeValue)
protected function formatNumericCellValue($nodeValue, $cellStyleId)
{
$cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue);
return $cellValue;
// Numeric values can represent numbers as well as timestamps.
// We need to look at the style of the cell to determine whether it is one or the other.
$shouldFormatAsDate = $this->styleHelper->shouldFormatNumericValueAsDate($cellStyleId);
if ($shouldFormatAsDate) {
return $this->formatExcelTimestampValue(floatval($nodeValue));
} else {
return is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue);
}
}
/**
* Returns a cell's PHP Date value, associated to the given timestamp.
* NOTE: The timestamp is a float representing the number of days since January 1st, 1900.
*
* @param float $nodeValue
* @return \DateTime|null The value associated with the cell or NULL if invalid date value
*/
protected function formatExcelTimestampValue($nodeValue)
{
$numDaysSince1970Jan1 = $nodeValue - self::NUM_DAYS_FROM_JAN_1_1900_TO_JAN_1_1970;
// Fix for the erroneous leap year in Excel
if ($nodeValue < self::ERRONEOUS_EXCEL_LEAP_YEAR_DAY) {
$numDaysSince1970Jan1++;
}
$unixTimestamp = round($numDaysSince1970Jan1 * self::NUM_SECONDS_IN_ONE_DAY);
try {
$cellValue = (new \DateTime())->setTimestamp($unixTimestamp);
return $cellValue;
} catch (\Exception $e) {
return null;
}
}
/**

View File

@ -0,0 +1,224 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
use Box\Spout\Reader\Wrapper\XMLReader;
/**
* Class StyleHelper
* This class provides helper functions related to XLSX styles
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class StyleHelper
{
/** Paths of XML files relative to the XLSX file root */
const STYLES_XML_FILE_PATH = 'xl/styles.xml';
/** Nodes used to find relevant information in the styles XML file */
const XML_NODE_NUM_FMTS = 'numFmts';
const XML_NODE_NUM_FMT = 'numFmt';
const XML_NODE_CELL_XFS = 'cellXfs';
const XML_NODE_XF = 'xf';
/** Attributes used to find relevant information in the styles XML file */
const XML_ATTRIBUTE_NUM_FMT_ID = 'numFmtId';
const XML_ATTRIBUTE_FORMAT_CODE = 'formatCode';
const XML_ATTRIBUTE_APPLY_NUMBER_FORMAT = 'applyNumberFormat';
/** By convention, default style ID is 0 */
const DEFAULT_STYLE_ID = 0;
/** @var string Path of the XLSX file being read */
protected $filePath;
/** @var array Array containing a mapping NUM_FMT_ID => FORMAT_CODE */
protected $customNumberFormats;
/** @var array Array containing a mapping STYLE_ID => [STYLE_ATTRIBUTES] */
protected $stylesAttributes;
/**
* @param string $filePath Path of the XLSX file being read
*/
public function __construct($filePath)
{
$this->filePath = $filePath;
}
/**
* Reads the styles.xml file and extract the relevant information from the file.
*
* @return void
*/
protected function extractRelevantInfo()
{
$this->customNumberFormats = [];
$this->stylesAttributes = [];
$stylesXmlFilePath = $this->filePath .'#' . self::STYLES_XML_FILE_PATH;
$xmlReader = new XMLReader();
if ($xmlReader->open('zip://' . $stylesXmlFilePath)) {
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_NUM_FMTS)) {
$numFmtsNode = new SimpleXMLElement($xmlReader->readOuterXml());
$this->extractNumberFormats($numFmtsNode);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL_XFS)) {
$cellXfsNode = new SimpleXMLElement($xmlReader->readOuterXml());
$this->extractStyleAttributes($cellXfsNode);
}
}
$xmlReader->close();
}
}
/**
* Extracts number formats from the "numFmt" nodes.
* For simplicity, the styles attributes are kept in memory. This is possible thanks
* to the reuse of formats. So 1 million cells should not use 1 million formats.
*
* @param SimpleXMLElement $numFmtsNode The "numFmts" node
* @return void
*/
protected function extractNumberFormats($numFmtsNode)
{
foreach ($numFmtsNode->children() as $numFmtNode) {
$numFmtId = intval($numFmtNode->getAttribute(self::XML_ATTRIBUTE_NUM_FMT_ID));
$formatCode = $numFmtNode->getAttribute(self::XML_ATTRIBUTE_FORMAT_CODE);
$this->customNumberFormats[$numFmtId] = $formatCode;
}
}
/**
* Extracts style attributes from the "xf" nodes, inside the "cellXfs" section.
* For simplicity, the styles attributes are kept in memory. This is possible thanks
* to the reuse of styles. So 1 million cells should not use 1 million styles.
*
* @param SimpleXMLElement $cellXfsNode The "cellXfs" node
* @return void
*/
protected function extractStyleAttributes($cellXfsNode)
{
foreach ($cellXfsNode->children() as $xfNode) {
$this->stylesAttributes[] = [
self::XML_ATTRIBUTE_NUM_FMT_ID => intval($xfNode->getAttribute(self::XML_ATTRIBUTE_NUM_FMT_ID)),
self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT => !!($xfNode->getAttribute(self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT)),
];
}
}
/**
* @return array The custom number formats
*/
protected function getCustomNumberFormats()
{
if (!isset($this->customNumberFormats)) {
$this->extractRelevantInfo();
}
return $this->customNumberFormats;
}
/**
* @return array The styles attributes
*/
protected function getStylesAttributes()
{
if (!isset($this->stylesAttributes)) {
$this->extractRelevantInfo();
}
return $this->stylesAttributes;
}
/**
* Returns whether the style with the given ID should consider
* numeric values as timestamps and format the cell as a date.
*
* @param int $styleId Zero-based style ID
* @return bool Whether the cell with the given cell should display a date instead of a numeric value
*/
public function shouldFormatNumericValueAsDate($styleId)
{
$stylesAttributes = $this->getStylesAttributes();
// Default style (0) does not format numeric values as timestamps. Only custom styles do.
// Also if the style ID does not exist in the styles.xml file, format as numeric value.
if ($styleId === self::DEFAULT_STYLE_ID || !array_key_exists($styleId, $stylesAttributes)) {
return false;
}
$styleAttributes = $stylesAttributes[$styleId];
$applyNumberFormat = $styleAttributes[self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT];
if (!$applyNumberFormat) {
return false;
}
$numFmtId = $styleAttributes[self::XML_ATTRIBUTE_NUM_FMT_ID];
return $this->doesNumFmtIdIndicateDate($numFmtId);
}
/**
* @param int $numFmtId
* @return bool Whether the number format ID indicates that the number is a timestamp
*/
protected function doesNumFmtIdIndicateDate($numFmtId)
{
return (
$this->isNumFmtIdBuiltInDateFormat($numFmtId) ||
$this->isNumFmtIdCustomDateFormat($numFmtId)
);
}
/**
* @param int $numFmtId
* @return bool Whether the number format ID indicates that the number is a timestamp
*/
protected function isNumFmtIdBuiltInDateFormat($numFmtId)
{
$builtInDateFormatIds = [14, 15, 16, 17, 18, 19, 20, 21, 22, 45, 46, 47];
return in_array($numFmtId, $builtInDateFormatIds);
}
/**
* @param int $numFmtId
* @return bool Whether the number format ID indicates that the number is a timestamp
*/
protected function isNumFmtIdCustomDateFormat($numFmtId)
{
$customNumberFormats = $this->getCustomNumberFormats();
if (!array_key_exists($numFmtId, $customNumberFormats)) {
return false;
}
$customNumberFormat = $customNumberFormats[$numFmtId];
// Remove extra formatting (what's between [ ], the brackets should not be preceded by a "\")
$pattern = '((?<!\\\)\[.+?(?<!\\\)\])';
$customNumberFormat = preg_replace($pattern, '', $customNumberFormat);
// custom date formats contain specific characters to represent the date:
// e - yy - m - d - h - s
// and all of their variants (yyyy - mm - dd...)
$dateFormatCharacters = ['e', 'yy', 'm', 'd', 'h', 's'];
$hasFoundDateFormatCharacter = false;
foreach ($dateFormatCharacters as $dateFormatCharacter) {
// character not preceded by "\"
$pattern = '/(?<!\\\)' . $dateFormatCharacter . '/';
if (preg_match($pattern, $customNumberFormat)) {
$hasFoundDateFormatCharacter = true;
break;
}
}
return $hasFoundDateFormatCharacter;
}
}

View File

@ -8,6 +8,7 @@ use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\CellHelper;
use Box\Spout\Reader\XLSX\Helper\CellValueFormatter;
use Box\Spout\Reader\XLSX\Helper\StyleHelper;
/**
* Class RowIterator
@ -39,6 +40,9 @@ class RowIterator implements IteratorInterface
/** @var Helper\CellValueFormatter Helper to format cell values */
protected $cellValueFormatter;
/** @var Helper\StyleHelper $styleHelper Helper to work with styles */
protected $styleHelper;
/** @var int Number of read rows */
protected $numReadRows = 0;
@ -62,7 +66,9 @@ class RowIterator implements IteratorInterface
$this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
$this->xmlReader = new XMLReader();
$this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper);
$this->styleHelper = new StyleHelper($filePath);
$this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper, $this->styleHelper);
}
/**

View File

@ -0,0 +1,134 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
/**
* Class StyleHelperTest
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class StyleHelperTest extends \PHPUnit_Framework_TestCase
{
/**
* @param array $styleAttributes
* @param array|void $customNumberFormats
* @return StyleHelper
*/
private function getStyleHelperMock($styleAttributes, $customNumberFormats = [])
{
/** @var StyleHelper $styleHelper */
$styleHelper = $this->getMockBuilder('\Box\Spout\Reader\XLSX\Helper\StyleHelper')
->setMethods(['getCustomNumberFormats', 'getStylesAttributes'])
->disableOriginalConstructor()
->getMock();
$styleHelper->method('getStylesAttributes')->willReturn($styleAttributes);
$styleHelper->method('getCustomNumberFormats')->willReturn($customNumberFormats);
return $styleHelper;
}
/**
* @return void
*/
public function testShouldFormatNumericValueAsDateWithDefaultStyle()
{
$styleHelper = $this->getStyleHelperMock([]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(0);
$this->assertFalse($shouldFormatAsDate);
}
/**
* @return void
*/
public function testShouldFormatNumericValueAsDateWhenStyleIdNotListed()
{
$styleHelper = $this->getStyleHelperMock([['applyNumberFormat' => true]]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1);
$this->assertFalse($shouldFormatAsDate);
}
/**
* @return void
*/
public function testShouldFormatNumericValueAsDateWhenShouldNotApplyNumberFormat()
{
$styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => false]]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1);
$this->assertFalse($shouldFormatAsDate);
}
/**
* @return void
*/
public function testShouldFormatNumericValueAsDateWithBuiltinDateFormats()
{
$builtinNumFmtIdsForDate = [14, 15, 16, 17, 18, 19, 20, 21, 22, 45, 46, 47];
foreach ($builtinNumFmtIdsForDate as $builtinNumFmtIdForDate) {
$styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => $builtinNumFmtIdForDate]]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1);
$this->assertTrue($shouldFormatAsDate);
}
}
/**
* @return void
*/
public function testShouldFormatNumericValueAsDateWhenCustomNumberFormatNotFound()
{
$styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => 165]], [166 => []]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1);
$this->assertFalse($shouldFormatAsDate);
}
/**
* @return array
*/
public function dataProviderForCustomDateFormats()
{
return [
// number format, expectedResult
['[$-409]dddd\,\ mmmm\ d\,\ yy', true],
['[$-409]d\-mmm\-yy;@', true],
['[$-409]d\-mmm\-yyyy;@', true],
['mm/dd/yy;@', true],
['[$-F800]dddd\,\ mmmm\ dd\,\ yyyy', true],
['m/d;@', true],
['m/d/yy;@', true],
['[$-409]d\-mmm;@', true],
['[$-409]dd\-mmm\-yy;@', true],
['[$-409]mmm\-yy;@', true],
['[$-409]mmmm\-yy;@', true],
['[$-409]mmmm\ d\,\ yyyy;@', true],
['[$-409]m/d/yy\ h:mm\ AM/PM;@', true],
['m/d/yy\ h:mm;@', true],
['[$-409]mmmmm;@', true],
['[$-409]mmmmm\-yy;@', true],
['m/d/yyyy;@', true],
['[$-409]m/d/yy\--h:mm;@', true],
['GENERAL', false],
['\ma\yb\e', false],
['[Red]foo;', false],
];
}
/**
* @dataProvider dataProviderForCustomDateFormats
*
* @param string $numberFormat
* @param bool $expectedResult
* @return void
*/
public function testShouldFormatNumericValueAsDateWithCustomDateFormats($numberFormat, $expectedResult)
{
$numFmtId = 165;
$styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => $numFmtId]], [$numFmtId => $numberFormat]);
$shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1);
$this->assertEquals($expectedResult, $shouldFormatAsDate);
}
}

View File

@ -113,6 +113,52 @@ class ReaderTest extends \PHPUnit_Framework_TestCase
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportNumericTimestampFormattedDifferentlyAsDate()
{
// make sure dates are always created with the same timezone
date_default_timezone_set('UTC');
$allRows = $this->getAllRowsForFile('sheet_with_same_numeric_value_date_formatted_differently.xlsx');
$expectedDate = \DateTime::createFromFormat('Y-m-d H:i:s', '2015-01-01 00:00:00');
$expectedRows = [
array_fill(0, 10, $expectedDate),
array_fill(0, 10, $expectedDate),
array_fill(0, 10, $expectedDate),
array_merge(array_fill(0, 7, $expectedDate), ['', '', '']),
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/
public function testReadShouldSupportDifferentDatesAsNumericTimestamp()
{
// make sure dates are always created with the same timezone
date_default_timezone_set('UTC');
$allRows = $this->getAllRowsForFile('sheet_with_different_numeric_value_dates.xlsx');
$expectedRows = [
[
\DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-01 00:00:00'),
\DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-02 00:00:00'),
\DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-01 22:23:00'),
],
[
\DateTime::createFromFormat('Y-m-d H:i:s', '1900-02-28 23:59:59'),
\DateTime::createFromFormat('Y-m-d H:i:s', '1900-03-01 00:00:00'),
\DateTime::createFromFormat('Y-m-d H:i:s', '1900-02-28 11:00:00'), // 1900-02-29 should be converted to 1900-02-28
]
];
$this->assertEquals($expectedRows, $allRows);
}
/**
* @return void
*/