diff --git a/src/Spout/Reader/Wrapper/SimpleXMLElement.php b/src/Spout/Reader/Wrapper/SimpleXMLElement.php index 4892aca..0e3d758 100644 --- a/src/Spout/Reader/Wrapper/SimpleXMLElement.php +++ b/src/Spout/Reader/Wrapper/SimpleXMLElement.php @@ -151,6 +151,22 @@ class SimpleXMLElement return $doesElementExist ? $this->wrapSimpleXMLElement($realElement) : null; } + /** + * Returns the immediate children. + * + * @return array The children + */ + public function children() + { + $children = []; + + foreach ($this->simpleXMLElement->children() as $child) { + $children[] = $this->wrapSimpleXMLElement($child); + } + + return $children; + } + /** * @return string */ diff --git a/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php b/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php index 79f92e7..3c417ca 100644 --- a/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php +++ b/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php @@ -25,19 +25,35 @@ class CellValueFormatter /** Definition of XML attributes used to parse data */ const XML_ATTRIBUTE_TYPE = 't'; + const XML_ATTRIBUTE_STYLE_ID = 's'; + + /** Constants used for date formatting */ + const NUM_DAYS_FROM_JAN_1_1900_TO_JAN_1_1970 = 25569; // actually 25567 but accommodating for an Excel bug with some bisextile years + const NUM_SECONDS_IN_ONE_DAY = 86400; + + /** + * February 29th, 1900 is NOT a leap year but Excel thinks it is... + * @see https://en.wikipedia.org/wiki/Year_1900_problem#Microsoft_Excel + */ + const ERRONEOUS_EXCEL_LEAP_YEAR_DAY = 60; /** @var SharedStringsHelper Helper to work with shared strings */ protected $sharedStringsHelper; + /** @var StyleHelper Helper to work with styles */ + protected $styleHelper; + /** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */ protected $escaper; /** * @param SharedStringsHelper $sharedStringsHelper Helper to work with shared strings + * @param StyleHelper $styleHelper Helper to work with styles */ - public function __construct($sharedStringsHelper) + public function __construct($sharedStringsHelper, $styleHelper) { $this->sharedStringsHelper = $sharedStringsHelper; + $this->styleHelper = $styleHelper; /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ $this->escaper = new \Box\Spout\Common\Escaper\XLSX(); @@ -53,6 +69,7 @@ class CellValueFormatter { // Default cell type is "n" $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC; + $cellStyleId = intval($node->getAttribute(self::XML_ATTRIBUTE_STYLE_ID)); $vNodeValue = $this->getVNodeValue($node); if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) { @@ -69,7 +86,7 @@ class CellValueFormatter case self::CELL_TYPE_BOOLEAN: return $this->formatBooleanCellValue($vNodeValue); case self::CELL_TYPE_NUMERIC: - return $this->formatNumericCellValue($vNodeValue); + return $this->formatNumericCellValue($vNodeValue, $cellStyleId); case self::CELL_TYPE_DATE: return $this->formatDateCellValue($vNodeValue); default: @@ -138,14 +155,49 @@ class CellValueFormatter /** * Returns the cell Numeric value from string of nodeValue. + * The value can also represent a timestamp and a DateTime will be returned. * * @param string $nodeValue - * @return int|float The value associated with the cell + * @param int $cellStyleId 0 being the default style + * @return int|float|\DateTime|null The value associated with the cell */ - protected function formatNumericCellValue($nodeValue) + protected function formatNumericCellValue($nodeValue, $cellStyleId) { - $cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue); - return $cellValue; + // Numeric values can represent numbers as well as timestamps. + // We need to look at the style of the cell to determine whether it is one or the other. + $shouldFormatAsDate = $this->styleHelper->shouldFormatNumericValueAsDate($cellStyleId); + + if ($shouldFormatAsDate) { + return $this->formatExcelTimestampValue(floatval($nodeValue)); + } else { + return is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue); + } + } + + /** + * Returns a cell's PHP Date value, associated to the given timestamp. + * NOTE: The timestamp is a float representing the number of days since January 1st, 1900. + * + * @param float $nodeValue + * @return \DateTime|null The value associated with the cell or NULL if invalid date value + */ + protected function formatExcelTimestampValue($nodeValue) + { + $numDaysSince1970Jan1 = $nodeValue - self::NUM_DAYS_FROM_JAN_1_1900_TO_JAN_1_1970; + + // Fix for the erroneous leap year in Excel + if ($nodeValue < self::ERRONEOUS_EXCEL_LEAP_YEAR_DAY) { + $numDaysSince1970Jan1++; + } + + $unixTimestamp = round($numDaysSince1970Jan1 * self::NUM_SECONDS_IN_ONE_DAY); + + try { + $cellValue = (new \DateTime())->setTimestamp($unixTimestamp); + return $cellValue; + } catch (\Exception $e) { + return null; + } } /** diff --git a/src/Spout/Reader/XLSX/Helper/StyleHelper.php b/src/Spout/Reader/XLSX/Helper/StyleHelper.php new file mode 100644 index 0000000..52d1844 --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/StyleHelper.php @@ -0,0 +1,224 @@ + FORMAT_CODE */ + protected $customNumberFormats; + + /** @var array Array containing a mapping STYLE_ID => [STYLE_ATTRIBUTES] */ + protected $stylesAttributes; + + /** + * @param string $filePath Path of the XLSX file being read + */ + public function __construct($filePath) + { + $this->filePath = $filePath; + } + + /** + * Reads the styles.xml file and extract the relevant information from the file. + * + * @return void + */ + protected function extractRelevantInfo() + { + $this->customNumberFormats = []; + $this->stylesAttributes = []; + + $stylesXmlFilePath = $this->filePath .'#' . self::STYLES_XML_FILE_PATH; + $xmlReader = new XMLReader(); + + if ($xmlReader->open('zip://' . $stylesXmlFilePath)) { + while ($xmlReader->read()) { + if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_NUM_FMTS)) { + $numFmtsNode = new SimpleXMLElement($xmlReader->readOuterXml()); + $this->extractNumberFormats($numFmtsNode); + + } else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL_XFS)) { + $cellXfsNode = new SimpleXMLElement($xmlReader->readOuterXml()); + $this->extractStyleAttributes($cellXfsNode); + } + } + + $xmlReader->close(); + } + } + + /** + * Extracts number formats from the "numFmt" nodes. + * For simplicity, the styles attributes are kept in memory. This is possible thanks + * to the reuse of formats. So 1 million cells should not use 1 million formats. + * + * @param SimpleXMLElement $numFmtsNode The "numFmts" node + * @return void + */ + protected function extractNumberFormats($numFmtsNode) + { + foreach ($numFmtsNode->children() as $numFmtNode) { + $numFmtId = intval($numFmtNode->getAttribute(self::XML_ATTRIBUTE_NUM_FMT_ID)); + $formatCode = $numFmtNode->getAttribute(self::XML_ATTRIBUTE_FORMAT_CODE); + $this->customNumberFormats[$numFmtId] = $formatCode; + } + } + + /** + * Extracts style attributes from the "xf" nodes, inside the "cellXfs" section. + * For simplicity, the styles attributes are kept in memory. This is possible thanks + * to the reuse of styles. So 1 million cells should not use 1 million styles. + * + * @param SimpleXMLElement $cellXfsNode The "cellXfs" node + * @return void + */ + protected function extractStyleAttributes($cellXfsNode) + { + foreach ($cellXfsNode->children() as $xfNode) { + $this->stylesAttributes[] = [ + self::XML_ATTRIBUTE_NUM_FMT_ID => intval($xfNode->getAttribute(self::XML_ATTRIBUTE_NUM_FMT_ID)), + self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT => !!($xfNode->getAttribute(self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT)), + ]; + } + } + + /** + * @return array The custom number formats + */ + protected function getCustomNumberFormats() + { + if (!isset($this->customNumberFormats)) { + $this->extractRelevantInfo(); + } + + return $this->customNumberFormats; + } + + /** + * @return array The styles attributes + */ + protected function getStylesAttributes() + { + if (!isset($this->stylesAttributes)) { + $this->extractRelevantInfo(); + } + + return $this->stylesAttributes; + } + + /** + * Returns whether the style with the given ID should consider + * numeric values as timestamps and format the cell as a date. + * + * @param int $styleId Zero-based style ID + * @return bool Whether the cell with the given cell should display a date instead of a numeric value + */ + public function shouldFormatNumericValueAsDate($styleId) + { + $stylesAttributes = $this->getStylesAttributes(); + + // Default style (0) does not format numeric values as timestamps. Only custom styles do. + // Also if the style ID does not exist in the styles.xml file, format as numeric value. + if ($styleId === self::DEFAULT_STYLE_ID || !array_key_exists($styleId, $stylesAttributes)) { + return false; + } + + $styleAttributes = $stylesAttributes[$styleId]; + + $applyNumberFormat = $styleAttributes[self::XML_ATTRIBUTE_APPLY_NUMBER_FORMAT]; + if (!$applyNumberFormat) { + return false; + } + + $numFmtId = $styleAttributes[self::XML_ATTRIBUTE_NUM_FMT_ID]; + return $this->doesNumFmtIdIndicateDate($numFmtId); + } + + /** + * @param int $numFmtId + * @return bool Whether the number format ID indicates that the number is a timestamp + */ + protected function doesNumFmtIdIndicateDate($numFmtId) + { + return ( + $this->isNumFmtIdBuiltInDateFormat($numFmtId) || + $this->isNumFmtIdCustomDateFormat($numFmtId) + ); + } + + /** + * @param int $numFmtId + * @return bool Whether the number format ID indicates that the number is a timestamp + */ + protected function isNumFmtIdBuiltInDateFormat($numFmtId) + { + $builtInDateFormatIds = [14, 15, 16, 17, 18, 19, 20, 21, 22, 45, 46, 47]; + return in_array($numFmtId, $builtInDateFormatIds); + } + + /** + * @param int $numFmtId + * @return bool Whether the number format ID indicates that the number is a timestamp + */ + protected function isNumFmtIdCustomDateFormat($numFmtId) + { + $customNumberFormats = $this->getCustomNumberFormats(); + + if (!array_key_exists($numFmtId, $customNumberFormats)) { + return false; + } + + $customNumberFormat = $customNumberFormats[$numFmtId]; + + // Remove extra formatting (what's between [ ], the brackets should not be preceded by a "\") + $pattern = '((?sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); $this->xmlReader = new XMLReader(); - $this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper); + + $this->styleHelper = new StyleHelper($filePath); + $this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper, $this->styleHelper); } /** diff --git a/tests/Spout/Reader/XLSX/Helper/StyleHelperTest.php b/tests/Spout/Reader/XLSX/Helper/StyleHelperTest.php new file mode 100644 index 0000000..3b8edff --- /dev/null +++ b/tests/Spout/Reader/XLSX/Helper/StyleHelperTest.php @@ -0,0 +1,134 @@ +getMockBuilder('\Box\Spout\Reader\XLSX\Helper\StyleHelper') + ->setMethods(['getCustomNumberFormats', 'getStylesAttributes']) + ->disableOriginalConstructor() + ->getMock(); + + $styleHelper->method('getStylesAttributes')->willReturn($styleAttributes); + $styleHelper->method('getCustomNumberFormats')->willReturn($customNumberFormats); + + return $styleHelper; + } + + /** + * @return void + */ + public function testShouldFormatNumericValueAsDateWithDefaultStyle() + { + $styleHelper = $this->getStyleHelperMock([]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(0); + $this->assertFalse($shouldFormatAsDate); + } + + /** + * @return void + */ + public function testShouldFormatNumericValueAsDateWhenStyleIdNotListed() + { + $styleHelper = $this->getStyleHelperMock([['applyNumberFormat' => true]]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1); + $this->assertFalse($shouldFormatAsDate); + } + + /** + * @return void + */ + public function testShouldFormatNumericValueAsDateWhenShouldNotApplyNumberFormat() + { + $styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => false]]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1); + $this->assertFalse($shouldFormatAsDate); + } + + /** + * @return void + */ + public function testShouldFormatNumericValueAsDateWithBuiltinDateFormats() + { + $builtinNumFmtIdsForDate = [14, 15, 16, 17, 18, 19, 20, 21, 22, 45, 46, 47]; + + foreach ($builtinNumFmtIdsForDate as $builtinNumFmtIdForDate) { + $styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => $builtinNumFmtIdForDate]]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1); + + $this->assertTrue($shouldFormatAsDate); + } + } + + /** + * @return void + */ + public function testShouldFormatNumericValueAsDateWhenCustomNumberFormatNotFound() + { + $styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => 165]], [166 => []]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1); + + $this->assertFalse($shouldFormatAsDate); + } + + /** + * @return array + */ + public function dataProviderForCustomDateFormats() + { + return [ + // number format, expectedResult + ['[$-409]dddd\,\ mmmm\ d\,\ yy', true], + ['[$-409]d\-mmm\-yy;@', true], + ['[$-409]d\-mmm\-yyyy;@', true], + ['mm/dd/yy;@', true], + ['[$-F800]dddd\,\ mmmm\ dd\,\ yyyy', true], + ['m/d;@', true], + ['m/d/yy;@', true], + ['[$-409]d\-mmm;@', true], + ['[$-409]dd\-mmm\-yy;@', true], + ['[$-409]mmm\-yy;@', true], + ['[$-409]mmmm\-yy;@', true], + ['[$-409]mmmm\ d\,\ yyyy;@', true], + ['[$-409]m/d/yy\ h:mm\ AM/PM;@', true], + ['m/d/yy\ h:mm;@', true], + ['[$-409]mmmmm;@', true], + ['[$-409]mmmmm\-yy;@', true], + ['m/d/yyyy;@', true], + ['[$-409]m/d/yy\--h:mm;@', true], + ['GENERAL', false], + ['\ma\yb\e', false], + ['[Red]foo;', false], + ]; + } + + /** + * @dataProvider dataProviderForCustomDateFormats + * + * @param string $numberFormat + * @param bool $expectedResult + * @return void + */ + public function testShouldFormatNumericValueAsDateWithCustomDateFormats($numberFormat, $expectedResult) + { + $numFmtId = 165; + $styleHelper = $this->getStyleHelperMock([[], ['applyNumberFormat' => true, 'numFmtId' => $numFmtId]], [$numFmtId => $numberFormat]); + $shouldFormatAsDate = $styleHelper->shouldFormatNumericValueAsDate(1); + + $this->assertEquals($expectedResult, $shouldFormatAsDate); + } +} diff --git a/tests/Spout/Reader/XLSX/ReaderTest.php b/tests/Spout/Reader/XLSX/ReaderTest.php index eb42b84..c774749 100644 --- a/tests/Spout/Reader/XLSX/ReaderTest.php +++ b/tests/Spout/Reader/XLSX/ReaderTest.php @@ -113,6 +113,52 @@ class ReaderTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedRows, $allRows); } + /** + * @return void + */ + public function testReadShouldSupportNumericTimestampFormattedDifferentlyAsDate() + { + // make sure dates are always created with the same timezone + date_default_timezone_set('UTC'); + + $allRows = $this->getAllRowsForFile('sheet_with_same_numeric_value_date_formatted_differently.xlsx'); + + $expectedDate = \DateTime::createFromFormat('Y-m-d H:i:s', '2015-01-01 00:00:00'); + $expectedRows = [ + array_fill(0, 10, $expectedDate), + array_fill(0, 10, $expectedDate), + array_fill(0, 10, $expectedDate), + array_merge(array_fill(0, 7, $expectedDate), ['', '', '']), + ]; + + $this->assertEquals($expectedRows, $allRows); + } + + /** + * @return void + */ + public function testReadShouldSupportDifferentDatesAsNumericTimestamp() + { + // make sure dates are always created with the same timezone + date_default_timezone_set('UTC'); + + $allRows = $this->getAllRowsForFile('sheet_with_different_numeric_value_dates.xlsx'); + + $expectedRows = [ + [ + \DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-01 00:00:00'), + \DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-02 00:00:00'), + \DateTime::createFromFormat('Y-m-d H:i:s', '2015-09-01 22:23:00'), + ], + [ + \DateTime::createFromFormat('Y-m-d H:i:s', '1900-02-28 23:59:59'), + \DateTime::createFromFormat('Y-m-d H:i:s', '1900-03-01 00:00:00'), + \DateTime::createFromFormat('Y-m-d H:i:s', '1900-02-28 11:00:00'), // 1900-02-29 should be converted to 1900-02-28 + ] + ]; + $this->assertEquals($expectedRows, $allRows); + } + /** * @return void */ diff --git a/tests/resources/xlsx/sheet_with_different_numeric_value_dates.xlsx b/tests/resources/xlsx/sheet_with_different_numeric_value_dates.xlsx new file mode 100644 index 0000000..1f565c1 Binary files /dev/null and b/tests/resources/xlsx/sheet_with_different_numeric_value_dates.xlsx differ diff --git a/tests/resources/xlsx/sheet_with_same_numeric_value_date_formatted_differently.xlsx b/tests/resources/xlsx/sheet_with_same_numeric_value_date_formatted_differently.xlsx new file mode 100644 index 0000000..8f5a20c Binary files /dev/null and b/tests/resources/xlsx/sheet_with_same_numeric_value_date_formatted_differently.xlsx differ