diff --git a/src/Spout/Reader/ODS/Helper/CellValueFormatter.php b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php new file mode 100644 index 0000000..15a8cad --- /dev/null +++ b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php @@ -0,0 +1,136 @@ +escaper = new \Box\Spout\Common\Escaper\ODS(); + } + + /** + * Returns the (unescaped) correctly marshalled, cell value associated to the given XML node. + * @TODO Add other types !! + * + * @param \DOMNode $node + * @return string|int|float|bool The value associated with the cell (or empty string if cell's type is undefined) + */ + public function extractAndFormatNodeValue($node) + { + $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE); + $pNodeValue = $this->getFirstPNodeValue($node); + + switch ($cellType) { + case self::CELL_TYPE_STRING: + return $this->formatStringCellValue($node); + case self::CELL_TYPE_FLOAT: + return $this->formatFloatCellValue($pNodeValue); + case self::CELL_TYPE_BOOLEAN: + return $this->formatBooleanCellValue($pNodeValue); + default: + return ''; + } + } + + /** + * Returns the value of the first "" node within the given node. + * + * @param \DOMNode $node + * @return string Value for the first "" node or empty string if no "" found + */ + protected function getFirstPNodeValue($node) + { + $nodeValue = ''; + $pNodes = $node->getElementsByTagName(self::XML_NODE_P); + + if ($pNodes->length > 0) { + $nodeValue = $pNodes->item(0)->nodeValue; + } + + return $nodeValue; + } + + /** + * Returns the cell String value. + * + * @param \DOMNode $node + * @return string The value associated with the cell + */ + protected function formatStringCellValue($node) + { + $pNodeValues = []; + $pNodes = $node->getElementsByTagName(self::XML_NODE_P); + + foreach ($pNodes as $pNode) { + $currentPValue = ''; + + foreach ($pNode->childNodes as $childNode) { + if ($childNode instanceof \DOMText) { + $currentPValue .= $childNode->nodeValue; + } else if ($childNode->nodeName === self::XML_NODE_S) { + $spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C); + $numSpaces = (!empty($spaceAttribute)) ? intval($spaceAttribute) : 1; + $currentPValue .= str_repeat(' ', $numSpaces); + } + } + + $pNodeValues[] = $currentPValue; + } + + $escapedCellValue = implode("\n", $pNodeValues); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell Numeric value from string of nodeValue. + * + * @param string $pNodeValue + * @return int|float The value associated with the cell + */ + protected function formatFloatCellValue($pNodeValue) + { + $cellValue = is_int($pNodeValue) ? intval($pNodeValue) : floatval($pNodeValue); + return $cellValue; + } + + /** + * Returns the cell Boolean value from a specific node's Value. + * + * @param string $pNodeValue + * @return bool The value associated with the cell + */ + protected function formatBooleanCellValue($pNodeValue) + { + // !! is similar to boolval() + $cellValue = !!$pNodeValue; + return $cellValue; + } +} diff --git a/src/Spout/Reader/ODS/RowIterator.php b/src/Spout/Reader/ODS/RowIterator.php index 1130226..7a3745f 100644 --- a/src/Spout/Reader/ODS/RowIterator.php +++ b/src/Spout/Reader/ODS/RowIterator.php @@ -6,6 +6,7 @@ use Box\Spout\Common\Exception\IOException; use Box\Spout\Reader\Exception\IteratorNotRewindableException; use Box\Spout\Reader\Exception\XMLProcessingException; use Box\Spout\Reader\IteratorInterface; +use Box\Spout\Reader\ODS\Helper\CellValueFormatter; use Box\Spout\Reader\Wrapper\XMLReader; /** @@ -15,32 +16,23 @@ use Box\Spout\Reader\Wrapper\XMLReader; */ class RowIterator implements IteratorInterface { - /** Definition of all possible cell types */ - const CELL_TYPE_STRING = 'string'; - const CELL_TYPE_BOOLEAN = 'boolean'; - const CELL_TYPE_FLOAT = 'float'; - /** Definition of XML nodes names used to parse data */ const XML_NODE_TABLE = 'table:table'; const XML_NODE_ROW = 'table:table-row'; const XML_NODE_CELL = 'table:table-cell'; - const XML_NODE_P = 'p'; - const XML_NODE_S = 'text:s'; /** Definition of XML attribute used to parse data */ - const XML_ATTRIBUTE_TYPE = 'office:value-type'; const XML_ATTRIBUTE_NUM_COLUMNS_REPEATED = 'table:number-columns-repeated'; - const XML_ATTRIBUTE_C = 'text:c'; /** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */ protected $xmlReader; + /** @var Helper\CellValueFormatter Helper to format cell values */ + protected $cellValueFormatter; + /** @var bool Whether the iterator has already been rewound once */ protected $hasAlreadyBeenRewound = false; - /** @var \Box\Spout\Common\Escaper\ODS Used to unescape XML data */ - protected $escaper; - /** @var int Number of read rows */ protected $numReadRows = 0; @@ -56,9 +48,7 @@ class RowIterator implements IteratorInterface public function __construct($xmlReader) { $this->xmlReader = $xmlReader; - - /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ - $this->escaper = new \Box\Spout\Common\Escaper\ODS(); + $this->cellValueFormatter = new CellValueFormatter(); } /** @@ -182,101 +172,13 @@ class RowIterator implements IteratorInterface /** * Returns the (unescaped) correctly marshalled, cell value associated to the given XML node. - * @TODO Add other types !! * * @param \DOMNode $node * @return string|int|float|bool The value associated with the cell (or empty string if cell's type is undefined) */ protected function getCellValue($node) { - $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE); - $pNodeValue = $this->getTextPNodeValue($node); - - switch ($cellType) { - case self::CELL_TYPE_STRING: - return $this->formatStringCellValue($node); - case self::CELL_TYPE_FLOAT: - return $this->formatFloatCellValue($pNodeValue); - case self::CELL_TYPE_BOOLEAN: - return $this->formatBooleanCellValue($pNodeValue); - default: - return ''; - } - } - - /** - * Returns the value of the first "" node within the given node. - * - * @param \DOMNode $node - * @return string Value for the first "" node or empty string if no "" found - */ - protected function getTextPNodeValue($node) - { - $nodeValue = ''; - $pNodes = $node->getElementsByTagName(self::XML_NODE_P); - - if ($pNodes->length > 0) { - $nodeValue = $pNodes->item(0)->nodeValue; - } - - return $nodeValue; - } - - /** - * Returns the cell String value. - * - * @param \DOMNode $node - * @return string The value associated with the cell - */ - protected function formatStringCellValue($node) - { - $pNodeValues = []; - $pNodes = $node->getElementsByTagName(self::XML_NODE_P); - - foreach ($pNodes as $pNode) { - $currentPValue = ''; - - foreach ($pNode->childNodes as $childNode) { - if ($childNode instanceof \DOMText) { - $currentPValue .= $childNode->nodeValue; - } else if ($childNode->nodeName === self::XML_NODE_S) { - $spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C); - $numSpaces = (!empty($spaceAttribute)) ? intval($spaceAttribute) : 1; - $currentPValue .= str_repeat(' ', $numSpaces); - } - } - - $pNodeValues[] = $currentPValue; - } - - $escapedCellValue = implode("\n", $pNodeValues); - $cellValue = $this->escaper->unescape($escapedCellValue); - return $cellValue; - } - - /** - * Returns the cell Numeric value from string of nodeValue. - * - * @param string $pNodeValue - * @return int|float The value associated with the cell - */ - protected function formatFloatCellValue($pNodeValue) - { - $cellValue = is_int($pNodeValue) ? intval($pNodeValue) : floatval($pNodeValue); - return $cellValue; - } - - /** - * Returns the cell Boolean value from a specific node's Value. - * - * @param string $pNodeValue - * @return bool The value associated with the cell - */ - protected function formatBooleanCellValue($pNodeValue) - { - // !! is similar to boolval() - $cellValue = !!$pNodeValue; - return $cellValue; + return $this->cellValueFormatter->extractAndFormatNodeValue($node); } /** diff --git a/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php b/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php new file mode 100644 index 0000000..99d4920 --- /dev/null +++ b/src/Spout/Reader/XLSX/Helper/CellValueFormatter.php @@ -0,0 +1,180 @@ +sharedStringsHelper = $sharedStringsHelper; + + /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ + $this->escaper = new \Box\Spout\Common\Escaper\XLSX(); + } + + /** + * Returns the (unescaped) correctly marshalled, cell value associated to the given XML node. + * + * @param \DOMNode $node + * @return string|int|float|bool|\DateTime|null The value associated with the cell (null when the cell has an error) + */ + public function extractAndFormatNodeValue($node) + { + // Default cell type is "n" + $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC; + $vNodeValue = $this->getVNodeValue($node); + + if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) { + return $vNodeValue; + } + + switch ($cellType) { + case self::CELL_TYPE_INLINE_STRING: + return $this->formatInlineStringCellValue($node); + case self::CELL_TYPE_SHARED_STRING: + return $this->formatSharedStringCellValue($vNodeValue); + case self::CELL_TYPE_STR: + return $this->formatStrCellValue($vNodeValue); + case self::CELL_TYPE_BOOLEAN: + return $this->formatBooleanCellValue($vNodeValue); + case self::CELL_TYPE_NUMERIC: + return $this->formatNumericCellValue($vNodeValue); + case self::CELL_TYPE_DATE: + return $this->formatDateCellValue($vNodeValue); + default: + return null; + } + } + + /** + * Returns the cell's string value from a node's nested value node + * + * @param \DOMNode $node + * @return string The value associated with the cell + */ + protected function getVNodeValue($node) + { + // for cell types having a "v" tag containing the value. + // if not, the returned value should be empty string. + $vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0); + return ($vNode !== null) ? $vNode->nodeValue : ''; + } + + /** + * Returns the cell String value where string is inline. + * + * @param \DOMNode $node + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatInlineStringCellValue($node) + { + // inline strings are formatted this way: + // [INLINE_STRING] + $tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0); + $escapedCellValue = trim($tNode->nodeValue); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell String value from shared-strings file using nodeValue index. + * + * @param string $nodeValue + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatSharedStringCellValue($nodeValue) + { + // shared strings are formatted this way: + // [SHARED_STRING_INDEX] + $sharedStringIndex = intval($nodeValue); + $escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell String value, where string is stored in value node. + * + * @param string $nodeValue + * @return string The value associated with the cell (null when the cell has an error) + */ + protected function formatStrCellValue($nodeValue) + { + $escapedCellValue = trim($nodeValue); + $cellValue = $this->escaper->unescape($escapedCellValue); + return $cellValue; + } + + /** + * Returns the cell Numeric value from string of nodeValue. + * + * @param string $nodeValue + * @return int|float The value associated with the cell + */ + protected function formatNumericCellValue($nodeValue) + { + $cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue); + return $cellValue; + } + + /** + * Returns the cell Boolean value from a specific node's Value. + * + * @param string $nodeValue + * @return bool The value associated with the cell + */ + protected function formatBooleanCellValue($nodeValue) + { + // !! is similar to boolval() + $cellValue = !!$nodeValue; + return $cellValue; + } + + /** + * Returns a cell's PHP Date value, associated to the given stored nodeValue. + * + * @param string $nodeValue + * @return \DateTime|null The value associated with the cell (null when the cell has an error) + */ + protected function formatDateCellValue($nodeValue) + { + // Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php) + try { + $cellValue = new \DateTime($nodeValue); + return $cellValue; + } catch (\Exception $e) { + return null; + } + } +} diff --git a/src/Spout/Reader/XLSX/RowIterator.php b/src/Spout/Reader/XLSX/RowIterator.php index a6f1287..c9066c1 100644 --- a/src/Spout/Reader/XLSX/RowIterator.php +++ b/src/Spout/Reader/XLSX/RowIterator.php @@ -7,6 +7,7 @@ use Box\Spout\Reader\Exception\XMLProcessingException; use Box\Spout\Reader\IteratorInterface; use Box\Spout\Reader\Wrapper\XMLReader; use Box\Spout\Reader\XLSX\Helper\CellHelper; +use Box\Spout\Reader\XLSX\Helper\CellValueFormatter; /** * Class RowIterator @@ -15,28 +16,16 @@ use Box\Spout\Reader\XLSX\Helper\CellHelper; */ class RowIterator implements IteratorInterface { - /** Definition of all possible cell types */ - const CELL_TYPE_INLINE_STRING = 'inlineStr'; - const CELL_TYPE_STR = 'str'; - const CELL_TYPE_SHARED_STRING = 's'; - const CELL_TYPE_BOOLEAN = 'b'; - const CELL_TYPE_NUMERIC = 'n'; - const CELL_TYPE_DATE = 'd'; - const CELL_TYPE_ERROR = 'e'; - /** Definition of XML nodes names used to parse data */ const XML_NODE_DIMENSION = 'dimension'; const XML_NODE_WORKSHEET = 'worksheet'; const XML_NODE_ROW = 'row'; const XML_NODE_CELL = 'c'; - const XML_NODE_VALUE = 'v'; - const XML_NODE_INLINE_STRING_VALUE = 't'; /** Definition of XML attributes used to parse data */ const XML_ATTRIBUTE_REF = 'ref'; const XML_ATTRIBUTE_SPANS = 'spans'; const XML_ATTRIBUTE_CELL_INDEX = 'r'; - const XML_ATTRIBUTE_TYPE = 't'; /** @var string Path of the XLSX file being read */ protected $filePath; @@ -44,14 +33,11 @@ class RowIterator implements IteratorInterface /** @var string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml */ protected $sheetDataXMLFilePath; - /** @var Helper\SharedStringsHelper Helper to work with shared strings */ - protected $sharedStringsHelper; - /** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */ protected $xmlReader; - /** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */ - protected $escaper; + /** @var Helper\CellValueFormatter Helper to format cell values */ + protected $cellValueFormatter; /** @var int Number of read rows */ protected $numReadRows = 0; @@ -74,12 +60,9 @@ class RowIterator implements IteratorInterface { $this->filePath = $filePath; $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); - $this->sharedStringsHelper = $sharedStringsHelper; $this->xmlReader = new XMLReader(); - - /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ - $this->escaper = new \Box\Spout\Common\Escaper\XLSX(); + $this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper); } /** @@ -200,131 +183,7 @@ class RowIterator implements IteratorInterface */ protected function getCellValue($node) { - // Default cell type is "n" - $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC; - $vNodeValue = $this->getVNodeValue($node); - - if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) { - return $vNodeValue; - } - - switch ($cellType) { - case self::CELL_TYPE_INLINE_STRING: - return $this->formatInlineStringCellValue($node); - case self::CELL_TYPE_SHARED_STRING: - return $this->formatSharedStringCellValue($vNodeValue); - case self::CELL_TYPE_STR: - return $this->formatStrCellValue($vNodeValue); - case self::CELL_TYPE_BOOLEAN: - return $this->formatBooleanCellValue($vNodeValue); - case self::CELL_TYPE_NUMERIC: - return $this->formatNumericCellValue($vNodeValue); - case self::CELL_TYPE_DATE: - return $this->formatDateCellValue($vNodeValue); - default: - return null; - } - } - - /** - * Returns the cell's string value from a node's nested value node - * - * @param \DOMNode $node - * @return string The value associated with the cell - */ - protected function getVNodeValue($node) - { - // for cell types having a "v" tag containing the value. - // if not, the returned value should be empty string. - $vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0); - return ($vNode !== null) ? $vNode->nodeValue : ''; - } - - /** - * Returns the cell String value where string is inline. - * - * @param \DOMNode $node - * @return string The value associated with the cell (null when the cell has an error) - */ - protected function formatInlineStringCellValue($node) - { - // inline strings are formatted this way: - // [INLINE_STRING] - $tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0); - $escapedCellValue = trim($tNode->nodeValue); - $cellValue = $this->escaper->unescape($escapedCellValue); - return $cellValue; - } - - /** - * Returns the cell String value from shared-strings file using nodeValue index. - * - * @param string $nodeValue - * @return string The value associated with the cell (null when the cell has an error) - */ - protected function formatSharedStringCellValue($nodeValue) - { - // shared strings are formatted this way: - // [SHARED_STRING_INDEX] - $sharedStringIndex = intval($nodeValue); - $escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex); - $cellValue = $this->escaper->unescape($escapedCellValue); - return $cellValue; - } - - /** - * Returns the cell String value, where string is stored in value node. - * - * @param string $nodeValue - * @return string The value associated with the cell (null when the cell has an error) - */ - protected function formatStrCellValue($nodeValue) - { - $escapedCellValue = trim($nodeValue); - $cellValue = $this->escaper->unescape($escapedCellValue); - return $cellValue; - } - - /** - * Returns the cell Numeric value from string of nodeValue. - * - * @param string $nodeValue - * @return int|float The value associated with the cell - */ - protected function formatNumericCellValue($nodeValue) - { - $cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue); - return $cellValue; - } - - /** - * Returns the cell Boolean value from a specific node's Value. - * - * @param string $nodeValue - * @return bool The value associated with the cell - */ - protected function formatBooleanCellValue($nodeValue) - { - // !! is similar to boolval() - $cellValue = !!$nodeValue; - return $cellValue; - } - - /** - * Returns a cell's PHP Date value, associated to the given stored nodeValue. - * - * @param string $nodeValue - * @return \DateTime|null The value associated with the cell (null when the cell has an error) - */ - protected function formatDateCellValue($nodeValue) - { - // Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php) - try { - $cellValue = new \DateTime($nodeValue); - return $cellValue; - } catch (\Exception $e) { - return null; - } + return $this->cellValueFormatter->extractAndFormatNodeValue($node); } /**