Merge pull request #112 from box/cell_formatters

Moved cell value formatting logic into formatters
This commit is contained in:
Adrien Loison 2015-09-02 00:21:25 -07:00
commit aa7978146f
4 changed files with 327 additions and 250 deletions

View File

@ -0,0 +1,136 @@
<?php
namespace Box\Spout\Reader\ODS\Helper;
/**
* Class CellValueFormatter
* This class provides helper functions to format cell values
*
* @package Box\Spout\Reader\ODS\Helper
*/
class CellValueFormatter
{
/** Definition of all possible cell types */
const CELL_TYPE_STRING = 'string';
const CELL_TYPE_BOOLEAN = 'boolean';
const CELL_TYPE_FLOAT = 'float';
/** Definition of XML nodes names used to parse data */
const XML_NODE_P = 'p';
const XML_NODE_S = 'text:s';
/** Definition of XML attribute used to parse data */
const XML_ATTRIBUTE_TYPE = 'office:value-type';
const XML_ATTRIBUTE_C = 'text:c';
/** @var \Box\Spout\Common\Escaper\ODS Used to unescape XML data */
protected $escaper;
/**
*
*/
public function __construct()
{
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$this->escaper = new \Box\Spout\Common\Escaper\ODS();
}
/**
* Returns the (unescaped) correctly marshalled, cell value associated to the given XML node.
* @TODO Add other types !!
*
* @param \DOMNode $node
* @return string|int|float|bool The value associated with the cell (or empty string if cell's type is undefined)
*/
public function extractAndFormatNodeValue($node)
{
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE);
$pNodeValue = $this->getFirstPNodeValue($node);
switch ($cellType) {
case self::CELL_TYPE_STRING:
return $this->formatStringCellValue($node);
case self::CELL_TYPE_FLOAT:
return $this->formatFloatCellValue($pNodeValue);
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($pNodeValue);
default:
return '';
}
}
/**
* Returns the value of the first "<text:p>" node within the given node.
*
* @param \DOMNode $node
* @return string Value for the first "<text:p>" node or empty string if no "<text:p>" found
*/
protected function getFirstPNodeValue($node)
{
$nodeValue = '';
$pNodes = $node->getElementsByTagName(self::XML_NODE_P);
if ($pNodes->length > 0) {
$nodeValue = $pNodes->item(0)->nodeValue;
}
return $nodeValue;
}
/**
* Returns the cell String value.
*
* @param \DOMNode $node
* @return string The value associated with the cell
*/
protected function formatStringCellValue($node)
{
$pNodeValues = [];
$pNodes = $node->getElementsByTagName(self::XML_NODE_P);
foreach ($pNodes as $pNode) {
$currentPValue = '';
foreach ($pNode->childNodes as $childNode) {
if ($childNode instanceof \DOMText) {
$currentPValue .= $childNode->nodeValue;
} else if ($childNode->nodeName === self::XML_NODE_S) {
$spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C);
$numSpaces = (!empty($spaceAttribute)) ? intval($spaceAttribute) : 1;
$currentPValue .= str_repeat(' ', $numSpaces);
}
}
$pNodeValues[] = $currentPValue;
}
$escapedCellValue = implode("\n", $pNodeValues);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell Numeric value from string of nodeValue.
*
* @param string $pNodeValue
* @return int|float The value associated with the cell
*/
protected function formatFloatCellValue($pNodeValue)
{
$cellValue = is_int($pNodeValue) ? intval($pNodeValue) : floatval($pNodeValue);
return $cellValue;
}
/**
* Returns the cell Boolean value from a specific node's Value.
*
* @param string $pNodeValue
* @return bool The value associated with the cell
*/
protected function formatBooleanCellValue($pNodeValue)
{
// !! is similar to boolval()
$cellValue = !!$pNodeValue;
return $cellValue;
}
}

View File

@ -6,6 +6,7 @@ use Box\Spout\Common\Exception\IOException;
use Box\Spout\Reader\Exception\IteratorNotRewindableException;
use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\ODS\Helper\CellValueFormatter;
use Box\Spout\Reader\Wrapper\XMLReader;
/**
@ -15,32 +16,23 @@ use Box\Spout\Reader\Wrapper\XMLReader;
*/
class RowIterator implements IteratorInterface
{
/** Definition of all possible cell types */
const CELL_TYPE_STRING = 'string';
const CELL_TYPE_BOOLEAN = 'boolean';
const CELL_TYPE_FLOAT = 'float';
/** Definition of XML nodes names used to parse data */
const XML_NODE_TABLE = 'table:table';
const XML_NODE_ROW = 'table:table-row';
const XML_NODE_CELL = 'table:table-cell';
const XML_NODE_P = 'p';
const XML_NODE_S = 'text:s';
/** Definition of XML attribute used to parse data */
const XML_ATTRIBUTE_TYPE = 'office:value-type';
const XML_ATTRIBUTE_NUM_COLUMNS_REPEATED = 'table:number-columns-repeated';
const XML_ATTRIBUTE_C = 'text:c';
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var Helper\CellValueFormatter Helper to format cell values */
protected $cellValueFormatter;
/** @var bool Whether the iterator has already been rewound once */
protected $hasAlreadyBeenRewound = false;
/** @var \Box\Spout\Common\Escaper\ODS Used to unescape XML data */
protected $escaper;
/** @var int Number of read rows */
protected $numReadRows = 0;
@ -56,9 +48,7 @@ class RowIterator implements IteratorInterface
public function __construct($xmlReader)
{
$this->xmlReader = $xmlReader;
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$this->escaper = new \Box\Spout\Common\Escaper\ODS();
$this->cellValueFormatter = new CellValueFormatter();
}
/**
@ -182,101 +172,13 @@ class RowIterator implements IteratorInterface
/**
* Returns the (unescaped) correctly marshalled, cell value associated to the given XML node.
* @TODO Add other types !!
*
* @param \DOMNode $node
* @return string|int|float|bool The value associated with the cell (or empty string if cell's type is undefined)
*/
protected function getCellValue($node)
{
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE);
$pNodeValue = $this->getTextPNodeValue($node);
switch ($cellType) {
case self::CELL_TYPE_STRING:
return $this->formatStringCellValue($node);
case self::CELL_TYPE_FLOAT:
return $this->formatFloatCellValue($pNodeValue);
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($pNodeValue);
default:
return '';
}
}
/**
* Returns the value of the first "<text:p>" node within the given node.
*
* @param \DOMNode $node
* @return string Value for the first "<text:p>" node or empty string if no "<text:p>" found
*/
protected function getTextPNodeValue($node)
{
$nodeValue = '';
$pNodes = $node->getElementsByTagName(self::XML_NODE_P);
if ($pNodes->length > 0) {
$nodeValue = $pNodes->item(0)->nodeValue;
}
return $nodeValue;
}
/**
* Returns the cell String value.
*
* @param \DOMNode $node
* @return string The value associated with the cell
*/
protected function formatStringCellValue($node)
{
$pNodeValues = [];
$pNodes = $node->getElementsByTagName(self::XML_NODE_P);
foreach ($pNodes as $pNode) {
$currentPValue = '';
foreach ($pNode->childNodes as $childNode) {
if ($childNode instanceof \DOMText) {
$currentPValue .= $childNode->nodeValue;
} else if ($childNode->nodeName === self::XML_NODE_S) {
$spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C);
$numSpaces = (!empty($spaceAttribute)) ? intval($spaceAttribute) : 1;
$currentPValue .= str_repeat(' ', $numSpaces);
}
}
$pNodeValues[] = $currentPValue;
}
$escapedCellValue = implode("\n", $pNodeValues);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell Numeric value from string of nodeValue.
*
* @param string $pNodeValue
* @return int|float The value associated with the cell
*/
protected function formatFloatCellValue($pNodeValue)
{
$cellValue = is_int($pNodeValue) ? intval($pNodeValue) : floatval($pNodeValue);
return $cellValue;
}
/**
* Returns the cell Boolean value from a specific node's Value.
*
* @param string $pNodeValue
* @return bool The value associated with the cell
*/
protected function formatBooleanCellValue($pNodeValue)
{
// !! is similar to boolval()
$cellValue = !!$pNodeValue;
return $cellValue;
return $this->cellValueFormatter->extractAndFormatNodeValue($node);
}
/**

View File

@ -0,0 +1,180 @@
<?php
namespace Box\Spout\Reader\XLSX\Helper;
/**
* Class CellValueFormatter
* This class provides helper functions to format cell values
*
* @package Box\Spout\Reader\XLSX\Helper
*/
class CellValueFormatter
{
/** Definition of all possible cell types */
const CELL_TYPE_INLINE_STRING = 'inlineStr';
const CELL_TYPE_STR = 'str';
const CELL_TYPE_SHARED_STRING = 's';
const CELL_TYPE_BOOLEAN = 'b';
const CELL_TYPE_NUMERIC = 'n';
const CELL_TYPE_DATE = 'd';
const CELL_TYPE_ERROR = 'e';
/** Definition of XML nodes names used to parse data */
const XML_NODE_VALUE = 'v';
const XML_NODE_INLINE_STRING_VALUE = 't';
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_TYPE = 't';
/** @var SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */
protected $escaper;
/**
* @param SharedStringsHelper $sharedStringsHelper Helper to work with shared strings
*/
public function __construct($sharedStringsHelper)
{
$this->sharedStringsHelper = $sharedStringsHelper;
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$this->escaper = new \Box\Spout\Common\Escaper\XLSX();
}
/**
* Returns the (unescaped) correctly marshalled, cell value associated to the given XML node.
*
* @param \DOMNode $node
* @return string|int|float|bool|\DateTime|null The value associated with the cell (null when the cell has an error)
*/
public function extractAndFormatNodeValue($node)
{
// Default cell type is "n"
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC;
$vNodeValue = $this->getVNodeValue($node);
if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) {
return $vNodeValue;
}
switch ($cellType) {
case self::CELL_TYPE_INLINE_STRING:
return $this->formatInlineStringCellValue($node);
case self::CELL_TYPE_SHARED_STRING:
return $this->formatSharedStringCellValue($vNodeValue);
case self::CELL_TYPE_STR:
return $this->formatStrCellValue($vNodeValue);
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($vNodeValue);
case self::CELL_TYPE_NUMERIC:
return $this->formatNumericCellValue($vNodeValue);
case self::CELL_TYPE_DATE:
return $this->formatDateCellValue($vNodeValue);
default:
return null;
}
}
/**
* Returns the cell's string value from a node's nested value node
*
* @param \DOMNode $node
* @return string The value associated with the cell
*/
protected function getVNodeValue($node)
{
// for cell types having a "v" tag containing the value.
// if not, the returned value should be empty string.
$vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0);
return ($vNode !== null) ? $vNode->nodeValue : '';
}
/**
* Returns the cell String value where string is inline.
*
* @param \DOMNode $node
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatInlineStringCellValue($node)
{
// inline strings are formatted this way:
// <c r="A1" t="inlineStr"><is><t>[INLINE_STRING]</t></is></c>
$tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0);
$escapedCellValue = trim($tNode->nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value from shared-strings file using nodeValue index.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatSharedStringCellValue($nodeValue)
{
// shared strings are formatted this way:
// <c r="A1" t="s"><v>[SHARED_STRING_INDEX]</v></c>
$sharedStringIndex = intval($nodeValue);
$escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value, where string is stored in value node.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatStrCellValue($nodeValue)
{
$escapedCellValue = trim($nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell Numeric value from string of nodeValue.
*
* @param string $nodeValue
* @return int|float The value associated with the cell
*/
protected function formatNumericCellValue($nodeValue)
{
$cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue);
return $cellValue;
}
/**
* Returns the cell Boolean value from a specific node's Value.
*
* @param string $nodeValue
* @return bool The value associated with the cell
*/
protected function formatBooleanCellValue($nodeValue)
{
// !! is similar to boolval()
$cellValue = !!$nodeValue;
return $cellValue;
}
/**
* Returns a cell's PHP Date value, associated to the given stored nodeValue.
*
* @param string $nodeValue
* @return \DateTime|null The value associated with the cell (null when the cell has an error)
*/
protected function formatDateCellValue($nodeValue)
{
// Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php)
try {
$cellValue = new \DateTime($nodeValue);
return $cellValue;
} catch (\Exception $e) {
return null;
}
}
}

View File

@ -7,6 +7,7 @@ use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\CellHelper;
use Box\Spout\Reader\XLSX\Helper\CellValueFormatter;
/**
* Class RowIterator
@ -15,28 +16,16 @@ use Box\Spout\Reader\XLSX\Helper\CellHelper;
*/
class RowIterator implements IteratorInterface
{
/** Definition of all possible cell types */
const CELL_TYPE_INLINE_STRING = 'inlineStr';
const CELL_TYPE_STR = 'str';
const CELL_TYPE_SHARED_STRING = 's';
const CELL_TYPE_BOOLEAN = 'b';
const CELL_TYPE_NUMERIC = 'n';
const CELL_TYPE_DATE = 'd';
const CELL_TYPE_ERROR = 'e';
/** Definition of XML nodes names used to parse data */
const XML_NODE_DIMENSION = 'dimension';
const XML_NODE_WORKSHEET = 'worksheet';
const XML_NODE_ROW = 'row';
const XML_NODE_CELL = 'c';
const XML_NODE_VALUE = 'v';
const XML_NODE_INLINE_STRING_VALUE = 't';
/** Definition of XML attributes used to parse data */
const XML_ATTRIBUTE_REF = 'ref';
const XML_ATTRIBUTE_SPANS = 'spans';
const XML_ATTRIBUTE_CELL_INDEX = 'r';
const XML_ATTRIBUTE_TYPE = 't';
/** @var string Path of the XLSX file being read */
protected $filePath;
@ -44,14 +33,11 @@ class RowIterator implements IteratorInterface
/** @var string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml */
protected $sheetDataXMLFilePath;
/** @var Helper\SharedStringsHelper Helper to work with shared strings */
protected $sharedStringsHelper;
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var \Box\Spout\Common\Escaper\XLSX Used to unescape XML data */
protected $escaper;
/** @var Helper\CellValueFormatter Helper to format cell values */
protected $cellValueFormatter;
/** @var int Number of read rows */
protected $numReadRows = 0;
@ -74,12 +60,9 @@ class RowIterator implements IteratorInterface
{
$this->filePath = $filePath;
$this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
$this->sharedStringsHelper = $sharedStringsHelper;
$this->xmlReader = new XMLReader();
/** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
$this->escaper = new \Box\Spout\Common\Escaper\XLSX();
$this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper);
}
/**
@ -200,131 +183,7 @@ class RowIterator implements IteratorInterface
*/
protected function getCellValue($node)
{
// Default cell type is "n"
$cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE) ?: self::CELL_TYPE_NUMERIC;
$vNodeValue = $this->getVNodeValue($node);
if (($vNodeValue === '') && ($cellType !== self::CELL_TYPE_INLINE_STRING)) {
return $vNodeValue;
}
switch ($cellType) {
case self::CELL_TYPE_INLINE_STRING:
return $this->formatInlineStringCellValue($node);
case self::CELL_TYPE_SHARED_STRING:
return $this->formatSharedStringCellValue($vNodeValue);
case self::CELL_TYPE_STR:
return $this->formatStrCellValue($vNodeValue);
case self::CELL_TYPE_BOOLEAN:
return $this->formatBooleanCellValue($vNodeValue);
case self::CELL_TYPE_NUMERIC:
return $this->formatNumericCellValue($vNodeValue);
case self::CELL_TYPE_DATE:
return $this->formatDateCellValue($vNodeValue);
default:
return null;
}
}
/**
* Returns the cell's string value from a node's nested value node
*
* @param \DOMNode $node
* @return string The value associated with the cell
*/
protected function getVNodeValue($node)
{
// for cell types having a "v" tag containing the value.
// if not, the returned value should be empty string.
$vNode = $node->getElementsByTagName(self::XML_NODE_VALUE)->item(0);
return ($vNode !== null) ? $vNode->nodeValue : '';
}
/**
* Returns the cell String value where string is inline.
*
* @param \DOMNode $node
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatInlineStringCellValue($node)
{
// inline strings are formatted this way:
// <c r="A1" t="inlineStr"><is><t>[INLINE_STRING]</t></is></c>
$tNode = $node->getElementsByTagName(self::XML_NODE_INLINE_STRING_VALUE)->item(0);
$escapedCellValue = trim($tNode->nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value from shared-strings file using nodeValue index.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatSharedStringCellValue($nodeValue)
{
// shared strings are formatted this way:
// <c r="A1" t="s"><v>[SHARED_STRING_INDEX]</v></c>
$sharedStringIndex = intval($nodeValue);
$escapedCellValue = $this->sharedStringsHelper->getStringAtIndex($sharedStringIndex);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell String value, where string is stored in value node.
*
* @param string $nodeValue
* @return string The value associated with the cell (null when the cell has an error)
*/
protected function formatStrCellValue($nodeValue)
{
$escapedCellValue = trim($nodeValue);
$cellValue = $this->escaper->unescape($escapedCellValue);
return $cellValue;
}
/**
* Returns the cell Numeric value from string of nodeValue.
*
* @param string $nodeValue
* @return int|float The value associated with the cell
*/
protected function formatNumericCellValue($nodeValue)
{
$cellValue = is_int($nodeValue) ? intval($nodeValue) : floatval($nodeValue);
return $cellValue;
}
/**
* Returns the cell Boolean value from a specific node's Value.
*
* @param string $nodeValue
* @return bool The value associated with the cell
*/
protected function formatBooleanCellValue($nodeValue)
{
// !! is similar to boolval()
$cellValue = !!$nodeValue;
return $cellValue;
}
/**
* Returns a cell's PHP Date value, associated to the given stored nodeValue.
*
* @param string $nodeValue
* @return \DateTime|null The value associated with the cell (null when the cell has an error)
*/
protected function formatDateCellValue($nodeValue)
{
// Mitigate thrown Exception on invalid date-time format (http://php.net/manual/en/datetime.construct.php)
try {
$cellValue = new \DateTime($nodeValue);
return $cellValue;
} catch (\Exception $e) {
return null;
}
return $this->cellValueFormatter->extractAndFormatNodeValue($node);
}
/**