Introduce XMLProcessor to reduce ODS,XLSX readers' complexity

This commit is contained in:
Adrien Loison 2016-10-18 16:17:32 -07:00
parent 73d5d0ea17
commit fe1dfc997d
3 changed files with 246 additions and 96 deletions

View File

@ -0,0 +1,152 @@
<?php
namespace Box\Spout\Reader\Common;
use Box\Spout\Reader\Wrapper\XMLReader;
/**
* Class XMLProcessor
* Helps process XML files
*
* @package Box\Spout\Reader\Common
*/
class XMLProcessor
{
/* Node types */
const NODE_TYPE_START = XMLReader::ELEMENT;
const NODE_TYPE_END = XMLReader::END_ELEMENT;
/* Keys associated to reflection attributes to invoke a callback */
const CALLBACK_REFLECTION_METHOD = 'reflectionMethod';
const CALLBACK_REFLECTION_OBJECT = 'reflectionObject';
/* Values returned by the callbacks to indicate what the processor should do next */
const PROCESSING_CONTINUE = 1;
const PROCESSING_STOP = 2;
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var array Registered callbacks */
private $callbacks = [];
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
*/
public function __construct($xmlReader)
{
$this->xmlReader = $xmlReader;
}
/**
* @param string $nodeName A callback may be triggered when a node with this name is read
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
* @param callable $callback Callback to execute when the read node has the given name and type
* @return XMLProcessor
*/
public function registerCallback($nodeName, $nodeType, $callback)
{
$callbackKey = $this->getCallbackKey($nodeName, $nodeType);
$this->callbacks[$callbackKey] = $this->getInvokableCallbackData($callback);
return $this;
}
/**
* @param string $nodeName Name of the node
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
* @return string Key used to store the associated callback
*/
private function getCallbackKey($nodeName, $nodeType)
{
return "$nodeName$nodeType";
}
/**
* Because the callback can be a "protected" function, we don't want to use call_user_func() directly
* but instead invoke the callback using Reflection. This allows the invocation of "protected" functions.
* Since some functions can be called a lot, we pre-process the callback to only return the elements that
* will be needed to invoke the callback later.
*
* @param callable $callback Array reference to a callback: [OBJECT, METHOD_NAME]
* @return array Associative array containing the elements needed to invoke the callback using Reflection
*/
private function getInvokableCallbackData($callback)
{
$callbackObject = $callback[0];
$callbackMethodName = $callback[1];
$reflectionMethod = new \ReflectionMethod(get_class($callbackObject), $callbackMethodName);
$reflectionMethod->setAccessible(true);
return [
self::CALLBACK_REFLECTION_METHOD => $reflectionMethod,
self::CALLBACK_REFLECTION_OBJECT => $callbackObject,
];
}
/**
* Resumes the reading of the XML file where it was left off.
* Stops whenever a callback indicates that reading should stop or at the end of the file.
*
* @return void
* @throws \Box\Spout\Reader\Exception\XMLProcessingException
*/
public function readUntilStopped()
{
while ($this->xmlReader->read()) {
$nodeType = $this->xmlReader->nodeType;
$nodeNamePossiblyWithPrefix = $this->xmlReader->name;
$nodeNameWithoutPrefix = $this->xmlReader->localName;
$callbackData = $this->getRegisteredCallbackData($nodeNamePossiblyWithPrefix, $nodeNameWithoutPrefix, $nodeType);
if ($callbackData !== null) {
$callbackResponse = $this->invokeCallback($callbackData, [$this->xmlReader]);
if ($callbackResponse === self::PROCESSING_STOP) {
// stop reading
break;
}
}
}
}
/**
* @param string $nodeNamePossiblyWithPrefix Name of the node, possibly prefixed
* @param string $nodeNameWithoutPrefix Name of the same node, un-prefixed
* @param int $nodeType Type of the node [NODE_TYPE_START || NODE_TYPE_END]
* @return array|null Callback data to be used for execution when a node of the given name/type is read or NULL if none found
*/
private function getRegisteredCallbackData($nodeNamePossiblyWithPrefix, $nodeNameWithoutPrefix, $nodeType)
{
// With prefixed nodes, we should match if (by order of preference):
// 1. the callback was registered with the prefixed node name (e.g. "x:worksheet")
// 2. the callback was registered with the un-prefixed node name (e.g. "worksheet")
$callbackKeyForPossiblyPrefixedName = $this->getCallbackKey($nodeNamePossiblyWithPrefix, $nodeType);
$callbackKeyForUnPrefixedName = $this->getCallbackKey($nodeNameWithoutPrefix, $nodeType);
$hasPrefix = ($nodeNamePossiblyWithPrefix !== $nodeNameWithoutPrefix);
$callbackKeyToUse = $callbackKeyForUnPrefixedName;
if ($hasPrefix && isset($this->callbacks[$callbackKeyForPossiblyPrefixedName])) {
$callbackKeyToUse = $callbackKeyForPossiblyPrefixedName;
}
// Using isset here because it is way faster than array_key_exists...
return isset($this->callbacks[$callbackKeyToUse]) ? $this->callbacks[$callbackKeyToUse] : null;
}
/**
* @param array $callbackData Associative array containing data to invoke the callback using Reflection
* @param array $args Arguments to pass to the callback
* @return int Callback response
*/
private function invokeCallback($callbackData, $args)
{
$reflectionMethod = $callbackData[self::CALLBACK_REFLECTION_METHOD];
$callbackObject = $callbackData[self::CALLBACK_REFLECTION_OBJECT];
return $reflectionMethod->invokeArgs($callbackObject, $args);
}
}

View File

@ -8,6 +8,7 @@ use Box\Spout\Reader\Exception\XMLProcessingException;
use Box\Spout\Reader\IteratorInterface;
use Box\Spout\Reader\ODS\Helper\CellValueFormatter;
use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\Common\XMLProcessor;
/**
* Class RowIterator
@ -29,6 +30,9 @@ class RowIterator implements IteratorInterface
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var \Box\Spout\Reader\Common\XMLProcessor Helper Object to process XML nodes */
protected $xmlProcessor;
/** @var bool Whether empty rows should be returned or skipped */
protected $shouldPreserveEmptyRows;
@ -38,6 +42,9 @@ class RowIterator implements IteratorInterface
/** @var bool Whether the iterator has already been rewound once */
protected $hasAlreadyBeenRewound = false;
/** @var array Contains the data for the currently processed row (key = cell index, value = cell value) */
protected $currentlyProcessedRowData = [];
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
protected $rowDataBuffer = null;
@ -72,6 +79,13 @@ class RowIterator implements IteratorInterface
$this->xmlReader = $xmlReader;
$this->shouldPreserveEmptyRows = $options->shouldPreserveEmptyRows();
$this->cellValueFormatter = new CellValueFormatter($options->shouldFormatDates());
// Register all callbacks to process different nodes when reading the XML file
$this->xmlProcessor = new XMLProcessor($this->xmlReader);
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_TABLE, XMLProcessor::NODE_TYPE_END, [$this, 'processTableEndingNode']);
}
/**
@ -122,7 +136,7 @@ class RowIterator implements IteratorInterface
public function next()
{
if ($this->doesNeedDataForNextRowToBeProcessed()) {
$this->readDataForNextRow($this->xmlReader);
$this->readDataForNextRow();
}
$this->lastRowIndexProcessed++;
@ -148,54 +162,26 @@ class RowIterator implements IteratorInterface
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
* @return void
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
* @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML
*/
protected function readDataForNextRow($xmlReader)
protected function readDataForNextRow()
{
$rowData = [];
$this->currentlyProcessedRowData = [];
try {
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) {
$this->processRowStartingNode($xmlReader);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) {
$rowData = $this->processCellStartingNode($xmlReader, $rowData);
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) {
$isEmptyRow = $this->isEmptyRow($rowData, $this->lastProcessedCellValue);
// if the fetched row is empty and we don't want to preserve it...
if (!$this->shouldPreserveEmptyRows && $isEmptyRow) {
// ... skip it
continue;
}
$rowData = $this->processRowEndingNode($rowData, $isEmptyRow);
// at this point, we have all the data we need for the row
// so that we can populate the buffer
break;
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_TABLE)) {
$this->processTableEndingNode();
break;
}
}
$this->xmlProcessor->readUntilStopped();
} catch (XMLProcessingException $exception) {
throw new IOException("The sheet's data cannot be read. [{$exception->getMessage()}]");
}
$this->rowDataBuffer = $rowData;
$this->rowDataBuffer = $this->currentlyProcessedRowData;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<table:table-row>" starting node
* @return void
* @return int A return code that indicates what action should the processor take next
*/
protected function processRowStartingNode($xmlReader)
{
@ -204,14 +190,15 @@ class RowIterator implements IteratorInterface
$this->lastProcessedCellValue = null;
$this->numColumnsRepeated = 1;
$this->numRowsRepeated = $this->getNumRowsRepeatedForCurrentNode($xmlReader);
return XMLProcessor::PROCESSING_CONTINUE;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<table:table-cell>" starting node
* @param array $rowData Data of all cells read so far
* @return array Original row data + data for the cell that was just read
* @return int A return code that indicates what action should the processor take next
*/
protected function processCellStartingNode($xmlReader, $rowData)
protected function processCellStartingNode($xmlReader)
{
$currentNumColumnsRepeated = $this->getNumColumnsRepeatedForCurrentNode($xmlReader);
@ -221,7 +208,7 @@ class RowIterator implements IteratorInterface
// process cell N only after having read cell N+1 (see below why)
if ($this->hasAlreadyReadOneCellInCurrentRow) {
for ($i = 0; $i < $this->numColumnsRepeated; $i++) {
$rowData[] = $this->lastProcessedCellValue;
$this->currentlyProcessedRowData[] = $this->lastProcessedCellValue;
}
}
@ -229,28 +216,34 @@ class RowIterator implements IteratorInterface
$this->lastProcessedCellValue = $currentCellValue;
$this->numColumnsRepeated = $currentNumColumnsRepeated;
return $rowData;
return XMLProcessor::PROCESSING_CONTINUE;
}
/**
* @param array $rowData Data of all cells read so far
* @param bool $isEmptyRow Whether the given row is empty
* @return array
* @return int A return code that indicates what action should the processor take next
*/
protected function processRowEndingNode($rowData, $isEmptyRow)
protected function processRowEndingNode()
{
$isEmptyRow = $this->isEmptyRow($this->currentlyProcessedRowData, $this->lastProcessedCellValue);
// if the fetched row is empty and we don't want to preserve it...
if (!$this->shouldPreserveEmptyRows && $isEmptyRow) {
// ... skip it
return XMLProcessor::PROCESSING_CONTINUE;
}
// if the row is empty, we don't want to return more than one cell
$actualNumColumnsRepeated = (!$isEmptyRow) ? $this->numColumnsRepeated : 1;
// Only add the value if the last read cell is not a trailing empty cell repeater in Excel.
// The current count of read columns is determined by counting the values in $rowData.
// The current count of read columns is determined by counting the values in "$this->currentlyProcessedRowData".
// This is to avoid creating a lot of empty cells, as Excel adds a last empty "<table:table-cell>"
// with a number-columns-repeated value equals to the number of (supported columns - used columns).
// In Excel, the number of supported columns is 16384, but we don't want to returns rows with
// always 16384 cells.
if ((count($rowData) + $actualNumColumnsRepeated) !== self::MAX_COLUMNS_EXCEL) {
if ((count($this->currentlyProcessedRowData) + $actualNumColumnsRepeated) !== self::MAX_COLUMNS_EXCEL) {
for ($i = 0; $i < $actualNumColumnsRepeated; $i++) {
$rowData[] = $this->lastProcessedCellValue;
$this->currentlyProcessedRowData[] = $this->lastProcessedCellValue;
}
}
@ -258,16 +251,20 @@ class RowIterator implements IteratorInterface
// then the next row to be processed will be row (N+M).
$this->nextRowIndexToBeProcessed += $this->numRowsRepeated;
return $rowData;
// at this point, we have all the data we need for the row
// so that we can populate the buffer
return XMLProcessor::PROCESSING_STOP;
}
/**
* @return void
* @return int A return code that indicates what action should the processor take next
*/
protected function processTableEndingNode()
{
// The closing "</table:table>" marks the end of the file
$this->hasReachedEndOfFile = true;
return XMLProcessor::PROCESSING_STOP;
}
/**

View File

@ -9,6 +9,7 @@ use Box\Spout\Reader\Wrapper\XMLReader;
use Box\Spout\Reader\XLSX\Helper\CellHelper;
use Box\Spout\Reader\XLSX\Helper\CellValueFormatter;
use Box\Spout\Reader\XLSX\Helper\StyleHelper;
use Box\Spout\Reader\Common\XMLProcessor;
/**
* Class RowIterator
@ -38,6 +39,9 @@ class RowIterator implements IteratorInterface
/** @var \Box\Spout\Reader\Wrapper\XMLReader The XMLReader object that will help read sheet's XML data */
protected $xmlReader;
/** @var \Box\Spout\Reader\Common\XMLProcessor Helper Object to process XML nodes */
protected $xmlProcessor;
/** @var Helper\CellValueFormatter Helper to format cell values */
protected $cellValueFormatter;
@ -50,6 +54,9 @@ class RowIterator implements IteratorInterface
*/
protected $numReadRows = 0;
/** @var array Contains the data for the currently processed row (key = cell index, value = cell value) */
protected $currentlyProcessedRowData = [];
/** @var array|null Buffer used to store the row data, while checking if there are more rows to read */
protected $rowDataBuffer = null;
@ -88,6 +95,14 @@ class RowIterator implements IteratorInterface
$this->cellValueFormatter = new CellValueFormatter($sharedStringsHelper, $this->styleHelper, $options->shouldFormatDates());
$this->shouldPreserveEmptyRows = $options->shouldPreserveEmptyRows();
// Register all callbacks to process different nodes when reading the XML file
$this->xmlProcessor = new XMLProcessor($this->xmlReader);
$this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
$this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']);
}
/**
@ -152,7 +167,7 @@ class RowIterator implements IteratorInterface
$this->nextRowIndexToBeProcessed++;
if ($this->doesNeedDataForNextRowToBeProcessed()) {
$this->readDataForNextRow($this->xmlReader);
$this->readDataForNextRow();
}
}
@ -180,55 +195,26 @@ class RowIterator implements IteratorInterface
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object
* @return void
* @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
* @throws \Box\Spout\Common\Exception\IOException If unable to read the sheet data XML
*/
protected function readDataForNextRow($xmlReader)
protected function readDataForNextRow()
{
$rowData = [];
$this->currentlyProcessedRowData = [];
try {
while ($xmlReader->read()) {
if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_DIMENSION)) {
$this->processDimensionStartingNode($xmlReader);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_ROW)) {
$rowData = $this->processRowStartingNode($xmlReader);
} else if ($xmlReader->isPositionedOnStartingNode(self::XML_NODE_CELL)) {
$rowData = $this->processCellStartingNode($xmlReader, $rowData);
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_ROW)) {
// if the fetched row is empty and we don't want to preserve it..,
if (!$this->shouldPreserveEmptyRows && $this->isEmptyRow($rowData)) {
// ... skip it
continue;
}
$rowData = $this->processRowEndingNode($rowData);
// at this point, we have all the data we need for the row
// so that we can populate the buffer
break;
} else if ($xmlReader->isPositionedOnEndingNode(self::XML_NODE_WORKSHEET)) {
$this->processWorksheetEndingNode();
break;
}
}
$this->xmlProcessor->readUntilStopped();
} catch (XMLProcessingException $exception) {
throw new IOException("The {$this->sheetDataXMLFilePath} file cannot be read. [{$exception->getMessage()}]");
}
$this->rowDataBuffer = $rowData;
$this->rowDataBuffer = $this->currentlyProcessedRowData;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node
* @return void
* @return int A return code that indicates what action should the processor take next
*/
protected function processDimensionStartingNode($xmlReader)
{
@ -238,11 +224,13 @@ class RowIterator implements IteratorInterface
$lastCellIndex = $matches[1];
$this->numColumns = CellHelper::getColumnIndexFromCellIndex($lastCellIndex) + 1;
}
return XMLProcessor::PROCESSING_CONTINUE;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node
* @return array
* @return int A return code that indicates what action should the processor take next
*/
protected function processRowStartingNode($xmlReader)
{
@ -260,45 +248,58 @@ class RowIterator implements IteratorInterface
$numberOfColumnsForRow = intval($numberOfColumnsForRow);
}
return ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
$this->currentlyProcessedRowData = ($numberOfColumnsForRow !== 0) ? array_fill(0, $numberOfColumnsForRow, '') : [];
return XMLProcessor::PROCESSING_CONTINUE;
}
/**
* @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node
* @param array $rowData Data of all cells read so far (key = cell index, value = cell value)
* @return array Original row data + data for the cell that was just read (key = cell index, value = cell value)
* @return int A return code that indicates what action should the processor take next
*/
protected function processCellStartingNode($xmlReader, $rowData)
protected function processCellStartingNode($xmlReader)
{
$currentColumnIndex = $this->getColumnIndex($xmlReader);
$node = $xmlReader->expand();
$rowData[$currentColumnIndex] = $this->getCellValue($node);
$this->currentlyProcessedRowData[$currentColumnIndex] = $this->getCellValue($node);
$this->lastColumnIndexProcessed = $currentColumnIndex;
return $rowData;
return XMLProcessor::PROCESSING_CONTINUE;
}
/**
* @param array $rowData Data of all cells read so far (key = cell index, value = cell value)
* @return array
* @return int A return code that indicates what action should the processor take next
*/
protected function processRowEndingNode($rowData)
protected function processRowEndingNode()
{
// if the fetched row is empty and we don't want to preserve it..,
if (!$this->shouldPreserveEmptyRows && $this->isEmptyRow($this->currentlyProcessedRowData)) {
// ... skip it
return XMLProcessor::PROCESSING_CONTINUE;
}
$this->numReadRows++;
// If needed, we fill the empty cells
return ($this->numColumns !== 0) ? $rowData : CellHelper::fillMissingArrayIndexes($rowData);
if ($this->numColumns === 0) {
$this->currentlyProcessedRowData = CellHelper::fillMissingArrayIndexes($this->currentlyProcessedRowData);
}
// at this point, we have all the data we need for the row
// so that we can populate the buffer
return XMLProcessor::PROCESSING_STOP;
}
/**
* @return void
* @return int A return code that indicates what action should the processor take next
*/
protected function processWorksheetEndingNode()
{
// The closing "</worksheet>" marks the end of the file
$this->hasReachedEndOfFile = true;
return XMLProcessor::PROCESSING_STOP;
}
/**