diff --git a/src/Spout/Reader/ODS/Helper/CellValueFormatter.php b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php index 50209ec..0fb0ac6 100644 --- a/src/Spout/Reader/ODS/Helper/CellValueFormatter.php +++ b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php @@ -22,9 +22,11 @@ class CellValueFormatter /** Definition of XML nodes names used to parse data */ const XML_NODE_P = 'p'; - const XML_NODE_S = 'text:s'; - const XML_NODE_A = 'text:a'; - const XML_NODE_SPAN = 'text:span'; + const XML_NODE_TEXT_A = 'text:a'; + const XML_NODE_TEXT_SPAN = 'text:span'; + const XML_NODE_TEXT_S = 'text:s'; + const XML_NODE_TEXT_TAB = 'text:tab'; + const XML_NODE_TEXT_LINE_BREAK = 'text:line-break'; /** Definition of XML attributes used to parse data */ const XML_ATTRIBUTE_TYPE = 'office:value-type'; @@ -41,6 +43,13 @@ class CellValueFormatter /** @var \Box\Spout\Common\Helper\Escaper\ODS Used to unescape XML data */ protected $escaper; + /** @var array List of XML nodes representing whitespaces and their corresponding value */ + private static $WHITESPACE_XML_NODES = [ + self::XML_NODE_TEXT_S => ' ', + self::XML_NODE_TEXT_TAB => "\t", + self::XML_NODE_TEXT_LINE_BREAK => "\n", + ]; + /** * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings * @param \Box\Spout\Common\Helper\Escaper\ODS $escaper Used to unescape XML data @@ -96,21 +105,7 @@ class CellValueFormatter $pNodes = $node->getElementsByTagName(self::XML_NODE_P); foreach ($pNodes as $pNode) { - $currentPValue = ''; - - foreach ($pNode->childNodes as $childNode) { - if ($childNode instanceof \DOMText) { - $currentPValue .= $childNode->nodeValue; - } elseif ($childNode->nodeName === self::XML_NODE_S) { - $spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C); - $numSpaces = (!empty($spaceAttribute)) ? (int) $spaceAttribute : 1; - $currentPValue .= str_repeat(' ', $numSpaces); - } elseif ($childNode->nodeName === self::XML_NODE_A || $childNode->nodeName === self::XML_NODE_SPAN) { - $currentPValue .= $childNode->nodeValue; - } - } - - $pNodeValues[] = $currentPValue; + $pNodeValues[] = $this->extractTextValueFromNode($pNode); } $escapedCellValue = implode("\n", $pNodeValues); @@ -119,6 +114,62 @@ class CellValueFormatter return $cellValue; } + /** + * @param $pNode + * @return string + */ + private function extractTextValueFromNode($pNode) + { + $textValue = ''; + + foreach ($pNode->childNodes as $childNode) { + if ($childNode instanceof \DOMText) { + $textValue .= $childNode->nodeValue; + } elseif ($this->isWhitespaceNode($childNode->nodeName)) { + $textValue .= $this->transformWhitespaceNode($childNode); + } elseif ($childNode->nodeName === self::XML_NODE_TEXT_A || $childNode->nodeName === self::XML_NODE_TEXT_SPAN) { + $textValue .= $this->extractTextValueFromNode($childNode); + } + } + + return $textValue; + } + + /** + * Returns whether the given node is a whitespace node. It must be one of these: + * - + * - + * - + * + * @param string $nodeName + * @return bool + */ + private function isWhitespaceNode($nodeName) + { + return isset(self::$WHITESPACE_XML_NODES[$nodeName]); + } + + /** + * The "" node can contain the string value directly + * or contain child elements. In this case, whitespaces contain in + * the child elements should be replaced by their XML equivalent: + * - space => + * - tab => + * - line break => + * + * @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949 + * + * @param \DOMNode $node The XML node representing a whitespace + * @return string The corresponding whitespace value + */ + private function transformWhitespaceNode($node) + { + $countAttribute = $node->getAttribute(self::XML_ATTRIBUTE_C); // only defined for "" + $numWhitespaces = (!empty($countAttribute)) ? (int) $countAttribute : 1; + + return str_repeat(self::$WHITESPACE_XML_NODES[$node->nodeName], $numWhitespaces); + } + /** * Returns the cell Numeric value from the given node. * diff --git a/tests/Spout/Reader/ODS/ReaderTest.php b/tests/Spout/Reader/ODS/ReaderTest.php index 674e4b0..ce56e4b 100644 --- a/tests/Spout/Reader/ODS/ReaderTest.php +++ b/tests/Spout/Reader/ODS/ReaderTest.php @@ -277,6 +277,17 @@ class ReaderTest extends TestCase $this->assertEquals([$expectedRow], $allRows); } + /** + * @return void + */ + public function testReadShouldSupportWhitespaceAsXML() + { + $allRows = $this->getAllRowsForFile('sheet_with_whitespaces_as_xml.ods'); + + $expectedRow = ["Lorem ipsum\tdolor sit amet"]; + $this->assertEquals([$expectedRow], $allRows); + } + /** * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) * diff --git a/tests/resources/ods/sheet_with_whitespaces_as_xml.ods b/tests/resources/ods/sheet_with_whitespaces_as_xml.ods new file mode 100644 index 0000000..d345951 Binary files /dev/null and b/tests/resources/ods/sheet_with_whitespaces_as_xml.ods differ