From dbdf5f7f386118ab4ef9341924e8463c9bc3f47c Mon Sep 17 00:00:00 2001 From: Adrien Loison Date: Sun, 27 Oct 2019 19:53:55 +0100 Subject: [PATCH] [ODS] Add support for whitespaces inside The `` node can contain the string value directly or contain child elements. In this case, whitespaces contain in the child elements should be replaced by their XML equivalent: - space => `` - tab => `` - line break => `` @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949 --- .../Reader/ODS/Helper/CellValueFormatter.php | 87 ++++++++++++++---- tests/Spout/Reader/ODS/ReaderTest.php | 11 +++ .../ods/sheet_with_whitespaces_as_xml.ods | Bin 0 -> 2439 bytes 3 files changed, 80 insertions(+), 18 deletions(-) create mode 100644 tests/resources/ods/sheet_with_whitespaces_as_xml.ods diff --git a/src/Spout/Reader/ODS/Helper/CellValueFormatter.php b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php index 50209ec..0fb0ac6 100644 --- a/src/Spout/Reader/ODS/Helper/CellValueFormatter.php +++ b/src/Spout/Reader/ODS/Helper/CellValueFormatter.php @@ -22,9 +22,11 @@ class CellValueFormatter /** Definition of XML nodes names used to parse data */ const XML_NODE_P = 'p'; - const XML_NODE_S = 'text:s'; - const XML_NODE_A = 'text:a'; - const XML_NODE_SPAN = 'text:span'; + const XML_NODE_TEXT_A = 'text:a'; + const XML_NODE_TEXT_SPAN = 'text:span'; + const XML_NODE_TEXT_S = 'text:s'; + const XML_NODE_TEXT_TAB = 'text:tab'; + const XML_NODE_TEXT_LINE_BREAK = 'text:line-break'; /** Definition of XML attributes used to parse data */ const XML_ATTRIBUTE_TYPE = 'office:value-type'; @@ -41,6 +43,13 @@ class CellValueFormatter /** @var \Box\Spout\Common\Helper\Escaper\ODS Used to unescape XML data */ protected $escaper; + /** @var array List of XML nodes representing whitespaces and their corresponding value */ + private static $WHITESPACE_XML_NODES = [ + self::XML_NODE_TEXT_S => ' ', + self::XML_NODE_TEXT_TAB => "\t", + self::XML_NODE_TEXT_LINE_BREAK => "\n", + ]; + /** * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings * @param \Box\Spout\Common\Helper\Escaper\ODS $escaper Used to unescape XML data @@ -96,21 +105,7 @@ class CellValueFormatter $pNodes = $node->getElementsByTagName(self::XML_NODE_P); foreach ($pNodes as $pNode) { - $currentPValue = ''; - - foreach ($pNode->childNodes as $childNode) { - if ($childNode instanceof \DOMText) { - $currentPValue .= $childNode->nodeValue; - } elseif ($childNode->nodeName === self::XML_NODE_S) { - $spaceAttribute = $childNode->getAttribute(self::XML_ATTRIBUTE_C); - $numSpaces = (!empty($spaceAttribute)) ? (int) $spaceAttribute : 1; - $currentPValue .= str_repeat(' ', $numSpaces); - } elseif ($childNode->nodeName === self::XML_NODE_A || $childNode->nodeName === self::XML_NODE_SPAN) { - $currentPValue .= $childNode->nodeValue; - } - } - - $pNodeValues[] = $currentPValue; + $pNodeValues[] = $this->extractTextValueFromNode($pNode); } $escapedCellValue = implode("\n", $pNodeValues); @@ -119,6 +114,62 @@ class CellValueFormatter return $cellValue; } + /** + * @param $pNode + * @return string + */ + private function extractTextValueFromNode($pNode) + { + $textValue = ''; + + foreach ($pNode->childNodes as $childNode) { + if ($childNode instanceof \DOMText) { + $textValue .= $childNode->nodeValue; + } elseif ($this->isWhitespaceNode($childNode->nodeName)) { + $textValue .= $this->transformWhitespaceNode($childNode); + } elseif ($childNode->nodeName === self::XML_NODE_TEXT_A || $childNode->nodeName === self::XML_NODE_TEXT_SPAN) { + $textValue .= $this->extractTextValueFromNode($childNode); + } + } + + return $textValue; + } + + /** + * Returns whether the given node is a whitespace node. It must be one of these: + * - + * - + * - + * + * @param string $nodeName + * @return bool + */ + private function isWhitespaceNode($nodeName) + { + return isset(self::$WHITESPACE_XML_NODES[$nodeName]); + } + + /** + * The "" node can contain the string value directly + * or contain child elements. In this case, whitespaces contain in + * the child elements should be replaced by their XML equivalent: + * - space => + * - tab => + * - line break => + * + * @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949 + * + * @param \DOMNode $node The XML node representing a whitespace + * @return string The corresponding whitespace value + */ + private function transformWhitespaceNode($node) + { + $countAttribute = $node->getAttribute(self::XML_ATTRIBUTE_C); // only defined for "" + $numWhitespaces = (!empty($countAttribute)) ? (int) $countAttribute : 1; + + return str_repeat(self::$WHITESPACE_XML_NODES[$node->nodeName], $numWhitespaces); + } + /** * Returns the cell Numeric value from the given node. * diff --git a/tests/Spout/Reader/ODS/ReaderTest.php b/tests/Spout/Reader/ODS/ReaderTest.php index 674e4b0..ce56e4b 100644 --- a/tests/Spout/Reader/ODS/ReaderTest.php +++ b/tests/Spout/Reader/ODS/ReaderTest.php @@ -277,6 +277,17 @@ class ReaderTest extends TestCase $this->assertEquals([$expectedRow], $allRows); } + /** + * @return void + */ + public function testReadShouldSupportWhitespaceAsXML() + { + $allRows = $this->getAllRowsForFile('sheet_with_whitespaces_as_xml.ods'); + + $expectedRow = ["Lorem ipsum\tdolor sit amet"]; + $this->assertEquals([$expectedRow], $allRows); + } + /** * @NOTE: The LIBXML_NOENT is used to ACTUALLY substitute entities (and should therefore not be used) * diff --git a/tests/resources/ods/sheet_with_whitespaces_as_xml.ods b/tests/resources/ods/sheet_with_whitespaces_as_xml.ods new file mode 100644 index 0000000000000000000000000000000000000000..d34595179cbc9f7313ede4380a686396f22d3e21 GIT binary patch literal 2439 zcmZ`*2{aU38@6kZU9t?4?8{`!`k_8Fwy|%CF&e^{kr^Q$qJ*);*p(vLm!HyP%98w+ ziA?y6bt2ifp|6Z3^$-8|pUU~ad(S=ZJ?Eb1x#zy;d7t~*S~9Z;G8~M&2bUo62wkiy zeSffy(N~av5F8T`4EGKWM)>=9WBgG_wd=?$swi)?KUx(P3`br;`Gf|+kr-8Ua0uM{ z3i>J>jkxyOo>jcD?$s3-r+*xA44NsXpmd$Lzp5iFVC#|t~ z=*pAEmQ2f!czCa6IxpWJZYtI4-668N;vEH_ZAMOp%Pw2pb!UQ&6ew(HiL2=FUAE%( z|H4%)k(R}xbNymsALl~`rhlcf76O(nmC4Y8_xXOAcRdIDq1IWZm0|#glXGwF9w>d- z;THVM@=TM)dyIFI+nj4Bf13D}F_H2c#E{nYVYbPk1!(L}aDnB?4-exnsua%h+$5F7 zz-W7GLxjDxLPa)wgy(BsgWlFY^2ga~1wD&mqyBw;aqixZT<@If=g*&qY~W<6<_Uo& z`dzX3SEVRtAFcj**dJ+M13ys!vezhE6XHd;4d9VuHE$v-i2b`zK{9rTam(tQhk%oK z;()a|c}h)&pgv`*iY<{NT1h4-iYcS7t(5B+75TJq$z*!kR)o5`2(xTUKY&Ru=e$-D zG26260-P1{Kf(HK-s8ezR;|XDF2=ngu~#au`&>`e%)(S*o3`;92?Um~OH5sc^AO{+ z*V$n&JM3y&$J5tZ*aHx}Sg>9ype}(lE(s=8Y~uja74V5k5WIRPpXp^p88O1_ulA&y zRI@6pWVgYA`cs`vKD8UpPlhfx%u&GBX{r8@1$IZu94rh3tKA;^UXUR z=EX1M&zAy-?%lCKa|~9iZ^sX-6PgbTXP47~09~Ps?yEi?Lkg$0H`x9J8KU9xGbjAA zaBY=VTgDy4M-=u1pp@yiHXTBy2{MfsD?Nkr#JQhN zRcb!QYQaJD@#)F_rX(N{g(Nh_kViwvyPx`vlIM+n)jz~wvXhySfkBjB1BW#Q$9Nx# zKf?rh3&d+m(~X}CV#WJc-{DKYtoH6RLyjTXuH6i;u(ZZ0DOi#W(%z=ln8VC1O?PcF6x=p2bm5uO>XM@pOOVslBLFzr1XtZ8 znro-73X*74Trxif1d+egfot_OYENqGbQ%g&LX*P^oCe>E)DMnGq#y<^*h2L+d(E06 zPr*oEQZ}YO3x#L5X+GO&F6Qn?=f2D5JR8vu%XQXQZ=GWiXs`L*Ny(NQUSD@(1tH0S zw_nHi0OvOClX4iVQ<4q%42Yg-FdmYK!5q(h<^|Th&X%!lm*|GRX~czSrE@WE`wYJ( zipeYKTK|Fh&=Vy@kJhG+CMsYXGqoUZq`xm5eP|iaGTH+=KwNjIQP}Co)u3|kna^ao zQ3>PGHv#o<0fo|tJwp655ubXAvdr6$so&%8?CX}V-`40pZ)3P3H+*V`Uph+X%eU)` zUc=Fo$3^6AdSGq1x;4Hji}IX&1Y;Tkp#DS%WogRMsU`wmrA0(Ky>Jp^RtuO6RawV5 zSdE_hck_GUvNNTvapHyQVm4j8#ktQOIoJkbB_3aUn-tetFmwr0u#&>NohbjtK)&~h z>=jD?uBq3C^88?Q)+#5llVVh)!SOXiKTpy~(uncb1Qc|}B~;TN52nk6gPs61CISIR zA8N&kh`R^kZCOx~pG(n_$>o9@WLG(+>7J34;*8B6I1hf^wX>^vwIQ!Tpve|PTN&xi z9Bs3*7xeR09;q@JUIz$S*-=7<&n=FSsB^-OuI7%Gx1DQDTzm(5KXiD80n2NUTEH_7 zb&czac`9+bmfHX!<6;D5kgf2(SU^g`Kw_b`AyUCN?kzyM?z)go!yOM%O;M*n7@<}u zqmP_Mz5!JXAwPGmz^YhUGEADFotCOI~i%E=~Cz&5G)7ifKB z?3M+s1JYuk0Icv;r^R#R47;XgcWNu+oAy`pDIbH~ zFfV@N-yIh-tUB@MX-)AHeu+<*&(@Rkz8Z8pNLxsU^W<`f6y`{2ii-8+8#AMH&d=P1 z^xop1X=hG^2j+Z_&8Z5w7%GDX*vl+upnqeFvu@uvdcoB~CXY)fNQMjeIYM3kzM9#iWzr@yV2~3VG>tf?G6%Oa( zZ}Y^^CC$hr$o$VO1zp#N10Q=3{?}mqK>h!w;lCIL27Ig|efv`jaTIveZGQqX)35v6 udyk@yTF(zu6w5yt(NT({#ruOIn)T;WwzXtsJ2+#dPbvDhV0Z8*F#HF`_Z~t3 literal 0 HcmV?d00001