Skip to content

Commit 8da89c3

Browse files
authored
Merge pull request #12 from elecena/fix/entities-handling
Improve XML entities handling
2 parents a89ca71 + 542337e commit 8da89c3

File tree

4 files changed

+77
-9
lines changed

4 files changed

+77
-9
lines changed

src/XMLParser.php

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class XMLParser implements \Iterator
1717
private ?string $currentTagName = null;
1818
private array $currentTagAttributes = [];
1919

20+
private string $currentTagContent = '';
21+
2022
/**
2123
* The stack of the XML node names as go deeper into the tree.
2224
*
@@ -84,6 +86,8 @@ public function startXML(\XMLParser $parser, string $tagName, array $attributes)
8486
$this->currentTagName = $tagName;
8587
$this->currentTagAttributes = $attributes;
8688

89+
$this->currentTagContent = '';
90+
8791
// append to the queue of items to iterate over
8892
$this->nodesQueue[] = new Nodes\XMLNodeOpen(
8993
name: $this->currentTagName,
@@ -94,19 +98,31 @@ public function startXML(\XMLParser $parser, string $tagName, array $attributes)
9498
$this->nodeNamesStack[] = $tagName;
9599
}
96100

101+
/**
102+
* The XML parser "emits" separate characters when the node has the content with XML entities.
103+
*
104+
* For instance: <loc>https://example.com/index.html?ACTION=1004&amp;SITE=3</loc>
105+
*
106+
* Would emit: 'https://example.com/index.html?ACTION=1004', '&' and 'SITE=3' separately.
107+
*
108+
* So, just accumulate the characters as we're getting them and "emit" the XMLNodeContent instance
109+
* when the node is closed.
110+
*/
97111
public function charXML(\XMLParser $parser, string $tagContent): void
112+
{
113+
$this->currentTagContent .= $tagContent;
114+
}
115+
116+
public function endXML(\XMLParser $parser, string $tagName): void
98117
{
99118
// append to the queue of items to iterate over
100119
$this->nodesQueue[] = new Nodes\XMLNodeContent(
101120
name: $this->currentTagName,
102121
attributes: $this->currentTagAttributes,
103-
content: $tagContent,
122+
content: $this->currentTagContent,
104123
parentName: array_slice($this->nodeNamesStack, -2, 1)[0] ?: null
105124
);
106-
}
107125

108-
public function endXML(\XMLParser $parser, string $tagName): void
109-
{
110126
// Pop the node name off the end of stack
111127
array_pop($this->nodeNamesStack);
112128

tests/XMLParserEntitiesTest.php

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<?php
2+
3+
use Elecena\XmlIterator\Nodes\XMLNodeContent;
4+
use Elecena\XmlIterator\Nodes\XMLNodeOpen;
5+
use Elecena\XmlIterator\Nodes\XMLNodeClose;
6+
7+
class XMLParserEntitiesTest extends XMLParserTestCase
8+
{
9+
protected function getParserStream()
10+
{
11+
return fopen(__DIR__ . '/fixtures/sitemap-entities.xml', mode: 'rt');
12+
}
13+
14+
public function testParsesTheLocNodesWithAmpersands(): void
15+
{
16+
$locations = [];
17+
18+
foreach($this->getParser()->iterateByNodeContent('loc') as $item) {
19+
$locations[] = $item->content;
20+
}
21+
22+
$this->assertCount(8, $locations);
23+
$this->assertEquals('https://www.reichelt.com/index.html?ACTION=1004&SITE=1', $locations[0]);
24+
$this->assertEquals('https://www.reichelt.com/magazin/en/sitemap.xml', $locations[7]);
25+
}
26+
}

tests/XMLParserTest.php

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,9 @@ public function testParsesTheLocNodes(): void
4747
{
4848
$locations = [];
4949

50-
foreach($this->getParser() as $item) {
51-
if ($item instanceof XMLNodeContent && $item->name === 'loc') {
52-
$locations[] = $item->content;
53-
$this->assertEquals('sitemap', $item->parentName);
54-
}
50+
foreach($this->getParser()->iterateByNodeContent('loc') as $item) {
51+
$locations[] = $item->content;
52+
$this->assertEquals('sitemap', $item->parentName);
5553
}
5654

5755
$this->assertCount(8, $locations);
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
3+
<!-- https://www.reichelt.com/sitemap.xml -->
4+
<sitemap>
5+
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=1</loc>
6+
</sitemap>
7+
<sitemap>
8+
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=2</loc>
9+
</sitemap>
10+
<sitemap>
11+
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=3</loc>
12+
</sitemap>
13+
<sitemap>
14+
<loc>https://www.reichelt.com/index.html?ACTION=1004&amp;SITE=4</loc>
15+
</sitemap>
16+
<sitemap>
17+
<loc>https://www.reichelt.de/magazin/sitemap.xml</loc>
18+
</sitemap>
19+
<sitemap>
20+
<loc>https://www.reichelt.com/magazin/fr/sitemap.xml</loc>
21+
</sitemap>
22+
<sitemap>
23+
<loc>https://www.reichelt.com/magazin/nl/sitemap.xml</loc>
24+
</sitemap>
25+
<sitemap>
26+
<loc>https://www.reichelt.com/magazin/en/sitemap.xml</loc>
27+
</sitemap>
28+
</sitemapindex>

0 commit comments

Comments
 (0)