-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathXMLParser.php
More file actions
218 lines (184 loc) · 6.18 KB
/
XMLParser.php
File metadata and controls
218 lines (184 loc) · 6.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
<?php
namespace Elecena\XmlIterator;
use Elecena\XmlIterator\Exceptions\ParsingError;
use Elecena\XmlIterator\Nodes\XMLNodeContent;
/**
* Implements a fast and memory-efficient XML parser with the iterator interface.
*
* @see https://www.php.net/manual/en/function.xml-parse.php
*/
class XMLParser implements \Iterator
{
private \XMLParser $parser;
private ?string $currentTagName = null;
private array $currentTagAttributes = [];
private string $currentTagContent = '';
/**
* The stack of the XML node names as go deeper into the tree.
*
* @var string[]
*/
private array $nodeNamesStack = [];
/**
* Holds nodes to iterate over as we parse through XML
*
* @var Nodes\XMLNode[]
*/
private array $nodesQueue = [];
private int $index;
// how much data to read from the XML at a time
const BATCH_READ_SIZE = 4096;
/**
* @param resource $stream
*/
public function __construct(private $stream)
{
}
public function setUp(): void
{
$this->index = 0;
$this->nodesQueue = [];
$this->currentTagName = null;
$this->currentTagAttributes = [];
$this->nodeNamesStack = [];
$this->parser = \xml_parser_create();
/**
* Set XML parsing handling methods
*
* These methods will push @see Nodes\XMLNode classes to the nodesStack array.
*
* Once the iterator goes through them all, the self::next() method
* will read and parse the next portion of the input XML stream.
*/
xml_set_element_handler($this->parser, [$this,'startXML'], [$this, 'endXML']);
xml_set_character_data_handler($this->parser, [$this,'charXML']);
// @see https://www.php.net/manual/en/function.xml-parser-set-option.php
xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
}
private function close(): void
{
xml_parser_free($this->parser);
}
/**
* @param \XMLParser $parser
* @param string $tagName
* @param array $attributes
* @return void
*/
public function startXML(\XMLParser $parser, string $tagName, array $attributes): void
{
$this->currentTagName = $tagName;
$this->currentTagAttributes = $attributes;
$this->currentTagContent = '';
// append to the queue of items to iterate over
$this->nodesQueue[] = new Nodes\XMLNodeOpen(
name: $this->currentTagName,
attributes: $this->currentTagAttributes,
parentName: end($this->nodeNamesStack) ?: null
);
$this->nodeNamesStack[] = $tagName;
}
/**
* The XML parser "emits" separate characters when the node has the content with XML entities.
*
* For instance: <loc>https://example.com/index.html?ACTION=1004&SITE=3</loc>
*
* Would emit: 'https://example.com/index.html?ACTION=1004', '&' and 'SITE=3' separately.
*
* So, just accumulate the characters as we're getting them and "emit" the XMLNodeContent instance
* when the node is closed.
*/
public function charXML(\XMLParser $parser, string $tagContent): void
{
$this->currentTagContent .= $tagContent;
}
public function endXML(\XMLParser $parser, string $tagName): void
{
// append to the queue of items to iterate over
$this->nodesQueue[] = new Nodes\XMLNodeContent(
name: $this->currentTagName,
attributes: $this->currentTagAttributes,
content: $this->currentTagContent,
parentName: array_slice($this->nodeNamesStack, -2, 1)[0] ?: null
);
// Pop the node name off the end of stack
array_pop($this->nodeNamesStack);
// append to the queue of items to iterate over
$this->nodesQueue[] = new Nodes\XMLNodeClose(
name: $tagName,
parentName: end($this->nodeNamesStack) ?: null
);
// and update the current tag name to properly handle consecutive closing tag and whitespaces
// e.g. </foo>\n\n</bar>
$this->currentTagName = end($this->nodeNamesStack);
}
public function current(): Nodes\XMLNode
{
return array_shift($this->nodesQueue);
}
/**
* @throws ParsingError
*/
public function next(): void
{
// we still have some already parsed nodes on the queue
if (!empty($this->nodesQueue)) {
$this->index++;
return;
}
$this->parseNextChunk();
}
public function key(): int
{
return $this->index;
}
public function valid(): bool
{
return !empty($this->nodesQueue);
}
/**
* @throws ParsingError
*/
public function rewind(): void
{
$this->setUp();
$this->parseNextChunk();
}
/**
* Takes the next chunk of data from the XML stream and parses the data.
*
* Callbacks are called, nodes are pushed to the stack and iterator can go over them.
*
* @return void
* @throws ParsingError
*/
private function parseNextChunk(): void
{
// the nodes stack has been iterated over, consume and parse the next piece of the XML stream
$data = stream_get_contents($this->stream, length: self::BATCH_READ_SIZE);
$isFinal = ($data === false);
$res = xml_parse($this->parser, $data, $isFinal);
if ($res === 0 /* returns 0 on failure */) {
// take more details from the parser instance and throw an exception
throw ParsingError::fromParserInstance($this->parser, is_string($data) ? $data : null);
}
// we're done with reading and parsing the stream, close the XML parser instance
if ($isFinal) {
$this->close();
}
}
/**
* Parses the XML and yields only the @see XMLNodeContent items with matching node name
*
* @param string $name
* @return \Generator<XMLNodeContent>
*/
public function iterateByNodeContent(string $name): \Generator
{
foreach($this as $node) {
if ($node instanceof XMLNodeContent && $node->name === $name) {
yield $node;
}
}
}
}