From 677f3f096e879fab1d63999d2a195620768b2c87 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 16 Mar 2024 21:22:18 +0100 Subject: [PATCH] Fix hasSingleTagInsideElement method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It would fail for e.g. `

foo

`. mozilla/readability uses children for the tag lookup, which return only elements. PHP does not have children property so b580cf216d9001f82c866bb9a6c8bcad1cc862d8 mistakenly used `childNodes` instead, but that can return any node type. Let’s filter the children ourselves. Also add comments from mozilla/readability’s `_hasSingleTagInsideElement`. --- src/Readability.php | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 836a333..b0b815f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1477,14 +1477,23 @@ private function isPhrasingContent($node): bool ); } + /** + * Checks if `$node` has only whitespace and a single element with `$tag` for the tag name. + * Returns false if `$node` contains non-empty text nodes + * or if it contains no element with given tag or more than 1 element. + */ private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool { - if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) { + $childNodes = iterator_to_array($node->childNodes); + $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement); + + // There should be exactly 1 element child with given tag + if (1 !== \count($children) || $children[0]->nodeName !== $tag) { return false; } $a = array_filter( - iterator_to_array($node->childNodes), + $childNodes, fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode)) );