From 677f3f096e879fab1d63999d2a195620768b2c87 Mon Sep 17 00:00:00 2001
From: Jan Tojnar
Date: Sat, 16 Mar 2024 21:22:18 +0100
Subject: [PATCH] Fix hasSingleTagInsideElement method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
It would fail for e.g. ``.
mozilla/readability uses children for the tag lookup, which return only elements.
PHP does not have children property so b580cf216d9001f82c866bb9a6c8bcad1cc862d8
mistakenly used `childNodes` instead, but that can return any node type.
Let’s filter the children ourselves.
Also add comments from mozilla/readability’s `_hasSingleTagInsideElement`.
---
src/Readability.php | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/src/Readability.php b/src/Readability.php
index 836a333..b0b815f 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -1477,14 +1477,23 @@ private function isPhrasingContent($node): bool
);
}
+ /**
+ * Checks if `$node` has only whitespace and a single element with `$tag` for the tag name.
+ * Returns false if `$node` contains non-empty text nodes
+ * or if it contains no element with given tag or more than 1 element.
+ */
private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
{
- if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
+ $childNodes = iterator_to_array($node->childNodes);
+ $children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement);
+
+ // There should be exactly 1 element child with given tag
+ if (1 !== \count($children) || $children[0]->nodeName !== $tag) {
return false;
}
$a = array_filter(
- iterator_to_array($node->childNodes),
+ $childNodes,
fn ($childNode) => $childNode instanceof \DOMText && preg_match($this->regexps['hasContent'], $this->getInnerText($childNode))
);