Skip to content

Commit 90d587f

Browse files
author
Christoph Singer
committed
Adopt helper functions for loading HTML from Symfony
1 parent e8de7cd commit 90d587f

2 files changed

Lines changed: 49 additions & 22 deletions

File tree

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"php":"^8.0",
1717
"ext-dom":"*",
1818
"ext-libxml":"*",
19-
"ext-mbstring":"*",
19+
"symfony/polyfill-mbstring": "~1.0",
2020
"symfony/dom-crawler":"^6",
2121
"symfony/css-selector":"^6"
2222
},

src/Helpers.php

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -82,28 +82,55 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
8282

8383
public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
8484
{
85-
$unsafeLibXml = \LIBXML_VERSION < 20900;
86-
$current = libxml_use_internal_errors(true);
87-
if($unsafeLibXml) {
88-
$disableEntities = libxml_disable_entity_loader(true);
89-
}
90-
$d = new \DOMDocument('1.0', $charset);
91-
$d->validateOnParse = true;
92-
if (function_exists('mb_convert_encoding') && in_array(
93-
strtolower($charset),
94-
array_map('strtolower', mb_list_encodings())
95-
)
96-
) {
97-
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
85+
return self::parseXhtml($html, $charset);
86+
}
87+
/**
88+
* Function originally taken from Symfony\Component\DomCrawler\Crawler
89+
* (c) Fabien Potencier <fabien@symfony.com>
90+
* License: MIT
91+
*/
92+
private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
93+
{
94+
$htmlContent = self::convertToHtmlEntities($htmlContent, $charset);
95+
96+
$internalErrors = libxml_use_internal_errors(true);
97+
98+
$dom = new \DOMDocument('1.0', $charset);
99+
$dom->validateOnParse = true;
100+
101+
if ('' !== trim($htmlContent)) {
102+
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
103+
// Option LIBXML_SCHEMA_CREATE seems to prevent this
104+
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
105+
@$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE);
98106
}
99-
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
100-
// Option LIBXML_SCHEMA_CREATE seems to prevent this
101-
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
102-
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
103-
libxml_use_internal_errors($current);
104-
if($unsafeLibXml) {
105-
libxml_disable_entity_loader($disableEntities);
107+
108+
libxml_use_internal_errors($internalErrors);
109+
110+
return $dom;
111+
}
112+
113+
/**
114+
* Converts charset to HTML-entities to ensure valid parsing.
115+
* Function taken from Symfony\Component\DomCrawler\Crawler
116+
* (c) Fabien Potencier <fabien@symfony.com>
117+
* License: MIT
118+
*/
119+
private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
120+
{
121+
set_error_handler(function () { throw new \Exception(); });
122+
123+
try {
124+
return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
125+
} catch (\Exception|\ValueError) {
126+
try {
127+
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
128+
$htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
129+
} catch (\Exception|\ValueError) {
130+
}
131+
return $htmlContent;
132+
} finally {
133+
restore_error_handler();
106134
}
107-
return $d;
108135
}
109136
}

0 commit comments

Comments
 (0)