@@ -82,28 +82,55 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
8282
8383 public static function loadHtml (string $ html , $ charset = 'UTF-8 ' ): \DOMDocument
8484 {
85- $ unsafeLibXml = \LIBXML_VERSION < 20900 ;
86- $ current = libxml_use_internal_errors (true );
87- if ($ unsafeLibXml ) {
88- $ disableEntities = libxml_disable_entity_loader (true );
89- }
90- $ d = new \DOMDocument ('1.0 ' , $ charset );
91- $ d ->validateOnParse = true ;
92- if (function_exists ('mb_convert_encoding ' ) && in_array (
93- strtolower ($ charset ),
94- array_map ('strtolower ' , mb_list_encodings ())
95- )
96- ) {
97- $ html = mb_convert_encoding ($ html , 'HTML-ENTITIES ' , $ charset );
85+ return self ::parseXhtml ($ html , $ charset );
86+ }
87+ /**
88+ * Function originally taken from Symfony\Component\DomCrawler\Crawler
89+ * (c) Fabien Potencier <fabien@symfony.com>
90+ * License: MIT
91+ */
92+ private static function parseXhtml (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
93+ {
94+ $ htmlContent = self ::convertToHtmlEntities ($ htmlContent , $ charset );
95+
96+ $ internalErrors = libxml_use_internal_errors (true );
97+
98+ $ dom = new \DOMDocument ('1.0 ' , $ charset );
99+ $ dom ->validateOnParse = true ;
100+
101+ if ('' !== trim ($ htmlContent )) {
102+ // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
103+ // Option LIBXML_SCHEMA_CREATE seems to prevent this
104+ // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
105+ @$ dom ->loadHTML ($ htmlContent , \LIBXML_SCHEMA_CREATE );
98106 }
99- // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
100- // Option LIBXML_SCHEMA_CREATE seems to prevent this
101- // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
102- @$ d ->loadHTML ($ html , \LIBXML_SCHEMA_CREATE );
103- libxml_use_internal_errors ($ current );
104- if ($ unsafeLibXml ) {
105- libxml_disable_entity_loader ($ disableEntities );
107+
108+ libxml_use_internal_errors ($ internalErrors );
109+
110+ return $ dom ;
111+ }
112+
113+ /**
114+ * Converts charset to HTML-entities to ensure valid parsing.
115+ * Function taken from Symfony\Component\DomCrawler\Crawler
116+ * (c) Fabien Potencier <fabien@symfony.com>
117+ * License: MIT
118+ */
119+ private static function convertToHtmlEntities (string $ htmlContent , string $ charset = 'UTF-8 ' ): string
120+ {
121+ set_error_handler (function () { throw new \Exception (); });
122+
123+ try {
124+ return mb_encode_numericentity ($ htmlContent , [0x80 , 0x10FFFF , 0 , 0x1FFFFF ], $ charset );
125+ } catch (\Exception |\ValueError ) {
126+ try {
127+ $ htmlContent = iconv ($ charset , 'UTF-8 ' , $ htmlContent );
128+ $ htmlContent = mb_encode_numericentity ($ htmlContent , [0x80 , 0x10FFFF , 0 , 0x1FFFFF ], 'UTF-8 ' );
129+ } catch (\Exception |\ValueError ) {
130+ }
131+ return $ htmlContent ;
132+ } finally {
133+ restore_error_handler ();
106134 }
107- return $ d ;
108135 }
109136}
0 commit comments