44
55namespace MirazMac \HtmlSanitizer ;
66
7+ use function \chr ;
8+ use function \html_entity_decode ;
9+ use function \htmlspecialchars ;
10+ use function \libxml_clear_errors ;
11+ use function \libxml_disable_entity_loader ;
12+ use function \libxml_use_internal_errors ;
13+ use function \mb_strlen ;
14+ use function \mb_strpos ;
15+ use function \mb_strtolower ;
16+ use function \mb_substr ;
17+ use function \parse_url ;
18+ use function \preg_match ;
19+ use function \range ;
20+ use function \str_replace ;
21+ use function \trim ;
22+ use function \version_compare ;
23+
724/**
825 * HtmlSanitizer
926 *
10- * A lightweight library to make sanitizing HTML easier on PHP. Has no dependencies except Native DomDocument support,
11- * faster than any other sanization library present for PHP
27+ * A lightweight library to make sanitizing HTML easier on PHP.
28+ * Has no dependencies except native PHP extensions like dom, libxml, mbstring.
29+ *
30+ * Should be faster than any other sanization library present for PHP
1231 *
1332 * @author Miraz Mac <mirazmac@gmail.com>
1433 * @link https://mirazmac.com
@@ -37,17 +56,12 @@ public function __construct(Whitelist $whitelist)
3756 *
3857 * @param string $html
3958 * @return string
40- * @throws \RuntimeException If failed to convert the HTML into UTF-8 via mb_convert_encoding()
59+ * @throws \InvalidArgumentException If supplied HTML is not valid UTF-8
4160 */
4261 public function sanitize (string $ html ) : string
4362 {
44- // Because..
45- libxml_use_internal_errors (true );
46- libxml_clear_errors (true );
47-
48- // deprecated in PHP 8.0
49- if (version_compare (\PHP_VERSION , '8.0.0 ' , '< ' )) {
50- libxml_disable_entity_loader (true );
63+ if (!$ this ->isValidUtf8 ($ html )) {
64+ throw new \InvalidArgumentException ("Provided HTML must be valid utf-8 " );
5165 }
5266
5367 // Remove NULL characters (ignored by some browsers).
@@ -57,41 +71,56 @@ public function sanitize(string $html) : string
5771 return '' ;
5872 }
5973
60- // Construct the DOM Document
61- $ dom = new \DOMDocument ('1.0 ' , 'UTF-8 ' );
62-
63- // Fix encoding issues
64- $ html = @mb_convert_encoding ($ html , 'HTML-ENTITIES ' , 'UTF-8 ' );
74+ // Because..
75+ $ previousState = libxml_use_internal_errors (true );
76+ libxml_clear_errors ();
6577
66- if (empty ($ html )) {
67- throw new \RuntimeException ("Failed to convert the HTML into UTF-8 via mb_convert_encoding(); " );
78+ // deprecated in PHP 8.0
79+ if (\PHP_VERSION_ID < 80000 ) {
80+ libxml_disable_entity_loader (true );
6881 }
6982
70- // Nah, we're not HTMLPurifier (fuck that bloated ass library btw)
83+ // Construct the DOM Document
84+ $ dom = new \DOMDocument ('1.0 ' , 'UTF-8 ' );
85+
86+ // Nah
7187 $ dom ->strictErrorChecking = false ;
7288 // nope
7389 $ dom ->validateOnParse = false ;
7490 $ dom ->substituteEntities = false ;
91+ // Don't even try
7592 $ dom ->resolveExternals = false ;
7693 // whenever possible, please..
7794 $ dom ->recover = true ;
78- // should this be a option to customize?
79- // idk
8095 $ dom ->formatOutput = false ;
81- // same question
8296 $ dom ->preserveWhiteSpace = false ;
8397
8498 // no shit sherlock
8599 $ dom ->encoding = 'UTF-8 ' ;
86100
87101 // Finally load the HTML
88- $ dom ->loadHTML ($ html );
102+ $ dom ->loadHTML (
103+ // Prepend the utf-8 encoding tags
104+ // ugly hack but works better than mb_convert_encoding()
105+ '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta charset="UTF-8"> '
106+ .
107+ $ html ,
108+ \LIBXML_NOERROR | \LIBXML_NOWARNING | \LIBXML_HTML_NODEFDTD
109+ );
89110
90111 // Why again? Apparently it gets set to NULL after calling loadHTML(), so set it back to UTF-8 again,
91112 // otherwise saveHTML produces weird results
92113 $ dom ->encoding = 'UTF-8 ' ;
93114
94- return trim ($ dom ->saveHTML ($ this ->doSanitize ($ dom )));
115+ $ html = trim ($ dom ->saveHTML ($ this ->doSanitize ($ dom )));
116+
117+ // Clear the errors
118+ libxml_clear_errors ();
119+
120+ // Restore the state
121+ libxml_use_internal_errors ($ previousState );
122+
123+ return $ html ;
95124 }
96125
97126 /**
@@ -140,6 +169,12 @@ protected function doSanitize($html)
140169 continue ; // no further action required, let's proceed to the next one
141170 }
142171
172+ // Remove attribute if value doesn't match with an explicitly defined list
173+ if (!$ this ->whitelist ->isValueAllowed ($ html ->nodeName , $ name , $ value )) {
174+ $ html ->removeAttribute ($ name );
175+ continue ;
176+ }
177+
143178 // Handle boolean/blank attributes
144179 if (HtmlDataMap::isBooleanAttribute ($ name ) || $ this ->whitelist ->isBooleanAttribute ($ name )) {
145180 // If it's already empty or a valid boolean don't change anything
@@ -161,6 +196,7 @@ protected function doSanitize($html)
161196 );
162197 }
163198
199+
164200 // Regardless of all this, every attribute gets escaped
165201 $ html ->setAttribute (
166202 $ name ,
@@ -214,7 +250,7 @@ protected function filterURL(string $element, $value) : string
214250 * @param string $string
215251 * @return string
216252 */
217- protected function escapeAttribute (string $ string ) : string
253+ public function escapeAttribute (string $ string ) : string
218254 {
219255 $ string = html_entity_decode ($ string , ENT_QUOTES , 'UTF-8 ' );
220256 return htmlspecialchars ($ string , ENT_QUOTES , 'UTF-8 ' , true );
@@ -255,4 +291,16 @@ protected function stripDangerousProtocols($uri) : string
255291
256292 return $ uri ;
257293 }
294+
295+ /**
296+ * Determines whether the specified string is valid utf 8.
297+ *
298+ * @param string $string The string
299+ *
300+ * @return bool
301+ */
302+ protected function isValidUtf8 (string $ string ): bool
303+ {
304+ return '' === $ string || 1 === preg_match ('/^./us ' , $ string );
305+ }
258306}
0 commit comments