Skip to content

Commit 5ab5c6c

Browse files
committed
Added allowed values support for specific attributes, added tests, added source tags to the basic whitelist
1 parent 4b7976a commit 5ab5c6c

6 files changed

Lines changed: 433 additions & 29 deletions

File tree

src/BasicWhitelist.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,21 @@ public function __construct()
2222
parent::setProtocols(static::getBasicProtocols());
2323
}
2424

25+
/**
26+
* Returns the basic allowed protocols
27+
*
28+
* @return array
29+
*/
2530
public static function getBasicProtocols() : array
2631
{
2732
return ['http', 'https', 'ftp', '//', 'mailto', 'data'];
2833
}
2934

35+
/**
36+
* Gets the basic tags.
37+
*
38+
* @return array
39+
*/
3040
public static function getBasicTags() : array
3141
{
3242
return [
@@ -271,6 +281,9 @@ public static function getBasicTags() : array
271281
'xml:lang'
272282
],
273283
'sup' => [],
284+
'source' => [
285+
'src', 'type', 'sizes', 'srcset', 'media'
286+
],
274287
'table' => [
275288
'align',
276289
'bgcolor',

src/Sanitizer.php

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,30 @@
44

55
namespace MirazMac\HtmlSanitizer;
66

7+
use function \chr;
8+
use function \html_entity_decode;
9+
use function \htmlspecialchars;
10+
use function \libxml_clear_errors;
11+
use function \libxml_disable_entity_loader;
12+
use function \libxml_use_internal_errors;
13+
use function \mb_strlen;
14+
use function \mb_strpos;
15+
use function \mb_strtolower;
16+
use function \mb_substr;
17+
use function \parse_url;
18+
use function \preg_match;
19+
use function \range;
20+
use function \str_replace;
21+
use function \trim;
22+
use function \version_compare;
23+
724
/**
825
* HtmlSanitizer
926
*
10-
* A lightweight library to make sanitizing HTML easier on PHP. Has no dependencies except Native DomDocument support,
11-
* faster than any other sanization library present for PHP
27+
* A lightweight library to make sanitizing HTML easier on PHP.
28+
* Has no dependencies except native PHP extensions like dom, libxml, mbstring.
29+
*
30+
* Should be faster than any other sanization library present for PHP
1231
*
1332
* @author Miraz Mac <mirazmac@gmail.com>
1433
* @link https://mirazmac.com
@@ -37,17 +56,12 @@ public function __construct(Whitelist $whitelist)
3756
*
3857
* @param string $html
3958
* @return string
40-
* @throws \RuntimeException If failed to convert the HTML into UTF-8 via mb_convert_encoding()
59+
* @throws \InvalidArgumentException If supplied HTML is not valid UTF-8
4160
*/
4261
public function sanitize(string $html) : string
4362
{
44-
// Because..
45-
libxml_use_internal_errors(true);
46-
libxml_clear_errors(true);
47-
48-
// deprecated in PHP 8.0
49-
if (version_compare(\PHP_VERSION, '8.0.0', '<')) {
50-
libxml_disable_entity_loader(true);
63+
if (!$this->isValidUtf8($html)) {
64+
throw new \InvalidArgumentException("Provided HTML must be valid utf-8");
5165
}
5266

5367
// Remove NULL characters (ignored by some browsers).
@@ -57,41 +71,56 @@ public function sanitize(string $html) : string
5771
return '';
5872
}
5973

60-
// Construct the DOM Document
61-
$dom = new \DOMDocument('1.0', 'UTF-8');
62-
63-
// Fix encoding issues
64-
$html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
74+
// Because..
75+
$previousState = libxml_use_internal_errors(true);
76+
libxml_clear_errors();
6577

66-
if (empty($html)) {
67-
throw new \RuntimeException("Failed to convert the HTML into UTF-8 via mb_convert_encoding();");
78+
// deprecated in PHP 8.0
79+
if (\PHP_VERSION_ID < 80000) {
80+
libxml_disable_entity_loader(true);
6881
}
6982

70-
// Nah, we're not HTMLPurifier (fuck that bloated ass library btw)
83+
// Construct the DOM Document
84+
$dom = new \DOMDocument('1.0', 'UTF-8');
85+
86+
// Nah
7187
$dom->strictErrorChecking = false;
7288
// nope
7389
$dom->validateOnParse = false;
7490
$dom->substituteEntities = false;
91+
// Don't even try
7592
$dom->resolveExternals = false;
7693
// whenever possible, please..
7794
$dom->recover = true;
78-
// should this be a option to customize?
79-
// idk
8095
$dom->formatOutput = false;
81-
// same question
8296
$dom->preserveWhiteSpace = false;
8397

8498
// no shit sherlock
8599
$dom->encoding = 'UTF-8';
86100

87101
// Finally load the HTML
88-
$dom->loadHTML($html);
102+
$dom->loadHTML(
103+
// Prepend the utf-8 encoding tags
104+
// ugly hack but works better than mb_convert_encoding()
105+
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta charset="UTF-8">'
106+
.
107+
$html,
108+
\LIBXML_NOERROR | \LIBXML_NOWARNING | \LIBXML_HTML_NODEFDTD
109+
);
89110

90111
// Why again? Apparently it gets set to NULL after calling loadHTML(), so set it back to UTF-8 again,
91112
// otherwise saveHTML produces weird results
92113
$dom->encoding = 'UTF-8';
93114

94-
return trim($dom->saveHTML($this->doSanitize($dom)));
115+
$html = trim($dom->saveHTML($this->doSanitize($dom)));
116+
117+
// Clear the errors
118+
libxml_clear_errors();
119+
120+
// Restore the state
121+
libxml_use_internal_errors($previousState);
122+
123+
return $html;
95124
}
96125

97126
/**
@@ -140,6 +169,12 @@ protected function doSanitize($html)
140169
continue; // no further action required, let's proceed to the next one
141170
}
142171

172+
// Remove attribute if value doesn't match with an explicitly defined list
173+
if (!$this->whitelist->isValueAllowed($html->nodeName, $name, $value)) {
174+
$html->removeAttribute($name);
175+
continue;
176+
}
177+
143178
// Handle boolean/blank attributes
144179
if (HtmlDataMap::isBooleanAttribute($name) || $this->whitelist->isBooleanAttribute($name)) {
145180
// If it's already empty or a valid boolean don't change anything
@@ -161,6 +196,7 @@ protected function doSanitize($html)
161196
);
162197
}
163198

199+
164200
// Regardless of all this, every attribute gets escaped
165201
$html->setAttribute(
166202
$name,
@@ -214,7 +250,7 @@ protected function filterURL(string $element, $value) : string
214250
* @param string $string
215251
* @return string
216252
*/
217-
protected function escapeAttribute(string $string) : string
253+
public function escapeAttribute(string $string) : string
218254
{
219255
$string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
220256
return htmlspecialchars($string, ENT_QUOTES, 'UTF-8', true);
@@ -255,4 +291,16 @@ protected function stripDangerousProtocols($uri) : string
255291

256292
return $uri;
257293
}
294+
295+
/**
296+
* Determines whether the specified string is valid utf 8.
297+
*
298+
* @param string $string The string
299+
*
300+
* @return bool
301+
*/
302+
protected function isValidUtf8(string $string): bool
303+
{
304+
return '' === $string || 1 === preg_match('/^./us', $string);
305+
}
258306
}

src/Whitelist.php

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44

55
namespace MirazMac\HtmlSanitizer;
66

7+
use function \array_merge;
8+
use function \array_reverse;
9+
use function \explode;
10+
use function \in_array;
11+
use function \is_array;
12+
713
/**
814
* Whitelist
915
*
@@ -41,6 +47,13 @@ class Whitelist
4147
*/
4248
protected $treatAsBoolean = [];
4349

50+
/**
51+
* Allowed values for specific attributes
52+
*
53+
* @var array
54+
*/
55+
protected $values = [];
56+
4457
/**
4558
* Internally required tags
4659
*
@@ -141,6 +154,30 @@ public function removeAttribute(string $tagName, $attributes) : Whitelist
141154
return $this;
142155
}
143156

157+
/**
158+
* Sets list of allowed values for an attribute under a tag name.
159+
* If this is set and the value of the attribute doesn't match with these, the attribute will be removed.
160+
* This mainly should be used for custom data attributes where you only want a specific set of values.
161+
*
162+
* @param string $tagName The tag name
163+
* @param string $attribute The attribute
164+
* @param array $values The values
165+
*
166+
* @throws \LogicException If tag isn't allowed
167+
*
168+
* @return self
169+
*/
170+
public function setAllowedValues(string $tagName, string $attribute, array $values)
171+
{
172+
if (!$this->isTagAllowed($tagName)) {
173+
throw new \LogicException("Failed to allow values on attribute `{$attribute}` on tag `{$tagName}`, because the tag itself isn't allowed.");
174+
}
175+
176+
$this->values[$tagName][$attribute] = $values;
177+
178+
return $this;
179+
}
180+
144181
/**
145182
* Add one or many protocols to the whitelist
146183
*
@@ -424,6 +461,25 @@ public function isHostAllowed(string $tagName, string $host) : bool
424461
return false;
425462
}
426463

464+
/**
465+
* Determines if value is allowed for an attribute under.
466+
*
467+
* @param string $tagName The tag name
468+
* @param string $attribute The attribute
469+
* @param string $value The value
470+
*
471+
* @return bool
472+
*/
473+
public function isValueAllowed(string $tagName, string $attribute, $value) : bool
474+
{
475+
// Allowed by default unless added explicitly
476+
if (!isset($this->values[$tagName][$attribute])) {
477+
return true;
478+
}
479+
480+
return in_array($value, $this->values[$tagName][$attribute]);
481+
}
482+
427483
/**
428484
* Iteratively ensures the host domain is allowed
429485
* Taken from tgalopin/html-sanitizer

tests/SanitizerTest.php

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public function setUp() : void
2828
*/
2929
public function testSimpleHTML() : void
3030
{
31-
$string = $this->sanitizer->sanitize('<div id="fake"><h5 class="foo">Lorem ipsum</h5></div>');
31+
$string = $this->sanitizer->sanitize('<script>alert("hello");</script><div id="fake"><h5 class="foo">Lorem ipsum</h5></div>');
3232
$this->assertEquals("Lorem ipsum", $string);
3333
}
3434

@@ -41,6 +41,15 @@ public function testEmptyString() : void
4141
$this->assertEmpty($this->sanitizer->sanitize(''));
4242
}
4343

44+
/**
45+
* Tests unicode strings remaining as is
46+
*/
47+
public function testUnicodeString() : void
48+
{
49+
$string = $this->basicSanitizer->sanitize('<p>আমি বাংলায় গান গাই</p>');
50+
$this->assertEquals('<p>আমি বাংলায় গান গাই</p>', $string);
51+
}
52+
4453
/**
4554
* tests host filtering
4655
*
@@ -71,12 +80,22 @@ public function testCustomAttribute() : void
7180
$this->assertEquals('<img src="1.png" data-src="1.png">', $string);
7281
}
7382

83+
/**
84+
* Test allowed values for an attribute
85+
*/
86+
public function testAllowedValues()
87+
{
88+
$string = $this->basicSanitizer->sanitize('<a href="#" title="four">hey</a>');
89+
$this->assertEquals('<a href="#">hey</a>', $string);
90+
}
91+
7492
protected function getBasicWhitelist()
7593
{
7694
$whitelist = new BasicWhitelist;
7795
// Allow support for a few attribute for testing
7896
$whitelist->allowAttribute('img', ['data-src', 'data-lazyload'])
7997
->setAllowedHosts('img', ['google.com'])
98+
->setAllowedValues('a', 'title', ['one', 'two', 'three'])
8099
->treatAttributesAsURL(['data-src'])
81100
->treatAttributesAsBoolean(['data-lazyload']);
82101
return $whitelist;

0 commit comments

Comments
 (0)