Skip to content
This repository was archived by the owner on Jan 11, 2023. It is now read-only.

Commit c782ced

Browse files
committed
Re-escape HTML data after Htmlclean
This works by parsing HTML data into normal text. However, when we're done with it, we want this to become valid HTML again. If we leave things like & parsed, we can get situations where the title/content of a page depends on how many times we run it through Htmlclean. Fixes #918
1 parent 72848b0 commit c782ced

File tree

2 files changed

+81
-10
lines changed

2 files changed

+81
-10
lines changed

libraries/htmlclean/htmlclean.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,37 @@
2626
#include "glib.h"
2727
#include "gumbo.h"
2828

29+
// After parsing, we need to re-escape HTML so we don't remove literal <> and &
30+
// For example if the original text was "The &lt;pre&gt; element is an HTML element", we want
31+
// our final output to be "The &lt;pre&gt; element is an HTML element", not "The <pre> element
32+
// is an HTML element" (which would get stripped if we ran it through this again)
33+
// Returns a new string!
34+
static char* reescape_xml_entities(const char* text)
35+
{
36+
size_t len = strlen(text);
37+
GString* result = g_string_sized_new(len);
38+
for (size_t i = 0; i < len; ++i)
39+
{
40+
char c = text[i];
41+
switch (c)
42+
{
43+
case '<':
44+
g_string_append(result, "&lt;");
45+
break;
46+
case '>':
47+
g_string_append(result, "&gt;");
48+
break;
49+
case '&':
50+
g_string_append(result, "&amp;");
51+
break;
52+
default:
53+
g_string_append_c(result, c);
54+
break;
55+
}
56+
}
57+
return g_string_free(result, FALSE);
58+
}
59+
2960
char *cleantext(GumboNode *node)
3061
{
3162
if (node->type == GUMBO_NODE_TEXT)
@@ -92,5 +123,8 @@ char *htmlclean_strip_html(const char *input)
92123
{
93124
return g_strdup("");
94125
}
95-
return cleaned;
126+
127+
char* cleaned_escaped = reescape_xml_entities(cleaned);
128+
free(cleaned);
129+
return cleaned_escaped;
96130
}

libraries/htmlclean/test_htmlclean.c

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,61 @@ int main(int argc, char** argv)
4848
test_no_change
4949
);
5050

51-
// g_test_add_data_func (
52-
// "/htmlclean/nochange/escapedhtml",
53-
// "this string contains &amp; escaped HTML",
54-
// test_no_change
55-
// );
51+
g_test_add_data_func (
52+
"/htmlclean/nochange/escapedhtml",
53+
"this string contains &amp; escaped HTML",
54+
test_no_change
55+
);
56+
57+
g_test_add_data_func (
58+
"/htmlclean/nochange/escapedhtml2",
59+
"CSS, &amp;lt;pre&gt;, and trailing whitespace lead to browser layout weirdness",
60+
test_no_change
61+
);
5662

57-
// Previous versions of the parser crashed or hung when given these inputs
5863
g_test_add_data_func (
59-
"/htmlclean/nochange/justopen",
60-
"<",
64+
"/htmlclean/nochange/justlt",
65+
"&lt;",
66+
test_no_change
67+
);
68+
69+
g_test_add_data_func (
70+
"/htmlclean/nochange/justgt",
71+
"&gt;",
6172
test_no_change
6273
);
6374

6475
g_test_add_data_func (
6576
"/htmlclean/nochange/justamp",
66-
"&",
77+
"&amp;",
6778
test_no_change
6879
);
6980

81+
// Previous versions of the parser crashed or hung when given these inputs
82+
// These get escaped, even though they're not not ambiguous in this situation
83+
g_test_add_data_func (
84+
"/htmlclean/change/justamp",
85+
&(inout_t){
86+
"&",
87+
"&amp;"
88+
},
89+
test_change);
90+
91+
g_test_add_data_func (
92+
"/htmlclean/change/justlt",
93+
&(inout_t){
94+
"<",
95+
"&lt;"
96+
},
97+
test_change);
98+
99+
g_test_add_data_func (
100+
"/htmlclean/change/justgt",
101+
&(inout_t){
102+
">",
103+
"&gt;"
104+
},
105+
test_change);
106+
70107
return g_test_run ();
71108
}

0 commit comments

Comments
 (0)