Re-escape HTML data after Htmlclean

brendanlong · brendanlong · commit c782cedfe530 · 2019-07-13T15:04:50.000-04:00
This works by parsing HTML data into normal text. However, when we're done with it, we want this to become valid HTML again. If we leave things like & parsed, we can get situations where the title/content of a page depends on how many times we run it through Htmlclean. Fixes #918
diff --git a/libraries/htmlclean/htmlclean.c b/libraries/htmlclean/htmlclean.c
@@ -26,6 +26,37 @@
 #include "glib.h"
 #include "gumbo.h"
 
+// After parsing, we need to re-escape HTML so we don't remove literal <> and &
+// For example if the original text was "The &lt;pre&gt; element is an HTML element", we want
+// our final output to be "The &lt;pre&gt; element is an HTML element", not "The <pre> element
+// is an HTML element" (which would get stripped if we ran it through this again)
+// Returns a new string!
+static char* reescape_xml_entities(const char* text)
+{
+	size_t len = strlen(text);
+	GString* result = g_string_sized_new(len);
+	for (size_t i = 0; i < len; ++i)
+	{
+		char c = text[i];
+		switch (c)
+		{
+			case '<':
+				g_string_append(result, "&lt;");
+				break;
+			case '>':
+				g_string_append(result, "&gt;");
+				break;
+			case '&':
+				g_string_append(result, "&amp;");
+				break;
+			default:
+				g_string_append_c(result, c);
+				break;
+		}
+	}
+	return g_string_free(result, FALSE);
+}
+
 char *cleantext(GumboNode *node)
 {
 	if (node->type == GUMBO_NODE_TEXT)
@@ -92,5 +123,8 @@ char *htmlclean_strip_html(const char *input)
 	{
 		return g_strdup("");
 	}
-	return cleaned;
+
+	char* cleaned_escaped = reescape_xml_entities(cleaned);
+	free(cleaned);
+	return cleaned_escaped;
 }
diff --git a/libraries/htmlclean/test_htmlclean.c b/libraries/htmlclean/test_htmlclean.c
@@ -48,24 +48,61 @@ int main(int argc, char** argv)
 		test_no_change
 	);
 
-	// g_test_add_data_func (
-	// 	"/htmlclean/nochange/escapedhtml",
-	// 	"this string contains &amp; escaped HTML",
-	// 	test_no_change
-	// );
+	g_test_add_data_func (
+		"/htmlclean/nochange/escapedhtml",
+		"this string contains &amp; escaped HTML",
+		test_no_change
+	);
+
+	g_test_add_data_func (
+		"/htmlclean/nochange/escapedhtml2",
+		"CSS, &amp;lt;pre&gt;, and trailing whitespace lead to browser layout weirdness",
+		test_no_change
+	);
 
-	// Previous versions of the parser crashed or hung when given these inputs
 	g_test_add_data_func (
-		"/htmlclean/nochange/justopen",
-		"<",
+		"/htmlclean/nochange/justlt",
+		"&lt;",
+		test_no_change
+	);
+
+	g_test_add_data_func (
+		"/htmlclean/nochange/justgt",
+		"&gt;",
 		test_no_change
 	);
 
 	g_test_add_data_func (
 		"/htmlclean/nochange/justamp",
-		"&",
+		"&amp;",
 		test_no_change
 	);
 
+	// Previous versions of the parser crashed or hung when given these inputs
+	// These get escaped, even though they're not not ambiguous in this situation
+	g_test_add_data_func (
+		"/htmlclean/change/justamp",
+		&(inout_t){
+			"&",
+			"&amp;"
+		},
+		test_change);
+
+	g_test_add_data_func (
+		"/htmlclean/change/justlt",
+		&(inout_t){
+			"<",
+			"&lt;"
+		},
+		test_change);
+
+	g_test_add_data_func (
+		"/htmlclean/change/justgt",
+		&(inout_t){
+			">",
+			"&gt;"
+		},
+		test_change);
+
 	return g_test_run ();
 }