Skip to content

Commit 5810c31

Browse files
committed
Start nh3 sanitization and fixes
1 parent 7582bfb commit 5810c31

File tree

23 files changed

+186
-26
lines changed

23 files changed

+186
-26
lines changed

problemtools/md2html.py

+36-14
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#! /usr/bin/env python3
22
# -*- coding: utf-8 -*-
3+
import nh3
34
import os.path
4-
import string
55
import argparse
6+
import html
7+
import string
68
import subprocess
79
import re
810
import tempfile
@@ -35,13 +37,12 @@ def convert(problem: str, options: argparse.Namespace) -> bool:
3537
statement_common.foreach_image(statement_path,
3638
lambda img_name: copy_image(problem, img_name))
3739

38-
command = ["pandoc", statement_path, "-t" , "html", "-f", "markdown-raw_html", "--mathjax"]
40+
command = ["pandoc", statement_path, "-t" , "html", "--mathjax"]
3941
statement_html = subprocess.run(command, capture_output=True, text=True,
4042
shell=False, check=True).stdout
4143

4244

4345
templatepaths = [os.path.join(os.path.dirname(__file__), 'templates/markdown_html'),
44-
os.path.join(os.path.dirname(__file__), '../templates/markdown_html'),
4546
'/usr/lib/problemtools/templates/markdown_html']
4647
templatepath = next((p for p in templatepaths
4748
if os.path.isdir(p) and os.path.isfile(os.path.join(p, "default-layout.html"))),
@@ -50,30 +51,51 @@ def convert(problem: str, options: argparse.Namespace) -> bool:
5051
if templatepath is None:
5152
raise Exception('Could not find directory with markdown templates')
5253

53-
problem_name = statement_common.get_problem_name(problem, options.language)
54+
problem_name = statement_common.get_yaml_problem_name(problem, options.language)
5455

55-
html_template = _substitute_template(templatepath, "default-layout.html",
56+
if problem_name:
57+
problem_name = html.escape(problem_name)
58+
59+
statement_html = _substitute_template(templatepath, "default-layout.html",
5660
statement_html=statement_html,
5761
language=options.language,
5862
title=problem_name or "Missing problem name",
59-
problemid=problembase)
63+
problemid=problembase) # No need to escape problem shortname, the spec has tight restrictions directory names
6064

6165
samples = statement_common.format_samples(problem, to_pdf=False)
6266

6367
# Insert samples at {{nextsample}} and {{remainingsamples}}
64-
html_template, remaining_samples = statement_common.inject_samples(html_template, samples, "")
68+
statement_html, remaining_samples = statement_common.inject_samples(statement_html, samples, "")
6569

6670
# Insert the remaining samples at the bottom
67-
if FOOTNOTES_STRING in html_template:
68-
pos = html_template.find(FOOTNOTES_STRING)
71+
if FOOTNOTES_STRING in statement_html:
72+
pos = statement_html.find(FOOTNOTES_STRING)
6973
else:
70-
pos = html_template.find("</body>")
71-
html_template = html_template[:pos] + "".join(remaining_samples) + html_template[pos:]
72-
73-
html_template = replace_hr_in_footnotes(html_template)
74+
pos = statement_html.find("</body>")
75+
statement_html = statement_html[:pos] + "".join(remaining_samples) + statement_html[pos:]
76+
77+
statement_html = replace_hr_in_footnotes(statement_html)
78+
html_body = statement_html[statement_html.find("<body>"):]
79+
statement_html = statement_html[:statement_html.find("<body>")]
80+
81+
allowed_classes = ("sample", "problemheader", "problembody", "sampleinteractionwrite", "sampleinteractionread",)
82+
def attribute_filter(tag, attribute, value):
83+
if attribute == "class" and value in allowed_classes:
84+
return value
85+
if tag == "img" and attribute == "src":
86+
return value
87+
return None
88+
89+
html_body = nh3.clean(html_body,
90+
link_rel="noopener nofollow noreferrer",
91+
attribute_filter=attribute_filter,
92+
tags=nh3.ALLOWED_TAGS | {"img"},
93+
attributes={"table": {"class"}, "div": {"class"}, "img": {"src"}},
94+
)
95+
statement_html += html_body
7496

7597
with open(destfile, "w", encoding="utf-8", errors="xmlcharrefreplace") as output_file:
76-
output_file.write(html_template)
98+
output_file.write(statement_html)
7799

78100
if options.css:
79101
with open("problem.css", "w") as output_file:

problemtools/problem2pdf.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def md2pdf(options: argparse.Namespace) -> bool:
3232
statement_common.assert_images_are_valid_md(statement_path)
3333

3434
templatepaths = [os.path.join(os.path.dirname(__file__), 'templates/markdown_pdf'),
35-
os.path.join(os.path.dirname(__file__), '../templates/markdown_pdf'),
3635
'/usr/lib/problemtools/templates/markdown_pdf']
3736
templatepath = next((p for p in templatepaths
3837
if os.path.isdir(p) and os.path.isfile(os.path.join(p, "fix_tables.md"))),
@@ -48,7 +47,7 @@ def md2pdf(options: argparse.Namespace) -> bool:
4847
with open(statement_path, "r") as file:
4948
statement_md = file.read()
5049

51-
problem_name = statement_common.get_problem_name(problem_root, options.language)
50+
problem_name = statement_common.get_yaml_problem_name(problem_root, options.language)
5251

5352
# Add problem name and id to the top
5453
problem_id = os.path.basename(problem_root)
@@ -66,7 +65,7 @@ def md2pdf(options: argparse.Namespace) -> bool:
6665
with tempfile.NamedTemporaryFile(mode='w', suffix=".md") as temp_file:
6766
temp_file.write(statement_md)
6867
temp_file.flush()
69-
command = ["pandoc", temp_file.name, "-o", destfile, f"--resource-path={statement_dir}", "-f", "markdown-raw_html"]
68+
command = ["pandoc", temp_file.name, "-o", destfile, f"--resource-path={statement_dir}"]
7069
try:
7170
return subprocess.run(command, capture_output=True, text=True, shell=False, check=True)
7271
except subprocess.CalledProcessError as e:

problemtools/statement_common.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import subprocess
77
import re
88
import json
9+
import yaml
910
from pathlib import Path
1011
=======
1112
from typing import Optional
@@ -67,7 +68,7 @@ def json_dfs(data, callback) -> None:
6768

6869
def foreach_image(statement_path, callback):
6970
# Find all images in the statement and call callback for each one
70-
command = ["pandoc", statement_path, "-t" , "json", "-f", "markdown-raw_html"]
71+
command = ["pandoc", statement_path, "-t" , "json"]
7172
statement_json = subprocess.run(command, capture_output=True,
7273
text=True, shell=False, check=True).stdout
7374
json_dfs(json.loads(statement_json), callback)
@@ -92,19 +93,32 @@ def assert_images_are_valid_md(statement_path: str) -> None:
9293
foreach_image(statement_path,
9394
lambda img_name: assert_image_is_valid(problem_root, img_name))
9495

95-
def get_problem_name(problem: str, language: Optional[str]) -> Optional[str]:
96-
"""Load problem.yaml to get problem name"""
97-
if language is None:
98-
language = "en"
99-
with verifyproblem.Problem(problem) as prob:
100-
config = verifyproblem.ProblemConfig(prob)
101-
if not config.check(None):
102-
raise Exception("Invalid problem.yaml")
96+
def get_yaml_problem_name(problem: str, language: Optional[str]) -> Optional[str]:
97+
98+
# TODO: getting this should be done using verifyproblem
99+
# Wait until new config parsing system is in place
100+
config_file = Path(problem) / 'problem.yaml'
101+
if not config_file.is_file():
102+
raise FileNotFoundError(f"No problem.yaml found")
103+
104+
try:
105+
with open(config_file) as f:
106+
config = yaml.safe_load(f)
107+
if config is None:
108+
config = {}
109+
except Exception as e:
110+
raise Exception(f"Invalid problem.yaml: {e}")
111+
112+
if 'name' in config and not isinstance(config['name'], dict):
113+
config['name'] = {'': config['name']}
114+
103115
names = config.get("name")
104116
# If there is only one language, per the spec that is the one we want
105117
if len(names) == 1:
106118
return next(iter(names.values()))
107119

120+
if language is None:
121+
language = "en"
108122
if language not in names:
109123
raise Exception(f"No problem name defined for language {language or 'en'}")
110124
return names[language]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
name: Problem ID xss
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
name: <script>alert("Hello world!");</script>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
XSS injection via problem name.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
PWNED
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<script>
2+
alert("Hello world!");
3+
</script>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
on_reject: continue
2+
range: 0 0
3+
accept_score: 0
4+
grader_flags: first_error
5+
input_validator_flags: nFive=0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
on_reject: continue
2+
range: 0 2
3+
grader_flags: ignore_sample
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
name: Sample XSS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Various XSS methods. Hopefully the sanitizer doesn't let any of them through.
2+
3+
4+
<script>
5+
alert("Hello world!");
6+
</script>
7+
8+
<img src=x onerror=alert('XSS')>
9+
10+
<a href="#" onclick="alert('XSS')">Click me</a>
11+
12+
<svg onload=alert('XSS')></svg>
13+
14+
<a href="javascript:alert('XSS')">Click me</a>
15+
16+
<input type="text" value="<script>alert('XSS')</script>">
17+
18+
<script>eval('\x61\x6c\x65\x72\x74\x28\x27\x58\x53\x53\x27\x29')</script>
19+
20+
<svg><script>alert('XSS')</script></svg>
21+
22+
<iframe src="javascript:alert('XSS')"></iframe>
23+
24+
<math><mtext><script>alert('XSS')</script></mtext></math>
25+
26+
<div style="background:url(javascript:alert('XSS'))">
27+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Nice!
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
on_reject: continue
2+
range: 0 0
3+
accept_score: 0
4+
grader_flags: first_error
5+
input_validator_flags: nFive=0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
on_reject: continue
2+
range: 0 2
3+
grader_flags: ignore_sample
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
name: Special Characters Sample
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
All printable ASCII characters in sample
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
name: XSS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Various XSS methods. Hopefully the sanitizer doesn't let any of them through.
2+
3+
4+
<script>
5+
alert("Hello world!");
6+
</script>
7+
8+
<img src=x onerror=alert('XSS')>
9+
10+
<a href="#" onclick="alert('XSS')">Click me</a>
11+
12+
<svg onload=alert('XSS')></svg>
13+
14+
<a href="javascript:alert('XSS')">Click me</a>
15+
16+
<input type="text" value="<script>alert('XSS')</script>">
17+
18+
<script>eval('\x61\x6c\x65\x72\x74\x28\x27\x58\x53\x53\x27\x29')</script>
19+
20+
<svg><script>alert('XSS')</script></svg>
21+
22+
<iframe src="javascript:alert('XSS')"></iframe>
23+
24+
<math><mtext><script>alert('XSS')</script></mtext></math>
25+
26+
<div style="background:url(javascript:alert('XSS'))">
27+

problemtools/tests/test_markdown.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from pathlib import Path
2+
from problemtools.tests.test_xss import render
3+
4+
def test_sample_escaping():
5+
problem_path = Path(__file__).parent / "problems" / "specialcharacterssample"
6+
html = render(problem_path)
7+
all_printable = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
8+
assert all_printable in html

problemtools/tests/test_xss.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import os
2+
from pathlib import Path
3+
from problemtools.problem2html import convert, get_parser
4+
import tempfile
5+
6+
def render(problem_path):
7+
with tempfile.TemporaryDirectory() as temp_dir:
8+
args, _unknown = get_parser().parse_known_args(['--problem', str(problem_path.resolve()), '--dest-dir', str(temp_dir)])
9+
convert(args)
10+
with open(f"{temp_dir}/index.html", "r") as f:
11+
html = f.read()
12+
return html
13+
14+
def test_no_xss_statement():
15+
problem_path = Path(__file__).parent / "problems" / "statementxss"
16+
html = render(problem_path)
17+
assert "alert" not in html
18+
19+
def test_no_xss_problemname():
20+
problem_path = Path(__file__).parent / "problems" / "problemnamexss"
21+
html = render(problem_path)
22+
assert "<script>" not in html
23+
24+
def test_no_xss_sample():
25+
problem_path = Path(__file__).parent / "problems" / "samplexss"
26+
html = render(problem_path)
27+
assert "<script>" not in html
28+
29+
def test_no_xss_problem_id():
30+
problem_path = Path(__file__).parent / "problems" / '<script>alert("Hello world!");</script>'
31+
html = render(problem_path)
32+
assert "<script>" not in html
33+

0 commit comments

Comments
 (0)