Skip to content

Commit c7826af

Browse files
Parse t-strings into TNodes/TAttributes representation using TemplateParser (#82)
* Parse t-strings into intermediary TNodes/TAttributes representation using TemplateParser. * Move lru cache from processor into parser with a few tests.
1 parent 737a7f2 commit c7826af

File tree

7 files changed

+1362
-510
lines changed

7 files changed

+1362
-510
lines changed

tdom/escaping.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import re
2+
from string.templatelib import Interpolation
3+
4+
from markupsafe import Markup
5+
6+
from .utils import format_interpolation as base_format_interpolation
7+
8+
9+
def _format_safe(value: object, format_spec: str) -> str:
10+
"""Use Markup() to mark a value as safe HTML."""
11+
assert format_spec == "safe"
12+
return Markup(value)
13+
14+
15+
def _format_unsafe(value: object, format_spec: str) -> str:
16+
"""Convert a value to a plain string, forcing it to be treated as unsafe."""
17+
assert format_spec == "unsafe"
18+
return str(value)
19+
20+
21+
CUSTOM_FORMATTERS = (("safe", _format_safe), ("unsafe", _format_unsafe))
22+
23+
24+
def format_interpolation(interpolation: Interpolation) -> object:
25+
return base_format_interpolation(
26+
interpolation,
27+
formatters=CUSTOM_FORMATTERS,
28+
)
29+
30+
31+
def escape_html_comment(text):
32+
"""Escape text injected into an HTML comment."""
33+
GT = ">"
34+
LT = "<"
35+
36+
if not text:
37+
return text
38+
# - text must not start with the string ">"
39+
if text[0] == ">":
40+
text = GT + text[1:]
41+
42+
# - nor start with the string "->"
43+
if text[:2] == "->":
44+
text = "-" + GT + text[2:]
45+
46+
# - nor contain the strings "<!--", "-->", or "--!>"
47+
if (index := text.find("<!--")) and index != -1:
48+
text = text[:index] + LT + text[index + 1]
49+
if (index := text.find("-->")) and index != -1:
50+
text = text[: index + 2] + GT + text[index + 3]
51+
if (index := text.find("--!>")) and index != -1:
52+
text = text[: index + 3] + GT + text[index + 4]
53+
54+
# - nor end with the string "<!-".
55+
if text[-3:] == "<!-":
56+
text = text[:-3] + LT + "!-"
57+
58+
return text
59+
60+
61+
def escape_html_style(text):
62+
LT = "&lt;"
63+
close_str = "</style>"
64+
close_str_re = re.compile(close_str, re.I | re.A)
65+
replace_str = LT + close_str[1:]
66+
return re.sub(close_str_re, replace_str, text)
67+
68+
69+
def escape_html_script(text):
70+
"""
71+
https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements
72+
73+
(from link) The easiest and safest way to avoid the rather strange restrictions
74+
described in this section is to always escape an ASCII case-insensitive
75+
match for:
76+
- "<!--" as "\x3c!--"
77+
- "<script" as "\x3cscript"
78+
- "</script" as "\x3c/script"`
79+
80+
This does not make a script *run*; it just tries to prevent accidentally injecting
81+
*another* SCRIPT tag into a SCRIPT tag being rendered.
82+
"""
83+
match_to_replace = (
84+
(re.compile("<!--", re.I | re.A), "\x3c!--"),
85+
(re.compile("<script", re.I | re.A), "\x3cscript"),
86+
(re.compile("</script", re.I | re.A), "\x3c/script"),
87+
)
88+
for match_re, replace_text in match_to_replace:
89+
text = re.sub(match_re, replace_text, text)
90+
return text

tdom/nodes.py

Lines changed: 150 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
from dataclasses import dataclass, field
2+
from string.templatelib import Template
23

34
from markupsafe import escape
45

6+
from .escaping import (
7+
escape_html_comment,
8+
escape_html_script,
9+
escape_html_style,
10+
)
11+
12+
513
# See https://developer.mozilla.org/en-US/docs/Glossary/Void_element
614
VOID_ELEMENTS = frozenset(
715
[
@@ -31,8 +39,125 @@
3139
# FUTURE: make nodes frozen (and have the parser work with mutable builders)
3240

3341

42+
def to_template_repr(template):
43+
"""
44+
Convert a template to a comparable representation.
45+
46+
This is mostly for testing because Templates/Interpolations are not comparable.
47+
"""
48+
parts = []
49+
for index, s in enumerate(template.strings):
50+
parts.append(s)
51+
if index < len(template.strings) - 1:
52+
ip = template.interpolations[index]
53+
parts.append((ip.value, ip.expression, ip.conversion, ip.format_spec))
54+
return tuple(parts)
55+
56+
57+
@dataclass
58+
class TNodeBase:
59+
def __str__(self) -> str:
60+
raise NotImplementedError("Cannot serialize dynamic nodes.")
61+
62+
def __html__(self) -> str:
63+
raise NotImplementedError("Cannot serialize dynamic nodes.")
64+
65+
66+
type TAttribute = (
67+
StaticAttribute | SpreadAttribute | TemplatedAttribute | InterpolatedAttribute
68+
)
69+
70+
71+
@dataclass
72+
class StaticAttribute:
73+
name: str
74+
value: str | None = None
75+
76+
77+
@dataclass
78+
class SpreadAttribute:
79+
interpolation_index: int
80+
81+
82+
@dataclass
83+
class TemplatedAttribute:
84+
name: str
85+
value_t: Template
86+
87+
def to_comparable(self):
88+
return (self.name, to_template_repr(self.value_t))
89+
90+
def __eq__(self, other: object):
91+
return (
92+
isinstance(other, TemplatedAttribute)
93+
and self.to_comparable() == other.to_comparable()
94+
)
95+
96+
97+
@dataclass
98+
class InterpolatedAttribute:
99+
name: str
100+
interpolation_index: int
101+
102+
103+
type TNode = TElement | TComponent | TFragment | TText | TComment | TDocumentType
104+
105+
106+
@dataclass
107+
class TDocumentType(TNodeBase):
108+
text: str
109+
110+
111+
@dataclass
112+
class TElement(TNodeBase):
113+
tag: str
114+
attrs: tuple[TAttribute, ...] = field(default_factory=tuple)
115+
children: tuple[TNode, ...] = field(default_factory=tuple)
116+
117+
118+
@dataclass
119+
class TFragment(TNodeBase):
120+
children: tuple[TNode, ...] = field(default_factory=tuple)
121+
122+
123+
@dataclass
124+
class TComponent(TNodeBase):
125+
starttag_interpolation_index: int
126+
endtag_interpolation_index: int
127+
starttag_string_index: (
128+
int # string index where the starttag > or startendtag /> occurs.
129+
)
130+
endtag_string_index: (
131+
int # string index where the endtag > or startendtag /> occurs.
132+
)
133+
attrs: tuple[TAttribute, ...] = field(default_factory=tuple)
134+
children: tuple[TNode, ...] = field(default_factory=tuple)
135+
136+
137+
@dataclass
138+
class TText(TNodeBase):
139+
text_t: Template
140+
141+
def __eq__(self, other: object) -> bool:
142+
# This is primarily of use for testing purposes. We only consider
143+
# two Text nodes equal if their string representations match.
144+
return isinstance(other, TText) and to_template_repr(
145+
self.text_t
146+
) == to_template_repr(other.text_t)
147+
148+
149+
@dataclass
150+
class TComment(TNodeBase):
151+
text_t: Template
152+
153+
def __eq__(self, other: object) -> bool:
154+
return isinstance(other, TComment) and to_template_repr(
155+
self.text_t
156+
) == to_template_repr(other.text_t)
157+
158+
34159
@dataclass(slots=True)
35-
class Node:
160+
class Node(TNodeBase):
36161
def __html__(self) -> str:
37162
"""Return the HTML representation of the node."""
38163
# By default, just return the string representation
@@ -66,7 +191,7 @@ class Comment(Node):
66191
text: str
67192

68193
def __str__(self) -> str:
69-
return f"<!--{self.text}-->"
194+
return f"<!--{escape_html_comment(self.text)}-->"
70195

71196

72197
@dataclass(slots=True)
@@ -100,6 +225,28 @@ def is_void(self) -> bool:
100225
def is_content(self) -> bool:
101226
return self.tag in CONTENT_ELEMENTS
102227

228+
def _children_to_str(self):
229+
if not self.children:
230+
return ""
231+
if self.tag in ("script", "style"):
232+
chunks = []
233+
for child in self.children:
234+
if isinstance(child, Text):
235+
chunks.append(child.text)
236+
else:
237+
raise ValueError(
238+
"Cannot serialize non-text content inside a script tag."
239+
)
240+
raw_children_str = "".join(chunks)
241+
if self.tag == "script":
242+
return escape_html_script(raw_children_str)
243+
elif self.tag == "style":
244+
return escape_html_style(raw_children_str)
245+
else:
246+
raise ValueError("Unsupported tag for single-level bulk escaping.")
247+
else:
248+
return "".join(str(child) for child in self.children)
249+
103250
def __str__(self) -> str:
104251
# We use markupsafe's escape to handle HTML escaping of attribute values
105252
# which means it's possible to mark them as safe if needed.
@@ -111,5 +258,5 @@ def __str__(self) -> str:
111258
return f"<{self.tag}{attrs_str} />"
112259
if not self.children:
113260
return f"<{self.tag}{attrs_str}></{self.tag}>"
114-
children_str = "".join(str(child) for child in self.children)
261+
children_str = self._children_to_str()
115262
return f"<{self.tag}{attrs_str}>{children_str}</{self.tag}>"

0 commit comments

Comments
 (0)