Skip to content

Commit 9b04c18

Browse files
committed
also detect mismatched self-closing tags (where this is significant)
1 parent d8bdb61 commit 9b04c18

File tree

7 files changed

+267
-4
lines changed

7 files changed

+267
-4
lines changed

htmlcompare/compare.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Optional
55

66
from htmlcompare.compare_css import compare_css, compare_stylesheet
7+
from htmlcompare.elements import is_self_closing_significant
78
from htmlcompare.nodes import Comment, ConditionalComment, Document, Element, Node, TextNode
89
from htmlcompare.normalize import normalize_tree
910
from htmlcompare.options import CompareOptions
@@ -130,6 +131,18 @@ def _compare_elements(
130131
))
131132
return # don't compare children if tags differ
132133

134+
is_self_closing_different = (expected.is_self_closing != actual.is_self_closing)
135+
if is_self_closing_different and is_self_closing_significant(expected.tag):
136+
expected_form = _html_tag(expected.tag, expected.is_self_closing)
137+
actual_form = _html_tag(actual.tag, actual.is_self_closing)
138+
differences.append(Difference(
139+
type=DifferenceType.SELF_CLOSING_MISMATCH,
140+
path=element_path,
141+
expected=expected_form,
142+
actual=actual_form,
143+
message=f"self-closing syntax differs: expected {expected_form}, got {actual_form}",
144+
))
145+
133146
_compare_attributes(expected.attributes, actual.attributes, element_path, differences)
134147
# compare children, passing tag name for context-aware comparison (e.g., CSS in <style> tags)
135148
_compare_node_lists(
@@ -141,6 +154,12 @@ def _compare_elements(
141154
)
142155

143156

157+
def _html_tag(tag: str, is_self_closing) -> str:
158+
if is_self_closing:
159+
return f"<{tag} ... />"
160+
return f"<{tag} ...>"
161+
162+
144163
def _compare_attributes(
145164
expected: dict[str, str],
146165
actual: dict[str, str],

htmlcompare/elements.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
# SPDX-License-Identifier: MIT
22

33

4-
__all__ = ['is_block_element', 'is_inline_element', 'is_preformatted_element']
4+
__all__ = [
5+
'is_block_element',
6+
'is_inline_element',
7+
'is_preformatted_element',
8+
'is_self_closing_significant',
9+
]
510

611
# Block-level elements where whitespace between them is typically insignificant.
712
# Based on HTML5 spec and browser rendering behavior.
@@ -67,3 +72,68 @@ def is_inline_element(tag: str) -> bool:
6772
def is_preformatted_element(tag: str) -> bool:
6873
"""Return True if the element preserves whitespace."""
6974
return tag.lower() in PREFORMATTED_ELEMENTS
75+
76+
77+
# HTML5 void elements - these are always self-closing by definition.
78+
# For these elements, <br>, <br/>, and <br /> are semantically identical.
79+
# https://html.spec.whatwg.org/multipage/syntax.html#void-elements
80+
HTML5_VOID_ELEMENTS = frozenset({
81+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
82+
'link', 'meta', 'source', 'track', 'wbr',
83+
})
84+
85+
# VML (Vector Markup Language) elements used in Microsoft Office/Outlook.
86+
# These are XML-based and self-closing syntax matters for correct rendering.
87+
VML_ELEMENTS = frozenset({
88+
'v:background', 'v:fill', 'v:formulas', 'v:group', 'v:image',
89+
'v:imagedata', 'v:line', 'v:oval', 'v:path', 'v:polyline',
90+
'v:rect', 'v:roundrect', 'v:shadow', 'v:shape', 'v:shapetype',
91+
'v:stroke', 'v:textbox', 'v:textpath',
92+
# Office namespace elements
93+
'o:lock', 'o:wrapblock',
94+
})
95+
96+
# HTML elements where self-closing syntax can cause issues or has different meaning.
97+
# - script: <script /> may not work correctly in all browsers
98+
# - style: similar to script
99+
# - textarea: <textarea /> is invalid and may cause rendering issues
100+
# - title: empty title vs self-closing can behave differently
101+
# - iframe: self-closing iframe may not work in some browsers
102+
SELF_CLOSING_SIGNIFICANT_HTML = frozenset({
103+
'script', 'style', 'textarea', 'title', 'iframe',
104+
})
105+
106+
107+
def is_self_closing_significant(tag: str) -> bool:
108+
"""
109+
Return True if self-closing syntax is significant for this tag.
110+
111+
For these elements, <tag /> vs <tag></tag> or <tag> matters and
112+
should be flagged as a difference.
113+
114+
Returns False for:
115+
- HTML5 void elements (br, img, etc.) where self-closing is always implied
116+
- Regular HTML elements where it doesn't matter
117+
118+
Returns True for:
119+
- VML elements (v:rect, v:fill, etc.)
120+
- Certain HTML elements (script, style, textarea, title, iframe)
121+
"""
122+
tag_lower = tag.lower()
123+
124+
# Void elements are always self-closing - syntax doesn't matter
125+
if tag_lower in HTML5_VOID_ELEMENTS:
126+
return False
127+
128+
if tag_lower in VML_ELEMENTS:
129+
return True
130+
131+
# Check for VML/Office namespace pattern (v:* or o:*)
132+
if ':' in tag_lower:
133+
prefix = tag_lower.split(':')[0]
134+
if prefix in ('v', 'o'):
135+
return True
136+
137+
if tag_lower in SELF_CLOSING_SIGNIFICANT_HTML:
138+
return True
139+
return False

htmlcompare/nodes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class Element:
5252
tag: str
5353
attributes: dict[str, str] = field(default_factory=dict)
5454
children: Sequence['Node'] = field(default_factory=list)
55+
is_self_closing: bool = False
5556

5657
def __eq__(self, other):
5758
if not isinstance(other, Element):
@@ -60,6 +61,7 @@ def __eq__(self, other):
6061
self.tag == other.tag
6162
and self.attributes == other.attributes
6263
and self.children == other.children
64+
and self.is_self_closing == other.is_self_closing
6365
)
6466

6567

htmlcompare/normalize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def _normalize_element(element: Element, options: CompareOptions) -> Element:
147147
tag=element.tag,
148148
attributes=element.attributes,
149149
children=normalized_children,
150+
is_self_closing=element.is_self_closing,
150151
)
151152

152153

htmlcompare/parser.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,50 @@
1616
_CONDITIONAL_START_RE = re.compile(r'^\[if\s+([^\]]+)\]>')
1717
_CONDITIONAL_END_RE = re.compile(r'<!\[endif\]$')
1818

19+
# Marker attribute to track self-closing tags through html5lib parsing
20+
_SELF_CLOSING_MARKER = 'data-htmlcompare-self-closing'
1921

20-
def parse_html(html_string: str) -> Document:
22+
# pattern to detect self-closing tags: <tag ... /> (but not <!-- or <!)
23+
# captures: (1) tag name, (2) attributes section
24+
_SELF_CLOSING_TAG_PATTERN = rb'<([a-zA-Z][a-zA-Z0-9:_-]*)(\s[^>]*)?\s*/>'
25+
26+
27+
def _mark_self_closing_tags(html_string: Union[str, bytes]) -> Union[str, bytes]:
28+
"""
29+
Pre-process HTML to mark self-closing tags with a marker attribute.
30+
31+
This is needed because html5lib normalizes self-closing syntax away.
32+
We inject a marker attribute that survives parsing.
33+
"""
34+
if not isinstance(html_string, (str, bytes)):
35+
raise TypeError("html_string must be str or bytes")
36+
elif isinstance(html_string, bytes):
37+
def _inject_bytes_marker_attribute(match) -> bytes:
38+
tag = match.group(1)
39+
attrs = match.group(2) or b''
40+
_marker = f' {_SELF_CLOSING_MARKER}="true" '.encode('utf-8')
41+
return b'<' + tag + attrs + _marker + b'/>'
42+
_regex = re.compile(_SELF_CLOSING_TAG_PATTERN, re.DOTALL)
43+
return _regex.sub(_inject_bytes_marker_attribute, html_string)
44+
else:
45+
def _inject_str_marker_attribute(match) -> str:
46+
tag = match.group(1)
47+
attrs = match.group(2) or ''
48+
return f'<{tag}{attrs} {_SELF_CLOSING_MARKER}="true" />'
49+
pattern = _SELF_CLOSING_TAG_PATTERN.decode('utf-8')
50+
_SELF_CLOSING_TAG_RE = re.compile(pattern, re.DOTALL)
51+
return _SELF_CLOSING_TAG_RE.sub(_inject_str_marker_attribute, html_string)
52+
53+
54+
def parse_html(html_string: Union[str, bytes]) -> Document:
2155
"""
2256
Parse an HTML string into a Document tree of Node objects.
2357
2458
Uses html5lib for HTML5-compliant parsing, then converts the
2559
resulting tree into our internal node representation.
2660
"""
27-
html_element = html5lib.parse(html_string, namespaceHTMLElements=False)
61+
marked_html = _mark_self_closing_tags(html_string)
62+
html_element = html5lib.parse(marked_html, namespaceHTMLElements=False)
2863
html_node = _element_to_node(html_element)
2964

3065
return Document(children=[html_node])
@@ -36,16 +71,28 @@ def _element_to_node(element) -> Element:
3671
if '}' in tag:
3772
tag = tag.split('}', 1)[1]
3873

74+
is_self_closing = False
3975
attributes = {}
4076
for key, value in element.attrib.items():
4177
if isinstance(key, tuple):
4278
_namespace, attr_name = key
4379
else:
4480
attr_name = key
81+
82+
if attr_name == _SELF_CLOSING_MARKER:
83+
is_self_closing = True
84+
# don't include marker in final attributes
85+
continue
86+
4587
attributes[attr_name] = value
4688

4789
children = _convert_children(element)
48-
return Element(tag=tag, attributes=attributes, children=children)
90+
return Element(
91+
tag=tag,
92+
attributes=attributes,
93+
children=children,
94+
is_self_closing=is_self_closing,
95+
)
4996

5097

5198
def _convert_children(element) -> Sequence[Union[Element, TextNode, Comment, ConditionalComment]]:

htmlcompare/result.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class DifferenceType(Enum):
2525
COMMENT_MISMATCH = auto()
2626
CONDITIONAL_COMMENT_CONDITION_MISMATCH = auto()
2727
CONDITIONAL_COMMENT_CONTENT_MISMATCH = auto()
28+
SELF_CLOSING_MISMATCH = auto()
2829

2930

3031
@dataclass

htmlcompare/tests/compare_test.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,3 +645,126 @@ def test_conditional_comment_ignores_whitespace_inside_nested_block_elements():
645645
single_line = '<div><!--[if mso | IE]><table><tr><td><v:image src="test.jpg" /><![endif]--></div>' # noqa: E501
646646
result = compare_html(multi_line, single_line)
647647
assert result.is_equal
648+
649+
650+
651+
class TestSelfClosingTagDetection:
652+
def test_detects_vml_rect_self_closing_vs_opening_tag(self):
653+
# v:rect is a VML element where self-closing syntax matters for Outlook.
654+
result = compare_html(
655+
'<div><v:rect style="width:100px" /></div>',
656+
'<div><v:rect style="width:100px" ></v:rect></div>',
657+
)
658+
assert not result.is_equal
659+
diff, = [d for d in result.differences if d.type == DifferenceType.SELF_CLOSING_MISMATCH]
660+
assert 'v:rect' in diff.path
661+
662+
def test_detects_vml_fill_self_closing_difference(self):
663+
result = compare_html(
664+
'<v:fill color="red" />',
665+
'<v:fill color="red" ></v:fill>',
666+
)
667+
assert not result.is_equal
668+
669+
def test_same_vml_self_closing_elements_are_equal(self):
670+
html = '<v:rect xmlns:v="urn:schemas-microsoft-com:vml" style="width:100px" />'
671+
assert compare_html(html, html).is_equal
672+
673+
def test_same_vml_non_self_closing_elements_are_equal(self):
674+
html = '<v:rect style="width:100px"></v:rect>'
675+
assert compare_html(html, html).is_equal
676+
677+
def test_detects_script_self_closing_difference(self):
678+
# <script /> behaves differently from <script></script> in browsers.
679+
result = compare_html(
680+
'<script src="app.js" />',
681+
'<script src="app.js"></script>',
682+
)
683+
assert not result.is_equal
684+
685+
def test_detects_style_self_closing_difference(self):
686+
# <style /> behaves differently from <style></style> in browsers.
687+
result = compare_html(
688+
'<style type="text/css" />',
689+
'<style type="text/css"></style>',
690+
)
691+
assert not result.is_equal
692+
693+
def test_ignores_html5_void_element_self_closing_br(self):
694+
assert compare_html('<br>', '<br/>').is_equal
695+
assert compare_html('<br>', '<br />').is_equal
696+
assert compare_html('<br/>', '<br />').is_equal
697+
698+
def test_ignores_html5_void_element_self_closing_img(self):
699+
assert compare_html(
700+
'<img src="test.jpg" alt="">',
701+
'<img src="test.jpg" alt="" />',
702+
).is_equal
703+
704+
def test_ignores_html5_void_element_self_closing_input(self):
705+
assert compare_html(
706+
'<input type="text" name="foo">',
707+
'<input type="text" name="foo" />',
708+
).is_equal
709+
710+
def test_ignores_html5_void_element_self_closing_meta(self):
711+
assert compare_html(
712+
'<meta charset="utf-8">',
713+
'<meta charset="utf-8" />',
714+
).is_equal
715+
716+
def test_ignores_regular_div_self_closing(self):
717+
# Regular HTML elements like div - self-closing doesn't matter semantically.
718+
# Note: both become <div></div> after parsing, self-closing div is invalid HTML
719+
assert compare_html('<div />', '<div></div>').is_equal
720+
721+
def test_vml_in_conditional_comment(self):
722+
# VML self-closing detection works also inside conditional comments.
723+
result = compare_html(
724+
'<div><!--[if mso]><v:rect style="width:100px" /><![endif]--></div>',
725+
'<div><!--[if mso]><v:rect style="width:100px" ></v:rect><![endif]--></div>',
726+
)
727+
assert not result.is_equal
728+
729+
def test_multiple_vml_elements_with_mixed_self_closing(self):
730+
# Each VML element's self-closing status is checked independently.
731+
html = '<v:rect /><v:fill /><v:stroke />'
732+
assert compare_html(html, html).is_equal
733+
734+
# different self-closing on v:fill
735+
result = compare_html(
736+
'<v:rect /><v:fill /><v:stroke />',
737+
'<v:rect /><v:fill ></v:fill><v:stroke />',
738+
)
739+
assert not result.is_equal
740+
741+
def test_detects_textarea_self_closing_difference(self):
742+
# <textarea /> is invalid and may cause rendering issues.
743+
result = compare_html(
744+
'<textarea name="comment" />',
745+
'<textarea name="comment"></textarea>',
746+
)
747+
assert not result.is_equal
748+
749+
def test_detects_iframe_self_closing_difference(self):
750+
result = compare_html(
751+
'<iframe src="page.html" />',
752+
'<iframe src="page.html"></iframe>',
753+
)
754+
assert not result.is_equal
755+
756+
def test_unknown_vml_namespace_element(self):
757+
# Any v: prefixed element should have self-closing checked.
758+
result = compare_html(
759+
'<v:customshape fill="red" />',
760+
'<v:customshape fill="red"></v:customshape>',
761+
)
762+
assert not result.is_equal
763+
764+
def test_office_namespace_element(self):
765+
# o: prefixed elements (Office namespace) should have self-closing checked.
766+
result = compare_html(
767+
'<o:lock aspectratio="t" />',
768+
'<o:lock aspectratio="t"></o:lock>',
769+
)
770+
assert not result.is_equal

0 commit comments

Comments
 (0)