Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,13 @@ A pure Python HTML5 parser that just works. No C extensions to compile. No syste
| **Chromium**<br>browser engine | ✅ **99%** | 🚀&nbsp;Very&nbsp;Fast | — | — | — |
| **WebKit**<br>browser engine | ✅ **98%** | 🚀 Very Fast | — | — | — |
| **Firefox**<br>browser engine | ✅ **97%** | 🚀 Very Fast | — | — | — |
| **`markupever`**<br>Python wrapper of Rust-based html5ever | ✅ **95%** | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Fast and correct. |
| **`html5lib`**<br>Pure Python | 🟡 88% | 🐢 Slow | 🟡 XPath (lxml) | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained. Reference implementation; Correct but quite slow. |
| **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🟡 84% | 🚀 Very Fast | 🟡 XPath (lxml) | ❌ Needs sanitization | Fast and mostly correct. |
| **`selectolax`**<br>Python wrapper of C-based Lexbor | 🟡 68% | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Very fast but less compliant. |
| **`BeautifulSoup`**<br>Pure Python | 🔴 5% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
| **`html.parser`**<br>Python stdlib | 🔴 4% | ⚡ Fast | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
| **`BeautifulSoup`**<br>Pure Python | 🔴 4% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 1% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 3% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |

[1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). See [docs/correctness.md](docs/correctness.md) for details.

Expand Down
86 changes: 85 additions & 1 deletion benchmarks/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from justhtml.context import FragmentContext

# Available parsers
PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax"]
PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]


def check_parser_available(parser_name):
Expand Down Expand Up @@ -58,6 +58,13 @@ def check_parser_available(parser_name):
try:
import html5_parser # noqa: F401

return True
except ImportError:
return False
if parser_name == "markupever":
try:
import markupever # noqa: F401

return True
except ImportError:
return False
Expand Down Expand Up @@ -409,6 +416,22 @@ def run_test_html5_parser(html, fragment_context, expected, xml_coercion=False,
return False, "", str(e)


def run_test_markupever(html, fragment_context, expected, xml_coercion=False, iframe_srcdoc=False):
"""Run a single test with MarkupEver."""
import markupever

try:
if fragment_context:
nodes = markupever.parse(html, markupever.HtmlOptions(full_document=False)).root().first_child.children()
else:
nodes = [markupever.parse(html).root()]
actual = _markupever_to_test_format(nodes)
passed = compare_outputs(expected, actual)
return passed, actual, None
except Exception as e:
return False, "", str(e)


# =============================================================================
# Test format conversion helpers
# =============================================================================
Expand Down Expand Up @@ -794,6 +817,66 @@ def walk(node, indent):
return "\n".join(walk(root, 0))


def _markupever_to_test_format(nodes):
"""Convert MarkupEver DOM to test format."""
import markupever
import markupever.dom

def process(node, indent):
prefix = " " * indent
match node:
case markupever.dom.Document():
for child in node.children():
yield from process(child, indent)
case markupever.dom.Doctype():
if node.public_id or node.system_id:
yield f'| <!DOCTYPE {node.name} "{node.public_id}" "{node.system_id}">\n'
else:
yield f"| <!DOCTYPE {node.name}>\n"
case markupever.dom.Element():
if node.name.ns == NS_SVG:
tag_name = f"svg {node.name.local}"
elif node.name.ns == NS_MATHML:
tag_name = f"math {node.name.local}"
elif node.name.ns == NS_HTML:
tag_name = node.name.local
else:
tag_name = f"{node.name.ns} {node.name.local}"
yield f"| {prefix}<{tag_name}>\n"

attrs = []
for qual_name, value in zip(node.attrs.keys(), node.attrs.values(), strict=True):
if qual_name.ns == NS_XLINK:
attr_name = f"xlink {qual_name.local}"
elif qual_name.ns == NS_XML:
attr_name = f"xml {qual_name.local}"
elif qual_name.ns == NS_XMLNS:
attr_name = f"xmlns {qual_name.local}"
elif qual_name.ns == "":
attr_name = qual_name.local
else:
attr_name = f"{qual_name.ns} {qual_name.local}"
attrs.append((attr_name, value))
for attr_name, value in sorted(attrs):
yield f'| {prefix} {attr_name}="{value}"\n'

if node.name.ns == NS_HTML and node.name.local == "template":
yield f"| {prefix} content\n"
for child in node.children():
yield from process(child, indent + 4)
else:
for child in node.children():
yield from process(child, indent + 2)
case markupever.dom.Text():
yield f'| {prefix}"{node.content}"\n'
case markupever.dom.Comment():
yield f"| {prefix}<!-- {node.content} -->\n"
case _:
raise ValueError(f"Unknown node type {type(node)}")

return "".join(line for node in nodes for line in process(node, 0))


# Parser dispatch
PARSER_RUNNERS = {
"justhtml": run_test_justhtml,
Expand All @@ -803,6 +886,7 @@ def walk(node, indent):
"bs4": run_test_bs4,
"html.parser": run_test_html_parser,
"selectolax": run_test_selectolax,
"markupever": run_test_markupever,
}


Expand Down
46 changes: 45 additions & 1 deletion benchmarks/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,47 @@ def benchmark_gumbo(html_source, iterations=1):
}


def benchmark_markupever(html_source, iterations=1):
"""Benchmark markupever parser."""
try:
from markupever import parse
except ImportError:
return {"error": "markupever not installed (pip install markupever)"}
times = []
errors = 0
total_bytes = 0
file_count = 0
warmup_done = False
for _, html in html_source:
if not warmup_done:
try:
parse(html)
except Exception:
pass
warmup_done = True
total_bytes += len(html)
file_count += 1
for _ in range(iterations):
try:
start = time.perf_counter()
result = parse(html)
elapsed = time.perf_counter() - start
times.append(elapsed)
_ = result.root()
except Exception:
errors += 1
return {
"total_time": sum(times),
"mean_time": sum(times) / len(times) if times else 0,
"min_time": min(times) if times else 0,
"max_time": max(times) if times else 0,
"errors": errors,
"success_count": len(times),
"file_count": file_count,
"total_bytes": total_bytes,
}


def _benchmark_worker(bench_fn, html_files, iterations, queue):
"""Worker function to run benchmark in a separate process."""
try:
Expand Down Expand Up @@ -630,6 +671,7 @@ def print_results(results, file_count, iterations=1):
"html.parser",
"selectolax",
"gumbo",
"markupever",
]

# Combined header
Expand Down Expand Up @@ -726,8 +768,9 @@ def main():
"html.parser",
"selectolax",
"gumbo",
"markupever",
],
default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo"],
default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo", "markupever"],
help="Parsers to benchmark (default: all)",
)
# MEMORY: options
Expand Down Expand Up @@ -785,6 +828,7 @@ def run_with_memory(bench_fn, html_source_factory, iterations):
"html.parser": benchmark_html_parser,
"selectolax": benchmark_selectolax,
"gumbo": benchmark_gumbo,
"markupever": benchmark_markupever,
}

file_count = 0
Expand Down
7 changes: 4 additions & 3 deletions docs/correctness.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ We run the same test suite against other Python parsers to compare compliance:
| Parser | Tests Passed | Compliance | Notes |
|--------|-------------|------------|-------|
| **JustHTML** | 1743/1743 | **100%** | Full spec compliance |
| markupever | 1652/1743 | 95% | Rust-based (html5ever), correct |
| html5lib | 1538/1743 | 88% | Reference implementation, but incomplete |
| html5_parser | 1462/1743 | 84% | C-based (Gumbo), mostly correct |
| selectolax | 1187/1743 | 68% | C-based (Lexbor), fast but less compliant |
| BeautifulSoup | 78/1743 | 4% | Uses html.parser, not HTML5 compliant |
| html.parser | 77/1743 | 4% | Python stdlib, basic error recovery only |
| lxml | 13/1743 | 1% | XML-based, not HTML5 compliant |
| BeautifulSoup | 79/1743 | 5% | Uses html.parser, not HTML5 compliant |
| html.parser | 78/1743 | 4% | Python stdlib, basic error recovery only |
| lxml | 44/1743 | 3% | XML-based, not HTML5 compliant |

*Run `python benchmarks/correctness.py` to reproduce these results.*

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ benchmark = [
"beautifulsoup4",
"selectolax",
"html5-parser",
"markupever",
]
dev = [
"ruff==0.14.7",
Expand Down