benchmarks: Add MarkupEver, https://awolverp.github.io/markupever/

andersk · andersk · commit bbd8d3851a62 · 2026-03-06T16:13:19.000-08:00
MarkupEver is based on the Rust html5ever library; it seems reasonably
correct and very fast, so well worth adding to the comparison.
Benchmark results on my system:

    Parser          Total (s)  Mean (ms)  Peak (MB)  Delta (MB)
    ----------------------------------------------------------------------------------------------------
    justhtml        4.161      8.323           146.6      101.7
    html5lib        6.377      12.753          171.1      117.2  (1.53x slower)
    lxml            0.346      0.692            65.0       21.3  (12.03x faster)
    bs4             4.325      8.651           135.7       85.3  (1.04x slower)
    html.parser     1.565      3.131            52.6        8.2  (2.66x faster)
    selectolax      0.219      0.437            68.0       10.5  (19.04x faster)
    gumbo           1.194      2.387            70.6       25.4  (3.49x faster)
    markupever      0.435      0.870            64.9       21.0  (9.56x faster)

Signed-off-by: Anders Kaseorg &lt;andersk@mit.edu&gt;
diff --git a/README.md b/README.md
@@ -80,12 +80,13 @@ A pure Python HTML5 parser that just works. No C extensions to compile. No syste
 | **Chromium**<br>browser engine | ✅ **99%** | 🚀&nbsp;Very&nbsp;Fast | — | — | — |
 | **WebKit**<br>browser engine | ✅ **98%** | 🚀 Very Fast | — | — | — |
 | **Firefox**<br>browser engine | ✅ **97%** | 🚀 Very Fast | — | — | — |
+| **`markupever`**<br>Python wrapper of Rust-based html5ever | ✅ **95%** | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Fast and correct. |
 | **`html5lib`**<br>Pure Python | 🟡 88% | 🐢 Slow | 🟡 XPath (lxml) | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained. Reference implementation;  Correct but quite slow. |
 | **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🟡 84% | 🚀 Very Fast | 🟡 XPath (lxml) | ❌ Needs sanitization | Fast and mostly correct. |
 | **`selectolax`**<br>Python wrapper of C-based Lexbor | 🟡 68% | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Very fast but less compliant. |
+| **`BeautifulSoup`**<br>Pure Python | 🔴 5% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
 | **`html.parser`**<br>Python stdlib | 🔴 4% | ⚡ Fast | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
-| **`BeautifulSoup`**<br>Pure Python | 🔴 4% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
-| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 1% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
+| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 3% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
 
 [1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). See [docs/correctness.md](docs/correctness.md) for details.
 
diff --git a/benchmarks/correctness.py b/benchmarks/correctness.py
@@ -17,7 +17,7 @@
 from justhtml.context import FragmentContext
 
 # Available parsers
-PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax"]
+PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]
 
 
 def check_parser_available(parser_name):
@@ -58,6 +58,13 @@ def check_parser_available(parser_name):
         try:
             import html5_parser  # noqa: F401
 
+            return True
+        except ImportError:
+            return False
+    if parser_name == "markupever":
+        try:
+            import markupever  # noqa: F401
+
             return True
         except ImportError:
             return False
@@ -409,6 +416,22 @@ def run_test_html5_parser(html, fragment_context, expected, xml_coercion=False,
         return False, "", str(e)
 
 
+def run_test_markupever(html, fragment_context, expected, xml_coercion=False, iframe_srcdoc=False):
+    """Run a single test with MarkupEver."""
+    import markupever
+
+    try:
+        if fragment_context:
+            nodes = markupever.parse(html, markupever.HtmlOptions(full_document=False)).root().first_child.children()
+        else:
+            nodes = [markupever.parse(html).root()]
+        actual = _markupever_to_test_format(nodes)
+        passed = compare_outputs(expected, actual)
+        return passed, actual, None
+    except Exception as e:
+        return False, "", str(e)
+
+
 # =============================================================================
 # Test format conversion helpers
 # =============================================================================
@@ -794,6 +817,66 @@ def walk(node, indent):
     return "\n".join(walk(root, 0))
 
 
+def _markupever_to_test_format(nodes):
+    """Convert MarkupEver DOM to test format."""
+    import markupever
+    import markupever.dom
+
+    def process(node, indent):
+        prefix = " " * indent
+        match node:
+            case markupever.dom.Document():
+                for child in node.children():
+                    yield from process(child, indent)
+            case markupever.dom.Doctype():
+                if node.public_id or node.system_id:
+                    yield f'| <!DOCTYPE {node.name} "{node.public_id}" "{node.system_id}">\n'
+                else:
+                    yield f"| <!DOCTYPE {node.name}>\n"
+            case markupever.dom.Element():
+                if node.name.ns == NS_SVG:
+                    tag_name = f"svg {node.name.local}"
+                elif node.name.ns == NS_MATHML:
+                    tag_name = f"math {node.name.local}"
+                elif node.name.ns == NS_HTML:
+                    tag_name = node.name.local
+                else:
+                    tag_name = f"{node.name.ns} {node.name.local}"
+                yield f"| {prefix}<{tag_name}>\n"
+
+                attrs = []
+                for qual_name, value in zip(node.attrs.keys(), node.attrs.values(), strict=True):
+                    if qual_name.ns == NS_XLINK:
+                        attr_name = f"xlink {qual_name.local}"
+                    elif qual_name.ns == NS_XML:
+                        attr_name = f"xml {qual_name.local}"
+                    elif qual_name.ns == NS_XMLNS:
+                        attr_name = f"xmlns {qual_name.local}"
+                    elif qual_name.ns == "":
+                        attr_name = qual_name.local
+                    else:
+                        attr_name = f"{qual_name.ns} {qual_name.local}"
+                    attrs.append((attr_name, value))
+                for attr_name, value in sorted(attrs):
+                    yield f'| {prefix}  {attr_name}="{value}"\n'
+
+                if node.name.ns == NS_HTML and node.name.local == "template":
+                    yield f"| {prefix}  content\n"
+                    for child in node.children():
+                        yield from process(child, indent + 4)
+                else:
+                    for child in node.children():
+                        yield from process(child, indent + 2)
+            case markupever.dom.Text():
+                yield f'| {prefix}"{node.content}"\n'
+            case markupever.dom.Comment():
+                yield f"| {prefix}<!-- {node.content} -->\n"
+            case _:
+                raise ValueError(f"Unknown node type {type(node)}")
+
+    return "".join(line for node in nodes for line in process(node, 0))
+
+
 # Parser dispatch
 PARSER_RUNNERS = {
     "justhtml": run_test_justhtml,
@@ -803,6 +886,7 @@ def walk(node, indent):
     "bs4": run_test_bs4,
     "html.parser": run_test_html_parser,
     "selectolax": run_test_selectolax,
+    "markupever": run_test_markupever,
 }
 
 
diff --git a/benchmarks/performance.py b/benchmarks/performance.py
@@ -567,6 +567,47 @@ def benchmark_gumbo(html_source, iterations=1):
     }
 
 
+def benchmark_markupever(html_source, iterations=1):
+    """Benchmark markupever parser."""
+    try:
+        from markupever import parse
+    except ImportError:
+        return {"error": "markupever not installed (pip install markupever)"}
+    times = []
+    errors = 0
+    total_bytes = 0
+    file_count = 0
+    warmup_done = False
+    for _, html in html_source:
+        if not warmup_done:
+            try:
+                parse(html)
+            except Exception:
+                pass
+            warmup_done = True
+        total_bytes += len(html)
+        file_count += 1
+        for _ in range(iterations):
+            try:
+                start = time.perf_counter()
+                result = parse(html)
+                elapsed = time.perf_counter() - start
+                times.append(elapsed)
+                _ = result.root()
+            except Exception:
+                errors += 1
+    return {
+        "total_time": sum(times),
+        "mean_time": sum(times) / len(times) if times else 0,
+        "min_time": min(times) if times else 0,
+        "max_time": max(times) if times else 0,
+        "errors": errors,
+        "success_count": len(times),
+        "file_count": file_count,
+        "total_bytes": total_bytes,
+    }
+
+
 def _benchmark_worker(bench_fn, html_files, iterations, queue):
     """Worker function to run benchmark in a separate process."""
     try:
@@ -630,6 +671,7 @@ def print_results(results, file_count, iterations=1):
         "html.parser",
         "selectolax",
         "gumbo",
+        "markupever",
     ]
 
     # Combined header
@@ -726,8 +768,9 @@ def main():
             "html.parser",
             "selectolax",
             "gumbo",
+            "markupever",
         ],
-        default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo"],
+        default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo", "markupever"],
         help="Parsers to benchmark (default: all)",
     )
     # MEMORY: options
@@ -785,6 +828,7 @@ def run_with_memory(bench_fn, html_source_factory, iterations):
         "html.parser": benchmark_html_parser,
         "selectolax": benchmark_selectolax,
         "gumbo": benchmark_gumbo,
+        "markupever": benchmark_markupever,
     }
 
     file_count = 0
diff --git a/docs/correctness.md b/docs/correctness.md
@@ -58,12 +58,13 @@ We run the same test suite against other Python parsers to compare compliance:
 | Parser | Tests Passed | Compliance | Notes |
 |--------|-------------|------------|-------|
 | **JustHTML** | 1743/1743 | **100%** | Full spec compliance |
+| markupever | 1652/1743 | 95% | Rust-based (html5ever), correct |
 | html5lib | 1538/1743 | 88% | Reference implementation, but incomplete |
 | html5_parser | 1462/1743 | 84% | C-based (Gumbo), mostly correct |
 | selectolax | 1187/1743 | 68% | C-based (Lexbor), fast but less compliant |
-| BeautifulSoup | 78/1743 | 4% | Uses html.parser, not HTML5 compliant |
-| html.parser | 77/1743 | 4% | Python stdlib, basic error recovery only |
-| lxml | 13/1743 | 1% | XML-based, not HTML5 compliant |
+| BeautifulSoup | 79/1743 | 5% | Uses html.parser, not HTML5 compliant |
+| html.parser | 78/1743 | 4% | Python stdlib, basic error recovery only |
+| lxml | 44/1743 | 3% | XML-based, not HTML5 compliant |
 
 *Run `python benchmarks/correctness.py` to reproduce these results.*
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ benchmark = [
     "beautifulsoup4",
     "selectolax",
     "html5-parser",
+    "markupever",
 ]
 dev = [
     "ruff==0.14.7",

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ benchmark = [`
`21`	`21`	`"beautifulsoup4",`
`22`	`22`	`"selectolax",`
`23`	`23`	`"html5-parser",`
	`24`	`+ "markupever",`
`24`	`25`	`]`
`25`	`26`	`dev = [`
`26`	`27`	`"ruff==0.14.7",`