EmilStenstrom · EmilStenstrom · Mar 7, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -80,12 +80,13 @@ A pure Python HTML5 parser that just works. No C extensions to compile. No syste
 | **Chromium**<br>browser engine | ✅ **99%** | 🚀&nbsp;Very&nbsp;Fast | — | — | — |
 | **WebKit**<br>browser engine | ✅ **98%** | 🚀 Very Fast | — | — | — |
 | **Firefox**<br>browser engine | ✅ **97%** | 🚀 Very Fast | — | — | — |
+| **`markupever`**<br>Python wrapper of Rust-based html5ever | ✅ **95%** | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Fast and correct. |
 | **`html5lib`**<br>Pure Python | 🟡 88% | 🐢 Slow | 🟡 XPath (lxml) | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained. Reference implementation;  Correct but quite slow. |
 | **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🟡 84% | 🚀 Very Fast | 🟡 XPath (lxml) | ❌ Needs sanitization | Fast and mostly correct. |
 | **`selectolax`**<br>Python wrapper of C-based Lexbor | 🟡 68% | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Very fast but less compliant. |
+| **`BeautifulSoup`**<br>Pure Python | 🔴 5% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
 | **`html.parser`**<br>Python stdlib | 🔴 4% | ⚡ Fast | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
-| **`BeautifulSoup`**<br>Pure Python | 🔴 4% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
-| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 1% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
+| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 3% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
 
 [1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). See [docs/correctness.md](docs/correctness.md) for details.
 

diff --git a/benchmarks/correctness.py b/benchmarks/correctness.py
@@ -17,7 +17,7 @@
 from justhtml.context import FragmentContext
 
 # Available parsers
-PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax"]
+PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]
 
 
 def check_parser_available(parser_name):
@@ -58,6 +58,13 @@ def check_parser_available(parser_name):
         try:
             import html5_parser  # noqa: F401
 
+            return True
+        except ImportError:
+            return False
+    if parser_name == "markupever":
+        try:
+            import markupever  # noqa: F401
+
             return True
         except ImportError:
             return False
@@ -409,6 +416,22 @@ def run_test_html5_parser(html, fragment_context, expected, xml_coercion=False,
         return False, "", str(e)
 
 
+def run_test_markupever(html, fragment_context, expected, xml_coercion=False, iframe_srcdoc=False):
+    """Run a single test with MarkupEver."""
+    import markupever
+
+    try:
+        if fragment_context:
+            nodes = markupever.parse(html, markupever.HtmlOptions(full_document=False)).root().first_child.children()
+        else:
+            nodes = [markupever.parse(html).root()]
+        actual = _markupever_to_test_format(nodes)
+        passed = compare_outputs(expected, actual)
+        return passed, actual, None
+    except Exception as e:
+        return False, "", str(e)
+
+
 # =============================================================================
 # Test format conversion helpers
 # =============================================================================
@@ -794,6 +817,66 @@ def walk(node, indent):
     return "\n".join(walk(root, 0))
 
 
+def _markupever_to_test_format(nodes):
+    """Convert MarkupEver DOM to test format."""
+    import markupever
+    import markupever.dom
+
+    def process(node, indent):
+        prefix = " " * indent
+        match node:
+            case markupever.dom.Document():
+                for child in node.children():
+                    yield from process(child, indent)
+            case markupever.dom.Doctype():
+                if node.public_id or node.system_id:
+                    yield f'| <!DOCTYPE {node.name} "{node.public_id}" "{node.system_id}">\n'
+                else:
+                    yield f"| <!DOCTYPE {node.name}>\n"
+            case markupever.dom.Element():
+                if node.name.ns == NS_SVG:
+                    tag_name = f"svg {node.name.local}"
+                elif node.name.ns == NS_MATHML:
+                    tag_name = f"math {node.name.local}"
+                elif node.name.ns == NS_HTML:
+                    tag_name = node.name.local
+                else:
+                    tag_name = f"{node.name.ns} {node.name.local}"
+                yield f"| {prefix}<{tag_name}>\n"
+
+                attrs = []
+                for qual_name, value in zip(node.attrs.keys(), node.attrs.values(), strict=True):
+                    if qual_name.ns == NS_XLINK:
+                        attr_name = f"xlink {qual_name.local}"
+                    elif qual_name.ns == NS_XML:
+                        attr_name = f"xml {qual_name.local}"
+                    elif qual_name.ns == NS_XMLNS:
+                        attr_name = f"xmlns {qual_name.local}"
+                    elif qual_name.ns == "":
+                        attr_name = qual_name.local
+                    else:
+                        attr_name = f"{qual_name.ns} {qual_name.local}"
+                    attrs.append((attr_name, value))
+                for attr_name, value in sorted(attrs):
+                    yield f'| {prefix}  {attr_name}="{value}"\n'
+
+                if node.name.ns == NS_HTML and node.name.local == "template":
+                    yield f"| {prefix}  content\n"
+                    for child in node.children():
+                        yield from process(child, indent + 4)
+                else:
+                    for child in node.children():
+                        yield from process(child, indent + 2)
+            case markupever.dom.Text():
+                yield f'| {prefix}"{node.content}"\n'
+            case markupever.dom.Comment():
+                yield f"| {prefix}<!-- {node.content} -->\n"
+            case _:
+                raise ValueError(f"Unknown node type {type(node)}")
+
+    return "".join(line for node in nodes for line in process(node, 0))
+
+
 # Parser dispatch
 PARSER_RUNNERS = {
     "justhtml": run_test_justhtml,
@@ -803,6 +886,7 @@ def walk(node, indent):
     "bs4": run_test_bs4,
     "html.parser": run_test_html_parser,
     "selectolax": run_test_selectolax,
+    "markupever": run_test_markupever,
 }
 
 

diff --git a/benchmarks/performance.py b/benchmarks/performance.py
@@ -567,6 +567,47 @@ def benchmark_gumbo(html_source, iterations=1):
     }
 
 
+def benchmark_markupever(html_source, iterations=1):
+    """Benchmark markupever parser."""
+    try:
+        from markupever import parse
+    except ImportError:
+        return {"error": "markupever not installed (pip install markupever)"}
+    times = []
+    errors = 0
+    total_bytes = 0
+    file_count = 0
+    warmup_done = False
+    for _, html in html_source:
+        if not warmup_done:
+            try:
+                parse(html)
+            except Exception:
+                pass
+            warmup_done = True
+        total_bytes += len(html)
+        file_count += 1
+        for _ in range(iterations):
+            try:
+                start = time.perf_counter()
+                result = parse(html)
+                elapsed = time.perf_counter() - start
+                times.append(elapsed)
+                _ = result.root()
+            except Exception:
+                errors += 1
+    return {
+        "total_time": sum(times),
+        "mean_time": sum(times) / len(times) if times else 0,
+        "min_time": min(times) if times else 0,
+        "max_time": max(times) if times else 0,
+        "errors": errors,
+        "success_count": len(times),
+        "file_count": file_count,
+        "total_bytes": total_bytes,
+    }
+
+
 def _benchmark_worker(bench_fn, html_files, iterations, queue):
     """Worker function to run benchmark in a separate process."""
     try:
@@ -630,6 +671,7 @@ def print_results(results, file_count, iterations=1):
         "html.parser",
         "selectolax",
         "gumbo",
+        "markupever",
     ]
 
     # Combined header
@@ -726,8 +768,9 @@ def main():
             "html.parser",
             "selectolax",
             "gumbo",
+            "markupever",
         ],
-        default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo"],
+        default=["justhtml", "html5lib", "lxml", "bs4", "html.parser", "selectolax", "gumbo", "markupever"],
         help="Parsers to benchmark (default: all)",
     )
     # MEMORY: options
@@ -785,6 +828,7 @@ def run_with_memory(bench_fn, html_source_factory, iterations):
         "html.parser": benchmark_html_parser,
         "selectolax": benchmark_selectolax,
         "gumbo": benchmark_gumbo,
+        "markupever": benchmark_markupever,
     }
 
     file_count = 0

diff --git a/docs/correctness.md b/docs/correctness.md
@@ -58,12 +58,13 @@ We run the same test suite against other Python parsers to compare compliance:
 | Parser | Tests Passed | Compliance | Notes |
 |--------|-------------|------------|-------|
 | **JustHTML** | 1743/1743 | **100%** | Full spec compliance |
+| markupever | 1652/1743 | 95% | Rust-based (html5ever), correct |
 | html5lib | 1538/1743 | 88% | Reference implementation, but incomplete |
 | html5_parser | 1462/1743 | 84% | C-based (Gumbo), mostly correct |
 | selectolax | 1187/1743 | 68% | C-based (Lexbor), fast but less compliant |
-| BeautifulSoup | 78/1743 | 4% | Uses html.parser, not HTML5 compliant |
-| html.parser | 77/1743 | 4% | Python stdlib, basic error recovery only |
-| lxml | 13/1743 | 1% | XML-based, not HTML5 compliant |
+| BeautifulSoup | 79/1743 | 5% | Uses html.parser, not HTML5 compliant |
+| html.parser | 78/1743 | 4% | Python stdlib, basic error recovery only |
+| lxml | 44/1743 | 3% | XML-based, not HTML5 compliant |
 
 *Run `python benchmarks/correctness.py` to reproduce these results.*
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ benchmark = [
     "beautifulsoup4",
     "selectolax",
     "html5-parser",
+    "markupever",
 ]
 dev = [
     "ruff==0.14.7",