Correct pyjanitor benchmark comparison

Vladimir Vilimaitis · Vladimir Vilimaitis · commit 7552f6bcf900 · 2026-05-18T00:32:53.000+02:00
diff --git a/README.md b/README.md
@@ -327,13 +327,13 @@ The compiled extension is CPython-version-specific. If `import polars_janitor` f
 
 These are local medians from this Windows x64 machine using CPython 3.13.5, Polars 1.40.1, pyjanitor 0.32.23 with pandas 3.0.3, and R 4.6.0 with janitor 2.2.1. Setup is outside the timed loop. Treat them as directional, not as a universal performance claim.
 
-The R comparison uses base R `data.frame`s because janitor is a data.frame/tibble package. The pyjanitor comparison uses pandas for the same reason.
+The R comparison uses base R `data.frame`s because janitor is a data.frame/tibble package. pyjanitor has Polars methods for `clean_names` and `row_to_names`, so those are shown separately. Its `compare_df_cols` helper is pandas-only in the tested version.
 
-| Task | Size | polars-janitor | pyjanitor/pandas | R janitor |
-| --- | ---: | ---: | ---: | ---: |
-| clean_names | 10,000 columns | 45.38 ms | 34.89 ms | 4710.00 ms |
-| compare_df_cols | 5,000 columns | 14.51 ms | 302.32 ms | 70.00 ms |
-| row_to_names + clean_names | 2,000 columns | 8.43 ms | 46.45 ms | 940.00 ms |
+| Task | Size | polars-janitor | pyjanitor/Polars | pyjanitor/pandas | R janitor |
+| --- | ---: | ---: | ---: | ---: | ---: |
+| clean_names | 10,000 columns | 45.49 ms | 139.01 ms | 36.94 ms | 5690.00 ms |
+| compare_df_cols | 5,000 columns | 14.47 ms | n/a | 384.17 ms | 80.00 ms |
+| row_to_names + clean_names | 2,000 columns | 8.78 ms | 32.13 ms | 44.04 ms | 970.00 ms |
 
 Run the same benchmark from a checkout:
 
diff --git a/benchmarks/benchmark_competitors.py b/benchmarks/benchmark_competitors.py
@@ -5,6 +5,7 @@
 import argparse
 import csv
 import gc
+import importlib
 import os
 import shutil
 import statistics
@@ -20,17 +21,26 @@
 
 import polars_janitor as pj
 
-if TYPE_CHECKING:
-    from collections.abc import Callable
-    from types import ModuleType
-
 try:
-    import janitor as pyjanitor
-    import pandas as pd
+    pyjanitor = importlib.import_module("janitor")
 except ImportError:
     pyjanitor = None
+
+try:
+    importlib.import_module("janitor.polars")
+    pyjanitor_polars_available = True
+except ImportError:
+    pyjanitor_polars_available = False
+
+try:
+    pd = importlib.import_module("pandas")
+except ImportError:
     pd = None
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from types import ModuleType
+
 
 PATTERNS = [
     "Customer ID",
@@ -265,7 +275,46 @@ def bench_polars_janitor(
     ]
 
 
-def bench_pyjanitor(
+def bench_pyjanitor_polars(
+    *,
+    repeats: int,
+    clean_size: int,
+    header_size: int,
+) -> list[BenchmarkResult]:
+    """Run pyjanitor's Polars benchmark cases when that namespace is installed."""
+    if not pyjanitor_polars_available:
+        return []
+
+    clean_frame = make_polars_frame(clean_size)
+    sheet = make_polars_sheet(header_size)
+
+    return [
+        BenchmarkResult(
+            "clean_names",
+            clean_size,
+            "pyjanitor/Polars",
+            median_ms(
+                lambda: clean_frame.clean_names(strip_accents=True, remove_special=True),
+                repeats=repeats,
+            ),
+        ),
+        BenchmarkResult(
+            "row_to_names + clean_names",
+            header_size,
+            "pyjanitor/Polars",
+            median_ms(
+                lambda: sheet.row_to_names(
+                    row_numbers=1,
+                    remove_rows=True,
+                    remove_rows_above=True,
+                ).clean_names(strip_accents=True, remove_special=True),
+                repeats=repeats,
+            ),
+        ),
+    ]
+
+
+def bench_pyjanitor_pandas(
     *,
     repeats: int,
     clean_size: int,
@@ -385,10 +434,10 @@ def print_markdown(results: list[BenchmarkResult]) -> None:
     """Print a README-ready Markdown table."""
     by_key = {(result.task, result.size, result.implementation): result for result in results}
     task_sizes = sorted({(result.task, result.size) for result in results})
-    implementations = ["polars-janitor", "pyjanitor/pandas", "R janitor"]
+    implementations = ["polars-janitor", "pyjanitor/Polars", "pyjanitor/pandas", "R janitor"]
 
-    print("| Task | Size | polars-janitor | pyjanitor/pandas | R janitor |")
-    print("| --- | ---: | ---: | ---: | ---: |")
+    print("| Task | Size | polars-janitor | pyjanitor/Polars | pyjanitor/pandas | R janitor |")
+    print("| --- | ---: | ---: | ---: | ---: | ---: |")
     for task, size in task_sizes:
         values = []
         for implementation in implementations:
@@ -420,7 +469,12 @@ def main() -> None:
             header_size=args.header_size,
             compare_size=args.compare_size,
         ),
-        *bench_pyjanitor(
+        *bench_pyjanitor_polars(
+            repeats=args.repeats,
+            clean_size=args.clean_size,
+            header_size=args.header_size,
+        ),
+        *bench_pyjanitor_pandas(
             repeats=args.repeats,
             clean_size=args.clean_size,
             header_size=args.header_size,