Skip to content

Commit 7552f6b

Browse files
author
Vladimir Vilimaitis
committed
Correct pyjanitor benchmark comparison
1 parent a5a8a04 commit 7552f6b

2 files changed

Lines changed: 71 additions & 17 deletions

File tree

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,13 @@ The compiled extension is CPython-version-specific. If `import polars_janitor` f
327327

328328
These are local medians from this Windows x64 machine using CPython 3.13.5, Polars 1.40.1, pyjanitor 0.32.23 with pandas 3.0.3, and R 4.6.0 with janitor 2.2.1. Setup is outside the timed loop. Treat them as directional, not as a universal performance claim.
329329

330-
The R comparison uses base R `data.frame`s because janitor is a data.frame/tibble package. The pyjanitor comparison uses pandas for the same reason.
330+
The R comparison uses base R `data.frame`s because janitor is a data.frame/tibble package. pyjanitor has Polars methods for `clean_names` and `row_to_names`, so those are shown separately. Its `compare_df_cols` helper is pandas-only in the tested version.
331331

332-
| Task | Size | polars-janitor | pyjanitor/pandas | R janitor |
333-
| --- | ---: | ---: | ---: | ---: |
334-
| clean_names | 10,000 columns | 45.38 ms | 34.89 ms | 4710.00 ms |
335-
| compare_df_cols | 5,000 columns | 14.51 ms | 302.32 ms | 70.00 ms |
336-
| row_to_names + clean_names | 2,000 columns | 8.43 ms | 46.45 ms | 940.00 ms |
332+
| Task | Size | polars-janitor | pyjanitor/Polars | pyjanitor/pandas | R janitor |
333+
| --- | ---: | ---: | ---: | ---: | ---: |
334+
| clean_names | 10,000 columns | 45.49 ms | 139.01 ms | 36.94 ms | 5690.00 ms |
335+
| compare_df_cols | 5,000 columns | 14.47 ms | n/a | 384.17 ms | 80.00 ms |
336+
| row_to_names + clean_names | 2,000 columns | 8.78 ms | 32.13 ms | 44.04 ms | 970.00 ms |
337337

338338
Run the same benchmark from a checkout:
339339

benchmarks/benchmark_competitors.py

Lines changed: 65 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import argparse
66
import csv
77
import gc
8+
import importlib
89
import os
910
import shutil
1011
import statistics
@@ -20,17 +21,26 @@
2021

2122
import polars_janitor as pj
2223

23-
if TYPE_CHECKING:
24-
from collections.abc import Callable
25-
from types import ModuleType
26-
2724
try:
28-
import janitor as pyjanitor
29-
import pandas as pd
25+
pyjanitor = importlib.import_module("janitor")
3026
except ImportError:
3127
pyjanitor = None
28+
29+
try:
30+
importlib.import_module("janitor.polars")
31+
pyjanitor_polars_available = True
32+
except ImportError:
33+
pyjanitor_polars_available = False
34+
35+
try:
36+
pd = importlib.import_module("pandas")
37+
except ImportError:
3238
pd = None
3339

40+
if TYPE_CHECKING:
41+
from collections.abc import Callable
42+
from types import ModuleType
43+
3444

3545
PATTERNS = [
3646
"Customer ID",
@@ -265,7 +275,46 @@ def bench_polars_janitor(
265275
]
266276

267277

268-
def bench_pyjanitor(
278+
def bench_pyjanitor_polars(
279+
*,
280+
repeats: int,
281+
clean_size: int,
282+
header_size: int,
283+
) -> list[BenchmarkResult]:
284+
"""Run pyjanitor's Polars benchmark cases when that namespace is installed."""
285+
if not pyjanitor_polars_available:
286+
return []
287+
288+
clean_frame = make_polars_frame(clean_size)
289+
sheet = make_polars_sheet(header_size)
290+
291+
return [
292+
BenchmarkResult(
293+
"clean_names",
294+
clean_size,
295+
"pyjanitor/Polars",
296+
median_ms(
297+
lambda: clean_frame.clean_names(strip_accents=True, remove_special=True),
298+
repeats=repeats,
299+
),
300+
),
301+
BenchmarkResult(
302+
"row_to_names + clean_names",
303+
header_size,
304+
"pyjanitor/Polars",
305+
median_ms(
306+
lambda: sheet.row_to_names(
307+
row_numbers=1,
308+
remove_rows=True,
309+
remove_rows_above=True,
310+
).clean_names(strip_accents=True, remove_special=True),
311+
repeats=repeats,
312+
),
313+
),
314+
]
315+
316+
317+
def bench_pyjanitor_pandas(
269318
*,
270319
repeats: int,
271320
clean_size: int,
@@ -385,10 +434,10 @@ def print_markdown(results: list[BenchmarkResult]) -> None:
385434
"""Print a README-ready Markdown table."""
386435
by_key = {(result.task, result.size, result.implementation): result for result in results}
387436
task_sizes = sorted({(result.task, result.size) for result in results})
388-
implementations = ["polars-janitor", "pyjanitor/pandas", "R janitor"]
437+
implementations = ["polars-janitor", "pyjanitor/Polars", "pyjanitor/pandas", "R janitor"]
389438

390-
print("| Task | Size | polars-janitor | pyjanitor/pandas | R janitor |")
391-
print("| --- | ---: | ---: | ---: | ---: |")
439+
print("| Task | Size | polars-janitor | pyjanitor/Polars | pyjanitor/pandas | R janitor |")
440+
print("| --- | ---: | ---: | ---: | ---: | ---: |")
392441
for task, size in task_sizes:
393442
values = []
394443
for implementation in implementations:
@@ -420,7 +469,12 @@ def main() -> None:
420469
header_size=args.header_size,
421470
compare_size=args.compare_size,
422471
),
423-
*bench_pyjanitor(
472+
*bench_pyjanitor_polars(
473+
repeats=args.repeats,
474+
clean_size=args.clean_size,
475+
header_size=args.header_size,
476+
),
477+
*bench_pyjanitor_pandas(
424478
repeats=args.repeats,
425479
clean_size=args.clean_size,
426480
header_size=args.header_size,

0 commit comments

Comments
 (0)