Skip to content

Commit fe37e64

Browse files
authored
Merge pull request #43 from vertti/feat/early-termination-v0.19.0
Early termination optimization (v0.19.0) - 71-124x speedup for error cases
2 parents 84ae5fd + b817cf5 commit fe37e64

File tree

4 files changed

+107
-6
lines changed

4 files changed

+107
-6
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## 0.19.0
6+
7+
### Performance Improvements
8+
9+
- **Early termination for row validation** - Dramatically faster when validation errors exist
10+
- Stops scanning after collecting `max_errors` (default behavior)
11+
- **71-124x speedup** when errors are present (1.2ms vs 140ms for 100k rows with errors)
12+
- **No overhead** for valid data (maintains 767K rows/sec throughput)
13+
- Can be disabled with `early_termination=False` parameter for exact error counts
14+
- Error messages now indicate when scanning stopped early: "stopped scanning early (at least N more row(s) with errors)"
15+
16+
### New Features
17+
18+
- Added `early_termination` parameter to `validate_dataframe_rows()`
19+
- Default: `True` (stops after `max_errors` for performance)
20+
- Set to `False` to scan entire DataFrame and get exact error count
21+
522
## 0.18.0
623

724
### Major Performance Improvements

daffy/row_validation.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def validate_dataframe_rows(
5959
row_validator: type[BaseModel],
6060
max_errors: int = 5,
6161
convert_nans: bool = True,
62+
early_termination: bool = True,
6263
) -> None:
6364
"""
6465
Validate DataFrame rows against a Pydantic model.
@@ -71,6 +72,7 @@ def validate_dataframe_rows(
7172
row_validator: Pydantic BaseModel class for validation
7273
max_errors: Maximum number of errors to collect before stopping
7374
convert_nans: Whether to convert NaN to None for Pydantic
75+
early_termination: Stop scanning after max_errors reached (faster for large datasets)
7476
7577
Raises:
7678
AssertionError: If any rows fail validation (consistent with Daffy)
@@ -89,14 +91,15 @@ def validate_dataframe_rows(
8991
df_prepared = _prepare_dataframe_for_validation(df, convert_nans)
9092

9193
# Use optimized row-by-row validation
92-
_validate_optimized(df_prepared, row_validator, max_errors, convert_nans)
94+
_validate_optimized(df_prepared, row_validator, max_errors, convert_nans, early_termination)
9395

9496

9597
def _validate_optimized(
9698
df: DataFrameType,
9799
row_validator: type[BaseModel],
98100
max_errors: int,
99101
convert_nans: bool,
102+
early_termination: bool,
100103
) -> None:
101104
"""Optimized row-by-row validation with fast DataFrame conversion."""
102105
failed_rows: list[tuple[Any, PydanticValidationError]] = []
@@ -120,6 +123,9 @@ def _validate_optimized(
120123
total_errors += 1
121124
if len(failed_rows) < max_errors:
122125
failed_rows.append((idx_label, e))
126+
elif early_termination:
127+
# Stop scanning after collecting max_errors
128+
break
123129

124130
elif is_polars_dataframe(df):
125131
# Polars: use iter_rows which is already optimized
@@ -134,17 +140,21 @@ def _validate_optimized(
134140
total_errors += 1
135141
if len(failed_rows) < max_errors:
136142
failed_rows.append((idx, e))
143+
elif early_termination:
144+
# Stop scanning after collecting max_errors
145+
break
137146
else:
138147
raise TypeError(f"Unknown DataFrame type: {type(df)}")
139148

140149
if failed_rows:
141-
_raise_validation_error(df, failed_rows, total_errors)
150+
_raise_validation_error(df, failed_rows, total_errors, early_termination)
142151

143152

144153
def _raise_validation_error(
145154
df: DataFrameType,
146155
errors: list[tuple[Any, Any]],
147156
total_errors: int,
157+
stopped_early: bool,
148158
) -> None:
149159
"""Format and raise AssertionError with row validation details."""
150160
error_lines = [
@@ -162,7 +172,11 @@ def _raise_validation_error(
162172

163173
error_lines.append("")
164174

165-
if total_errors > len(errors):
175+
if stopped_early and total_errors > len(errors):
176+
error_lines.append(
177+
f" ... stopped scanning early (at least {total_errors - len(errors)} more row(s) with errors)"
178+
)
179+
elif total_errors > len(errors):
166180
error_lines.append(f" ... and {total_errors - len(errors)} more row(s) with errors")
167181

168182
raise AssertionError("\n".join(error_lines))

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "daffy"
3-
version = "0.18.0"
3+
version = "0.19.0"
44
description = "Function decorators for Pandas and Polars Dataframe column name and data type validation"
55
authors = [
66
{ name="Janne Sinivirta", email="[email protected]" },

tests/test_row_validation.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ def test_max_errors_limit() -> None:
234234
)
235235

236236
with pytest.raises(AssertionError) as exc_info:
237-
validate_dataframe_rows(df, SimpleValidator, max_errors=3)
237+
validate_dataframe_rows(df, SimpleValidator, max_errors=3, early_termination=False)
238238

239239
message = str(exc_info.value)
240240

@@ -243,7 +243,7 @@ def test_max_errors_limit() -> None:
243243
assert "Row 1:" in message
244244
assert "Row 2:" in message
245245

246-
# Should indicate more errors exist
246+
# Should indicate more errors exist (exact count since early_termination=False)
247247
assert "7 more row(s) with errors" in message
248248

249249

@@ -310,3 +310,73 @@ def __len__(self) -> int:
310310

311311
with pytest.raises(TypeError, match="Expected DataFrame"):
312312
validate_dataframe_rows(fake_df, SimpleValidator) # type: ignore[arg-type]
313+
314+
315+
def test_early_termination_enabled() -> None:
316+
# Create large DataFrame with many invalid rows
317+
df = pd.DataFrame(
318+
{
319+
"name": [str(i) for i in range(100)],
320+
"age": [-1] * 100, # All invalid
321+
"price": [10.0] * 100,
322+
}
323+
)
324+
325+
with pytest.raises(AssertionError) as exc_info:
326+
validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=True)
327+
328+
message = str(exc_info.value)
329+
330+
# Should show first 5 errors
331+
assert "Row 0:" in message
332+
assert "Row 4:" in message
333+
334+
# Should indicate stopped early
335+
assert "stopped scanning early" in message
336+
assert "at least" in message
337+
338+
339+
def test_early_termination_disabled() -> None:
340+
# Create DataFrame with many invalid rows
341+
df = pd.DataFrame(
342+
{
343+
"name": [str(i) for i in range(100)],
344+
"age": [-1] * 100, # All invalid
345+
"price": [10.0] * 100,
346+
}
347+
)
348+
349+
with pytest.raises(AssertionError) as exc_info:
350+
validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=False)
351+
352+
message = str(exc_info.value)
353+
354+
# Should show first 5 errors
355+
assert "Row 0:" in message
356+
assert "Row 4:" in message
357+
358+
# Should indicate exact count (scanned all rows)
359+
assert "95 more row(s) with errors" in message
360+
assert "stopped scanning early" not in message
361+
362+
363+
def test_early_termination_with_polars() -> None:
364+
df = pl.DataFrame(
365+
{
366+
"name": [str(i) for i in range(100)],
367+
"age": [-1] * 100, # All invalid
368+
"price": [10.0] * 100,
369+
}
370+
)
371+
372+
with pytest.raises(AssertionError) as exc_info:
373+
validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=True)
374+
375+
message = str(exc_info.value)
376+
377+
# Should show first 5 errors
378+
assert "Row 0:" in message
379+
assert "Row 4:" in message
380+
381+
# Should indicate stopped early
382+
assert "stopped scanning early" in message

0 commit comments

Comments
 (0)