Merge pull request #43 from vertti/feat/early-termination-v0.19.0

vertti · web-flow · commit fe37e644c547 · 2025-11-26T10:25:50.000+02:00
Early termination optimization (v0.19.0) - 71-124x speedup for error cases
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,23 @@
 
 All notable changes to this project will be documented in this file.
 
+## 0.19.0
+
+### Performance Improvements
+
+- **Early termination for row validation** - Dramatically faster when validation errors exist
+  - Stops scanning after collecting `max_errors` (default behavior)
+  - **71-124x speedup** when errors are present (1.2ms vs 140ms for 100k rows with errors)
+  - **No overhead** for valid data (maintains 767K rows/sec throughput)
+  - Can be disabled with `early_termination=False` parameter for exact error counts
+  - Error messages now indicate when scanning stopped early: "stopped scanning early (at least N more row(s) with errors)"
+
+### New Features
+
+- Added `early_termination` parameter to `validate_dataframe_rows()`
+  - Default: `True` (stops after `max_errors` for performance)
+  - Set to `False` to scan entire DataFrame and get exact error count
+
 ## 0.18.0
 
 ### Major Performance Improvements
diff --git a/daffy/row_validation.py b/daffy/row_validation.py
@@ -59,6 +59,7 @@ def validate_dataframe_rows(
     row_validator: type[BaseModel],
     max_errors: int = 5,
     convert_nans: bool = True,
+    early_termination: bool = True,
 ) -> None:
     """
     Validate DataFrame rows against a Pydantic model.
@@ -71,6 +72,7 @@ def validate_dataframe_rows(
         row_validator: Pydantic BaseModel class for validation
         max_errors: Maximum number of errors to collect before stopping
         convert_nans: Whether to convert NaN to None for Pydantic
+        early_termination: Stop scanning after max_errors reached (faster for large datasets)
 
     Raises:
         AssertionError: If any rows fail validation (consistent with Daffy)
@@ -89,14 +91,15 @@ def validate_dataframe_rows(
     df_prepared = _prepare_dataframe_for_validation(df, convert_nans)
 
     # Use optimized row-by-row validation
-    _validate_optimized(df_prepared, row_validator, max_errors, convert_nans)
+    _validate_optimized(df_prepared, row_validator, max_errors, convert_nans, early_termination)
 
 
 def _validate_optimized(
     df: DataFrameType,
     row_validator: type[BaseModel],
     max_errors: int,
     convert_nans: bool,
+    early_termination: bool,
 ) -> None:
     """Optimized row-by-row validation with fast DataFrame conversion."""
     failed_rows: list[tuple[Any, PydanticValidationError]] = []
@@ -120,6 +123,9 @@ def _validate_optimized(
                 total_errors += 1
                 if len(failed_rows) < max_errors:
                     failed_rows.append((idx_label, e))
+                elif early_termination:
+                    # Stop scanning after collecting max_errors
+                    break
 
     elif is_polars_dataframe(df):
         # Polars: use iter_rows which is already optimized
@@ -134,17 +140,21 @@ def _validate_optimized(
                 total_errors += 1
                 if len(failed_rows) < max_errors:
                     failed_rows.append((idx, e))
+                elif early_termination:
+                    # Stop scanning after collecting max_errors
+                    break
     else:
         raise TypeError(f"Unknown DataFrame type: {type(df)}")
 
     if failed_rows:
-        _raise_validation_error(df, failed_rows, total_errors)
+        _raise_validation_error(df, failed_rows, total_errors, early_termination)
 
 
 def _raise_validation_error(
     df: DataFrameType,
     errors: list[tuple[Any, Any]],
     total_errors: int,
+    stopped_early: bool,
 ) -> None:
     """Format and raise AssertionError with row validation details."""
     error_lines = [
@@ -162,7 +172,11 @@ def _raise_validation_error(
 
         error_lines.append("")
 
-    if total_errors > len(errors):
+    if stopped_early and total_errors > len(errors):
+        error_lines.append(
+            f"  ... stopped scanning early (at least {total_errors - len(errors)} more row(s) with errors)"
+        )
+    elif total_errors > len(errors):
         error_lines.append(f"  ... and {total_errors - len(errors)} more row(s) with errors")
 
     raise AssertionError("\n".join(error_lines))
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "daffy"
-version = "0.18.0"
+version = "0.19.0"
 description = "Function decorators for Pandas and Polars Dataframe column name and data type validation"
 authors = [
  { name="Janne Sinivirta", email="janne.sinivirta@gmail.com" },
diff --git a/tests/test_row_validation.py b/tests/test_row_validation.py
@@ -234,7 +234,7 @@ def test_max_errors_limit() -> None:
     )
 
     with pytest.raises(AssertionError) as exc_info:
-        validate_dataframe_rows(df, SimpleValidator, max_errors=3)
+        validate_dataframe_rows(df, SimpleValidator, max_errors=3, early_termination=False)
 
     message = str(exc_info.value)
 
@@ -243,7 +243,7 @@ def test_max_errors_limit() -> None:
     assert "Row 1:" in message
     assert "Row 2:" in message
 
-    # Should indicate more errors exist
+    # Should indicate more errors exist (exact count since early_termination=False)
     assert "7 more row(s) with errors" in message
 
 
@@ -310,3 +310,73 @@ def __len__(self) -> int:
 
     with pytest.raises(TypeError, match="Expected DataFrame"):
         validate_dataframe_rows(fake_df, SimpleValidator)  # type: ignore[arg-type]
+
+
+def test_early_termination_enabled() -> None:
+    # Create large DataFrame with many invalid rows
+    df = pd.DataFrame(
+        {
+            "name": [str(i) for i in range(100)],
+            "age": [-1] * 100,  # All invalid
+            "price": [10.0] * 100,
+        }
+    )
+
+    with pytest.raises(AssertionError) as exc_info:
+        validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=True)
+
+    message = str(exc_info.value)
+
+    # Should show first 5 errors
+    assert "Row 0:" in message
+    assert "Row 4:" in message
+
+    # Should indicate stopped early
+    assert "stopped scanning early" in message
+    assert "at least" in message
+
+
+def test_early_termination_disabled() -> None:
+    # Create DataFrame with many invalid rows
+    df = pd.DataFrame(
+        {
+            "name": [str(i) for i in range(100)],
+            "age": [-1] * 100,  # All invalid
+            "price": [10.0] * 100,
+        }
+    )
+
+    with pytest.raises(AssertionError) as exc_info:
+        validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=False)
+
+    message = str(exc_info.value)
+
+    # Should show first 5 errors
+    assert "Row 0:" in message
+    assert "Row 4:" in message
+
+    # Should indicate exact count (scanned all rows)
+    assert "95 more row(s) with errors" in message
+    assert "stopped scanning early" not in message
+
+
+def test_early_termination_with_polars() -> None:
+    df = pl.DataFrame(
+        {
+            "name": [str(i) for i in range(100)],
+            "age": [-1] * 100,  # All invalid
+            "price": [10.0] * 100,
+        }
+    )
+
+    with pytest.raises(AssertionError) as exc_info:
+        validate_dataframe_rows(df, SimpleValidator, max_errors=5, early_termination=True)
+
+    message = str(exc_info.value)
+
+    # Should show first 5 errors
+    assert "Row 0:" in message
+    assert "Row 4:" in message
+
+    # Should indicate stopped early
+    assert "stopped scanning early" in message