Skip to content

Commit 35f7fe7

Browse files
committed
Clean up row_validation.py
- Remove verbose/obvious comments - Simplify docstrings to be concise - Make _convert_nan_to_none a one-liner dict comprehension - Remove unnecessary explanatory comments - More pythonic and easier to read
1 parent b39e30c commit 35f7fe7

File tree

1 file changed

+14
-106
lines changed

1 file changed

+14
-106
lines changed

daffy/row_validation.py

Lines changed: 14 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,7 @@
2727

2828

2929
def _iterate_dataframe_with_index(df: DataFrameType) -> Iterator[tuple[Any, dict[str, Any]]]:
30-
"""
31-
Efficiently iterate over DataFrame rows with proper index handling.
32-
33-
Returns iterator of (index_label, row_dict) tuples.
34-
Works for both pandas and polars, handles all index types.
35-
36-
Args:
37-
df: DataFrame to iterate over
38-
39-
Yields:
40-
Tuple of (index_label, row_dict) for each row
41-
"""
30+
"""Iterate over DataFrame rows, yielding (index_label, row_dict) tuples."""
4231
if is_polars_dataframe(df):
4332
for idx, row in enumerate(df.iter_rows(named=True)): # type: ignore[attr-defined]
4433
yield idx, row
@@ -50,25 +39,8 @@ def _iterate_dataframe_with_index(df: DataFrameType) -> Iterator[tuple[Any, dict
5039

5140

5241
def _convert_nan_to_none(row_dict: dict[str, Any]) -> dict[str, Any]:
53-
"""
54-
Convert NaN values to None for Pydantic compatibility.
55-
56-
Pydantic expects None for missing values, but DataFrames use NaN.
57-
This function bridges the gap.
58-
59-
Args:
60-
row_dict: Dictionary representation of a DataFrame row
61-
62-
Returns:
63-
Dictionary with NaN values converted to None
64-
"""
65-
cleaned = {}
66-
for key, value in row_dict.items():
67-
if isinstance(value, float) and math.isnan(value):
68-
cleaned[key] = None
69-
else:
70-
cleaned[key] = value
71-
return cleaned
42+
"""Convert NaN values to None for Pydantic compatibility."""
43+
return {k: (None if isinstance(v, float) and math.isnan(v) else v) for k, v in row_dict.items()}
7244

7345

7446
def validate_dataframe_rows(
@@ -100,18 +72,13 @@ def validate_dataframe_rows(
10072
if not isinstance(df, get_dataframe_types()):
10173
raise TypeError(f"Expected DataFrame, got {type(df)}")
10274

103-
# Empty DataFrames pass validation
10475
if len(df) == 0:
10576
return
10677

107-
# Try batch validation first (fastest)
10878
try:
10979
_validate_batch(df, row_validator, max_errors, convert_nans)
11080
except (TypeError, AttributeError, KeyError):
111-
# Fallback to iterative validation if batch conversion fails
112-
# This can happen with complex models or unusual DataFrame structures
11381
_validate_iterative(df, row_validator, max_errors, convert_nans)
114-
# AssertionError from validation failures should propagate
11582

11683

11784
def _validate_batch(
@@ -120,59 +87,37 @@ def _validate_batch(
12087
max_errors: int,
12188
convert_nans: bool,
12289
) -> None:
123-
"""
124-
Batch validation using TypeAdapter - much faster for large DataFrames.
125-
126-
Args:
127-
df: DataFrame to validate
128-
row_validator: Pydantic model for validation
129-
max_errors: Maximum errors to collect
130-
convert_nans: Whether to convert NaN values
131-
"""
132-
# Convert entire DataFrame to list of dicts
90+
"""Batch validation using TypeAdapter."""
13391
if is_polars_dataframe(df):
13492
records = list(df.iter_rows(named=True)) # type: ignore[attr-defined]
13593
elif is_pandas_dataframe(df):
13694
records = df.to_dict("records") # type: ignore[attr-defined]
13795
else:
13896
raise TypeError(f"Cannot convert {type(df)} to records")
13997

140-
# Convert NaNs if needed
14198
if convert_nans:
14299
records = [_convert_nan_to_none(r) for r in records]
143100

144-
# Create TypeAdapter for batch validation
145101
adapter = TypeAdapter(list[row_validator]) # type: ignore[misc]
146102

147103
try:
148-
# Validate all at once - very fast!
149104
adapter.validate_python(records)
150105
except PydanticValidationError as e: # type: ignore[misc]
151-
# Extract row-level errors from batch validation
152106
errors_by_row: list[tuple[Any, Any]] = []
153-
154-
# Count unique rows with errors (need to iterate through ALL errors)
155107
unique_row_indices: set[int] = set()
156108
all_errors = list(e.errors()) # type: ignore[misc]
109+
157110
for error in all_errors:
158111
if error["loc"] and isinstance(error["loc"][0], int):
159112
unique_row_indices.add(error["loc"][0])
160113

161-
# Collect up to max_errors for display
162114
for error in all_errors:
163115
if len(errors_by_row) >= max_errors:
164116
break
165117

166-
# Error location is like [row_index, field_name, ...]
167118
if error["loc"] and isinstance(error["loc"][0], int):
168119
row_idx = error["loc"][0]
169-
170-
# Get index label for better error message
171-
if is_pandas_dataframe(df):
172-
idx_label = df.index[row_idx] # type: ignore[attr-defined]
173-
else:
174-
idx_label = row_idx
175-
120+
idx_label = df.index[row_idx] if is_pandas_dataframe(df) else row_idx # type: ignore[attr-defined]
176121
errors_by_row.append((idx_label, error))
177122

178123
if errors_by_row:
@@ -185,15 +130,7 @@ def _validate_iterative(
185130
max_errors: int,
186131
convert_nans: bool,
187132
) -> None:
188-
"""
189-
Fallback iterative validation - slower but works for all cases.
190-
191-
Args:
192-
df: DataFrame to validate
193-
row_validator: Pydantic model for validation
194-
max_errors: Maximum errors to collect
195-
convert_nans: Whether to convert NaN values
196-
"""
133+
"""Fallback iterative validation."""
197134
failed_rows: list[tuple[Any, PydanticValidationError]] = []
198135

199136
for idx_label, row_dict in _iterate_dataframe_with_index(df):
@@ -216,57 +153,28 @@ def _raise_validation_error(
216153
errors: list[tuple[Any, Any]],
217154
total_errors: int,
218155
) -> None:
219-
"""
220-
Format and raise AssertionError with detailed row validation information.
221-
222-
Uses AssertionError for consistency with existing Daffy validation.
223-
224-
Args:
225-
df: DataFrame being validated
226-
errors: List of (index_label, error) tuples
227-
total_errors: Total number of validation errors
228-
229-
Raises:
230-
AssertionError: With detailed error message
231-
"""
232-
total_rows = len(df)
233-
shown_errors = len(errors)
234-
235-
# Build detailed error message
156+
"""Format and raise AssertionError with row validation details."""
236157
error_lines = [
237-
f"Row validation failed for {total_errors} out of {total_rows} rows:",
158+
f"Row validation failed for {total_errors} out of {len(df)} rows:",
238159
"",
239160
]
240161

241-
# Show details for each failed row
242162
for idx_label, error in errors:
243163
error_lines.append(f" Row {idx_label}:")
244164

245-
# Handle Pydantic ValidationError structure
246165
if isinstance(error, dict):
247-
# From batch validation
248166
field_path = ".".join(str(x) for x in error["loc"][1:] if x != "__root__")
249-
if field_path:
250-
error_lines.append(f" - {field_path}: {error['msg']}")
251-
else:
252-
error_lines.append(f" - {error['msg']}")
167+
error_lines.append(f" - {field_path}: {error['msg']}" if field_path else f" - {error['msg']}")
253168
elif hasattr(error, "errors"):
254-
# From iterative validation
255169
for err_dict in error.errors():
256170
field = ".".join(str(loc) for loc in err_dict["loc"] if loc != "__root__")
257-
if field:
258-
error_lines.append(f" - {field}: {err_dict['msg']}")
259-
else:
260-
error_lines.append(f" - {err_dict['msg']}")
171+
error_lines.append(f" - {field}: {err_dict['msg']}" if field else f" - {err_dict['msg']}")
261172
else:
262173
error_lines.append(f" - {str(error)}")
263174

264175
error_lines.append("")
265176

266-
# Add truncation notice if needed
267-
if total_errors > shown_errors:
268-
remaining = total_errors - shown_errors
269-
error_lines.append(f" ... and {remaining} more row(s) with errors")
177+
if total_errors > len(errors):
178+
error_lines.append(f" ... and {total_errors - len(errors)} more row(s) with errors")
270179

271-
message = "\n".join(error_lines)
272-
raise AssertionError(message)
180+
raise AssertionError("\n".join(error_lines))

0 commit comments

Comments
 (0)