2727
2828
2929def _iterate_dataframe_with_index (df : DataFrameType ) -> Iterator [tuple [Any , dict [str , Any ]]]:
30- """
31- Efficiently iterate over DataFrame rows with proper index handling.
32-
33- Returns iterator of (index_label, row_dict) tuples.
34- Works for both pandas and polars, handles all index types.
35-
36- Args:
37- df: DataFrame to iterate over
38-
39- Yields:
40- Tuple of (index_label, row_dict) for each row
41- """
30+ """Iterate over DataFrame rows, yielding (index_label, row_dict) tuples."""
4231 if is_polars_dataframe (df ):
4332 for idx , row in enumerate (df .iter_rows (named = True )): # type: ignore[attr-defined]
4433 yield idx , row
@@ -50,25 +39,8 @@ def _iterate_dataframe_with_index(df: DataFrameType) -> Iterator[tuple[Any, dict
5039
5140
5241def _convert_nan_to_none (row_dict : dict [str , Any ]) -> dict [str , Any ]:
53- """
54- Convert NaN values to None for Pydantic compatibility.
55-
56- Pydantic expects None for missing values, but DataFrames use NaN.
57- This function bridges the gap.
58-
59- Args:
60- row_dict: Dictionary representation of a DataFrame row
61-
62- Returns:
63- Dictionary with NaN values converted to None
64- """
65- cleaned = {}
66- for key , value in row_dict .items ():
67- if isinstance (value , float ) and math .isnan (value ):
68- cleaned [key ] = None
69- else :
70- cleaned [key ] = value
71- return cleaned
42+ """Convert NaN values to None for Pydantic compatibility."""
43+ return {k : (None if isinstance (v , float ) and math .isnan (v ) else v ) for k , v in row_dict .items ()}
7244
7345
7446def validate_dataframe_rows (
@@ -100,18 +72,13 @@ def validate_dataframe_rows(
10072 if not isinstance (df , get_dataframe_types ()):
10173 raise TypeError (f"Expected DataFrame, got { type (df )} " )
10274
103- # Empty DataFrames pass validation
10475 if len (df ) == 0 :
10576 return
10677
107- # Try batch validation first (fastest)
10878 try :
10979 _validate_batch (df , row_validator , max_errors , convert_nans )
11080 except (TypeError , AttributeError , KeyError ):
111- # Fallback to iterative validation if batch conversion fails
112- # This can happen with complex models or unusual DataFrame structures
11381 _validate_iterative (df , row_validator , max_errors , convert_nans )
114- # AssertionError from validation failures should propagate
11582
11683
11784def _validate_batch (
@@ -120,59 +87,37 @@ def _validate_batch(
12087 max_errors : int ,
12188 convert_nans : bool ,
12289) -> None :
123- """
124- Batch validation using TypeAdapter - much faster for large DataFrames.
125-
126- Args:
127- df: DataFrame to validate
128- row_validator: Pydantic model for validation
129- max_errors: Maximum errors to collect
130- convert_nans: Whether to convert NaN values
131- """
132- # Convert entire DataFrame to list of dicts
90+ """Batch validation using TypeAdapter."""
13391 if is_polars_dataframe (df ):
13492 records = list (df .iter_rows (named = True )) # type: ignore[attr-defined]
13593 elif is_pandas_dataframe (df ):
13694 records = df .to_dict ("records" ) # type: ignore[attr-defined]
13795 else :
13896 raise TypeError (f"Cannot convert { type (df )} to records" )
13997
140- # Convert NaNs if needed
14198 if convert_nans :
14299 records = [_convert_nan_to_none (r ) for r in records ]
143100
144- # Create TypeAdapter for batch validation
145101 adapter = TypeAdapter (list [row_validator ]) # type: ignore[misc]
146102
147103 try :
148- # Validate all at once - very fast!
149104 adapter .validate_python (records )
150105 except PydanticValidationError as e : # type: ignore[misc]
151- # Extract row-level errors from batch validation
152106 errors_by_row : list [tuple [Any , Any ]] = []
153-
154- # Count unique rows with errors (need to iterate through ALL errors)
155107 unique_row_indices : set [int ] = set ()
156108 all_errors = list (e .errors ()) # type: ignore[misc]
109+
157110 for error in all_errors :
158111 if error ["loc" ] and isinstance (error ["loc" ][0 ], int ):
159112 unique_row_indices .add (error ["loc" ][0 ])
160113
161- # Collect up to max_errors for display
162114 for error in all_errors :
163115 if len (errors_by_row ) >= max_errors :
164116 break
165117
166- # Error location is like [row_index, field_name, ...]
167118 if error ["loc" ] and isinstance (error ["loc" ][0 ], int ):
168119 row_idx = error ["loc" ][0 ]
169-
170- # Get index label for better error message
171- if is_pandas_dataframe (df ):
172- idx_label = df .index [row_idx ] # type: ignore[attr-defined]
173- else :
174- idx_label = row_idx
175-
120+ idx_label = df .index [row_idx ] if is_pandas_dataframe (df ) else row_idx # type: ignore[attr-defined]
176121 errors_by_row .append ((idx_label , error ))
177122
178123 if errors_by_row :
@@ -185,15 +130,7 @@ def _validate_iterative(
185130 max_errors : int ,
186131 convert_nans : bool ,
187132) -> None :
188- """
189- Fallback iterative validation - slower but works for all cases.
190-
191- Args:
192- df: DataFrame to validate
193- row_validator: Pydantic model for validation
194- max_errors: Maximum errors to collect
195- convert_nans: Whether to convert NaN values
196- """
133+ """Fallback iterative validation."""
197134 failed_rows : list [tuple [Any , PydanticValidationError ]] = []
198135
199136 for idx_label , row_dict in _iterate_dataframe_with_index (df ):
@@ -216,57 +153,28 @@ def _raise_validation_error(
216153 errors : list [tuple [Any , Any ]],
217154 total_errors : int ,
218155) -> None :
219- """
220- Format and raise AssertionError with detailed row validation information.
221-
222- Uses AssertionError for consistency with existing Daffy validation.
223-
224- Args:
225- df: DataFrame being validated
226- errors: List of (index_label, error) tuples
227- total_errors: Total number of validation errors
228-
229- Raises:
230- AssertionError: With detailed error message
231- """
232- total_rows = len (df )
233- shown_errors = len (errors )
234-
235- # Build detailed error message
156+ """Format and raise AssertionError with row validation details."""
236157 error_lines = [
237- f"Row validation failed for { total_errors } out of { total_rows } rows:" ,
158+ f"Row validation failed for { total_errors } out of { len ( df ) } rows:" ,
238159 "" ,
239160 ]
240161
241- # Show details for each failed row
242162 for idx_label , error in errors :
243163 error_lines .append (f" Row { idx_label } :" )
244164
245- # Handle Pydantic ValidationError structure
246165 if isinstance (error , dict ):
247- # From batch validation
248166 field_path = "." .join (str (x ) for x in error ["loc" ][1 :] if x != "__root__" )
249- if field_path :
250- error_lines .append (f" - { field_path } : { error ['msg' ]} " )
251- else :
252- error_lines .append (f" - { error ['msg' ]} " )
167+ error_lines .append (f" - { field_path } : { error ['msg' ]} " if field_path else f" - { error ['msg' ]} " )
253168 elif hasattr (error , "errors" ):
254- # From iterative validation
255169 for err_dict in error .errors ():
256170 field = "." .join (str (loc ) for loc in err_dict ["loc" ] if loc != "__root__" )
257- if field :
258- error_lines .append (f" - { field } : { err_dict ['msg' ]} " )
259- else :
260- error_lines .append (f" - { err_dict ['msg' ]} " )
171+ error_lines .append (f" - { field } : { err_dict ['msg' ]} " if field else f" - { err_dict ['msg' ]} " )
261172 else :
262173 error_lines .append (f" - { str (error )} " )
263174
264175 error_lines .append ("" )
265176
266- # Add truncation notice if needed
267- if total_errors > shown_errors :
268- remaining = total_errors - shown_errors
269- error_lines .append (f" ... and { remaining } more row(s) with errors" )
177+ if total_errors > len (errors ):
178+ error_lines .append (f" ... and { total_errors - len (errors )} more row(s) with errors" )
270179
271- message = "\n " .join (error_lines )
272- raise AssertionError (message )
180+ raise AssertionError ("\n " .join (error_lines ))
0 commit comments