-
Notifications
You must be signed in to change notification settings - Fork 5
Using TypeAdapter for validation #43
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| from typing import Any, Optional | ||
|
|
||
| import pandas as pd | ||
| from pydantic import BaseModel, ValidationError | ||
| from pydantic import TypeAdapter, ValidationError | ||
|
|
||
| from pandantic.basemodel import CoreValidator | ||
|
|
||
|
|
@@ -25,6 +25,15 @@ | |
|
|
||
| @pd.api.extensions.register_dataframe_accessor("pandantic") | ||
| class PydanticAccessor: | ||
| def _validate_schema_type(self, schema: Any) -> None: | ||
| """Raise TypeError if schema is not a valid pydantic model, type, or Union.""" | ||
| try: | ||
| TypeAdapter(schema) | ||
| except Exception as e: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's try to handle the specific exception type you would expect for
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Follow up: #43 (comment) the try/except logic can be removed in favor for a simple type check for |
||
| raise TypeError( | ||
| "Arg `schema` must be a valid pydantic model, type, or Union!" | ||
| ) from e | ||
|
|
||
| def __init__(self, pandas_obj: pd.DataFrame): | ||
| assert isinstance(pandas_obj, pd.DataFrame), "Only works with DataFrames!" | ||
| if not any(isinstance(col, str) for col in pandas_obj.columns): | ||
|
|
@@ -37,13 +46,11 @@ def obj(self) -> pd.DataFrame: | |
|
|
||
| def validate( | ||
| self, | ||
| schema: BaseModel, | ||
| schema: Any, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason you went with
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for your response and sorry for taking the discussion out of the PR comments. I thought of it being more a fundamental question, but I will answer now inside the comments. I hope its not too confusing. Do I understand correctly that what you are proposing basically means that I should revert the typing changes to their original state while keeping the internal changes? This way the default use case of For my use case, this would mean that if I want to leverage these more complex schemas, I would need to either use Thank you already 😃
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Btw on vacation right now will respond next week!
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @xaviernogueira I hope you had a great vacation. I kinda forgot about the PR as well. Do you have any news or novel ideas though concerning the remaining points?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @benbuc its all good, did slip thru. For a data science oriented library, I stand by my view that :
That said, I do see your concern, and I don't want people to have to use However I think the better strategy is: SchemaType = typing.TypeVar("SchemaType", bound=BaseModel)
# make the validation func return the TypeAdaptor (perhaps rename to "get_adapter"
def _validate_schema_type(self, schema: Any) -> TypeAdapter[SchemaType]:
try:
return TypeAdapter(schema)
except Exception:
...
# use the func to get the adaptor in other cases
valid_schema: ValidSchema = self._validate_schema_type(schema: Any)
schema_validator = CoreValidator(schema)
# where CoreValidator __init__ takes TypeAdapter[SchemaType]Haven't done all the linting here, but I do believe this should work fine, you may need to change things slightly, but I do think a |
||
| n_jobs: Optional[int] = None, | ||
| **kwargs: Optional[dict[str, Any]], | ||
| ) -> bool: | ||
| if not isinstance(schema, type(BaseModel)): | ||
| raise TypeError("Arg `schema` must be a pydantic.BaseModel subclass!") | ||
|
|
||
| self._validate_schema_type(schema) | ||
| schema_validator = CoreValidator(schema) # type: ignore | ||
| try: | ||
| _ = schema_validator.validate( | ||
|
|
@@ -60,19 +67,14 @@ def validate( | |
|
|
||
| def filter( | ||
| self, | ||
| schema: BaseModel, | ||
| schema: Any, | ||
| n_jobs: Optional[int] = None, | ||
| verbose: bool = True, | ||
| **kwargs: Optional[dict[str, Any]], | ||
| ) -> pd.DataFrame: | ||
| if not isinstance(schema, type(BaseModel)): | ||
| raise TypeError("Arg `schema` must be a pydantic.BaseModel subclass!") | ||
|
|
||
| self._validate_schema_type(schema) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you are using |
||
| schema_validator = CoreValidator(schema) # type: ignore | ||
| if verbose: | ||
| errors = "log" | ||
| else: | ||
| errors = "skip" | ||
| errors = "log" if verbose else "skip" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
| filtered_df: pd.DataFrame = schema_validator.validate( | ||
| dataframe=self.obj, | ||
| errors=errors, | ||
|
|
@@ -84,30 +86,36 @@ def filter( | |
|
|
||
| def itertuples( | ||
| self, | ||
| schema: BaseModel, | ||
| schema: Any, | ||
| verbose: bool = True, | ||
| ) -> Iterable[tuple[Any, ...]]: | ||
| """Same as normal .itertuples(), except invalid rows are skipped.""" | ||
| self._validate_schema_type(schema) | ||
| adapter = TypeAdapter(schema) | ||
| for row in self.obj.itertuples(name=None): | ||
| try: | ||
| _ = schema(**dict(zip(self.obj.columns, row[1:]))) # type: ignore | ||
| _ = adapter.validate_python( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we are not using the variable for anything, maybe we just drop the
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally would prefer to see it removed this PR.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense to me since he's touching these lines anyways. |
||
| dict(zip(self.obj.columns, row[1:])) | ||
| ) | ||
| except ValidationError as e: | ||
| if verbose: | ||
| logger.info(f"Invalid row {row} with error: {e}") | ||
| continue | ||
| yield row | ||
|
|
||
| def iterrows( # type: ignore[no-untyped-def] | ||
| self, schema: BaseModel, verbose: bool = True, **kwargs | ||
| self, schema: Any, verbose: bool = True, **kwargs | ||
| ) -> Iterable[tuple[Hashable, pd.Series]]: # type: ignore[type-arg] | ||
| """Same as normal .iterrows(), except invalid rows are skipped.""" | ||
| self._validate_schema_type(schema) | ||
| schema_validator = CoreValidator(schema) | ||
| for i, _ in schema_validator.iterate(dataframe=self.obj, context=kwargs, verbose=verbose): | ||
| yield i, self.obj.loc[i] # type: ignore[call-overload] | ||
|
|
||
| def iterschemas( # type: ignore[no-untyped-def] | ||
| self, schema: BaseModel, verbose: bool = True, **kwargs | ||
| self, schema: Any, verbose: bool = True, **kwargs | ||
| ) -> Iterable[tuple[Hashable, Any]]: | ||
| """Iterate over DataFrame rows as validated schema models.""" | ||
| self._validate_schema_type(schema) | ||
| schema_validator = CoreValidator(schema) | ||
| return schema_validator.iterate(dataframe=self.obj, context=kwargs, verbose=verbose) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,5 @@ | ||
| from typing import Union, TypeAlias | ||
|
|
||
| import pandas as pd | ||
| import pydantic | ||
|
|
||
|
|
||
| SchemaTypes: TypeAlias = Union[type[pydantic.BaseModel]] | ||
| TableTypes: TypeAlias = Union[pd.DataFrame] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| """Test complex Union types.""" | ||
|
|
||
| from typing import Annotated, Literal | ||
| import pandas as pd | ||
| import pytest | ||
| from pydantic import BaseModel, Field | ||
|
|
||
| from pandantic import Pandantic | ||
|
|
||
|
|
||
| class Cat(BaseModel): | ||
| pet_type: Literal['cat'] | ||
| extra_lives_left: int | ||
|
|
||
| class Dog(BaseModel): | ||
| pet_type: Literal['dog'] | ||
| extra_lives_left: Literal[0] | ||
|
|
||
| Pet = Annotated[Cat | Dog, Field(discriminator="pet_type")] | ||
|
|
||
| def test_valid_df_passes(): | ||
| """Test that a valid DataFrame with discriminated unions passes validation.""" | ||
|
|
||
| # GIVEN | ||
| validator = Pandantic(schema=Pet) | ||
|
|
||
| example_df_valid = pd.DataFrame( | ||
| data={ | ||
| "pet_type": ["cat", "dog"], | ||
| "extra_lives_left": [9, 0] | ||
| } | ||
| ) | ||
|
|
||
| validator.validate( | ||
| dataframe=example_df_valid | ||
| ) | ||
|
|
||
| def test_invalid_df_raises(): | ||
| """Test that an invalid DataFrame with discriminated unions raises a ValueError.""" | ||
|
|
||
| # GIVEN | ||
| validator = Pandantic(schema=Pet) | ||
|
|
||
| example_df_invalid = pd.DataFrame( | ||
| data={ | ||
| "pet_type": ["cat", "dog"], | ||
| "extra_lives_left": [9, 1] | ||
| } | ||
| ) | ||
|
|
||
| # THEN | ||
| with pytest.raises(ValueError): | ||
| # WHEN | ||
| validator.validate( | ||
| dataframe=example_df_invalid | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Follow up #43 (comment) this change makes sense here for the library internal, pushing input specificity down to the actual implementation classes.