Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/pandantic/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@

import pandas as pd

from pandantic.types import SchemaTypes, TableTypes
from pandantic.types import TableTypes
from pandantic.validators.base import BaseValidator
from pandantic.validators.pandas import PandasValidator


class CoreValidator:
"""An implementation of the Pydantic BaseValidator."""

def __init__(self, schema: SchemaTypes):
def __init__(self, schema: Any):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow up #43 (comment) this change makes sense here for the library internal, pushing input specificity down to the actual implementation classes.

self.schema = schema

def _get_implementation(self, dataframe: TableTypes) -> BaseValidator:
Expand Down
42 changes: 25 additions & 17 deletions src/pandantic/plugins/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Any, Optional

import pandas as pd
from pydantic import BaseModel, ValidationError
from pydantic import TypeAdapter, ValidationError

from pandantic.basemodel import CoreValidator

Expand All @@ -25,6 +25,15 @@

@pd.api.extensions.register_dataframe_accessor("pandantic")
class PydanticAccessor:
def _validate_schema_type(self, schema: Any) -> None:
"""Raise TypeError if schema is not a valid pydantic model, type, or Union."""
try:
TypeAdapter(schema)
except Exception as e:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's try to handle the specific exception type you would expect for TypeAdapter(schema) failing in a normal way.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow up: #43 (comment) the try/except logic can be removed in favor for a simple type check for PandasValidator inputs.

raise TypeError(
"Arg `schema` must be a valid pydantic model, type, or Union!"
) from e

def __init__(self, pandas_obj: pd.DataFrame):
assert isinstance(pandas_obj, pd.DataFrame), "Only works with DataFrames!"
if not any(isinstance(col, str) for col in pandas_obj.columns):
Expand All @@ -37,13 +46,11 @@ def obj(self) -> pd.DataFrame:

def validate(
self,
schema: BaseModel,
schema: Any,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you went with Any as opposed to SchemeTypes here. My hunch is you did this because you use self._validate_schema_type(), but handling incorrect input types in the function body does not negate having the actual type hint you need.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your response and sorry for taking the discussion out of the PR comments. I thought of it being more a fundamental question, but I will answer now inside the comments. I hope its not too confusing.

Do I understand correctly that what you are proposing basically means that I should revert the typing changes to their original state while keeping the internal changes? This way the default use case of pandantic - validating against BaseModel types - stays fully intact and type checked. Under the hood though, through the use of the TypeAdapter, more complex validation will be possible.

For my use case, this would mean that if I want to leverage these more complex schemas, I would need to either use # type: ignore or cast, because there is no way to express all possible valid Pydantic schemas explicitly in static typing (hence the reliance on Any at runtime).

Thank you already 😃

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw on vacation right now will respond next week!

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @xaviernogueira I hope you had a great vacation. I kinda forgot about the PR as well. Do you have any news or novel ideas though concerning the remaining points?

Copy link
Copy Markdown
Collaborator

@xaviernogueira xaviernogueira Dec 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@benbuc its all good, did slip thru.

For a data science oriented library, I stand by my view that :

  • the function signature should be informative as possible such that someone using JupyterLab with lower software engineering skills (analyst, scientist, etc) can leverage the LSP to understand things reasonably well without docs.
  • If it is included in the typehint, the function should work. Any is far to broad for this heuristic to work.

That said, I do see your concern, and I don't want people to have to use # type: ignore as it could lead to missing other errors on the same line. At first I thought maybe BaseModel | Any is a good compromise (maintains some LSP type hinting without forcing ignores).

However I think the better strategy is:

SchemaType = typing.TypeVar("SchemaType", bound=BaseModel)

# make the validation func return the TypeAdaptor (perhaps rename to "get_adapter"
def _validate_schema_type(self, schema: Any) -> TypeAdapter[SchemaType]:
    try:
        return TypeAdapter(schema)
   except Exception:
       ...

# use the func to get the adaptor in other cases
valid_schema: ValidSchema = self._validate_schema_type(schema: Any)
schema_validator = CoreValidator(schema)

# where CoreValidator __init__ takes TypeAdapter[SchemaType]

Haven't done all the linting here, but I do believe this should work fine, you may need to change things slightly, but I do think a TypeAdapter + TypeVar is the exact thing you are looking for.

n_jobs: Optional[int] = None,
**kwargs: Optional[dict[str, Any]],
) -> bool:
if not isinstance(schema, type(BaseModel)):
raise TypeError("Arg `schema` must be a pydantic.BaseModel subclass!")

self._validate_schema_type(schema)
schema_validator = CoreValidator(schema) # type: ignore
try:
_ = schema_validator.validate(
Expand All @@ -60,19 +67,14 @@ def validate(

def filter(
self,
schema: BaseModel,
schema: Any,
n_jobs: Optional[int] = None,
verbose: bool = True,
**kwargs: Optional[dict[str, Any]],
) -> pd.DataFrame:
if not isinstance(schema, type(BaseModel)):
raise TypeError("Arg `schema` must be a pydantic.BaseModel subclass!")

self._validate_schema_type(schema)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you are using _validate_schema_type() in all these functions, consider using a decorator pattern. Would make things more clear and harder to mess up accidentally.

schema_validator = CoreValidator(schema) # type: ignore
if verbose:
errors = "log"
else:
errors = "skip"
errors = "log" if verbose else "skip"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

filtered_df: pd.DataFrame = schema_validator.validate(
dataframe=self.obj,
errors=errors,
Expand All @@ -84,30 +86,36 @@ def filter(

def itertuples(
self,
schema: BaseModel,
schema: Any,
verbose: bool = True,
) -> Iterable[tuple[Any, ...]]:
"""Same as normal .itertuples(), except invalid rows are skipped."""
self._validate_schema_type(schema)
adapter = TypeAdapter(schema)
for row in self.obj.itertuples(name=None):
try:
_ = schema(**dict(zip(self.obj.columns, row[1:]))) # type: ignore
_ = adapter.validate_python(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are not using the variable for anything, maybe we just drop the _. Out of scope for this PR tho so don't worry about it if you don't want to.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally would prefer to see it removed this PR.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense to me since he's touching these lines anyways.

dict(zip(self.obj.columns, row[1:]))
)
except ValidationError as e:
if verbose:
logger.info(f"Invalid row {row} with error: {e}")
continue
yield row

def iterrows( # type: ignore[no-untyped-def]
self, schema: BaseModel, verbose: bool = True, **kwargs
self, schema: Any, verbose: bool = True, **kwargs
) -> Iterable[tuple[Hashable, pd.Series]]: # type: ignore[type-arg]
"""Same as normal .iterrows(), except invalid rows are skipped."""
self._validate_schema_type(schema)
schema_validator = CoreValidator(schema)
for i, _ in schema_validator.iterate(dataframe=self.obj, context=kwargs, verbose=verbose):
yield i, self.obj.loc[i] # type: ignore[call-overload]

def iterschemas( # type: ignore[no-untyped-def]
self, schema: BaseModel, verbose: bool = True, **kwargs
self, schema: Any, verbose: bool = True, **kwargs
) -> Iterable[tuple[Hashable, Any]]:
"""Iterate over DataFrame rows as validated schema models."""
self._validate_schema_type(schema)
schema_validator = CoreValidator(schema)
return schema_validator.iterate(dataframe=self.obj, context=kwargs, verbose=verbose)
3 changes: 0 additions & 3 deletions src/pandantic/types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from typing import Union, TypeAlias

import pandas as pd
import pydantic


SchemaTypes: TypeAlias = Union[type[pydantic.BaseModel]]
TableTypes: TypeAlias = Union[pd.DataFrame]
25 changes: 15 additions & 10 deletions src/pandantic/validators/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
Process,
Queue,
)
from pydantic import ValidationError

from pandantic.types import SchemaTypes
from pydantic import ValidationError, TypeAdapter

from pandantic.validators.base import BaseValidator


class PandasValidator(BaseValidator):
def __init__(self, schema: SchemaTypes):
def __init__(self, schema: Any):
self.schema = schema
self.adapter = TypeAdapter(schema)

def validate(
self,
Expand Down Expand Up @@ -47,6 +48,10 @@ def validate(
# check for extra columns and handle strict mode
# NOTE: this will need to be abstracted to handle different types of schema objects
if strict:
if not hasattr(self.schema, "model_fields"):
# TODO: Implement for complex schemas which rely on TypeAdapter for validation
raise ValueError("Strict mode is only supported for BaseModel schemas.")

extras = {
col for col in dataframe.columns if col not in self.schema.model_fields.keys()
}
Expand Down Expand Up @@ -108,8 +113,8 @@ def validate(
else:
for index, row_dict in dataframe.to_dict("index").items():
try:
self.schema.model_validate(
obj=row_dict,
self.adapter.validate_python(
row_dict,
context=context,
)
except ValidationError as exc: # pylint: disable=broad-exception-caught
Expand Down Expand Up @@ -147,8 +152,8 @@ def _validate_chunk(

for index, row_dict in chunk.items():
try:
self.schema.model_validate(
obj=row_dict,
self.adapter.validate_python(
row_dict,
context=context,
)
except ValidationError as exc: # pylint: disable=broad-exception-caught
Expand All @@ -169,14 +174,14 @@ def iterate(
dict[str, Any]
] = None, # pylint: disable=consider-alternative-union-syntax,useless-suppression
verbose: bool = True,
) -> Iterable[tuple[Hashable, SchemaTypes]]:
) -> Iterable[tuple[Hashable, Any]]:
"""Iterate over a DataFrame and yield validated schema models."""
for i, row in dataframe.iterrows():
try:
yield (
i,
self.schema.model_validate(
obj=row.to_dict(),
self.adapter.validate_python(
row.to_dict(),
context=context,
),
)
Expand Down
56 changes: 56 additions & 0 deletions tests/test_discriminated_union.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Test complex Union types."""

from typing import Annotated, Literal
import pandas as pd
import pytest
from pydantic import BaseModel, Field

from pandantic import Pandantic


class Cat(BaseModel):
pet_type: Literal['cat']
extra_lives_left: int

class Dog(BaseModel):
pet_type: Literal['dog']
extra_lives_left: Literal[0]

Pet = Annotated[Cat | Dog, Field(discriminator="pet_type")]

def test_valid_df_passes():
"""Test that a valid DataFrame with discriminated unions passes validation."""

# GIVEN
validator = Pandantic(schema=Pet)

example_df_valid = pd.DataFrame(
data={
"pet_type": ["cat", "dog"],
"extra_lives_left": [9, 0]
}
)

validator.validate(
dataframe=example_df_valid
)

def test_invalid_df_raises():
"""Test that an invalid DataFrame with discriminated unions raises a ValueError."""

# GIVEN
validator = Pandantic(schema=Pet)

example_df_invalid = pd.DataFrame(
data={
"pet_type": ["cat", "dog"],
"extra_lives_left": [9, 1]
}
)

# THEN
with pytest.raises(ValueError):
# WHEN
validator.validate(
dataframe=example_df_invalid
)