-
-
Notifications
You must be signed in to change notification settings - Fork 370
Description
Describe the bug
A clear and concise description of what the bug is.
- I have checked that this issue has not already been reported.
- I have confirmed this bug exists on the latest version of pandera.
- (optional) I have confirmed this bug exists on the main branch of pandera.
Code Sample, a copy-pastable example
import pandas as pd
import pandera.pandas as pa
from pandera.typing.pandas import DataFrame as PanderaDataFrame, INT64, Series, Bool, BOOL, Int64
from pydantic import BaseModel, Field, field_validator
class SampleDtoDF(pa.DataFrameModel):
nullable_str: Series[str] = pa.Field(nullable=True)
# nullable_int: Series[int] = pa.Field(nullable=True) # This one should be working too
# nullable_Int64: Series[Int64] = pa.Field(nullable=True)
nullable_INT64: Series[INT64] = pa.Field(nullable=True)
nullable_bool: Series[bool] = pa.Field(nullable=True)
nullable_bool_as_Bool: Series[Bool] = pa.Field(nullable=True)
nullable_bool_as_BOOL: Series[BOOL] = pa.Field(nullable=True)
class Config:
strict = True
coerce = True
class SampleDto(BaseModel):
df: PanderaDataFrame[SampleDtoDF] = Field(..., description="Validated DataFrame")
@field_validator("df")
@classmethod
def validate_df(cls, df: pd.DataFrame) -> PanderaDataFrame[SampleDtoDF]:
return SampleDtoDF.validate(df)
@classmethod
def from_dataframe(cls, df: pd.DataFrame) -> "SampleDto":
df_valid = SampleDtoDF.validate(df)
return cls(df=df_valid)
df_sample = pd.DataFrame(
{
"nullable_str": {
0: "e983cc46-6fd0-ee77-7a3c-4eb4329c25c3",
1: None,
2: "1548abc5-8f3c-52c5-bd44-46fe0091d620",
},
# "nullable_int": {
# 0: 1,
# 1: None,
# 2: 0,
# },
# "nullable_Int64": {
# 0: 1,
# 1: None,
# 2: 0,
# },
"nullable_INT64": {
0: 1,
1: None,
2: 0,
},
"nullable_bool": {
0: True,
1: None,
2: False,
},
"nullable_bool_as_Bool": {
0: True,
1: None,
2: False,
},
"nullable_bool_as_BOOL": {
0: True,
1: None,
2: False,
},
}
)
print(df_sample)
dto_sample = SampleDto.from_dataframe(df_sample)
print(dto_sample.df)As it is the code runs, with null elements of the Bool column cast to False, and the null elements of the BOOL column remaining Null.
If the comments are removed, the code fails, showing inconsistent behaviour between nullable int columns declaration and bool columns declaration.
Expected behavior
Null elements managed in the same way across data types.
An option is to set snake case types (Int64, Bool) to cast the null to a default value (0, False), and the capital types (INT64, BOOL) to allow for null in the column.
Another option is to crash at initialisation if to a nullable column is assigned a type that would cast the null to a value (as in the case of nullable_bool_as_Bool: Series[Bool] = pa.Field(nullable=True), that does not produce a nullable column).
Desktop:
- OS: iOS
- Browser: Chrome
- Version: 142.0.7444.135 (Official Build) (arm64)
Screenshots
With the commented lines:
Removing the comments:
