Skip to content

Commit 1ccf9cc

Browse files
committed
Add Polars pydantic integration with format support and native JSON schema generation
- Add pydantic validation for Polars DataFrames and LazyFrames - Implement DataFrame type conversion from various formats (dict, CSV, JSON, Parquet, Feather) - Replace pandas dependency with native Polars JSON schema generation - Support both Pydantic v1 and v2 with appropriate validators - Add comprehensive test suite for the integration
1 parent 16dfb8c commit 1ccf9cc

File tree

3 files changed

+575
-23
lines changed

3 files changed

+575
-23
lines changed

pandera/api/polars/model.py

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -168,35 +168,79 @@ def to_json_schema(cls):
168168
This function is currently does not fully specify a pandera schema,
169169
and is primarily used internally to render OpenAPI docs via the
170170
FastAPI integration.
171-
172-
:raises ImportError: if ``pandas`` is not installed.
173171
"""
174-
try:
175-
import pandas as pd
176-
except ImportError as exc:
177-
raise ImportError(
178-
"pandas is required to serialize polars schema to json-schema"
179-
) from exc
180-
181172
schema = cls.to_schema()
182-
empty = pl.DataFrame(
183-
schema={k: v.type for k, v in schema.dtypes.items()}
184-
).to_pandas()
185-
table_schema = pd.io.json.build_table_schema(empty)
186-
187-
def _field_json_schema(field):
188-
return {
173+
174+
# Define a mapping from Polars data types to JSON schema types
175+
# This is more robust than string parsing
176+
POLARS_TO_JSON_TYPE_MAP = {
177+
# Integer types
178+
pl.Int8: "integer",
179+
pl.Int16: "integer",
180+
pl.Int32: "integer",
181+
pl.Int64: "integer",
182+
pl.UInt8: "integer",
183+
pl.UInt16: "integer",
184+
pl.UInt32: "integer",
185+
pl.UInt64: "integer",
186+
187+
# Float types
188+
pl.Float32: "number",
189+
pl.Float64: "number",
190+
191+
# Boolean type
192+
pl.Boolean: "boolean",
193+
194+
# String types
195+
pl.Utf8: "string",
196+
pl.String: "string",
197+
198+
# Date/Time types
199+
pl.Date: "datetime",
200+
pl.Datetime: "datetime",
201+
pl.Time: "datetime",
202+
pl.Duration: "datetime",
203+
}
204+
205+
def map_dtype_to_json_type(dtype):
206+
"""
207+
Map a Polars data type to a JSON schema type.
208+
209+
Args:
210+
dtype: Polars data type
211+
212+
Returns:
213+
str: JSON schema type string
214+
"""
215+
# First try the direct mapping
216+
if dtype.__class__ in POLARS_TO_JSON_TYPE_MAP:
217+
return POLARS_TO_JSON_TYPE_MAP[dtype.__class__]
218+
219+
# Fallback to string representation check for edge cases
220+
dtype_str = str(dtype).lower()
221+
if 'float' in dtype_str:
222+
return "number"
223+
elif 'int' in dtype_str:
224+
return "integer"
225+
elif 'bool' in dtype_str:
226+
return "boolean"
227+
elif any(t in dtype_str for t in ['date', 'time', 'datetime']):
228+
return "datetime"
229+
else:
230+
return "string"
231+
232+
properties = {}
233+
for col_name, col_schema in schema.dtypes.items():
234+
json_type = map_dtype_to_json_type(col_schema.type)
235+
properties[col_name] = {
189236
"type": "array",
190-
"items": {"type": field["type"]},
237+
"items": {"type": json_type},
191238
}
192239

193240
return {
194241
"title": schema.name or "pandera.DataFrameSchema",
195242
"type": "object",
196-
"properties": {
197-
field["name"]: _field_json_schema(field)
198-
for field in table_schema["fields"]
199-
},
243+
"properties": properties,
200244
}
201245

202246
@classmethod

0 commit comments

Comments
 (0)