-
Notifications
You must be signed in to change notification settings - Fork 316
[WIP] Add Pydantic Schema Support to DocETL Python API #435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,8 +34,10 @@ | |
) | ||
from .validation import ( | ||
convert_dict_schema_to_list_schema, | ||
convert_schema_to_dict_format, | ||
convert_val, | ||
get_user_input_for_schema, | ||
is_pydantic_model, | ||
safe_eval, | ||
strict_render, | ||
) | ||
|
@@ -130,16 +132,32 @@ def call_llm_batch( | |
model: str, | ||
op_type: str, | ||
messages: list[dict[str, str]], | ||
output_schema: dict[str, str], | ||
output_schema: dict[str, str] | Any, | ||
verbose: bool = False, | ||
timeout_seconds: int = 120, | ||
max_retries_per_timeout: int = 2, | ||
bypass_cache: bool = False, | ||
litellm_completion_kwargs: dict[str, Any] = {}, | ||
op_config: dict[str, Any] = {}, | ||
) -> LLMResult: | ||
# Turn the output schema into a list of schemas | ||
output_schema = convert_dict_schema_to_list_schema(output_schema) | ||
# Handle Pydantic schemas | ||
if is_pydantic_model(output_schema): | ||
# Auto-enable structured output for Pydantic schemas | ||
op_config = op_config.copy() | ||
if "output" not in op_config: | ||
op_config["output"] = {} | ||
if "mode" not in op_config["output"]: | ||
op_config["output"]["mode"] = OutputMode.STRUCTURED_OUTPUT.value | ||
|
||
# For structured output mode, pass the Pydantic model directly | ||
# The LLM API will handle the OpenAPI conversion internally | ||
op_config["_pydantic_schema"] = output_schema | ||
# Convert to dict format only for the list schema wrapper | ||
dict_schema = convert_schema_to_dict_format(output_schema, model) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Schema Conversion Function Parameter MismatchThe |
||
output_schema = convert_dict_schema_to_list_schema(dict_schema) | ||
else: | ||
# Regular dict schema processing | ||
output_schema = convert_dict_schema_to_list_schema(output_schema) | ||
|
||
# Invoke the LLM call | ||
return self.call_llm( | ||
|
@@ -442,7 +460,7 @@ def call_llm( | |
model: str, | ||
op_type: str, | ||
messages: list[dict[str, str]], | ||
output_schema: dict[str, str], | ||
output_schema: dict[str, str] | Any, | ||
tools: list[dict[str, str]] | None = None, | ||
scratchpad: str | None = None, | ||
timeout_seconds: int = 120, | ||
|
@@ -479,6 +497,20 @@ def call_llm( | |
Raises: | ||
TimeoutError: If the call times out after retrying. | ||
""" | ||
# Handle Pydantic schemas | ||
if is_pydantic_model(output_schema): | ||
# Auto-enable structured output for Pydantic schemas | ||
op_config = op_config.copy() | ||
if "output" not in op_config: | ||
op_config["output"] = {} | ||
if "mode" not in op_config["output"]: | ||
op_config["output"]["mode"] = OutputMode.STRUCTURED_OUTPUT.value | ||
|
||
# Store the Pydantic schema for structured output | ||
op_config["_pydantic_schema"] = output_schema | ||
# Convert to dict format for internal processing | ||
output_schema = convert_schema_to_dict_format(output_schema, model) | ||
|
||
# Determine output mode using central enum | ||
output_mode_str = op_config.get("output", {}).get( | ||
"mode", OutputMode.TOOLS.value | ||
|
@@ -674,15 +706,38 @@ def _call_llm_with_cache( | |
# Prepare structured output schema if using structured output mode | ||
response_format = None | ||
if use_structured_output: | ||
if scratchpad is not None: | ||
props["updated_scratchpad"] = {"type": "string"} | ||
# Check if we have a Pydantic schema to use directly | ||
pydantic_schema = op_config.get("_pydantic_schema") | ||
if pydantic_schema and is_pydantic_model(pydantic_schema): | ||
# Use the OpenAPI schema from Pydantic directly | ||
from .validation import pydantic_to_openapi_schema | ||
|
||
openapi_schema = pydantic_to_openapi_schema(pydantic_schema) | ||
|
||
if scratchpad is not None: | ||
# Add scratchpad to the schema properties | ||
openapi_schema = openapi_schema.copy() | ||
openapi_schema["properties"] = openapi_schema["properties"].copy() | ||
openapi_schema["properties"]["updated_scratchpad"] = { | ||
"type": "string" | ||
} | ||
if "required" in openapi_schema: | ||
openapi_schema["required"] = list( | ||
openapi_schema["required"] | ||
) + ["updated_scratchpad"] | ||
|
||
schema = { | ||
"type": "object", | ||
"properties": props, | ||
"required": list(props.keys()), | ||
"additionalProperties": False, | ||
} | ||
schema = openapi_schema | ||
else: | ||
# Use the converted dict schema | ||
if scratchpad is not None: | ||
props["updated_scratchpad"] = {"type": "string"} | ||
|
||
schema = { | ||
"type": "object", | ||
"properties": props, | ||
"required": list(props.keys()), | ||
"additionalProperties": False, | ||
} | ||
|
||
response_format = { | ||
"type": "json_schema", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: Schema Mutation Causes Inconsistent Behavior
The
ReduceOperation
andResolveOperation
classes mutateself.config["output"]["schema"]
in place. When a Pydantic model is provided for the output schema, it's converted to a dictionary and directly overwrites the original Pydantic model in the configuration. This can lead to inconsistent behavior or break functionality if the operation instance is reused, as subsequent executions will operate on the converted dictionary schema.Additional Locations (1)
docetl/operations/resolve.py#L217-L226