Skip to content

Commit 7cd4774

Browse files
fix: ixp extraction tool (#821)
1 parent 975b947 commit 7cd4774

2 files changed

Lines changed: 80 additions & 15 deletions

File tree

src/uipath_langchain/agent/tools/extraction_tool.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
"""Ixp extraction tool."""
22

3-
from typing import Any
3+
import uuid
4+
from typing import Any, Optional
45

56
from langchain.tools import BaseTool
67
from langchain_core.messages import ToolCall, ToolMessage
78
from langchain_core.tools import StructuredTool
89
from langgraph.types import Command, interrupt
10+
from pydantic import BaseModel, Field
911
from uipath.agent.models.agent import AgentIxpExtractionResourceConfig
1012
from uipath.eval.mocks import mockable
11-
from uipath.platform.attachments import Attachment
1213
from uipath.platform.common import DocumentExtraction
1314
from uipath.platform.documents import ExtractionResponseIXP
1415

@@ -26,6 +27,34 @@ class StructuredToolWithWrapper(StructuredToolWithOutputType, ToolWrapperMixin):
2627
pass
2728

2829

30+
class ExtractionToolInputSchema(BaseModel):
31+
"""Alias-free mirror of `Attachment` used as the tool's args_schema.
32+
33+
We don't use `Attachment` directly because its fields carry aliases
34+
(`id` -> `ID`, `full_name` -> `FullName`, ...) and LangChain mishandles
35+
aliased fields in two places (see PR #796):
36+
37+
1. `BaseTool._parse_input()` extracts each field with `getattr(model, key)`,
38+
where `key` is the alias. For aliases that collide with built-in model
39+
attributes (e.g. `schema`), this returns the built-in instead of the
40+
field value, so downstream `kwargs.get("id") / kwargs.get("full_name")`
41+
came back as `None`.
42+
2. `tool_call_schema` rebuilds a subset of the model by copying each field
43+
but drops alias and serialization options, so the rebuilt schema no
44+
longer matches what the LLM emits.
45+
46+
Until LangChain fixes both, exposing an alias-free schema with field
47+
names matching `Attachment`'s python names sidesteps the issue. Keep the
48+
fields here in sync with `Attachment` — the test
49+
`test_extraction_tool_has_attachment_input_schema` enforces this.
50+
"""
51+
52+
id: uuid.UUID
53+
full_name: str
54+
mime_type: str
55+
metadata: Optional[dict[str, Any]] = Field(None)
56+
57+
2958
def create_ixp_extraction_tool(
3059
resource: AgentIxpExtractionResourceConfig,
3160
) -> StructuredTool:
@@ -38,27 +67,21 @@ def create_ixp_extraction_tool(
3867
@mockable(
3968
name=resource.name,
4069
description=resource.description,
41-
input_schema=Attachment.model_json_schema(),
70+
input_schema=ExtractionToolInputSchema.model_json_schema(),
4271
output_schema=ExtractionResponseIXP.model_json_schema(),
4372
example_calls=resource.properties.example_calls,
4473
)
4574
async def extraction_tool_fn(**kwargs: Any) -> ExtractionResponseIXP:
4675
from uipath.platform import UiPath
4776

77+
attachment = ExtractionToolInputSchema.model_validate(kwargs)
4878
uipath = UiPath()
4979

50-
attachment_id = kwargs.get("id")
51-
attachment_full_name = kwargs.get("full_name")
52-
53-
# TODO: attachment_mime_type is currently not used anywhere (attachment_full_name will also be obsolete once attachments api is onboarded)
54-
# should we use them somewhere else? otherwise input_schema should only contain the file id
55-
# attachment_mime_type = kwargs.get("mime_type")
56-
5780
# TODO: current workaround. DocumentExtraction model should support attachment_id and use the
5881
# start_ixp_extraction_from_attachment sdk method once support is added
5982

6083
attachment_local_file_path = await uipath.attachments.download_async(
61-
key=attachment_id, destination_path=attachment_full_name
84+
key=attachment.id, destination_path=attachment.full_name
6285
)
6386
document_extraction_response = interrupt(
6487
DocumentExtraction(
@@ -95,7 +118,7 @@ async def extraction_tool_wrapper(
95118
tool = StructuredToolWithWrapper(
96119
name=tool_name,
97120
description=resource.description,
98-
args_schema=Attachment,
121+
args_schema=ExtractionToolInputSchema,
99122
coroutine=extraction_tool_fn,
100123
output_type=ExtractionResponseIXP,
101124
metadata={

tests/agent/tools/test_extraction_tool.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
from uipath.platform.attachments import Attachment
1212
from uipath.platform.documents import ExtractionResponseIXP
1313

14-
from uipath_langchain.agent.tools.extraction_tool import create_ixp_extraction_tool
14+
from uipath_langchain.agent.tools.extraction_tool import (
15+
ExtractionToolInputSchema,
16+
create_ixp_extraction_tool,
17+
)
1518

1619

1720
class TestExtractionToolMetadata:
@@ -76,10 +79,16 @@ def test_extraction_tool_has_correct_description(self, extraction_resource):
7679
assert tool.description == "Extract data from files"
7780

7881
def test_extraction_tool_has_attachment_input_schema(self, extraction_resource):
79-
"""Test that extraction tool uses Attachment as input schema."""
82+
"""Test that extraction tool's input schema mirrors Attachment fields."""
8083
tool = create_ixp_extraction_tool(extraction_resource)
8184

82-
assert tool.args_schema == Attachment
85+
assert tool.args_schema is ExtractionToolInputSchema
86+
schema_fields = ExtractionToolInputSchema.model_fields
87+
attachment_fields = Attachment.model_fields
88+
89+
assert schema_fields.keys() == attachment_fields.keys()
90+
for name, attachment_field in attachment_fields.items():
91+
assert schema_fields[name].annotation == attachment_field.annotation
8392

8493
def test_extraction_tool_has_extraction_response_output_type(
8594
self, extraction_resource
@@ -235,6 +244,39 @@ async def test_extraction_tool_propagates_download_exception(
235244

236245
assert "Download failed" in str(exc_info.value)
237246

247+
@pytest.mark.asyncio
248+
@patch("uipath.platform.UiPath")
249+
@patch("uipath_langchain.agent.tools.extraction_tool.interrupt")
250+
async def test_extraction_tool_handles_alias_keyed_input(
251+
self, mock_interrupt, mock_uipath_class, extraction_resource
252+
):
253+
"""The LLM emits Attachment fields by alias (ID/FullName/MimeType) — the
254+
same shape Attachment.model_dump(by_alias=True) produces. download_async
255+
must still be called with the populated UUID, not key=None.
256+
"""
257+
mock_client = MagicMock()
258+
mock_uipath_class.return_value = mock_client
259+
mock_client.attachments.download_async = AsyncMock(
260+
return_value="/path/to/document.pdf"
261+
)
262+
mock_interrupt.return_value = {"extracted_data": {"field1": "value1"}}
263+
264+
tool = create_ixp_extraction_tool(extraction_resource)
265+
266+
attachment = ExtractionToolInputSchema(
267+
id=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
268+
full_name="document.pdf",
269+
mime_type="application/pdf",
270+
)
271+
aliased_input = attachment.model_dump()
272+
273+
await tool.ainvoke(aliased_input)
274+
275+
mock_client.attachments.download_async.assert_called_once_with(
276+
key=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
277+
destination_path="document.pdf",
278+
)
279+
238280

239281
class TestExtractionToolNameSanitization:
240282
"""Test that extraction tool names are properly sanitized."""

0 commit comments

Comments
 (0)