Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
aa23d97
Add FMAPI tool calling contract tests for DatabricksOpenAI
dhruv0811 Feb 24, 2026
3644164
Merge branch 'main' into fmapi-tool-calling-contract-tests
dhruv0811 Mar 2, 2026
6ebbfdc
Fix missing Iterator/AsyncIterator imports for Gemini stream wrappers
dhruv0811 Mar 2, 2026
016c01c
Clean up logging and stale references in FMAPI test files
dhruv0811 Mar 2, 2026
5ca144e
Centralize shared test config, use capabilities API, comment out resp…
dhruv0811 Mar 3, 2026
cca62cc
Simplify Gemini fixes, re-enable response-side fix, add codex to skip…
dhruv0811 Mar 3, 2026
6e49d61
Fix ty error: suppress invalid-assignment on api_client.do() return type
dhruv0811 Mar 3, 2026
5025920
Fix Responses API tool calling: format conversion, message round-trip…
dhruv0811 Mar 4, 2026
a142a8f
Add Responses API test path for GPT models including codex
dhruv0811 Mar 4, 2026
27d2540
Truncate FMAPI response ids to 64 chars (server returns oversized ids)
dhruv0811 Mar 4, 2026
2dcd834
Exclude gpt-oss models from Responses API tests (not supported)
dhruv0811 Mar 4, 2026
34c3c19
Fix LangChain Responses API tests: use correct tool names and prompts
dhruv0811 Mar 4, 2026
17a2085
Wire DatabricksResponses in both clients, fix id prefixes for multi-turn
dhruv0811 Mar 9, 2026
7e71ebb
Merge remote-tracking branch 'origin/main' into codex-responses-api-s…
dhruv0811 Mar 9, 2026
5a60ffb
Merge remote-tracking branch 'origin/main' into codex-responses-api-s…
dhruv0811 Mar 11, 2026
e598d67
Clean up PR: remove Gemini/pyproject/utils changes, keep only codex f…
dhruv0811 Mar 11, 2026
9c7b26b
Add Gemini 2.5 models to chat skip list (list content issues)
dhruv0811 Mar 11, 2026
a1c6878
Rename skip lists for clarity, add gemini-3-1-flash-lite to skip
dhruv0811 Mar 11, 2026
d344179
Add streaming test to TestLangGraphResponsesAPI
dhruv0811 Mar 11, 2026
f22186c
Add streaming test to TestAgentToolCallingResponsesAPI
dhruv0811 Mar 11, 2026
031499f
Replace __dict__ hack with @cached_property for DatabricksResponses w…
dhruv0811 Mar 11, 2026
2ac634c
Minimal clients.py: only add _truncate_response_ids, keep existing @p…
dhruv0811 Mar 11, 2026
927a52f
Clean up comments across PR
dhruv0811 Mar 11, 2026
689dbef
Fix linting: type annotation, format, unit test expectations, Runner.…
dhruv0811 Mar 11, 2026
775868a
Clean up skip list comments in fmapi.py for consistency
dhruv0811 Mar 11, 2026
7719e12
Fix Responses API streaming: truncate oversized input IDs before FMAP…
dhruv0811 Mar 11, 2026
4db9f59
Remove unnecessary strip-nulls: FMAPI accepts null values in input
dhruv0811 Mar 11, 2026
583f639
Remove msg_id truncation and function_call dedup: not needed by FMAPI
dhruv0811 Mar 11, 2026
436a080
Revert test_clients.py changes: no new unit tests in this PR
dhruv0811 Mar 11, 2026
93d570b
Reset test_clients.py to match remote main
dhruv0811 Mar 11, 2026
fdc4b0c
Shorten _truncate_input_ids docstring
dhruv0811 Mar 11, 2026
8037204
Fix Responses API status rejection: strip output-only fields from pas…
dhruv0811 Mar 11, 2026
d96586e
Add ID truncation to LangChain Responses API path (max 64 chars)
dhruv0811 Mar 12, 2026
4bad7b7
Merge main + address PR review nits
dhruv0811 Mar 20, 2026
8348012
Generate UUID fallback when lc_msg.id is blank for passthrough blocks
dhruv0811 Mar 20, 2026
ea9e276
Remove .claude/commands/integration-tests.md from PR
dhruv0811 Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 50 additions & 7 deletions integrations/langchain/src/databricks_langchain/chat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
import logging
import uuid
import warnings
from functools import cached_property
from operator import itemgetter
Expand Down Expand Up @@ -444,6 +445,19 @@ def _prepare_inputs(
# Responses API only supports temperature, not max_tokens, stop, or n
if self.temperature is not None:
data["temperature"] = self.temperature
# Convert tools from Chat Completions format to Responses API format
if "tools" in data:
data["tools"] = [
{
"type": "function",
"name": t["function"]["name"],
"description": t["function"].get("description", ""),
"parameters": t["function"].get("parameters", {}),
}
if "function" in t
else t
for t in data["tools"]
]
else:
# Chat completions API expects "messages" parameter
data["messages"] = [_convert_message_to_dict(msg) for msg in messages]
Expand Down Expand Up @@ -555,9 +569,11 @@ def _convert_responses_api_response_to_chat_result(self, response: Response) ->
"mcp_approval_request",
"image_generation_call",
):
# For these special types, convert to dict if possible
# For these special types, convert to dict if possible.
# Use exclude_none to drop default None fields (e.g. status, namespace)
# that FMAPI rejects as unknown parameters.
if hasattr(item, "model_dump"):
content_blocks.append(item.model_dump())
content_blocks.append(item.model_dump(exclude_none=True))
else:
content_blocks.append(item)

Expand Down Expand Up @@ -1413,6 +1429,18 @@ def _convert_lc_messages_to_responses_api(messages: list[BaseMessage]) -> list[d
"""
Convert a LangChain message to a Responses API message.
"""

# FMAPI enforces max 64-char IDs and requires msg_ prefix on message ids.
_MAX_ID = 64

def _truncate(s: str) -> str:
return s[:_MAX_ID]

def _msg_id(lc_id: str | None) -> str | None:
if not lc_id or lc_id.startswith("msg_"):
return _truncate(lc_id) if lc_id else lc_id
return _truncate(f"msg_{lc_id}")

# TODO: add multimodal support
input_items = []
for lc_msg in messages:
Expand All @@ -1437,7 +1465,7 @@ def _convert_lc_messages_to_responses_api(messages: list[BaseMessage]) -> list[d
}
],
"role": "assistant",
"id": lc_msg.id,
"id": _msg_id(lc_msg.id),
}
)
elif block_type == "refusal":
Expand All @@ -1451,7 +1479,7 @@ def _convert_lc_messages_to_responses_api(messages: list[BaseMessage]) -> list[d
}
],
"role": "assistant",
"id": lc_msg.id,
"id": _msg_id(lc_msg.id),
}
)
elif block_type in (
Expand All @@ -1465,13 +1493,28 @@ def _convert_lc_messages_to_responses_api(messages: list[BaseMessage]) -> list[d
"mcp_list_tools",
"mcp_approval_request",
):
input_items.append(block | {"id": lc_msg.id})
# FMAPI rejects output-only fields on input items.
block.pop("status", None)
# Fix ids: FMAPI requires fc_ prefix on function_call ids
# and max 64 chars on all ids.
if "id" not in block:
call_id = block.get("call_id", "")
if call_id:
raw_id = (
call_id if call_id.startswith("fc_") else f"fc_{call_id}"
)
else:
raw_id = lc_msg.id or str(uuid.uuid4())
block["id"] = _truncate(raw_id)
elif len(block["id"]) > _MAX_ID:
block["id"] = _truncate(block["id"])
input_items.append(block)
elif isinstance(cc_msg.get("content"), str):
input_items.append(
{
"type": "message",
"role": "assistant",
"id": lc_msg.id,
"id": _msg_id(lc_msg.id),
"content": [{"type": "output_text", "text": cc_msg["content"]}],
}
)
Expand All @@ -1481,7 +1524,7 @@ def _convert_lc_messages_to_responses_api(messages: list[BaseMessage]) -> list[d
[
{
"type": "function_call",
"id": lc_msg.id,
"id": _truncate(f"fc_{tool_call['id']}"),
"call_id": tool_call["id"],
"name": tool_call["function"]["name"],
"arguments": tool_call["function"]["arguments"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

import pytest
from databricks_ai_bridge.test_utils.fmapi import (
LANGCHAIN_SKIP_MODELS,
SKIP_CHAT_COMPLETIONS_LANGCHAIN,
SKIP_RESPONSES_API,
async_retry,
discover_foundation_models,
discover_chat_models,
discover_responses_models,
max_tokens_for_model,
retry,
)
Expand All @@ -29,7 +31,8 @@
reason="FMAPI tool calling tests disabled. Set RUN_FMAPI_TOOL_CALLING_TESTS=1 to enable.",
)

_FOUNDATION_MODELS = discover_foundation_models(LANGCHAIN_SKIP_MODELS)
_CHAT_MODELS = discover_chat_models(SKIP_CHAT_COMPLETIONS_LANGCHAIN)
_RESPONSES_MODELS = discover_responses_models(SKIP_RESPONSES_API)


@tool
Expand Down Expand Up @@ -60,7 +63,7 @@ def multiply(a: int, b: int) -> int:


@pytest.mark.integration
@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
@pytest.mark.parametrize("model", _CHAT_MODELS)
class TestLangGraphSync:
"""Sync LangGraph agent tests using ChatDatabricks + create_react_agent."""

Expand Down Expand Up @@ -157,7 +160,7 @@ def _run():

@pytest.mark.integration
@pytest.mark.asyncio
@pytest.mark.parametrize("model", _FOUNDATION_MODELS)
@pytest.mark.parametrize("model", _CHAT_MODELS)
class TestLangGraphAsync:
"""Async LangGraph agent tests using ChatDatabricks + create_react_agent."""

Expand Down Expand Up @@ -251,3 +254,82 @@ async def _run():
assert got_message_chunks, "Expected AIMessageChunk tokens in message stream"

await async_retry(_run)


# =============================================================================
# Responses API — LangGraph (GPT models including codex)
# =============================================================================


@pytest.mark.integration
@pytest.mark.parametrize("model", _RESPONSES_MODELS)
class TestLangGraphResponsesAPI:
"""LangGraph agent tests using ChatDatabricks(use_responses_api=True).

Tests GPT models (including codex which only supports Responses API).
"""

def test_single_turn(self, model):
"""Single-turn: agent calls tools and produces a final answer via Responses API."""
llm = ChatDatabricks(model=model, use_responses_api=True)
agent = create_react_agent(llm, [add, multiply])

def _run():
response = agent.invoke({"messages": [("human", "Use the add tool to compute 10 + 5")]})
tool_msgs = [m for m in response["messages"] if isinstance(m, ToolMessage)]
assert len(tool_msgs) >= 1, "Agent should have called at least one tool"
last = response["messages"][-1]
assert isinstance(last, AIMessage)

retry(_run)

def test_multi_turn(self, model):
"""Multi-turn: agent maintains context across turns via Responses API."""
llm = ChatDatabricks(model=model, use_responses_api=True)
checkpointer = MemorySaver()
agent = create_react_agent(llm, [add, multiply], checkpointer=checkpointer)
config = {"configurable": {"thread_id": "responses-api-test"}}

def _run():
r1 = agent.invoke(
{"messages": [("human", "Use the add tool to compute 10 + 5")]}, config=config
)
tool_msgs_1 = [m for m in r1["messages"] if isinstance(m, ToolMessage)]
assert len(tool_msgs_1) >= 1

r2 = agent.invoke(
{"messages": [("human", "Now multiply the result by 3")]}, config=config
)
assert len(r2["messages"]) > len(r1["messages"]), "History should grow across turns"

retry(_run)

def test_streaming(self, model):
"""Streaming: agent streams node updates and tool events via Responses API."""
llm = ChatDatabricks(model=model, use_responses_api=True)
agent = create_react_agent(llm, [add, multiply])

def _run():
event_count = 0
nodes_seen = set()
got_message_chunks = False

for event in agent.stream(
{"messages": [("human", "Use the add tool to compute 10 + 5")]},
stream_mode=["updates", "messages"],
):
event_count += 1
mode, data = event
if mode == "updates":
nodes_seen.update(data.keys())
elif mode == "messages":
chunk, _metadata = data
if isinstance(chunk, AIMessageChunk):
got_message_chunks = True

assert event_count > 0, "No stream events received"
assert "agent" in nodes_seen, f"Expected 'agent' node, got: {nodes_seen}"
assert "tools" in nodes_seen, f"Expected 'tools' node, got: {nodes_seen}"
assert got_message_chunks, "Expected AIMessageChunk tokens in message stream"

retry(_run)
Original file line number Diff line number Diff line change
Expand Up @@ -738,8 +738,8 @@
id=ID,
tool_calls=[
{
"name": tool_calls[0]["function"]["name"], # type: ignore[index]

Check warning on line 741 in integrations/langchain/tests/unit_tests/test_chat_models.py

View workflow job for this annotation

GitHub Actions / typechecking for integrations/langchain

ty (unused-type-ignore-comment)

tests/unit_tests/test_chat_models.py:741:61: unused-type-ignore-comment: Unused blanket `type: ignore` directive help: Remove the unused suppression comment
"args": json.loads(tool_calls[0]["function"]["arguments"]), # type: ignore[index]

Check warning on line 742 in integrations/langchain/tests/unit_tests/test_chat_models.py

View workflow job for this annotation

GitHub Actions / typechecking for integrations/langchain

ty (unused-type-ignore-comment)

tests/unit_tests/test_chat_models.py:742:78: unused-type-ignore-comment: Unused blanket `type: ignore` directive help: Remove the unused suppression comment
"id": ID,
"type": "tool_call",
}
Expand Down Expand Up @@ -924,7 +924,7 @@
"name": "get_weather",
"call_id": "call_123",
"arguments": '{"location": "SF"}',
"id": "msg_123",
"id": "fc_call_123",
},
{
"type": "function_call_output",
Expand Down
50 changes: 46 additions & 4 deletions integrations/openai/src/databricks_openai/utils/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,44 @@ class DatabricksChat(Chat):
completions: DatabricksCompletions


_FMAPI_MAX_ID_LENGTH = 64


def _truncate_response_ids(response: Any) -> None:
"""Truncate ids that exceed FMAPI's 64-char input limit.

FMAPI returns response and output item ids longer than 64 chars, but rejects
them on the next turn's input. We truncate to prevent multi-turn failures.
"""
if hasattr(response, "id") and response.id and len(response.id) > _FMAPI_MAX_ID_LENGTH:
response.id = response.id[:_FMAPI_MAX_ID_LENGTH]
if not hasattr(response, "output"):
return
for item in response.output:
item_id = getattr(item, "id", None)
if item_id and len(item_id) > _FMAPI_MAX_ID_LENGTH:
item.id = item_id[:_FMAPI_MAX_ID_LENGTH]


def _truncate_input_ids(input_items: Any) -> None:
"""Truncate ids in input items. Covers the streaming path where
_truncate_response_ids can't intercept the assembled response.
"""
if not input_items or not isinstance(input_items, list):
return
for item in input_items:
if isinstance(item, dict):
item_id = item.get("id")
if isinstance(item_id, str) and len(item_id) > _FMAPI_MAX_ID_LENGTH:
item["id"] = item_id[:_FMAPI_MAX_ID_LENGTH]
else:
item_id = getattr(item, "id", None)
if isinstance(item_id, str) and len(item_id) > _FMAPI_MAX_ID_LENGTH:
item.id = item_id[:_FMAPI_MAX_ID_LENGTH]


class DatabricksResponses(Responses):
"""Responses resource that handles apps/ prefix routing."""
"""Responses resource that handles apps/ prefix routing and id truncation."""

def __init__(self, client, workspace_client: WorkspaceClient):
super().__init__(client)
Expand All @@ -285,6 +321,7 @@ def _get_app_client(self, app_name: str) -> OpenAI:

def create(self, **kwargs):
model = kwargs.get("model", "")
_truncate_input_ids(kwargs.get("input"))

if isinstance(model, str) and model.startswith(_APPS_ENDPOINT_PREFIX):
app_name = model[len(_APPS_ENDPOINT_PREFIX) :]
Expand All @@ -294,7 +331,9 @@ def create(self, **kwargs):
except (APIStatusError, APIConnectionError) as e:
raise _wrap_app_error(e, app_name) from e

return super().create(**kwargs)
response = super().create(**kwargs)
_truncate_response_ids(response)
return response


class DatabricksOpenAI(OpenAI):
Expand Down Expand Up @@ -407,7 +446,7 @@ class AsyncDatabricksChat(AsyncChat):


class AsyncDatabricksResponses(AsyncResponses):
"""Async Responses resource that handles apps/ prefix routing."""
"""Async Responses resource that handles apps/ prefix routing and id truncation."""

def __init__(self, client, workspace_client: WorkspaceClient):
super().__init__(client)
Expand All @@ -429,6 +468,7 @@ def _get_app_client(self, app_name: str) -> AsyncOpenAI:

async def create(self, **kwargs):
model = kwargs.get("model", "")
_truncate_input_ids(kwargs.get("input"))

if isinstance(model, str) and model.startswith(_APPS_ENDPOINT_PREFIX):
app_name = model[len(_APPS_ENDPOINT_PREFIX) :]
Expand All @@ -438,7 +478,9 @@ async def create(self, **kwargs):
except (APIStatusError, APIConnectionError) as e:
raise _wrap_app_error(e, app_name) from e

return await super().create(**kwargs)
response = await super().create(**kwargs)
_truncate_response_ids(response)
return response


class AsyncDatabricksOpenAI(AsyncOpenAI):
Expand Down
Loading
Loading