Skip to content

Commit 1f6b2b4

Browse files
Simon Rosenbergclaude
andcommitted
harden(acp): fall back to the other model-select mechanism on method-not-found
Response-detection (`_session_selects_model_via_config_option`) picks set_config_option vs set_session_model from the session response and is correct for every validated CLI. But it reads an UNSTABLE capability whose shape can vary by build/auth state, so a misdetect would crash session init or a model switch with JSON-RPC -32601 "Method not found". Add `_apply_acp_model_with_fallback`: if the chosen model-apply call returns -32601, retry once with the other mechanism (and, for runtime switches, remember the working one). Wired into init (`_maybe_set_session_model`) and `ACPAgent.set_acp_model`; resume already tolerates rejection by keeping the server default, so it's left as-is. A non-method-not-found error (e.g. -32602 invalid model) still propagates — it's a real client error, not a wrong-mechanism signal. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent d412c0b commit 1f6b2b4

2 files changed

Lines changed: 195 additions & 21 deletions

File tree

openhands-sdk/openhands/sdk/agent/acp_agent.py

Lines changed: 95 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,13 @@
140140
# upstream model 500s, and transient infrastructure errors.
141141
_RETRIABLE_SERVER_ERROR_CODES: frozenset[int] = frozenset({-32603})
142142

143+
# -32601 = "Method not found" (JSON-RPC spec). An ACP server raises this when a
144+
# model-selection call uses the mechanism it does *not* implement — e.g.
145+
# `session/set_model` on a CLI that moved model selection to `configOptions`
146+
# (codex-acp 0.16+, claude-agent-acp 0.46+), or vice versa. We use it to fall
147+
# back to the other mechanism if response-detection picked the wrong one.
148+
_METHOD_NOT_FOUND_CODE: Final[int] = -32601
149+
143150

144151
# Maximum characters for ACP tool call content — matches MAX_CMD_OUTPUT_SIZE
145152
# used by the terminal tool and the default max_message_chars in LLM config.
@@ -437,6 +444,48 @@ def _apply_acp_model(
437444
return conn.set_session_model(model_id=model, session_id=session_id)
438445

439446

447+
def _is_method_not_found(exc: ACPRequestError) -> bool:
448+
"""Whether ``exc`` is a JSON-RPC "method not found" — i.e. the server does
449+
not implement the model-selection call we used."""
450+
return exc.code == _METHOD_NOT_FOUND_CODE
451+
452+
453+
async def _apply_acp_model_with_fallback(
454+
conn: ClientSideConnection,
455+
session_id: str,
456+
model: str,
457+
*,
458+
via_config_option: bool,
459+
) -> bool:
460+
"""Apply ``model`` via the detected mechanism, falling back to the other if
461+
the server reports the method missing.
462+
463+
Response-detection (``_session_selects_model_via_config_option``) is correct
464+
for every CLI we've validated, but it reads an UNSTABLE capability and the
465+
response shape can vary by build/auth state. If the chosen call raises
466+
``-32601 "Method not found"``, the server simply uses the *other* mechanism,
467+
so we retry with it instead of crashing session init. Returns the
468+
``via_config_option`` value that actually applied the model.
469+
"""
470+
try:
471+
await _apply_acp_model(
472+
conn, session_id, model, via_config_option=via_config_option
473+
)
474+
return via_config_option
475+
except ACPRequestError as exc:
476+
if not _is_method_not_found(exc):
477+
raise
478+
logger.info(
479+
"ACP model-apply via %s rejected as method-not-found; retrying via %s",
480+
"set_config_option" if via_config_option else "set_session_model",
481+
"set_session_model" if via_config_option else "set_config_option",
482+
)
483+
await _apply_acp_model(
484+
conn, session_id, model, via_config_option=not via_config_option
485+
)
486+
return not via_config_option
487+
488+
440489
def _extract_session_models(
441490
response: Any,
442491
) -> tuple[str | None, list[ACPModelInfo] | None]:
@@ -654,7 +703,7 @@ async def _maybe_set_session_model(
654703
return False
655704
provider = detect_acp_provider_by_agent_name(agent_name)
656705
if provider is not None and provider.supports_set_session_model:
657-
await _apply_acp_model(
706+
await _apply_acp_model_with_fallback(
658707
conn, session_id, acp_model, via_config_option=via_config_option
659708
)
660709
return True
@@ -727,7 +776,9 @@ async def _reapply_session_model_on_resume(
727776
if provider is not None:
728777
# Known provider: apply via the mechanism the resumed session uses
729778
# (set_config_option for codex-acp 0.16+/claude-agent-acp 0.46+,
730-
# else set_session_model).
779+
# else set_session_model). A rejection is already tolerated below
780+
# (the session keeps the server default), so resume doesn't need the
781+
# cross-mechanism fallback the init/switch paths use.
731782
await _apply_acp_model(
732783
conn, session_id, acp_model, via_config_option=via_config_option
733784
)
@@ -3643,25 +3694,48 @@ def set_acp_model(self, model: str) -> None:
36433694
timeout=self.acp_prompt_timeout,
36443695
)
36453696
except ACPRequestError as e:
3646-
# Server-internal failures (JSON-RPC -32603) are not the caller's
3647-
# fault, and the prompt path already treats them as retriable. Let
3648-
# them propagate (-> 5xx) instead of mislabeling them as a 400
3649-
# client error.
3650-
if e.code in _RETRIABLE_SERVER_ERROR_CODES:
3651-
raise
3652-
# acp.exceptions.RequestError derives from Exception (not
3653-
# RuntimeError); surface a true client/protocol rejection (e.g.
3654-
# method-not-found, invalid model id) as a ValueError so callers —
3655-
# and the agent-server route — treat it as a 400-class client error
3656-
# rather than an opaque 500.
3657-
method = (
3658-
"set_config_option(model)"
3659-
if self._model_via_config_option
3660-
else "set_session_model"
3661-
)
3662-
raise ValueError(
3663-
f"ACP server rejected {method}(model={model!r}): {e}"
3664-
) from e
3697+
pending_error: ACPRequestError | None = e
3698+
if _is_method_not_found(e):
3699+
# The session uses the other model-selection mechanism (or the
3700+
# init-time detection picked the wrong one): retry once with it,
3701+
# and remember the working mechanism for later switches. If the
3702+
# retry also fails, fall through to the error translation below.
3703+
flipped = not self._model_via_config_option
3704+
try:
3705+
self._executor.run_async(
3706+
_apply_acp_model(
3707+
self._conn,
3708+
self._session_id,
3709+
model,
3710+
via_config_option=flipped,
3711+
),
3712+
timeout=self.acp_prompt_timeout,
3713+
)
3714+
except ACPRequestError as e2:
3715+
pending_error = e2
3716+
else:
3717+
self._model_via_config_option = flipped
3718+
pending_error = None # both selections applied
3719+
if pending_error is not None:
3720+
# Server-internal failures (JSON-RPC -32603) are not the caller's
3721+
# fault, and the prompt path already treats them as retriable.
3722+
# Let them propagate (-> 5xx) instead of mislabeling them as a
3723+
# 400 client error.
3724+
if pending_error.code in _RETRIABLE_SERVER_ERROR_CODES:
3725+
raise pending_error
3726+
# acp.exceptions.RequestError derives from Exception (not
3727+
# RuntimeError); surface a true client/protocol rejection (e.g.
3728+
# method-not-found on both mechanisms, invalid model id) as a
3729+
# ValueError so callers — and the agent-server route — treat it
3730+
# as a 400-class client error rather than an opaque 500.
3731+
method = (
3732+
"set_config_option(model)"
3733+
if self._model_via_config_option
3734+
else "set_session_model"
3735+
)
3736+
raise ValueError(
3737+
f"ACP server rejected {method}(model={model!r}): {pending_error}"
3738+
) from pending_error
36653739
# Reflect the live model on the sentinel LLM + metrics so cost/token
36663740
# accounting and serialized state show the model actually in use
36673741
# (mirrors model_post_init). The ``acp_model`` field is frozen, so the

tests/sdk/agent/test_acp_agent.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@
2020

2121
from openhands.sdk.agent.acp_agent import (
2222
ACPAgent,
23+
_apply_acp_model_with_fallback,
2324
_classify_acp_init_error,
2425
_codex_auth_file,
2526
_codex_base_url_overrides,
2627
_estimate_cost_from_tokens,
2728
_extract_session_models,
2829
_extract_token_usage,
2930
_image_url_to_acp_block,
31+
_is_method_not_found,
3032
_mask_json_value,
3133
_maybe_set_session_model,
3234
_mcp_config_to_acp_servers,
@@ -4711,6 +4713,33 @@ def test_switches_model_via_config_option_session(self):
47114713
assert agent.llm.model == "gpt-5.4/low"
47124714
assert agent._current_model_id == "gpt-5.4/low"
47134715

4716+
def test_switch_falls_back_on_method_not_found(self):
4717+
# Detected set_session_model, but the live server only has the
4718+
# configOptions mechanism: the first apply raises -32601, the switch
4719+
# retries with the other call and remembers it.
4720+
agent = self._wire(_make_agent(), "codex-acp", via_config_option=False)
4721+
agent._executor.run_async.side_effect = [
4722+
ACPRequestError(code=-32601, message="Method not found"),
4723+
None, # retry with the flipped mechanism succeeds
4724+
]
4725+
agent.set_acp_model("gpt-5.4/low")
4726+
assert agent._executor.run_async.call_count == 2
4727+
# The working mechanism is remembered for later switches.
4728+
assert agent._model_via_config_option is True
4729+
assert agent.llm.model == "gpt-5.4/low"
4730+
assert agent._current_model_id == "gpt-5.4/low"
4731+
4732+
def test_switch_invalid_model_does_not_fall_back(self):
4733+
# A -32602 invalid-params is a real client error, not a wrong-mechanism
4734+
# signal: surface it as ValueError without a second call.
4735+
agent = self._wire(_make_agent(), "codex-acp", via_config_option=True)
4736+
agent._executor.run_async.side_effect = ACPRequestError(
4737+
code=-32602, message="Invalid params"
4738+
)
4739+
with pytest.raises(ValueError, match="rejected set_config_option"):
4740+
agent.set_acp_model("nonsense")
4741+
agent._executor.run_async.assert_called_once()
4742+
47144743
def test_claude_provider_supports_runtime_switch(self):
47154744
agent = self._wire(_make_agent(), "claude-agent-acp")
47164745
agent.set_acp_model("claude-haiku-4-5-20251001")
@@ -7187,6 +7216,77 @@ def test_none_response_is_not_config_option(self):
71877216
assert _session_selects_model_via_config_option(None) is False
71887217

71897218

7219+
class TestApplyAcpModelFallback:
7220+
"""Model-apply falls back to the other mechanism on -32601 method-not-found.
7221+
7222+
Response-detection is correct for every validated CLI, but it reads an
7223+
UNSTABLE capability; a misdetect (or a server that moved between mechanisms)
7224+
must self-heal rather than crash session init / a switch.
7225+
"""
7226+
7227+
def test_is_method_not_found(self):
7228+
assert _is_method_not_found(ACPRequestError(code=-32601, message="x")) is True
7229+
assert _is_method_not_found(ACPRequestError(code=-32603, message="x")) is False
7230+
assert _is_method_not_found(ACPRequestError(code=-32602, message="x")) is False
7231+
7232+
@pytest.mark.asyncio
7233+
async def test_config_option_falls_back_to_set_session_model(self):
7234+
# Detected configOptions, but the server only has set_session_model.
7235+
conn = AsyncMock()
7236+
conn.set_config_option.side_effect = ACPRequestError(
7237+
code=-32601, message="Method not found"
7238+
)
7239+
effective = await _apply_acp_model_with_fallback(
7240+
conn, "sess-1", "gemini-3-flash", via_config_option=True
7241+
)
7242+
conn.set_config_option.assert_awaited_once()
7243+
conn.set_session_model.assert_awaited_once_with(
7244+
model_id="gemini-3-flash", session_id="sess-1"
7245+
)
7246+
assert effective is False # the mechanism that actually worked
7247+
7248+
@pytest.mark.asyncio
7249+
async def test_set_session_model_falls_back_to_config_option(self):
7250+
# Detected set_session_model, but the server moved to configOptions
7251+
# (codex 0.16 / claude 0.46 reached via a stale/degraded detection).
7252+
conn = AsyncMock()
7253+
conn.set_session_model.side_effect = ACPRequestError(
7254+
code=-32601, message="Method not found"
7255+
)
7256+
effective = await _apply_acp_model_with_fallback(
7257+
conn, "sess-1", "opus[1m]", via_config_option=False
7258+
)
7259+
conn.set_session_model.assert_awaited_once()
7260+
conn.set_config_option.assert_awaited_once_with(
7261+
config_id="model", value="opus[1m]", session_id="sess-1"
7262+
)
7263+
assert effective is True
7264+
7265+
@pytest.mark.asyncio
7266+
async def test_no_fallback_when_primary_succeeds(self):
7267+
conn = AsyncMock()
7268+
effective = await _apply_acp_model_with_fallback(
7269+
conn, "sess-1", "sonnet", via_config_option=True
7270+
)
7271+
conn.set_config_option.assert_awaited_once()
7272+
conn.set_session_model.assert_not_called()
7273+
assert effective is True
7274+
7275+
@pytest.mark.asyncio
7276+
async def test_non_method_not_found_error_propagates(self):
7277+
# An invalid-model / server error is NOT a wrong-mechanism signal — it
7278+
# must propagate, not silently try the other call.
7279+
conn = AsyncMock()
7280+
conn.set_config_option.side_effect = ACPRequestError(
7281+
code=-32602, message="Invalid params"
7282+
)
7283+
with pytest.raises(ACPRequestError):
7284+
await _apply_acp_model_with_fallback(
7285+
conn, "sess-1", "bad", via_config_option=True
7286+
)
7287+
conn.set_session_model.assert_not_called()
7288+
7289+
71907290
class TestACPAgentAvailableModelsProperty:
71917291
"""``available_models`` exposes the server's model list verbatim.
71927292

0 commit comments

Comments
 (0)