Skip to content

Commit b507986

Browse files
committed
fix(codex): respect non-Claude model selection and OAuth, fix demo summary (#2075)
Three defects reported in #2075 made the Codex adapter unusable with a ChatGPT OAuth login. 1. Model routing handed Claude tier names to Codex. The batch/heuristic selector emits opus/sonnet/haiku with no adapter awareness, so a high-stakes role (manager/architect/security) produced `codex exec -m opus`, which Codex rejects. The spawner now substitutes the adapter's default model for an unpinned Claude tier name when the run-level adapter is non-Claude, so the model recorded for the run matches what actually runs. The Claude path is unchanged. CodexAdapter also maps any residual tier name to its default as a last-resort net. 2. Spurious OPENAI_API_KEY warning. The adapter warned on every spawn when the env var was absent, even with a valid ChatGPT OAuth session. It now detects ~/.codex/auth.json (written by `codex login`) and only warns when neither an API key nor an OAuth session is present. 3. `bernstein demo --real` crash. _print_demo_summary read the /status `tasks` field as a list, but the endpoint returns {"count", "items"}. Iterating the dict yielded its string keys and raised AttributeError on `.get`. It now unwraps the items list and keeps only dict rows. Fixes #2075
1 parent 1c7a345 commit b507986

7 files changed

Lines changed: 249 additions & 15 deletions

File tree

src/bernstein/adapters/codex.py

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,55 @@
1212
import logging
1313
import os
1414
import subprocess
15-
from typing import TYPE_CHECKING, Any
16-
17-
if TYPE_CHECKING:
18-
from pathlib import Path
15+
from pathlib import Path
16+
from typing import Any
1917

2018
from bernstein.adapters.base import DEFAULT_TIMEOUT_SECONDS, CLIAdapter, SpawnResult, build_worker_cmd
2119
from bernstein.adapters.env_isolation import build_filtered_env
2220
from bernstein.core.models import ApiTier, ApiTierInfo, ModelConfig, ProviderType, RateLimit
2321

2422
logger = logging.getLogger(__name__)
2523

24+
# Codex authenticates via either OPENAI_API_KEY or a ChatGPT OAuth session that
25+
# ``codex login`` stores in ~/.codex/auth.json. ~/.codex is already the canonical
26+
# Codex config dir (see agent_discovery and preflight), so its auth.json sibling
27+
# is the right signal for "an OAuth session exists".
28+
_CODEX_AUTH_FILE = Path.home() / ".codex" / "auth.json"
29+
30+
# Claude cascade tier names are not valid Codex model identifiers. If an upstream
31+
# selector hands one to this adapter (e.g. the high-stakes-role default), fall
32+
# back to a Codex model so ``codex exec -m`` receives something the CLI accepts.
33+
# The real selection fix lives in the spawner; this is a last-resort safety net.
34+
_DEFAULT_CODEX_MODEL = "gpt-5.4"
35+
_CLAUDE_TIER_MODELS = frozenset({"opus", "sonnet", "haiku"})
36+
37+
38+
def _has_codex_auth() -> bool:
39+
"""Return True when Codex has a usable credential: an API key or OAuth session."""
40+
return bool(os.environ.get("OPENAI_API_KEY")) or _CODEX_AUTH_FILE.exists()
41+
42+
43+
def _codex_model(model: str) -> str:
44+
"""Map a Claude cascade tier name to the Codex default; pass any other model through."""
45+
if model in _CLAUDE_TIER_MODELS:
46+
logger.warning(
47+
"CodexAdapter: model %r is a Claude tier name Codex cannot run; using %r "
48+
"instead. Set role_model_policy.<role>.model or default_model to a Codex "
49+
"model (e.g. gpt-5.4) to choose explicitly.",
50+
model,
51+
_DEFAULT_CODEX_MODEL,
52+
)
53+
return _DEFAULT_CODEX_MODEL
54+
return model
55+
2656

2757
class CodexAdapter(CLIAdapter):
2858
"""Spawn and monitor OpenAI Codex CLI sessions."""
2959

3060
registry_name = "codex"
61+
# Default model when no operator-pinned model reaches this adapter. Read by
62+
# the spawner to substitute Claude tier names for non-Claude adapters.
63+
default_model = _DEFAULT_CODEX_MODEL
3164
external_endpoints = (("api.openai.com", 443),)
3265
# OpenAI returns HTTP 429 with ``rate_limit_exceeded`` /
3366
# ``insufficient_quota`` error codes; the meter records both under
@@ -54,17 +87,21 @@ def spawn(
5487
log_path.parent.mkdir(parents=True, exist_ok=True)
5588
output_path = workdir / ".sdd" / "runtime" / f"{session_id}.last-message.txt"
5689

57-
api_key = os.environ.get("OPENAI_API_KEY")
58-
if not api_key:
59-
logger.warning("CodexAdapter: OPENAI_API_KEY is not set - spawn will fail")
90+
if not _has_codex_auth():
91+
logger.warning(
92+
"CodexAdapter: no OPENAI_API_KEY and no Codex OAuth session "
93+
"(~/.codex/auth.json) detected - spawn may fail until `codex login` is "
94+
"run or OPENAI_API_KEY is set",
95+
)
6096

97+
model = _codex_model(model_config.model)
6198
cmd = [
6299
"codex",
63100
"exec",
64101
"--sandbox",
65102
"workspace-write",
66103
"-m",
67-
model_config.model,
104+
model,
68105
"--json",
69106
"-o",
70107
str(output_path),
@@ -87,7 +124,7 @@ def spawn(
87124
pid_dir=pid_dir,
88125
workdir=workdir,
89126
log_path=log_path,
90-
model=model_config.model,
127+
model=model,
91128
)
92129

93130
env = build_filtered_env(["OPENAI_API_KEY", "OPENAI_ORG_ID", "OPENAI_BASE_URL"])

src/bernstein/cli/run_confirm.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,8 +452,16 @@ def _print_demo_summary(project_dir: Path, server_url: str, elapsed_secs: float
452452
resp = httpx.get(f"{server_url}/status", timeout=3.0, headers=auth_headers())
453453
if resp.status_code == 200:
454454
payload = resp.json()
455-
tasks_data = payload.get("tasks", [])
456-
total_cost = payload.get("total_cost_usd", 0.0)
455+
# /status returns tasks as {"count": N, "items": [...]}; tolerate a
456+
# bare list too. Iterating the dict form would yield its string keys
457+
# and crash on ``t.get`` (the historical AttributeError), so unwrap
458+
# to the items list and keep only dict rows.
459+
raw_tasks = payload.get("tasks", [])
460+
if isinstance(raw_tasks, dict):
461+
raw_tasks = raw_tasks.get("items", [])
462+
if isinstance(raw_tasks, list):
463+
tasks_data = [t for t in raw_tasks if isinstance(t, dict)]
464+
total_cost = float(payload.get("total_cost_usd", 0.0) or 0.0)
457465

458466
done = sum(1 for t in tasks_data if t.get("status") == "done")
459467
failed = sum(1 for t in tasks_data if t.get("status") == "failed")

src/bernstein/core/agents/spawner_core.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,11 @@
4848
submit_session_exec,
4949
write_prompt_to_session,
5050
)
51-
from bernstein.core.agents.spawner_warm_pool import _select_batch_config, _should_use_router
51+
from bernstein.core.agents.spawner_warm_pool import (
52+
_coerce_model_for_non_claude_adapter,
53+
_select_batch_config,
54+
_should_use_router,
55+
)
5256
from bernstein.core.agents.spawner_worktree import (
5357
cleanup_worktree as _cleanup_worktree,
5458
)
@@ -1794,6 +1798,19 @@ def _spawn_for_tasks_internal(self, tasks: list[Task], model_override: str | Non
17941798
preferred_provider,
17951799
)
17961800

1801+
# When the run-level adapter is non-Claude and no model was pinned by the
1802+
# operator, the heuristic/batch selector may still have produced a Claude
1803+
# tier name (opus/sonnet/haiku). Substitute the adapter's own default so
1804+
# the model recorded here matches what the adapter actually runs (e.g.
1805+
# Codex gets gpt-5.4, not `codex exec -m opus`). Claude-compatible
1806+
# adapters are returned unchanged.
1807+
if provider_name is None and not tasks[0].model and not role_policy.get("model"):
1808+
model_config = _coerce_model_for_non_claude_adapter(
1809+
model_config,
1810+
adapter_name=self._adapter.name(),
1811+
adapter_default_model=getattr(self._adapter, "default_model", None),
1812+
)
1813+
17971814
logger.info(
17981815
"Model selection for role=%s: model=%s effort=%s provider=%s source=%s",
17991816
tasks[0].role,

src/bernstein/core/agents/spawner_warm_pool.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,44 @@ def _should_use_router(
6060
return BanditRouter.router_applicable(effective_adapter)
6161

6262

63+
_CLAUDE_TIER_MODELS = frozenset({"opus", "sonnet", "haiku"})
64+
65+
66+
def _coerce_model_for_non_claude_adapter(
67+
model_config: ModelConfig,
68+
*,
69+
adapter_name: str,
70+
adapter_default_model: str | None,
71+
) -> ModelConfig:
72+
"""Replace an unpinned Claude tier name with the adapter's default model.
73+
74+
The batch/heuristic selectors emit Claude cascade tier names (opus/sonnet/
75+
haiku) with no adapter awareness. Handing one to a non-Claude adapter (e.g.
76+
Codex) produces ``codex exec -m opus``, which the CLI rejects, and records a
77+
model in the manifest that never actually ran. This normalises the selected
78+
model so the recorded and executed model agree.
79+
80+
Returns the input unchanged for Claude-compatible adapters (byte-identical),
81+
for models that are not Claude tiers, or when no adapter default is known.
82+
Callers must only invoke this when the operator did not pin a model
83+
(no per-task ``model`` and no ``role_model_policy`` model).
84+
"""
85+
from bernstein.core.bandit_router import BanditRouter
86+
87+
if BanditRouter.router_applicable(adapter_name):
88+
return model_config
89+
if model_config.model not in _CLAUDE_TIER_MODELS:
90+
return model_config
91+
if not adapter_default_model:
92+
return model_config
93+
return ModelConfig(
94+
model=adapter_default_model,
95+
effort=model_config.effort,
96+
max_tokens=model_config.max_tokens,
97+
is_batch=model_config.is_batch,
98+
)
99+
100+
63101
def _load_role_config(role: str, templates_dir: Path) -> ModelConfig | None:
64102
"""Load ModelConfig from a role's config.yaml if present.
65103

tests/unit/test_adapter_codex.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,11 +299,13 @@ def test_permission_error_raises_runtime_error(self, tmp_path: Path) -> None:
299299

300300

301301
class TestCodexWarningsAndFastExit:
302-
def test_warns_when_openai_api_key_missing(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
302+
def test_warns_when_no_key_and_no_oauth(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
303303
adapter = CodexAdapter()
304304
proc_mock = _make_popen_mock(pid=301)
305+
missing_auth = tmp_path / "no-codex" / "auth.json"
305306
with (
306307
patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock),
308+
patch("bernstein.adapters.codex._CODEX_AUTH_FILE", missing_auth),
307309
patch.dict("os.environ", {"PATH": "/usr/bin"}, clear=True),
308310
caplog.at_level("WARNING"),
309311
):
@@ -313,7 +315,42 @@ def test_warns_when_openai_api_key_missing(self, tmp_path: Path, caplog: pytest.
313315
model_config=ModelConfig(model="o3", effort="high"),
314316
session_id="warn-missing-key",
315317
)
316-
assert "OPENAI_API_KEY is not set - spawn will fail" in caplog.text
318+
assert "no OPENAI_API_KEY and no Codex OAuth session" in caplog.text
319+
320+
def test_no_auth_warning_with_oauth_session(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
321+
"""A valid ChatGPT OAuth session (~/.codex/auth.json) must not warn (issue #2075)."""
322+
adapter = CodexAdapter()
323+
proc_mock = _make_popen_mock(pid=303)
324+
auth_file = tmp_path / "auth.json"
325+
auth_file.write_text("{}")
326+
with (
327+
patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock),
328+
patch("bernstein.adapters.codex._CODEX_AUTH_FILE", auth_file),
329+
patch.dict("os.environ", {"PATH": "/usr/bin"}, clear=True),
330+
caplog.at_level("WARNING"),
331+
):
332+
adapter.spawn(
333+
prompt="hello",
334+
workdir=tmp_path,
335+
model_config=ModelConfig(model="o3", effort="high"),
336+
session_id="oauth-session",
337+
)
338+
assert not any("OPENAI_API_KEY" in r.message or "OAuth session" in r.message for r in caplog.records)
339+
340+
def test_claude_tier_model_mapped_to_codex_default(self, tmp_path: Path) -> None:
341+
"""A Claude tier name reaching the adapter must not become `codex exec -m opus` (issue #2075)."""
342+
adapter = CodexAdapter()
343+
proc_mock = _make_popen_mock(pid=304)
344+
with patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock) as popen:
345+
adapter.spawn(
346+
prompt="hello",
347+
workdir=tmp_path,
348+
model_config=ModelConfig(model="opus", effort="max"),
349+
session_id="codex-opus",
350+
)
351+
inner = _inner_cmd(popen.call_args.args[0])
352+
assert inner[inner.index("-m") + 1] == "gpt-5.4"
353+
assert "opus" not in inner
317354

318355
def test_fast_exit_rate_limit_raises(self, tmp_path: Path) -> None:
319356
adapter = CodexAdapter()

tests/unit/test_cli_demo.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33
from __future__ import annotations
44

5-
from unittest.mock import patch
5+
import io
6+
from unittest.mock import MagicMock, patch
67

78
from click.testing import CliRunner
9+
from rich.console import Console
810

911
from bernstein.cli.main import (
1012
DEMO_TASKS,
@@ -13,6 +15,48 @@
1315
setup_demo_project,
1416
)
1517

18+
19+
def test_demo_summary_handles_status_tasks_dict_shape(tmp_path):
20+
"""GET /status returns tasks as {"count", "items"}; the summary must not crash.
21+
22+
Regression for issue #2075: iterating the dict form yielded its string keys
23+
and raised ``AttributeError: 'str' object has no attribute 'get'``.
24+
"""
25+
from bernstein.cli import run_confirm
26+
27+
payload = {
28+
"total": 3,
29+
"done": 1,
30+
"failed": 1,
31+
"total_cost_usd": 0.5,
32+
"tasks": {
33+
"count": 3,
34+
"items": [
35+
{"status": "done", "title": "a"},
36+
{"status": "failed", "title": "b"},
37+
{"status": "open", "title": "c"},
38+
],
39+
},
40+
}
41+
resp = MagicMock()
42+
resp.status_code = 200
43+
resp.json.return_value = payload
44+
45+
buf = io.StringIO()
46+
rec_console = Console(file=buf, force_terminal=False, width=100)
47+
with (
48+
patch.object(run_confirm.httpx, "get", return_value=resp),
49+
patch.object(run_confirm, "console", rec_console),
50+
):
51+
run_confirm._print_demo_summary(tmp_path, "http://127.0.0.1:9999", elapsed_secs=12.0)
52+
53+
out = buf.getvalue()
54+
# 1 done out of 3 total, rendered without raising.
55+
assert "1" in out
56+
assert "/ 3" in out
57+
assert "$0.5000" in out
58+
59+
1660
# ---------------------------------------------------------------------------
1761
# detect_available_adapter
1862
# ---------------------------------------------------------------------------

tests/unit/test_model_coercion.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Adapter-aware model coercion for non-Claude adapters (issue #2075).
2+
3+
The batch/heuristic selector emits Claude cascade tier names (opus/sonnet/haiku)
4+
with no adapter awareness. For a non-Claude adapter that produces an invalid
5+
``-m`` value (e.g. ``codex exec -m opus``) and records a model that never ran.
6+
``_coerce_model_for_non_claude_adapter`` normalises the selection so the recorded
7+
and executed model agree, while leaving the Claude path byte-identical.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
from bernstein.core.models import ModelConfig
13+
14+
from bernstein.core.agents.spawner_warm_pool import _coerce_model_for_non_claude_adapter
15+
16+
17+
def test_claude_tier_replaced_with_adapter_default_for_codex() -> None:
18+
out = _coerce_model_for_non_claude_adapter(
19+
ModelConfig(model="opus", effort="max"),
20+
adapter_name="Codex",
21+
adapter_default_model="gpt-5.4",
22+
)
23+
assert out.model == "gpt-5.4"
24+
# Effort and other fields are preserved.
25+
assert out.effort == "max"
26+
27+
28+
def test_claude_adapter_left_unchanged() -> None:
29+
cfg = ModelConfig(model="opus", effort="max")
30+
out = _coerce_model_for_non_claude_adapter(
31+
cfg,
32+
adapter_name="claude",
33+
adapter_default_model="gpt-5.4",
34+
)
35+
assert out.model == "opus"
36+
37+
38+
def test_non_tier_model_passed_through() -> None:
39+
out = _coerce_model_for_non_claude_adapter(
40+
ModelConfig(model="gpt-5.4", effort="high"),
41+
adapter_name="Codex",
42+
adapter_default_model="gpt-5.4",
43+
)
44+
assert out.model == "gpt-5.4"
45+
46+
47+
def test_no_default_leaves_model_unchanged() -> None:
48+
out = _coerce_model_for_non_claude_adapter(
49+
ModelConfig(model="sonnet", effort="high"),
50+
adapter_name="Codex",
51+
adapter_default_model=None,
52+
)
53+
assert out.model == "sonnet"

0 commit comments

Comments
 (0)