fix(codex): respect non-Claude model selection and OAuth, fix demo summary (#2075)

chernistry · chernistry · commit b5079863a620 · 2026-06-25T12:33:12.000+03:00
Three defects reported in #2075 made the Codex adapter unusable with a ChatGPT OAuth login. 1. Model routing handed Claude tier names to Codex. The batch/heuristic selector emits opus/sonnet/haiku with no adapter awareness, so a high-stakes role (manager/architect/security) produced `codex exec -m opus`, which Codex rejects. The spawner now substitutes the adapter's default model for an unpinned Claude tier name when the run-level adapter is non-Claude, so the model recorded for the run matches what actually runs. The Claude path is unchanged. CodexAdapter also maps any residual tier name to its default as a last-resort net. 2. Spurious OPENAI_API_KEY warning. The adapter warned on every spawn when the env var was absent, even with a valid ChatGPT OAuth session. It now detects ~/.codex/auth.json (written by `codex login`) and only warns when neither an API key nor an OAuth session is present. 3. `bernstein demo --real` crash. _print_demo_summary read the /status `tasks` field as a list, but the endpoint returns {"count", "items"}. Iterating the dict yielded its string keys and raised AttributeError on `.get`. It now unwraps the items list and keeps only dict rows. Fixes #2075
diff --git a/src/bernstein/adapters/codex.py b/src/bernstein/adapters/codex.py
@@ -12,22 +12,55 @@
 import logging
 import os
 import subprocess
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from pathlib import Path
+from pathlib import Path
+from typing import Any
 
 from bernstein.adapters.base import DEFAULT_TIMEOUT_SECONDS, CLIAdapter, SpawnResult, build_worker_cmd
 from bernstein.adapters.env_isolation import build_filtered_env
 from bernstein.core.models import ApiTier, ApiTierInfo, ModelConfig, ProviderType, RateLimit
 
 logger = logging.getLogger(__name__)
 
+# Codex authenticates via either OPENAI_API_KEY or a ChatGPT OAuth session that
+# ``codex login`` stores in ~/.codex/auth.json. ~/.codex is already the canonical
+# Codex config dir (see agent_discovery and preflight), so its auth.json sibling
+# is the right signal for "an OAuth session exists".
+_CODEX_AUTH_FILE = Path.home() / ".codex" / "auth.json"
+
+# Claude cascade tier names are not valid Codex model identifiers. If an upstream
+# selector hands one to this adapter (e.g. the high-stakes-role default), fall
+# back to a Codex model so ``codex exec -m`` receives something the CLI accepts.
+# The real selection fix lives in the spawner; this is a last-resort safety net.
+_DEFAULT_CODEX_MODEL = "gpt-5.4"
+_CLAUDE_TIER_MODELS = frozenset({"opus", "sonnet", "haiku"})
+
+
+def _has_codex_auth() -> bool:
+    """Return True when Codex has a usable credential: an API key or OAuth session."""
+    return bool(os.environ.get("OPENAI_API_KEY")) or _CODEX_AUTH_FILE.exists()
+
+
+def _codex_model(model: str) -> str:
+    """Map a Claude cascade tier name to the Codex default; pass any other model through."""
+    if model in _CLAUDE_TIER_MODELS:
+        logger.warning(
+            "CodexAdapter: model %r is a Claude tier name Codex cannot run; using %r "
+            "instead. Set role_model_policy.<role>.model or default_model to a Codex "
+            "model (e.g. gpt-5.4) to choose explicitly.",
+            model,
+            _DEFAULT_CODEX_MODEL,
+        )
+        return _DEFAULT_CODEX_MODEL
+    return model
+
 
 class CodexAdapter(CLIAdapter):
     """Spawn and monitor OpenAI Codex CLI sessions."""
 
     registry_name = "codex"
+    # Default model when no operator-pinned model reaches this adapter. Read by
+    # the spawner to substitute Claude tier names for non-Claude adapters.
+    default_model = _DEFAULT_CODEX_MODEL
     external_endpoints = (("api.openai.com", 443),)
     # OpenAI returns HTTP 429 with ``rate_limit_exceeded`` /
     # ``insufficient_quota`` error codes; the meter records both under
@@ -54,17 +87,21 @@ def spawn(
         log_path.parent.mkdir(parents=True, exist_ok=True)
         output_path = workdir / ".sdd" / "runtime" / f"{session_id}.last-message.txt"
 
-        api_key = os.environ.get("OPENAI_API_KEY")
-        if not api_key:
-            logger.warning("CodexAdapter: OPENAI_API_KEY is not set - spawn will fail")
+        if not _has_codex_auth():
+            logger.warning(
+                "CodexAdapter: no OPENAI_API_KEY and no Codex OAuth session "
+                "(~/.codex/auth.json) detected - spawn may fail until `codex login` is "
+                "run or OPENAI_API_KEY is set",
+            )
 
+        model = _codex_model(model_config.model)
         cmd = [
             "codex",
             "exec",
             "--sandbox",
             "workspace-write",
             "-m",
-            model_config.model,
+            model,
             "--json",
             "-o",
             str(output_path),
@@ -87,7 +124,7 @@ def spawn(
             pid_dir=pid_dir,
             workdir=workdir,
             log_path=log_path,
-            model=model_config.model,
+            model=model,
         )
 
         env = build_filtered_env(["OPENAI_API_KEY", "OPENAI_ORG_ID", "OPENAI_BASE_URL"])
diff --git a/src/bernstein/cli/run_confirm.py b/src/bernstein/cli/run_confirm.py
@@ -452,8 +452,16 @@ def _print_demo_summary(project_dir: Path, server_url: str, elapsed_secs: float
         resp = httpx.get(f"{server_url}/status", timeout=3.0, headers=auth_headers())
         if resp.status_code == 200:
             payload = resp.json()
-            tasks_data = payload.get("tasks", [])
-            total_cost = payload.get("total_cost_usd", 0.0)
+            # /status returns tasks as {"count": N, "items": [...]}; tolerate a
+            # bare list too. Iterating the dict form would yield its string keys
+            # and crash on ``t.get`` (the historical AttributeError), so unwrap
+            # to the items list and keep only dict rows.
+            raw_tasks = payload.get("tasks", [])
+            if isinstance(raw_tasks, dict):
+                raw_tasks = raw_tasks.get("items", [])
+            if isinstance(raw_tasks, list):
+                tasks_data = [t for t in raw_tasks if isinstance(t, dict)]
+            total_cost = float(payload.get("total_cost_usd", 0.0) or 0.0)
 
     done = sum(1 for t in tasks_data if t.get("status") == "done")
     failed = sum(1 for t in tasks_data if t.get("status") == "failed")
diff --git a/src/bernstein/core/agents/spawner_core.py b/src/bernstein/core/agents/spawner_core.py
@@ -48,7 +48,11 @@
     submit_session_exec,
     write_prompt_to_session,
 )
-from bernstein.core.agents.spawner_warm_pool import _select_batch_config, _should_use_router
+from bernstein.core.agents.spawner_warm_pool import (
+    _coerce_model_for_non_claude_adapter,
+    _select_batch_config,
+    _should_use_router,
+)
 from bernstein.core.agents.spawner_worktree import (
     cleanup_worktree as _cleanup_worktree,
 )
@@ -1794,6 +1798,19 @@ def _spawn_for_tasks_internal(self, tasks: list[Task], model_override: str | Non
             preferred_provider,
         )
 
+        # When the run-level adapter is non-Claude and no model was pinned by the
+        # operator, the heuristic/batch selector may still have produced a Claude
+        # tier name (opus/sonnet/haiku). Substitute the adapter's own default so
+        # the model recorded here matches what the adapter actually runs (e.g.
+        # Codex gets gpt-5.4, not `codex exec -m opus`). Claude-compatible
+        # adapters are returned unchanged.
+        if provider_name is None and not tasks[0].model and not role_policy.get("model"):
+            model_config = _coerce_model_for_non_claude_adapter(
+                model_config,
+                adapter_name=self._adapter.name(),
+                adapter_default_model=getattr(self._adapter, "default_model", None),
+            )
+
         logger.info(
             "Model selection for role=%s: model=%s effort=%s provider=%s source=%s",
             tasks[0].role,
diff --git a/src/bernstein/core/agents/spawner_warm_pool.py b/src/bernstein/core/agents/spawner_warm_pool.py
@@ -60,6 +60,44 @@ def _should_use_router(
     return BanditRouter.router_applicable(effective_adapter)
 
 
+_CLAUDE_TIER_MODELS = frozenset({"opus", "sonnet", "haiku"})
+
+
+def _coerce_model_for_non_claude_adapter(
+    model_config: ModelConfig,
+    *,
+    adapter_name: str,
+    adapter_default_model: str | None,
+) -> ModelConfig:
+    """Replace an unpinned Claude tier name with the adapter's default model.
+
+    The batch/heuristic selectors emit Claude cascade tier names (opus/sonnet/
+    haiku) with no adapter awareness. Handing one to a non-Claude adapter (e.g.
+    Codex) produces ``codex exec -m opus``, which the CLI rejects, and records a
+    model in the manifest that never actually ran. This normalises the selected
+    model so the recorded and executed model agree.
+
+    Returns the input unchanged for Claude-compatible adapters (byte-identical),
+    for models that are not Claude tiers, or when no adapter default is known.
+    Callers must only invoke this when the operator did not pin a model
+    (no per-task ``model`` and no ``role_model_policy`` model).
+    """
+    from bernstein.core.bandit_router import BanditRouter
+
+    if BanditRouter.router_applicable(adapter_name):
+        return model_config
+    if model_config.model not in _CLAUDE_TIER_MODELS:
+        return model_config
+    if not adapter_default_model:
+        return model_config
+    return ModelConfig(
+        model=adapter_default_model,
+        effort=model_config.effort,
+        max_tokens=model_config.max_tokens,
+        is_batch=model_config.is_batch,
+    )
+
+
 def _load_role_config(role: str, templates_dir: Path) -> ModelConfig | None:
     """Load ModelConfig from a role's config.yaml if present.
 
diff --git a/tests/unit/test_adapter_codex.py b/tests/unit/test_adapter_codex.py
@@ -299,11 +299,13 @@ def test_permission_error_raises_runtime_error(self, tmp_path: Path) -> None:
 
 
 class TestCodexWarningsAndFastExit:
-    def test_warns_when_openai_api_key_missing(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
+    def test_warns_when_no_key_and_no_oauth(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
         adapter = CodexAdapter()
         proc_mock = _make_popen_mock(pid=301)
+        missing_auth = tmp_path / "no-codex" / "auth.json"
         with (
             patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock),
+            patch("bernstein.adapters.codex._CODEX_AUTH_FILE", missing_auth),
             patch.dict("os.environ", {"PATH": "/usr/bin"}, clear=True),
             caplog.at_level("WARNING"),
         ):
@@ -313,7 +315,42 @@ def test_warns_when_openai_api_key_missing(self, tmp_path: Path, caplog: pytest.
                 model_config=ModelConfig(model="o3", effort="high"),
                 session_id="warn-missing-key",
             )
-        assert "OPENAI_API_KEY is not set - spawn will fail" in caplog.text
+        assert "no OPENAI_API_KEY and no Codex OAuth session" in caplog.text
+
+    def test_no_auth_warning_with_oauth_session(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
+        """A valid ChatGPT OAuth session (~/.codex/auth.json) must not warn (issue #2075)."""
+        adapter = CodexAdapter()
+        proc_mock = _make_popen_mock(pid=303)
+        auth_file = tmp_path / "auth.json"
+        auth_file.write_text("{}")
+        with (
+            patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock),
+            patch("bernstein.adapters.codex._CODEX_AUTH_FILE", auth_file),
+            patch.dict("os.environ", {"PATH": "/usr/bin"}, clear=True),
+            caplog.at_level("WARNING"),
+        ):
+            adapter.spawn(
+                prompt="hello",
+                workdir=tmp_path,
+                model_config=ModelConfig(model="o3", effort="high"),
+                session_id="oauth-session",
+            )
+        assert not any("OPENAI_API_KEY" in r.message or "OAuth session" in r.message for r in caplog.records)
+
+    def test_claude_tier_model_mapped_to_codex_default(self, tmp_path: Path) -> None:
+        """A Claude tier name reaching the adapter must not become `codex exec -m opus` (issue #2075)."""
+        adapter = CodexAdapter()
+        proc_mock = _make_popen_mock(pid=304)
+        with patch("bernstein.adapters.codex.subprocess.Popen", return_value=proc_mock) as popen:
+            adapter.spawn(
+                prompt="hello",
+                workdir=tmp_path,
+                model_config=ModelConfig(model="opus", effort="max"),
+                session_id="codex-opus",
+            )
+        inner = _inner_cmd(popen.call_args.args[0])
+        assert inner[inner.index("-m") + 1] == "gpt-5.4"
+        assert "opus" not in inner
 
     def test_fast_exit_rate_limit_raises(self, tmp_path: Path) -> None:
         adapter = CodexAdapter()
diff --git a/tests/unit/test_cli_demo.py b/tests/unit/test_cli_demo.py
@@ -2,9 +2,11 @@
 
 from __future__ import annotations
 
-from unittest.mock import patch
+import io
+from unittest.mock import MagicMock, patch
 
 from click.testing import CliRunner
+from rich.console import Console
 
 from bernstein.cli.main import (
     DEMO_TASKS,
@@ -13,6 +15,48 @@
     setup_demo_project,
 )
 
+
+def test_demo_summary_handles_status_tasks_dict_shape(tmp_path):
+    """GET /status returns tasks as {"count", "items"}; the summary must not crash.
+
+    Regression for issue #2075: iterating the dict form yielded its string keys
+    and raised ``AttributeError: 'str' object has no attribute 'get'``.
+    """
+    from bernstein.cli import run_confirm
+
+    payload = {
+        "total": 3,
+        "done": 1,
+        "failed": 1,
+        "total_cost_usd": 0.5,
+        "tasks": {
+            "count": 3,
+            "items": [
+                {"status": "done", "title": "a"},
+                {"status": "failed", "title": "b"},
+                {"status": "open", "title": "c"},
+            ],
+        },
+    }
+    resp = MagicMock()
+    resp.status_code = 200
+    resp.json.return_value = payload
+
+    buf = io.StringIO()
+    rec_console = Console(file=buf, force_terminal=False, width=100)
+    with (
+        patch.object(run_confirm.httpx, "get", return_value=resp),
+        patch.object(run_confirm, "console", rec_console),
+    ):
+        run_confirm._print_demo_summary(tmp_path, "http://127.0.0.1:9999", elapsed_secs=12.0)
+
+    out = buf.getvalue()
+    # 1 done out of 3 total, rendered without raising.
+    assert "1" in out
+    assert "/ 3" in out
+    assert "$0.5000" in out
+
+
 # ---------------------------------------------------------------------------
 # detect_available_adapter
 # ---------------------------------------------------------------------------
diff --git a/tests/unit/test_model_coercion.py b/tests/unit/test_model_coercion.py
@@ -0,0 +1,53 @@
+"""Adapter-aware model coercion for non-Claude adapters (issue #2075).
+
+The batch/heuristic selector emits Claude cascade tier names (opus/sonnet/haiku)
+with no adapter awareness. For a non-Claude adapter that produces an invalid
+``-m`` value (e.g. ``codex exec -m opus``) and records a model that never ran.
+``_coerce_model_for_non_claude_adapter`` normalises the selection so the recorded
+and executed model agree, while leaving the Claude path byte-identical.
+"""
+
+from __future__ import annotations
+
+from bernstein.core.models import ModelConfig
+
+from bernstein.core.agents.spawner_warm_pool import _coerce_model_for_non_claude_adapter
+
+
+def test_claude_tier_replaced_with_adapter_default_for_codex() -> None:
+    out = _coerce_model_for_non_claude_adapter(
+        ModelConfig(model="opus", effort="max"),
+        adapter_name="Codex",
+        adapter_default_model="gpt-5.4",
+    )
+    assert out.model == "gpt-5.4"
+    # Effort and other fields are preserved.
+    assert out.effort == "max"
+
+
+def test_claude_adapter_left_unchanged() -> None:
+    cfg = ModelConfig(model="opus", effort="max")
+    out = _coerce_model_for_non_claude_adapter(
+        cfg,
+        adapter_name="claude",
+        adapter_default_model="gpt-5.4",
+    )
+    assert out.model == "opus"
+
+
+def test_non_tier_model_passed_through() -> None:
+    out = _coerce_model_for_non_claude_adapter(
+        ModelConfig(model="gpt-5.4", effort="high"),
+        adapter_name="Codex",
+        adapter_default_model="gpt-5.4",
+    )
+    assert out.model == "gpt-5.4"
+
+
+def test_no_default_leaves_model_unchanged() -> None:
+    out = _coerce_model_for_non_claude_adapter(
+        ModelConfig(model="sonnet", effort="high"),
+        adapter_name="Codex",
+        adapter_default_model=None,
+    )
+    assert out.model == "sonnet"