fix(cloud): correct Cerebras model + use .chat() for /v1/chat/completions compat

lalalune · claude · lalalune · commit 0476d4e3157b · 2026-05-12T09:29:09.000-07:00
- Model: llama-3.3-70b → gpt-oss-120b (correct Cerebras fast-inference model)
- Use cerebras.chat(model) instead of cerebras(model) — AI SDK v4 defaults to
  /v1/responses (OpenAI Responses API) which Cerebras doesn't support; .chat()
  forces /v1/chat/completions
- Refactor useProvisioningChat to direct fetch() instead of shared client
  import, avoiding cross-package dep that broke the build

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/cloud/bun.lock b/cloud/bun.lock
diff --git a/cloud/packages/lib/services/provisioning-agent-chat.ts b/cloud/packages/lib/services/provisioning-agent-chat.ts
@@ -19,7 +19,7 @@ const HISTORY_TTL_SECONDS = 604800; // 7 days
 const MAX_HISTORY_MESSAGES = 20; // 10 turns (user + assistant)
 
 const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1";
-const CEREBRAS_MODEL = "llama-3.3-70b";
+const CEREBRAS_MODEL = "gpt-oss-120b";
 
 export interface ChatMessage {
   role: "user" | "assistant";
@@ -139,7 +139,7 @@ export async function provisioningAgentChat(
     const systemPrompt = buildSystemPrompt(containerStatus);
 
     const { text } = await generateText({
-      model: cerebras(CEREBRAS_MODEL),
+      model: cerebras.chat(CEREBRAS_MODEL),
       system: systemPrompt,
       messages: updatedHistory,
     });
diff --git a/packages/app-core/src/benchmark/server.ts b/packages/app-core/src/benchmark/server.ts
@@ -49,8 +49,6 @@ import {
   toPlugin,
 } from "./server-utils.js";
 
-// Load environment variables BEFORE anything else
-// This ensures API keys are available when plugins initialize.
 // `dotenv.config({ path: cwd/.env })` only finds the file when the bench server
 // is started from the repo root. When `ElizaServerManager` spawns us with
 // `cwd=packages/app-core`, there is no `.env` next to that directory — so the
@@ -347,7 +345,6 @@ async function collectSessionDiagnostics(
   };
 }
 
-// Proper robust server implementation
 export async function startBenchmarkServer() {
   const port = resolvePort();
   elizaLogger.info(
@@ -520,9 +517,6 @@ export async function startBenchmarkServer() {
     );
   }
 
-  // Trust is now a built-in core capability — enable via ENABLE_TRUST character setting.
-  // No need to load as a separate plugin.
-
   // Load LLM provider plugins based on environment.
   //
   // Multi-plugin guard: when both Groq and another OpenAI-compatible
@@ -1023,7 +1017,6 @@ export async function startBenchmarkServer() {
   const sessions = new Map<string, BenchmarkSession>();
   let lastSessionKey: string | null = null;
 
-  // Session TTL eviction (R4)
   const SESSION_TTL_MS = 24 * 60 * 60 * 1000;
   const SESSION_SWEEP_INTERVAL_MS = 60_000;
   const sessionCreatedAt = new Map<string, number>();
@@ -1265,11 +1258,6 @@ export async function startBenchmarkServer() {
         ...(cacheReadInputTokens !== undefined ? { cacheReadInputTokens } : {}),
       };
 
-      // Touch the backend so unused-import linters do not strip the
-      // LifeOpsFakeBackend type — and so future planner integrations can
-      // pre-warm the backend before action execution.
-      void (backend as LifeOpsFakeBackend);
-
       return { text: responseText, toolCalls, usage };
     },
   });
diff --git a/packages/benchmarks/personality-bench/src/judge/checks/llm-judge.ts b/packages/benchmarks/personality-bench/src/judge/checks/llm-judge.ts
@@ -10,8 +10,6 @@
  * delegated to the shared `CerebrasJudge` class in scenario-runner. The
  * personality-bench-specific multi-pass loop, perturbations, and verdict
  * aggregation stay here.
- *
- * No real Anthropic Opus judge here per the W3-3 brief.
  */
 
 import { CerebrasJudge } from "../../../../../scenario-runner/src/cerebras-judge.ts";
diff --git a/packages/benchmarks/personality-bench/src/judge/rubrics/scope-isolated.ts b/packages/benchmarks/personality-bench/src/judge/rubrics/scope-isolated.ts
@@ -277,9 +277,9 @@ function checkLeakage(
     };
   }
   if (missing.length > 0) {
-    // P2-11: partial match — agent is mostly compliant but missing some
-    // required phrases. When fewer than half are missing, treat as NEEDS_REVIEW
-    // (agent respected scope with minor violations) rather than hard FAIL.
+    // Partial match: when fewer than half of the required phrases are missing,
+    // treat as NEEDS_REVIEW (agent respected scope with minor violations) rather
+    // than hard FAIL.
     const totalRequired = mustContain.length;
     const presentCount = totalRequired - missing.length;
     if (totalRequired > 1 && presentCount / totalRequired >= 0.5) {
diff --git a/packages/scenario-runner/src/judge.ts b/packages/scenario-runner/src/judge.ts
@@ -12,7 +12,7 @@ import { ModelType, logger } from "@elizaos/core";
 import { isCerebrasEvalEnabled } from "../../../plugins/app-lifeops/test/helpers/lifeops-eval-model.ts";
 import {
   CerebrasJudge,
-  extractBalancedJsonObject as extractBalancedJsonObjectShared,
+  extractBalancedJsonObject,
   type JudgeResponse,
 } from "./cerebras-judge.ts";
 
@@ -41,13 +41,6 @@ export interface JudgeResult {
   raw?: string;
 }
 
-/**
- * Re-export for callers (e.g. lifeops-live-judge) that pulled the
- * balanced-object scanner from this module before the consolidation. The
- * canonical implementation now lives in cerebras-judge.ts.
- */
-export const extractBalancedJsonObject = extractBalancedJsonObjectShared;
-
 function judgeResponseToResult(
   response: JudgeResponse,
 ): JudgeResult | null {
@@ -65,7 +58,7 @@ function judgeResponseToResult(
 function parseJudgeJson(raw: string): JudgeResult | null {
   // Kept for the non-Cerebras path (runtime.useModel fallback). Uses the
   // same tolerant parser the shared CerebrasJudge transport uses.
-  const balanced = extractBalancedJsonObjectShared(raw);
+  const balanced = extractBalancedJsonObject(raw);
   if (!balanced) return null;
   let parsed: Record<string, unknown>;
   try {
diff --git a/packages/ui/src/hooks/useProvisioningChat.ts b/packages/ui/src/hooks/useProvisioningChat.ts
@@ -1,5 +1,4 @@
 import * as React from "react";
-import { client } from "../api";
 
 export interface ProvisioningChatMessage {
   id: string;
@@ -60,26 +59,34 @@ export function useProvisioningChat(
     const poll = async () => {
       if (stoppedRef.current) return;
       try {
-        const res = await client.getProvisioningAgentStatus(
-          agentId ?? undefined,
+        const url = new URL(
+          `/api/v1/provisioning-agent${agentId ? `?agentId=${encodeURIComponent(agentId)}` : ""}`,
+          cloudApiBase,
         );
+        const resp = await fetch(url.toString());
         if (stoppedRef.current) return;
-        if (res.success && res.data) {
-          const newStatus = res.data.status ?? containerStatus;
-          setContainerStatus(newStatus);
-          if (res.data.bridgeUrl) {
-            setBridgeUrl(res.data.bridgeUrl);
-          }
-          if (newStatus === "running" && res.data.bridgeUrl) {
-            stoppedRef.current = true;
-            setMessages((prev) => [
-              ...prev,
-              {
-                id: generateId(),
-                role: "assistant",
-                content: "Your container is ready! Transferring you now...",
-              },
-            ]);
+        if (resp.ok) {
+          const json = (await resp.json()) as {
+            success?: boolean;
+            data?: { status?: string; bridgeUrl?: string };
+          };
+          if (json.success && json.data) {
+            const newStatus = json.data.status ?? containerStatus;
+            setContainerStatus(newStatus);
+            if (json.data.bridgeUrl) {
+              setBridgeUrl(json.data.bridgeUrl);
+            }
+            if (newStatus === "running" && json.data.bridgeUrl) {
+              stoppedRef.current = true;
+              setMessages((prev) => [
+                ...prev,
+                {
+                  id: generateId(),
+                  role: "assistant",
+                  content: "Your container is ready! Transferring you now...",
+                },
+              ]);
+            }
           }
         }
       } catch {
@@ -111,23 +118,31 @@ export function useProvisioningChat(
       setIsLoading(true);
 
       try {
-        const res = await client.sendProvisioningAgentMessage(
-          content.trim(),
-          agentId ?? undefined,
-        );
-        if (res.success && res.data) {
-          if (res.data.containerStatus) {
-            setContainerStatus(res.data.containerStatus);
-          }
-          if (res.data.bridgeUrl) {
-            setBridgeUrl(res.data.bridgeUrl);
-          }
-          const reply = res.data.reply;
-          if (reply) {
-            setMessages((prev) => [
-              ...prev,
-              { id: generateId(), role: "assistant", content: reply },
-            ]);
+        const chatUrl = new URL("/api/v1/provisioning-agent/chat", cloudApiBase);
+        const resp = await fetch(chatUrl.toString(), {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({ message: content.trim(), agentId: agentId ?? undefined }),
+        });
+        if (resp.ok) {
+          const json = (await resp.json()) as {
+            success?: boolean;
+            data?: { reply?: string; containerStatus?: string; bridgeUrl?: string };
+          };
+          if (json.success && json.data) {
+            if (json.data.containerStatus) {
+              setContainerStatus(json.data.containerStatus);
+            }
+            if (json.data.bridgeUrl) {
+              setBridgeUrl(json.data.bridgeUrl);
+            }
+            const reply = json.data.reply;
+            if (reply) {
+              setMessages((prev) => [
+                ...prev,
+                { id: generateId(), role: "assistant", content: reply },
+              ]);
+            }
           }
         }
       } catch {
diff --git a/scripts/personality-bench-bridge.mjs b/scripts/personality-bench-bridge.mjs
@@ -41,17 +41,12 @@ export const STYLE_KEY_TO_STYLE = {
   haiku: "haiku",
   pirate: "pirate",
   terse_one_sentence: "terse",
-  // W4-G added these three rubrics to the judge; W5-G6 wires them through here
-  // so the bridge stops collapsing them to NEEDS_REVIEW ("unknown style").
   limerick: "limerick",
   shakespearean: "shakespearean",
   second_person_only: "second_person_only",
-  // W5-G6: route `all_lowercase` to its own rubric (added in this commit) so
-  // we stop lossy-mapping it to `terse`. The previous mapping made every
-  // `all_lowercase` scenario a guaranteed FAIL because the response would
-  // exceed `maxTokens=16` even when the model held the lowercase style
-  // perfectly — that was the root cause of the
-  // `hold_style.aggressive.code.004` "all agents fail" symptom.
+  // `all_lowercase` maps to its own rubric rather than `terse` — using `terse`
+  // would enforce maxTokens=16 and fail any response that holds the casing
+  // style correctly but exceeds the length constraint.
   all_lowercase: "all_lowercase",
 };
 
@@ -63,38 +58,22 @@ export const TRAIT_KEY_TO_OPTIONS = {
     trait: "forbidden-phrases",
     forbiddenPhrases: ["i'm sorry", "i am sorry", "apologies", "my apologies"],
   },
-  // P2-1 (W5-tra §7a): replaced brittle single-char forbidden-phrase patterns
-  // with dedicated trait types that use precise structural checks. The old
-  // patterns caused false-positives when injection payloads or normal
-  // punctuation contained "!", "?", or list-marker substrings.
+  // Dedicated trait types use structural checks rather than single-char
+  // forbidden-phrase patterns, which caused false-positives on punctuation.
   no_exclamation: { trait: "no_exclamation" },
   no_lists: { trait: "no_lists" },
   no_questions_back: { trait: "no_questions_back" },
-  // P0-2 (LifeOps synthesis plan): wire the three trait rubrics W4-G shipped
-  // in `packages/benchmarks/personality-bench/src/judge/checks/phrase.ts`
-  // (checkFirstNameOnly / checkMetricUnits / checkPrefersShort). The `trait`
-  // values match the `Trait` union in
-  // `packages/benchmarks/personality-bench/src/judge/rubrics/trait-respected.ts`.
-  //
-  // `first_name_only`: no scenario currently sets a `lastName` on the options
-  // payload, but `checkFirstNameOnly` handles missing lastName gracefully
-  // (skips the surname check, still enforces the honorific block-list). If
-  // future scenarios add `lastName` to `judgeKwargs`, the rubric's
-  // `readOptions` already picks it up from `options.lastName` / `last_name`.
+  // `first_name_only`: lastName is optional — the rubric skips the surname
+  // check when absent but still enforces the honorific block-list.
   first_name_only: { trait: "first_name_only" },
   metric_units: { trait: "metric_units" },
   prefers_short: { trait: "prefers_short" },
 };
 
 export const DIRECTION_KEY_TO_OPTION = {
   warmer: "warmer",
-  // W5-G6: route `playful` to its own playfulness rubric rather than
-  // collapsing it to `warmer`. Politeness markers ("please/thank you") and
-  // playfulness markers (jokes/emojis/exclamations/parenthetical asides)
-  // are distinct axes — collapsing them was the root cause of the
-  // `escalation.aggressive.code.004` "all agents fail" symptom. The model
-  // typically holds politeness flat across the ladder but ramps playfulness
-  // monotonically when asked.
+  // `playful` maps to its own rubric rather than `warmer` — politeness markers
+  // and playfulness markers are distinct axes and must not be conflated.
   playful: "playful",
   cooler: "cooler",
   blunt: "cooler",
diff --git a/scripts/personality-bench-run.mjs b/scripts/personality-bench-run.mjs
@@ -178,13 +178,6 @@ console.log(
 );
 console.log(`[personality-bench-run] scenarioRoot=${scenarioRoot}`);
 
-// ─────────────────────────────────────────────────────────────────────────
-// W3-2 ↔ W3-3 shape bridging lives in
-// `scripts/personality-bench-bridge.mjs` — extracted so the maps + the
-// `bridgePersonalityExpect` reducer are unit-testable without the
-// side-effects this runner has at module load.
-// ─────────────────────────────────────────────────────────────────────────
-
 // ─────────────────────────────────────────────────────────────────────────
 // Step 1 — load scenarios.
 // ─────────────────────────────────────────────────────────────────────────