update persona model

Swiftyos · Swiftyos · commit ca9c8b22a30c · 2026-05-15T14:20:36.000+02:00
diff --git a/docs/generated/workspace-inventory.md b/docs/generated/workspace-inventory.md
@@ -1,6 +1,6 @@
 # Workspace Inventory
 
-Generated: 2026-05-15T07:43:59.795Z
+Generated: 2026-05-15T12:20:13.570Z
 
 ```text
   AGENTS.md
@@ -446,6 +446,7 @@ src/shared/utils/
   src/shared/utils/json.ts
   src/shared/utils/logging.ts
   src/shared/utils/safe-static-path.ts
+  src/shared/utils/scoring.ts
   src/shared/utils/secret-cipher.ts
   src/shared/utils/template.ts
 tests/
diff --git a/docs/product-specs/current-state.md b/docs/product-specs/current-state.md
@@ -1,6 +1,6 @@
 # Current State
 
-Last validated against `platform.md`: 2026-04-17
+Last validated against `platform.md`: 2026-05-15
 
 ## Implemented scenarios
 
@@ -10,6 +10,7 @@ Last validated against `platform.md`: 2026-04-17
 - [x] List command shows available scenarios
 - [x] Dry-run mode records intent without contacting external systems
 - [x] Judge requests preserve cache-friendly prompt prefixes
+- [x] Persona simulation uses a configurable default model with hidden reasoning
 - [x] Parallel mode overlaps scenario execution while preserving ordering
 - [ ] Multi-session memory scenarios preserve pinned identity and session controls
 - [ ] AutoGPT preset forges auth tokens internally
@@ -52,6 +53,9 @@ Last validated against `platform.md`: 2026-04-17
 - Judge-model requests now preserve a stable rubric-first prefix, add a stable
   prompt cache key, and enable supported provider caching on the OpenRouter
   Responses path.
+- Persona simulator requests default to `deepseek/deepseek-v4-flash` unless a
+  persona-level `model` or `AGENTPROBE_PERSONA_MODEL` override is present, and
+  they use medium reasoning effort while excluding reasoning from responses.
 - The OpenClaw CLI surface is implemented behind websocket endpoint presets and
   can create sessions, send chat turns, and read session history.
 - `bun run fast-feedback` now refreshes generated docs and quality score before
diff --git a/docs/product-specs/e2e-checklist.md b/docs/product-specs/e2e-checklist.md
@@ -10,6 +10,7 @@ Derived from `platform.md`. Every scenario should have a coverage owner.
 | List command shows available scenarios | `tests/e2e/cli.e2e.test.ts` | ⏳ planned |
 | Dry-run mode records intent without contacting external systems | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
 | Judge requests preserve cache-friendly prompt prefixes | `tests/unit/judge.test.ts` | ✅ covered |
+| Persona simulation uses a configurable default model with hidden reasoning | `tests/unit/simulator.test.ts` | ✅ covered |
 | Parallel mode overlaps scenario execution while preserving ordering | `tests/e2e/cli.e2e.test.ts` | ✅ covered |
 | Multi-session memory scenarios preserve pinned identity and session controls | `tests/unit/runner.test.ts` + `tests/unit/scenario-parsing.test.ts` | ⏳ planned |
 | AutoGPT preset forges auth tokens internally | `tests/unit/autogpt-auth.test.ts` + `tests/unit/adapters.test.ts` | ⏳ expanding |
diff --git a/docs/product-specs/platform.md b/docs/product-specs/platform.md
@@ -58,6 +58,15 @@ judge-model calls
 pushes transcript-specific content to the tail, and enables supported provider
 prompt caching without changing the scoring contract
 
+### Persona simulation uses a configurable default model with hidden reasoning
+
+**Given** a persona without an explicit `model` field and no
+`AGENTPROBE_PERSONA_MODEL` override
+**When** AgentProbe simulates the next persona turn
+**Then** the CLI sends the simulator request with
+`deepseek/deepseek-v4-flash` as the default model, medium reasoning effort, and
+reasoning excluded from the response
+
 ### Parallel mode overlaps scenario execution while preserving ordering
 
 **Given** valid endpoint, scenario, persona, and rubric YAML files with more
diff --git a/src/domains/evaluation/simulator.ts b/src/domains/evaluation/simulator.ts
@@ -8,7 +8,7 @@ import type {
 import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts";
 import type { LlmResponsesClient } from "./ports.ts";
 
-const DEFAULT_PERSONA_MODEL = "moonshotai/kimi-k2.6";
+const DEFAULT_PERSONA_MODEL = "deepseek/deepseek-v4-flash";
 
 type ConversationHistory =
   | string
@@ -582,6 +582,10 @@ export async function generatePersonaStep(
     model: resolvePersonaModel(persona),
     instructions: simulatorInstructions(persona, requireResponse),
     input: baseInput,
+    reasoning: {
+      effort: "medium",
+      exclude: true,
+    },
     text: {
       format: {
         type: "json_schema",
diff --git a/src/providers/sdk/openai-responses.ts b/src/providers/sdk/openai-responses.ts
@@ -251,6 +251,14 @@ export class OpenAiResponsesClient {
       },
       temperature: request.temperature,
       max_output_tokens: request.maxOutputTokens,
+      reasoning: request.reasoning
+        ? {
+            effort: request.reasoning.effort,
+            max_tokens: request.reasoning.maxTokens,
+            exclude: request.reasoning.exclude,
+            enabled: request.reasoning.enabled,
+          }
+        : undefined,
       prompt_cache_key: request.promptCacheKey,
       cache_control: request.cacheControl
         ? {
diff --git a/src/shared/types/contracts.ts b/src/shared/types/contracts.ts
@@ -633,6 +633,12 @@ export type OpenAiResponsesRequest = {
   model: string;
   instructions: string;
   input: string | OpenAiResponsesInputMessage[];
+  reasoning?: {
+    effort?: "xhigh" | "high" | "medium" | "low" | "minimal" | "none";
+    maxTokens?: number;
+    exclude?: boolean;
+    enabled?: boolean;
+  };
   text: {
     format: {
       type: "json_schema";
diff --git a/tests/unit/simulator.test.ts b/tests/unit/simulator.test.ts
@@ -3,6 +3,7 @@ import { beforeEach, describe, expect, test } from "bun:test";
 import {
   generateNextStep,
   generatePersonaStep,
+  resolvePersonaModel,
 } from "../../src/domains/evaluation/simulator.ts";
 import { AgentProbeRuntimeError } from "../../src/shared/utils/errors.ts";
 import {
@@ -23,6 +24,14 @@ describe("simulator", () => {
     }
   });
 
+  test("uses DeepSeek Flash as the default persona model", () => {
+    delete process.env.AGENTPROBE_PERSONA_MODEL;
+
+    expect(resolvePersonaModel(buildPersona())).toBe(
+      "deepseek/deepseek-v4-flash",
+    );
+  });
+
   test("uses env default model and guidance for required turns", async () => {
     process.env.AGENTPROBE_PERSONA_MODEL = "env-persona-model";
     const client = new FakeResponsesClient([
@@ -80,6 +89,10 @@ describe("simulator", () => {
     });
     expect(client.calls).toHaveLength(1);
     expect(client.calls[0]?.model).toBe("env-persona-model");
+    expect(client.calls[0]?.reasoning).toEqual({
+      effort: "medium",
+      exclude: true,
+    });
     expect(client.calls[0]?.instructions).toContain("Frustrated Customer");
     expect(client.calls[0]?.input).toContain("Ask about refund timing.");
     expect(client.calls[0]?.input).toContain("Conversation so far:");