grafana · vortegatorres · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -0,0 +1,72 @@
+// Evaluate a REAL agent — Claude Code — in a SINGLE `k6 run`.
+//
+// CliAgent runs the `claude` CLI headless as part of the test, captures its
+// trajectory (tool calls + final answer), and scores it. No simulation, no mocks,
+// and no separate capture step.
+//
+//   ANTHROPIC_API_KEY_JUDGE=sk-...  k6 run eval.test.js
+//
+// (Run it from a directory that contains some .go files — the default task counts
+// them. `claude` must be installed and logged in.)
+import { check } from 'k6';
+import { CliAgent, judge } from 'k6/experimental/ageval';
+
+const TASK =
+  __ENV.TASK ||
+  'Use the Glob tool to find all *.go files in the current directory, then tell me exactly how many there are.';
+
+const claude = new CliAgent({
+  name: 'claude-code',
+  command: 'claude',
+  args: [
+    '-p',
+    '{{input}}',
+    '--allowedTools',
+    'Glob',
+    'Read',
+    'LS',
+    '--output-format',
+    'stream-json',
+    '--verbose',
+  ],
+  format: 'claude-code',
+  timeoutSeconds: 180,
+});
+
+export const options = {
+  vus: 1,
+  iterations: 1,
+  thresholds: {
+    checks: ['rate>0.9'],
+    agent_tool_correctness: ['rate>0.9'],
+    agent_quality_score: ['avg>0.7'],
+    agent_judge_pass: ['rate>0.9'],
+  },
+};
+
+export default function () {
+  const res = claude.run({
+    input: TASK,
+    expectedTools: [{ name: 'Glob' }], // graded by expectSequence() below
+    tags: { case: 'count-go-files' },
+  });
+
+  check(res, {
+    'used a file-search tool (Glob)': (r) => r.calledTool('Glob'),
+    'produced a final answer': (r) => r.output.length > 0,
+    'answer contains a number': (r) => /\d/.test(r.output),
+  });
+
+  res.expectSequence();
+
+  judge(res, {
+    name: 'counts_go_files',
+    provider: 'anthropic',
+    model: 'claude-haiku-4-5',
+    apiKey: __ENV.ANTHROPIC_API_KEY_JUDGE || __ENV.ANTHROPIC_API_KEY,
+    rubric:
+      'The agent was asked to count the *.go files in the current directory. A good answer used a ' +
+      'file-search tool and reports a specific, concrete count (a number).',
+    threshold: 0.7,
+  });
+}
@@ -0,0 +1,70 @@
+// Evaluate a REAL agent — OpenAI Codex CLI — in a SINGLE `k6 run`.
+//
+// CliAgent runs the `codex` CLI non-interactively (`codex exec --json`) as
+// part of the test, captures its trajectory (shell/tool calls + final answer) via
+// the built-in `codex` adapter, and scores it. No simulation, no mocks, and no
+// separate capture step.
+//
+//   ANTHROPIC_API_KEY_JUDGE=sk-...  k6 run eval.test.js
+//
+// (Run it from a directory that contains some .go files — the default task counts
+// them. `codex` must be installed and logged in: `codex login`.)
+import { check } from 'k6';
+import { CliAgent, judge } from 'k6/experimental/ageval';
+
+const TASK =
+  __ENV.TASK ||
+  'How many .go files are in the current directory? Use a shell command to list them, then state the number.';
+
+const codex = new CliAgent({
+  name: 'codex',
+  command: 'codex',
+  args: [
+    'exec',
+    '--json',
+    '--skip-git-repo-check',
+    '--sandbox',
+    'read-only',
+    '{{input}}',
+  ],
+  format: 'codex',
+  timeoutSeconds: 180,
+});
+
+export const options = {
+  vus: 1,
+  iterations: 1,
+  thresholds: {
+    checks: ['rate>0.9'],
+    agent_tool_correctness: ['rate>0.9'],
+    agent_quality_score: ['avg>0.7'],
+    agent_judge_pass: ['rate>0.9'],
+  },
+};
+
+export default function () {
+  const res = codex.run({
+    input: TASK,
+    expectedTools: [{ name: 'shell' }], // graded by expectSequence() below
+    tags: { case: 'count-go-files' },
+  });
+
+  check(res, {
+    'ran a shell command': (r) => r.calledTool('shell'),
+    'produced a final answer': (r) => r.output.length > 0,
+    'answer contains a number': (r) => /\d/.test(r.output),
+  });
+
+  res.expectSequence();
+
+  judge(res, {
+    name: 'counts_go_files',
+    provider: 'anthropic',
+    model: 'claude-haiku-4-5',
+    apiKey: __ENV.ANTHROPIC_API_KEY_JUDGE || __ENV.ANTHROPIC_API_KEY,
+    rubric:
+      'The agent was asked to count the *.go files in the current directory. A good answer ran a ' +
+      'shell command to list them and reports a specific, concrete count (a number).',
+    threshold: 0.7,
+  });
+}
@@ -0,0 +1,82 @@
+# Evaluating real agent frameworks with `k6/experimental/ageval`
+
+These examples validate that ageval is **framework-agnostic** and that its scoring
+is **valid** (it discriminates good runs from bad, and reacts to real model
+changes). Four very different agent frameworks are evaluated with the *same*
+assertions and *zero* changes to the k6 module:
+
+| Example | Framework | Mode |
+|---|---|---|
+| [`pydantic-ai/`](./pydantic-ai/) | [Pydantic-AI](https://ai.pydantic.dev) | **live** — one `k6 run` executes the agent |
+| [`langgraph/`](./langgraph/) | [LangGraph](https://langchain-ai.github.io/langgraph/) ReAct agent | recorded |
+| [`openai-agents/`](./openai-agents/) | [OpenAI Agents SDK](https://openai.github.io/openai-agents-python/) (on Claude via LiteLLM) | recorded |
+| [`crewai/`](./crewai/) | [CrewAI](https://docs.crewai.com) — **multi-agent** crew | recorded |
+
+## The one pattern: canonical shape
+
+ageval's only hard contract is the canonical trajectory:
+
+```json
+{ "input": "...", "output": "...",
+  "toolCalls": [{ "name": "...", "input": {}, "output": "..." }],
+  "usage": { "inputTokens": 0, "outputTokens": 0 } }
+```
+
+Every framework exposes its run differently, so each example has a tiny
+**capture shim** (~15-20 lines, `agent.py` / `capture.py`) that maps the
+framework's result onto this shape:
+
+- **Pydantic-AI** — `result.output`, `result.all_messages()` (`ToolCallPart`/`ToolReturnPart`), `result.usage`.
+- **LangGraph** — message list: `AIMessage.tool_calls` + `ToolMessage` + `usage_metadata`.
+- **OpenAI Agents SDK** — `result.final_output`, `result.new_items` (`ToolCallItem`/`ToolCallOutputItem`), `result.context_wrapper.usage`.
+- **CrewAI** — a `TRACE` list inside the tools (captures calls across both agents), `CrewOutput.raw` + `token_usage`.
+
+That shim is the *entire* integration cost. No new k6/Go code, no per-framework
+adapter baked into the module. (For agents instrumented with OpenTelemetry GenAI /
+OpenInference, a future built-in `otel` adapter would remove even the shim.)
+
+**Live vs recorded.** The live example (`pydantic-ai`) runs the agent inside
+`k6 run` via `CliAgent`. The recorded examples replay a committed
+`trajectory.json` via `new AgentTestCase(...)`, so they are deterministic and CI-safe — only
+the LLM judge needs a key. Each recorded dir's `capture.py` regenerates its fixture.
+
+## What each track validates
+
+- **Usefulness** — four real frameworks (including a multi-agent crew) plug in with only a shim.
+- **Validity, tool dimension** — `expectSequence(...)` is a deterministic, non-LLM oracle: the agent must call the right tools, with the right args, in the right order (in CrewAI, *across* the Researcher → Writer handoff).
+- **Validity, answer dimension** — the LLM `judge` scores the final answer against a rubric.
+- **Discrimination** — `pydantic-ai/eval_golden.test.js` feeds a real good run and a tampered bad run (wrong invoice, leaked id) through the same assertions and checks that good passes and bad fails. (Verified: judge 1.0 vs 0.0.)
+
+## Differential check: Sonnet → Haiku
+
+The examples default to the cheaper `claude-haiku-4-5` as the **agent** model (the
+judge stays on a stronger model for fair grading). Re-running the identical evals
+on Haiku vs Claude Sonnet is itself a validation that the eval reacts to real model
+changes:
+
+- **Pydantic-AI (live):** still 4/4; cost dropped from **$0.0067 → $0.0023** (measured by `agent_cost_usd`).
+- **LangGraph, OpenAI SDK:** unchanged (4/4).
+- **CrewAI:** Haiku phrased the answer as *"widget manufacturer"* (singular) instead of *"widgets"*, which tripped an over-strict `/widgets/` string check **while the LLM judge still scored it 1.0**. Lesson the differential surfaced: pair a deterministic tool oracle with a semantic judge, and avoid brittle exact-string checks. (The check was loosened to `/widget/i`.)
+
+Set `AGENT_MODEL=claude-sonnet-4-5` to regenerate fixtures on Sonnet and compare.
+
+## Running everything
+
+```bash
+# build k6 once (from repo root)
+go build -o k6 .
+
+# recorded (deterministic; only the judge key needed)
+ANTHROPIC_API_KEY_JUDGE=sk-ant-...  ./k6 run examples/experimental/ageval/frameworks/langgraph/eval.test.js
+ANTHROPIC_API_KEY_JUDGE=sk-ant-...  ./k6 run examples/experimental/ageval/frameworks/openai-agents/eval.test.js
+ANTHROPIC_API_KEY_JUDGE=sk-ant-...  ./k6 run examples/experimental/ageval/frameworks/crewai/eval.test.js
+
+# discrimination
+ANTHROPIC_API_KEY_JUDGE=sk-ant-...  ./k6 run examples/experimental/ageval/frameworks/pydantic-ai/eval_golden.test.js
+
+# live (also runs the agent; needs the agent key + a venv — see pydantic-ai/README.md)
+cd examples/experimental/ageval/frameworks/pydantic-ai
+ANTHROPIC_API_KEY_AGENT=sk-ant-... ANTHROPIC_API_KEY_JUDGE=sk-ant-... PYTHON=./venv/bin/python  ../../../../../k6 run eval.test.js
+```
+
+`ANTHROPIC_API_KEY_AGENT` is used by the agent; `ANTHROPIC_API_KEY_JUDGE` by the judge.
@@ -0,0 +1,25 @@
+# CrewAI × ageval (recorded, multi-agent)
+
+Evaluates a [CrewAI](https://docs.crewai.com) **multi-agent crew** with
+`k6/experimental/ageval`: a Researcher (tool: `lookup_company`) hands off to a
+Writer (tool: `publish_summary`). The captured `trajectory.json` spans **both
+agents' tool calls**, so the eval grades the cross-agent pipeline — `expectSequence`
+requires research *before* publishing. Replayed by `eval.test.js` (deterministic /
+CI-safe; only the judge key is needed). See [`../README.md`](../README.md) for the
+overall pattern.
+
+## Run (replay the committed fixture)
+
+```bash
+ANTHROPIC_API_KEY_JUDGE=sk-ant-...  k6 run eval.test.js
+```
+
+## Regenerate the fixture (live)
+
+```bash
+python3 -m venv venv && venv/bin/pip install "crewai[anthropic]"
+# CrewAI is chatty, so capture.py writes the file directly (not stdout):
+ANTHROPIC_API_KEY_AGENT=sk-ant-...  venv/bin/python capture.py        # writes ./trajectory.json
+```
+
+Defaults to `claude-haiku-4-5`; set `AGENT_MODEL=claude-sonnet-4-5` to compare models.
@@ -0,0 +1,106 @@
+"""CrewAI MULTI-AGENT crew -> canonical ageval trajectory (printed as JSON).
+
+A two-agent crew on Claude: a Researcher (tool: lookup_company) hands off to a
+Writer (tool: publish_summary). This exercises ageval's multi-agent / sub-agent
+story: the trajectory spans BOTH agents' tool calls, and the eval grades the
+combined sequence. Tool calls are captured by a small TRACE list inside the tools
+(version-independent). CrewAI is chatty and some of its loggers bypass a stdout
+redirect, so this writes the JSON straight to a file (default ./trajectory.json)
+rather than stdout. Regenerate with:
+
+    ANTHROPIC_API_KEY_AGENT=sk-...  python capture.py [out.json]
+"""
+
+import contextlib
+import json
+import os
+import sys
+
+os.environ.setdefault("CREWAI_DISABLE_TELEMETRY", "true")
+
+from crewai import LLM, Agent, Crew, Process, Task  # noqa: E402
+from crewai.tools import tool  # noqa: E402
+
+MODEL = os.environ.get("AGENT_MODEL", "claude-haiku-4-5")
+API_KEY = os.environ.get("ANTHROPIC_API_KEY_AGENT") or os.environ.get("ANTHROPIC_API_KEY")
+
+FACTS = {"Acme Corp": {"founded": 1998, "industry": "widgets", "hq": "Springfield"}}
+TRACE = []  # ordered tool calls across both agents
+
+
+@tool("lookup_company")
+def lookup_company(name: str) -> str:
+    """Look up basic facts (founding year, industry, HQ) about a company by name."""
+    out = json.dumps(FACTS.get(name, {"error": "unknown company"}))
+    TRACE.append({"name": "lookup_company", "input": {"name": name}, "output": out})
+    return out
+
+
+@tool("publish_summary")
+def publish_summary(summary: str) -> str:
+    """Publish the final one-sentence company summary."""
+    TRACE.append({"name": "publish_summary", "input": {"summary": summary}, "output": "published"})
+    return "published"
+
+
+def main():
+    task_text = sys.argv[1] if len(sys.argv) > 1 else "Acme Corp"
+    if not API_KEY:
+        print("ANTHROPIC_API_KEY_AGENT (or ANTHROPIC_API_KEY) must be set", file=sys.stderr)
+        sys.exit(1)
+
+    llm = LLM(model=f"anthropic/{MODEL}", api_key=API_KEY)
+
+    researcher = Agent(
+        role="Company Researcher",
+        goal="Find accurate facts about a company using the lookup_company tool.",
+        backstory="You look up authoritative facts and never invent them.",
+        tools=[lookup_company],
+        llm=llm,
+        verbose=False,
+    )
+    writer = Agent(
+        role="Summary Writer",
+        goal="Write a concise one-sentence company summary and publish it.",
+        backstory="You turn researched facts into a single clear sentence.",
+        tools=[publish_summary],
+        llm=llm,
+        verbose=False,
+    )
+
+    research = Task(
+        description=f"Look up the company '{task_text}' and report its founding year and industry.",
+        expected_output="The company's founding year and industry.",
+        agent=researcher,
+    )
+    write = Task(
+        description="Write a one-sentence summary from the research, then publish it with publish_summary.",
+        expected_output="A one-sentence published summary.",
+        agent=writer,
+        context=[research],
+    )
+
+    crew = Crew(agents=[researcher, writer], tasks=[research, write], process=Process.sequential, verbose=False)
+
+    # Keep stdout clean for the JSON; send any crew/LLM chatter to stderr.
+    with contextlib.redirect_stdout(sys.stderr):
+        result = crew.kickoff()
+
+    usage = getattr(result, "token_usage", None)
+    in_tok = int(getattr(usage, "prompt_tokens", 0) or 0)
+    out_tok = int(getattr(usage, "completion_tokens", 0) or 0)
+    trajectory = {
+        "input": task_text,
+        "output": str(getattr(result, "raw", result)),
+        "model": MODEL,
+        "toolCalls": TRACE,
+        "usage": {"inputTokens": in_tok, "outputTokens": out_tok},
+    }
+    out_path = sys.argv[2] if len(sys.argv) > 2 else "trajectory.json"
+    with open(out_path, "w") as f:
+        json.dump(trajectory, f, indent=4)
+    print(f"wrote {out_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()