Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions examples/experimental/ageval/claude-code/eval.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Evaluate a REAL agent — Claude Code — in a SINGLE `k6 run`.
//
// CliAgent runs the `claude` CLI headless as part of the test, captures its
// trajectory (tool calls + final answer), and scores it. No simulation, no mocks,
// and no separate capture step.
//
// ANTHROPIC_API_KEY_JUDGE=sk-... k6 run eval.test.js
//
// (Run it from a directory that contains some .go files — the default task counts
// them. `claude` must be installed and logged in.)
import { check } from 'k6';
import { CliAgent, judge } from 'k6/experimental/ageval';

const TASK =
__ENV.TASK ||
'Use the Glob tool to find all *.go files in the current directory, then tell me exactly how many there are.';

const claude = new CliAgent({
name: 'claude-code',
command: 'claude',
args: [
'-p',
'{{input}}',
'--allowedTools',
'Glob',
'Read',
'LS',
'--output-format',
'stream-json',
'--verbose',
],
format: 'claude-code',
timeoutSeconds: 180,
});

export const options = {
vus: 1,
iterations: 1,
thresholds: {
checks: ['rate>0.9'],
agent_tool_correctness: ['rate>0.9'],
agent_quality_score: ['avg>0.7'],
agent_judge_pass: ['rate>0.9'],
},
};

export default function () {
const res = claude.run({
input: TASK,
expectedTools: [{ name: 'Glob' }], // graded by expectSequence() below
tags: { case: 'count-go-files' },
});

check(res, {
'used a file-search tool (Glob)': (r) => r.calledTool('Glob'),
'produced a final answer': (r) => r.output.length > 0,
'answer contains a number': (r) => /\d/.test(r.output),
});

res.expectSequence();

judge(res, {
name: 'counts_go_files',
provider: 'anthropic',
model: 'claude-haiku-4-5',
apiKey: __ENV.ANTHROPIC_API_KEY_JUDGE || __ENV.ANTHROPIC_API_KEY,
rubric:
'The agent was asked to count the *.go files in the current directory. A good answer used a ' +
'file-search tool and reports a specific, concrete count (a number).',
threshold: 0.7,
});
}
70 changes: 70 additions & 0 deletions examples/experimental/ageval/codex/eval.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Evaluate a REAL agent — OpenAI Codex CLI — in a SINGLE `k6 run`.
//
// CliAgent runs the `codex` CLI non-interactively (`codex exec --json`) as
// part of the test, captures its trajectory (shell/tool calls + final answer) via
// the built-in `codex` adapter, and scores it. No simulation, no mocks, and no
// separate capture step.
//
// ANTHROPIC_API_KEY_JUDGE=sk-... k6 run eval.test.js
//
// (Run it from a directory that contains some .go files — the default task counts
// them. `codex` must be installed and logged in: `codex login`.)
import { check } from 'k6';
import { CliAgent, judge } from 'k6/experimental/ageval';

const TASK =
__ENV.TASK ||
'How many .go files are in the current directory? Use a shell command to list them, then state the number.';

const codex = new CliAgent({
name: 'codex',
command: 'codex',
args: [
'exec',
'--json',
'--skip-git-repo-check',
'--sandbox',
'read-only',
'{{input}}',
],
format: 'codex',
timeoutSeconds: 180,
});

export const options = {
vus: 1,
iterations: 1,
thresholds: {
checks: ['rate>0.9'],
agent_tool_correctness: ['rate>0.9'],
agent_quality_score: ['avg>0.7'],
agent_judge_pass: ['rate>0.9'],
},
};

export default function () {
const res = codex.run({
input: TASK,
expectedTools: [{ name: 'shell' }], // graded by expectSequence() below
tags: { case: 'count-go-files' },
});

check(res, {
'ran a shell command': (r) => r.calledTool('shell'),
'produced a final answer': (r) => r.output.length > 0,
'answer contains a number': (r) => /\d/.test(r.output),
});

res.expectSequence();

judge(res, {
name: 'counts_go_files',
provider: 'anthropic',
model: 'claude-haiku-4-5',
apiKey: __ENV.ANTHROPIC_API_KEY_JUDGE || __ENV.ANTHROPIC_API_KEY,
rubric:
'The agent was asked to count the *.go files in the current directory. A good answer ran a ' +
'shell command to list them and reports a specific, concrete count (a number).',
threshold: 0.7,
});
}
82 changes: 82 additions & 0 deletions examples/experimental/ageval/frameworks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Evaluating real agent frameworks with `k6/experimental/ageval`

These examples validate that ageval is **framework-agnostic** and that its scoring
is **valid** (it discriminates good runs from bad, and reacts to real model
changes). Four very different agent frameworks are evaluated with the *same*
assertions and *zero* changes to the k6 module:

| Example | Framework | Mode |
|---|---|---|
| [`pydantic-ai/`](./pydantic-ai/) | [Pydantic-AI](https://ai.pydantic.dev) | **live** — one `k6 run` executes the agent |
| [`langgraph/`](./langgraph/) | [LangGraph](https://langchain-ai.github.io/langgraph/) ReAct agent | recorded |
| [`openai-agents/`](./openai-agents/) | [OpenAI Agents SDK](https://openai.github.io/openai-agents-python/) (on Claude via LiteLLM) | recorded |
| [`crewai/`](./crewai/) | [CrewAI](https://docs.crewai.com) — **multi-agent** crew | recorded |

## The one pattern: canonical shape

ageval's only hard contract is the canonical trajectory:

```json
{ "input": "...", "output": "...",
"toolCalls": [{ "name": "...", "input": {}, "output": "..." }],
"usage": { "inputTokens": 0, "outputTokens": 0 } }
```

Every framework exposes its run differently, so each example has a tiny
**capture shim** (~15-20 lines, `agent.py` / `capture.py`) that maps the
framework's result onto this shape:

- **Pydantic-AI** — `result.output`, `result.all_messages()` (`ToolCallPart`/`ToolReturnPart`), `result.usage`.
- **LangGraph** — message list: `AIMessage.tool_calls` + `ToolMessage` + `usage_metadata`.
- **OpenAI Agents SDK** — `result.final_output`, `result.new_items` (`ToolCallItem`/`ToolCallOutputItem`), `result.context_wrapper.usage`.
- **CrewAI** — a `TRACE` list inside the tools (captures calls across both agents), `CrewOutput.raw` + `token_usage`.

That shim is the *entire* integration cost. No new k6/Go code, no per-framework
adapter baked into the module. (For agents instrumented with OpenTelemetry GenAI /
OpenInference, a future built-in `otel` adapter would remove even the shim.)

**Live vs recorded.** The live example (`pydantic-ai`) runs the agent inside
`k6 run` via `CliAgent`. The recorded examples replay a committed
`trajectory.json` via `new AgentTestCase(...)`, so they are deterministic and CI-safe — only
the LLM judge needs a key. Each recorded dir's `capture.py` regenerates its fixture.

## What each track validates

- **Usefulness** — four real frameworks (including a multi-agent crew) plug in with only a shim.
- **Validity, tool dimension** — `expectSequence(...)` is a deterministic, non-LLM oracle: the agent must call the right tools, with the right args, in the right order (in CrewAI, *across* the Researcher → Writer handoff).
- **Validity, answer dimension** — the LLM `judge` scores the final answer against a rubric.
- **Discrimination** — `pydantic-ai/eval_golden.test.js` feeds a real good run and a tampered bad run (wrong invoice, leaked id) through the same assertions and checks that good passes and bad fails. (Verified: judge 1.0 vs 0.0.)

## Differential check: Sonnet → Haiku

The examples default to the cheaper `claude-haiku-4-5` as the **agent** model (the
judge stays on a stronger model for fair grading). Re-running the identical evals
on Haiku vs Claude Sonnet is itself a validation that the eval reacts to real model
changes:

- **Pydantic-AI (live):** still 4/4; cost dropped from **$0.0067 → $0.0023** (measured by `agent_cost_usd`).
- **LangGraph, OpenAI SDK:** unchanged (4/4).
- **CrewAI:** Haiku phrased the answer as *"widget manufacturer"* (singular) instead of *"widgets"*, which tripped an over-strict `/widgets/` string check **while the LLM judge still scored it 1.0**. Lesson the differential surfaced: pair a deterministic tool oracle with a semantic judge, and avoid brittle exact-string checks. (The check was loosened to `/widget/i`.)

Set `AGENT_MODEL=claude-sonnet-4-5` to regenerate fixtures on Sonnet and compare.

## Running everything

```bash
# build k6 once (from repo root)
go build -o k6 .

# recorded (deterministic; only the judge key needed)
ANTHROPIC_API_KEY_JUDGE=sk-ant-... ./k6 run examples/experimental/ageval/frameworks/langgraph/eval.test.js
ANTHROPIC_API_KEY_JUDGE=sk-ant-... ./k6 run examples/experimental/ageval/frameworks/openai-agents/eval.test.js
ANTHROPIC_API_KEY_JUDGE=sk-ant-... ./k6 run examples/experimental/ageval/frameworks/crewai/eval.test.js

# discrimination
ANTHROPIC_API_KEY_JUDGE=sk-ant-... ./k6 run examples/experimental/ageval/frameworks/pydantic-ai/eval_golden.test.js

# live (also runs the agent; needs the agent key + a venv — see pydantic-ai/README.md)
cd examples/experimental/ageval/frameworks/pydantic-ai
ANTHROPIC_API_KEY_AGENT=sk-ant-... ANTHROPIC_API_KEY_JUDGE=sk-ant-... PYTHON=./venv/bin/python ../../../../../k6 run eval.test.js
```

`ANTHROPIC_API_KEY_AGENT` is used by the agent; `ANTHROPIC_API_KEY_JUDGE` by the judge.
25 changes: 25 additions & 0 deletions examples/experimental/ageval/frameworks/crewai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# CrewAI × ageval (recorded, multi-agent)

Evaluates a [CrewAI](https://docs.crewai.com) **multi-agent crew** with
`k6/experimental/ageval`: a Researcher (tool: `lookup_company`) hands off to a
Writer (tool: `publish_summary`). The captured `trajectory.json` spans **both
agents' tool calls**, so the eval grades the cross-agent pipeline — `expectSequence`
requires research *before* publishing. Replayed by `eval.test.js` (deterministic /
CI-safe; only the judge key is needed). See [`../README.md`](../README.md) for the
overall pattern.

## Run (replay the committed fixture)

```bash
ANTHROPIC_API_KEY_JUDGE=sk-ant-... k6 run eval.test.js
```

## Regenerate the fixture (live)

```bash
python3 -m venv venv && venv/bin/pip install "crewai[anthropic]"
# CrewAI is chatty, so capture.py writes the file directly (not stdout):
ANTHROPIC_API_KEY_AGENT=sk-ant-... venv/bin/python capture.py # writes ./trajectory.json
```

Defaults to `claude-haiku-4-5`; set `AGENT_MODEL=claude-sonnet-4-5` to compare models.
106 changes: 106 additions & 0 deletions examples/experimental/ageval/frameworks/crewai/capture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""CrewAI MULTI-AGENT crew -> canonical ageval trajectory (printed as JSON).

A two-agent crew on Claude: a Researcher (tool: lookup_company) hands off to a
Writer (tool: publish_summary). This exercises ageval's multi-agent / sub-agent
story: the trajectory spans BOTH agents' tool calls, and the eval grades the
combined sequence. Tool calls are captured by a small TRACE list inside the tools
(version-independent). CrewAI is chatty and some of its loggers bypass a stdout
redirect, so this writes the JSON straight to a file (default ./trajectory.json)
rather than stdout. Regenerate with:

ANTHROPIC_API_KEY_AGENT=sk-... python capture.py [out.json]
"""

import contextlib
import json
import os
import sys

os.environ.setdefault("CREWAI_DISABLE_TELEMETRY", "true")

from crewai import LLM, Agent, Crew, Process, Task # noqa: E402
from crewai.tools import tool # noqa: E402

MODEL = os.environ.get("AGENT_MODEL", "claude-haiku-4-5")
API_KEY = os.environ.get("ANTHROPIC_API_KEY_AGENT") or os.environ.get("ANTHROPIC_API_KEY")

FACTS = {"Acme Corp": {"founded": 1998, "industry": "widgets", "hq": "Springfield"}}
TRACE = [] # ordered tool calls across both agents


@tool("lookup_company")
def lookup_company(name: str) -> str:
"""Look up basic facts (founding year, industry, HQ) about a company by name."""
out = json.dumps(FACTS.get(name, {"error": "unknown company"}))
TRACE.append({"name": "lookup_company", "input": {"name": name}, "output": out})
return out


@tool("publish_summary")
def publish_summary(summary: str) -> str:
"""Publish the final one-sentence company summary."""
TRACE.append({"name": "publish_summary", "input": {"summary": summary}, "output": "published"})
return "published"


def main():
task_text = sys.argv[1] if len(sys.argv) > 1 else "Acme Corp"
if not API_KEY:
print("ANTHROPIC_API_KEY_AGENT (or ANTHROPIC_API_KEY) must be set", file=sys.stderr)
sys.exit(1)

llm = LLM(model=f"anthropic/{MODEL}", api_key=API_KEY)

researcher = Agent(
role="Company Researcher",
goal="Find accurate facts about a company using the lookup_company tool.",
backstory="You look up authoritative facts and never invent them.",
tools=[lookup_company],
llm=llm,
verbose=False,
)
writer = Agent(
role="Summary Writer",
goal="Write a concise one-sentence company summary and publish it.",
backstory="You turn researched facts into a single clear sentence.",
tools=[publish_summary],
llm=llm,
verbose=False,
)

research = Task(
description=f"Look up the company '{task_text}' and report its founding year and industry.",
expected_output="The company's founding year and industry.",
agent=researcher,
)
write = Task(
description="Write a one-sentence summary from the research, then publish it with publish_summary.",
expected_output="A one-sentence published summary.",
agent=writer,
context=[research],
)

crew = Crew(agents=[researcher, writer], tasks=[research, write], process=Process.sequential, verbose=False)

# Keep stdout clean for the JSON; send any crew/LLM chatter to stderr.
with contextlib.redirect_stdout(sys.stderr):
result = crew.kickoff()

usage = getattr(result, "token_usage", None)
in_tok = int(getattr(usage, "prompt_tokens", 0) or 0)
out_tok = int(getattr(usage, "completion_tokens", 0) or 0)
trajectory = {
"input": task_text,
"output": str(getattr(result, "raw", result)),
"model": MODEL,
"toolCalls": TRACE,
"usage": {"inputTokens": in_tok, "outputTokens": out_tok},
}
out_path = sys.argv[2] if len(sys.argv) > 2 else "trajectory.json"
with open(out_path, "w") as f:
json.dump(trajectory, f, indent=4)
print(f"wrote {out_path}", file=sys.stderr)


if __name__ == "__main__":
main()
Loading
Loading