Skip to content

Commit 954dbd1

Browse files
feat: reusable GitHub Action and GITHUB_STEP_SUMMARY support (#152)
- action.yml: composite action (uses: Siddhant-K-code/agent-trace@main) that installs agent-trace, runs eval ci, posts to step summary, and uploads .agent-traces as a workflow artifact - eval/runner.py: write to $GITHUB_STEP_SUMMARY when set (GitHub Actions native), fall back to .agent-traces/eval-summary.md otherwise - examples/ci/agent-eval.yml: updated to use the reusable action Closes #130 Co-authored-by: Ona <no-reply@ona.com>
1 parent faa4cf6 commit 954dbd1

5 files changed

Lines changed: 139 additions & 44 deletions

File tree

action.yml

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
name: agent-trace eval
2+
description: Run agent-trace evals in CI, post results to the step summary, and fail on regression.
3+
4+
inputs:
5+
config:
6+
description: Path to eval config file
7+
default: .agent-evals.yaml
8+
baseline:
9+
description: Path to baseline scores file (optional)
10+
default: ""
11+
save-baseline:
12+
description: Save scores as new baseline after run
13+
default: "false"
14+
tolerance:
15+
description: Regression tolerance (0.0–1.0, default 0.05)
16+
default: "0.05"
17+
trace-dir:
18+
description: Directory where agent-trace sessions are stored
19+
default: .agent-traces
20+
python-version:
21+
description: Python version to use
22+
default: "3.12"
23+
install-extras:
24+
description: Comma-separated optional extras to install (e.g. openai,anthropic)
25+
default: ""
26+
27+
outputs:
28+
passed:
29+
description: "true if all scorers passed, false otherwise"
30+
value: ${{ steps.eval.outputs.passed }}
31+
summary-path:
32+
description: Path to the written eval summary markdown file
33+
value: ${{ steps.eval.outputs.summary-path }}
34+
35+
runs:
36+
using: composite
37+
steps:
38+
- name: Set up Python
39+
uses: actions/setup-python@v5
40+
with:
41+
python-version: ${{ inputs.python-version }}
42+
43+
- name: Install agent-trace
44+
shell: bash
45+
run: |
46+
EXTRAS="${{ inputs.install-extras }}"
47+
if [ -n "$EXTRAS" ]; then
48+
pip install "agent-trace[$EXTRAS]"
49+
else
50+
pip install agent-trace
51+
fi
52+
53+
- name: Run evals
54+
id: eval
55+
shell: bash
56+
env:
57+
GITHUB_STEP_SUMMARY: ${{ env.GITHUB_STEP_SUMMARY }}
58+
run: |
59+
ARGS="--config ${{ inputs.config }} --trace-dir ${{ inputs.trace-dir }}"
60+
ARGS="$ARGS --tolerance ${{ inputs.tolerance }}"
61+
ARGS="$ARGS --github-summary"
62+
63+
if [ -n "${{ inputs.baseline }}" ]; then
64+
ARGS="$ARGS --baseline ${{ inputs.baseline }}"
65+
fi
66+
67+
if [ "${{ inputs.save-baseline }}" = "true" ]; then
68+
ARGS="$ARGS --save-baseline"
69+
fi
70+
71+
if agent-strace eval ci $ARGS; then
72+
echo "passed=true" >> "$GITHUB_OUTPUT"
73+
else
74+
echo "passed=false" >> "$GITHUB_OUTPUT"
75+
exit 1
76+
fi
77+
78+
SUMMARY=".agent-traces/eval-summary.md"
79+
if [ -f "$SUMMARY" ]; then
80+
echo "summary-path=$SUMMARY" >> "$GITHUB_OUTPUT"
81+
fi
82+
83+
- name: Upload trace artifacts
84+
if: always()
85+
uses: actions/upload-artifact@v4
86+
with:
87+
name: agent-traces
88+
path: ${{ inputs.trace-dir }}
89+
if-no-files-found: ignore

examples/ci/agent-eval.yml

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# Runs eval scorers on every PR that touches agent config files.
44
# Fails the PR if any scorer drops below its threshold.
5-
# Posts a score summary as a PR comment.
5+
# Posts a score summary to the GitHub Actions step summary.
66
#
77
# Prerequisites:
88
# 1. Capture at least one session: agent-strace record -- <your-agent-command>
@@ -27,49 +27,18 @@ jobs:
2727
steps:
2828
- uses: actions/checkout@v4
2929

30-
- name: Set up Python
31-
uses: actions/setup-python@v5
30+
# Uses the reusable action bundled with agent-trace.
31+
# Results appear in the GitHub Actions step summary automatically.
32+
- name: Run agent-trace eval
33+
uses: Siddhant-K-code/agent-trace@main
3234
with:
33-
python-version: "3.12"
34-
35-
- name: Install agent-strace
36-
run: pip install agent-strace
37-
38-
# Score the latest session in the dataset against all configured scorers.
39-
# Exits 1 if any scorer is below threshold or regresses vs baseline.
40-
- name: Run eval
35+
baseline: .agent-traces/baselines/main.json
36+
tolerance: "0.05"
4137
env:
4238
# Required only if using the llm_judge scorer.
43-
# Remove if using heuristic scorers only (no_errors, cost_under, etc.)
4439
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
45-
run: |
46-
agent-strace eval ci \
47-
--baseline .agent-traces/baselines/main.json \
48-
--tolerance 0.05 \
49-
--github-summary
50-
51-
# Post the Markdown summary as a PR comment so reviewers see the score delta.
52-
- name: Post eval summary
53-
if: always()
54-
uses: actions/github-script@v7
55-
with:
56-
script: |
57-
const fs = require('fs');
58-
const summaryPath = '.agent-traces/eval-summary.md';
59-
if (!fs.existsSync(summaryPath)) {
60-
console.log('No eval summary found — skipping comment.');
61-
return;
62-
}
63-
const summary = fs.readFileSync(summaryPath, 'utf8');
64-
await github.rest.issues.createComment({
65-
issue_number: context.issue.number,
66-
owner: context.repo.owner,
67-
repo: context.repo.repo,
68-
body: summary,
69-
});
7040

71-
# Optional: update the baseline on every merge to main.
72-
# Commit the updated baseline back to the repo so future PRs compare against it.
41+
# Update the baseline on every merge to main so future PRs compare against it.
7342
update-baseline:
7443
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
7544
runs-on: ubuntu-latest

src/agent_trace/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""agent-trace: strace for AI agents."""
22

3-
__version__ = "0.54.0"
3+
__version__ = "0.55.0"

src/agent_trace/eval/runner.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import argparse
99
import json
10+
import os
1011
import sys
1112
from pathlib import Path
1213
from dataclasses import dataclass, field
@@ -267,10 +268,19 @@ def _write_github_summary(report: "EvalReport", baseline: dict[str, float], tole
267268
lines.append("")
268269
lines.append("</details>")
269270

270-
summary_path = Path(".agent-traces/eval-summary.md")
271-
summary_path.parent.mkdir(parents=True, exist_ok=True)
272-
summary_path.write_text("\n".join(lines) + "\n")
273-
sys.stderr.write(f"GitHub summary written to {summary_path}\n")
271+
content = "\n".join(lines) + "\n"
272+
273+
# Write to $GITHUB_STEP_SUMMARY when running inside GitHub Actions
274+
gha_summary = os.environ.get("GITHUB_STEP_SUMMARY", "")
275+
if gha_summary:
276+
with open(gha_summary, "a", encoding="utf-8") as f:
277+
f.write(content)
278+
sys.stderr.write(f"GitHub Actions step summary written to {gha_summary}\n")
279+
else:
280+
summary_path = Path(".agent-traces/eval-summary.md")
281+
summary_path.parent.mkdir(parents=True, exist_ok=True)
282+
summary_path.write_text(content)
283+
sys.stderr.write(f"GitHub summary written to {summary_path}\n")
274284

275285

276286
def cmd_eval_ci(args: argparse.Namespace) -> int:

tests/test_eval_extensions.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,33 @@ def test_github_summary_no_baseline_shows_dashes(self):
398398
os.chdir(orig)
399399
self.assertIn("—", summary)
400400

401+
def test_github_step_summary_env_var_used_when_set(self):
402+
"""When GITHUB_STEP_SUMMARY is set, write there instead of the local file."""
403+
import os, tempfile
404+
report = _make_report([("no_errors", 1.0, 1.0, True)])
405+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
406+
gha_path = f.name
407+
try:
408+
orig_env = os.environ.get("GITHUB_STEP_SUMMARY")
409+
os.environ["GITHUB_STEP_SUMMARY"] = gha_path
410+
orig = os.getcwd()
411+
os.chdir(self.tmp)
412+
try:
413+
_write_github_summary(report, {}, tolerance=0.0)
414+
finally:
415+
os.chdir(orig)
416+
if orig_env is None:
417+
os.environ.pop("GITHUB_STEP_SUMMARY", None)
418+
else:
419+
os.environ["GITHUB_STEP_SUMMARY"] = orig_env
420+
421+
content = Path(gha_path).read_text()
422+
self.assertIn("agent-strace eval", content)
423+
# Local fallback file must NOT have been written
424+
self.assertFalse((Path(self.tmp) / ".agent-traces" / "eval-summary.md").exists())
425+
finally:
426+
os.unlink(gha_path)
427+
401428

402429
if __name__ == "__main__":
403430
unittest.main()

0 commit comments

Comments
 (0)