Terminal-Bench Regression #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Terminal-Bench Regression | |
| on: | |
| schedule: | |
| - cron: "0 12 * * 1" # Monday 5am PT (12pm UTC) | |
| workflow_dispatch: | |
| inputs: | |
| model: | |
| description: "Override model (blank = run both defaults)" | |
| default: "" | |
| concurrency: | |
| description: "Max concurrent tasks" | |
| default: "10" | |
| env: | |
| FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true | |
| jobs: | |
| regression: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: [sonnet-4.6-xhigh, gpt-5.3-codex-xhigh] | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Setup Python + uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: false | |
| - name: Create venv and install deps | |
| run: | | |
| uv venv .venv | |
| source .venv/bin/activate | |
| uv pip install "harbor>=0.1.45" "litellm>=1.0.0" "modal>=1.3.5" | |
| - name: Configure Modal | |
| env: | |
| MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} | |
| MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} | |
| run: | | |
| printf '[letta]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\nenvironment = "terminal-bench"\nimage_builder_version = "2025.06"\n' \ | |
| "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > ~/.modal.toml | |
| - name: Run regression tasks | |
| env: | |
| LETTA_API_KEY: ${{ secrets.LETTA_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| run: | | |
| source .venv/bin/activate | |
| harbor run \ | |
| --dataset terminal-bench@2.0 \ | |
| --agent-import-path benchmarks.terminal_bench.letta_code_agent:LettaCode \ | |
| --model "${{ matrix.model }}" \ | |
| --env modal \ | |
| --n-concurrent ${{ inputs.concurrency || '10' }} \ | |
| --job-name "regression-${{ matrix.model }}-$(date +%Y%m%d)" | |
| - name: Upload results artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: tb-results-${{ matrix.model }} | |
| path: jobs/ | |
| report: | |
| needs: regression | |
| if: always() | |
| runs-on: ubuntu-latest | |
| permissions: | |
| issues: write | |
| contents: read | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Download all result artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: results/ | |
| - name: Setup Python + uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: false | |
| - name: Generate report and update GitHub Issue | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GITHUB_REPOSITORY: ${{ github.repository }} | |
| GITHUB_RUN_ID: ${{ github.run_id }} | |
| GITHUB_SERVER_URL: ${{ github.server_url }} | |
| run: | | |
| uv run python benchmarks/terminal_bench/report.py \ | |
| --results-dir results/ \ | |
| --baseline benchmarks/terminal_bench/baseline.json \ | |
| --repo "${{ github.repository }}" |