Skills Eval Daily #323
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # Runs the skills eval agent (.github/skill-eval/skills_eval_agent.py) as a | |
| # nightly full sweep of every skill — the `schedule` cron below, with | |
| # DAILY_RUN=1 making plan_matrix.py enumerate all skills — or on demand via | |
| # workflow_dispatch. It deliberately does NOT run per-PR: a full GPU sweep is | |
| # far too heavy to fire on every pull-request mirror push. See | |
| # .github/skill-eval/AGENTS.md for the agent's system prompt and behaviour. | |
| name: Skills Eval Daily | |
| on: | |
| # Nightly full sweep of all skills (DAILY_RUN below makes plan_matrix.py | |
| # enumerate every skill). This is the only automatic trigger — there is | |
| # deliberately no per-PR `push:` trigger, because a full GPU sweep is far | |
| # too heavy to run on every pull-request mirror push. | |
| schedule: | |
| - cron: "30 1 * * *" | |
| timezone: "America/Los_Angeles" | |
| # On-demand sweep. The "Run workflow" button only appears when | |
| # `workflow_dispatch:` is configured on the repo's default branch (`main`), | |
| # but the full harness lives on `develop`: click "Run workflow", pick | |
| # `develop` in the ref dropdown, and GitHub runs develop's version of this | |
| # file (env passthrough + agent invocation all come from develop). | |
| workflow_dispatch: | |
| # Authenticate as github-actions[bot] for all GitHub operations (gh pr | |
| # comment, gh pr create, git push of eval-bot/* branches). The agent | |
| # inherits this via env.GH_TOKEN on the run step below. Mirrors how | |
| # helm-sync.yml does it — no PAT, no rotation, no personal-account | |
| # binding. The skill-eval bot PRs hit the source PR's headRefName branch | |
| # in this same repo, so contents:write is enough; pull-requests:write | |
| # covers gh pr comment + gh pr create. | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| # Only one sweep runs at a time per ref. A fresh run cancels the in-flight | |
| # one; the per-box flocks (/tmp/brev/<INSTANCE_NAME>.lock) | |
| # release automatically when the agent process dies (POSIX flock(2) semantics — | |
| # no userspace trap needed). Any stale containers / volumes / marker that the | |
| # cancelled run left on a box are reconciled by the NEXT run's | |
| # `BrevEnvironment._ensure_prerequisite_deployed`: the active-deploy marker | |
| # carries `<profile_tag>|<run_id>`, so a marker from this run never matches | |
| # the next run's desired marker and the next run always tears down + redeploys. | |
| # Same path handles SIGKILL (the 12h timeout-minutes terminator) and host reboots | |
| # — there's no exit-side cleanup machinery to bypass. | |
| concurrency: | |
| group: skills-eval-daily-${{ github.ref }} | |
| cancel-in-progress: true | |
| defaults: | |
| run: | |
| shell: bash | |
| jobs: | |
| # --------------------------------------------------------------------- | |
| # plan — enumerate every skill into the dispatch matrix (full sweep: | |
| # DAILY_RUN=1 -> plan_matrix.py list_skill_file_paths()). | |
| # Lightweight: gh + python3, no GPU, no brev, no flock. | |
| # --------------------------------------------------------------------- | |
| plan: | |
| name: Plan eval matrix | |
| runs-on: [self-hosted, vss-skill-eval-runner] | |
| timeout-minutes: 15 | |
| outputs: | |
| matrix: ${{ steps.plan.outputs.matrix }} | |
| has_targets: ${{ steps.plan.outputs.has_targets }} | |
| steps: | |
| - name: Checkout mirror head | |
| uses: actions/checkout@v4 | |
| with: | |
| # ref: develop <-- after get merged into develop, I would need to uncomment this setting to specify | |
| # develop branch as target branch for scheduled skill eval run | |
| fetch-depth: 0 # plan diffs the cumulative head...base range | |
| - name: Compute matrix | |
| id: plan | |
| env: | |
| DAILY_RUN: true | |
| run: python3 .github/skill-eval/plan_matrix.py | |
| # --------------------------------------------------------------------- | |
| # eval — one leg per (spec, platform). Each leg runs the agent once. | |
| # --------------------------------------------------------------------- | |
| eval: | |
| name: ${{ matrix.name }} | |
| needs: plan | |
| if: needs.plan.outputs.has_targets == 'true' | |
| runs-on: [self-hosted, vss-skill-eval-runner] | |
| strategy: | |
| # One slow/failed spec must not cancel its siblings. | |
| fail-fast: false | |
| # Cap concurrent legs near the vss-eval-* box count so legs don't | |
| # all grab a runner slot only to block on flock. Excess legs queue. | |
| max-parallel: 4 | |
| matrix: ${{ fromJSON(needs.plan.outputs.matrix) }} | |
| # Per-leg cap. One spec, but a multi-step spec runs step-1..N as | |
| # sequential harbor invocations, each up to the 1 h agent ceiling | |
| # (AGENTS.md `--agent-timeout-multiplier 6.0`). 360m covers a ~4-step | |
| # cold-box spec plus deploy/dataset/comment overhead without the | |
| # whole-sweep 12 h cap the single-job design needed. AGENTS.md's | |
| # box-lock wait (flock -w 21000 ≈ 5.8 h) is kept strictly under this | |
| # 360 m ceiling so a lock-starved leg emits `BLOCKED: lock timeout` | |
| # before the job-killer fires (a 43200 s / 12 h wait would be | |
| # silently killed mid-flock with no verdict — the exit-4 trap). | |
| timeout-minutes: 360 | |
| steps: | |
| - name: Checkout mirror head | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # agent needs full history to fetch + commit to the contributor's branch (§ 3c) | |
| - name: Load coordinator env (Anthropic, NGC, Brev, remote endpoints) | |
| run: | | |
| # The runner host (vss-skill-validator-v2) keeps coordinator | |
| # secrets in a single .env. Don't echo contents. | |
| set -a | |
| source /home/ubuntu/eval-coordinator/.env | |
| set +a | |
| printf "COORDINATOR_ENV=loaded\n" >> "$GITHUB_ENV" | |
| printf "CLAUDE_CODE_DISABLE_THINKING=1\n" >> "$GITHUB_ENV" | |
| - name: Run skills eval agent (single spec) | |
| id: agent | |
| env: | |
| # github-actions[bot] token; scoped by the permissions: block. | |
| GH_TOKEN: ${{ github.token }} | |
| # gh checks ~/.config/gh/hosts.yml BEFORE GH_TOKEN; point it at a | |
| # per-leg scratch dir with no hosts.yml so a stored OAuth on the | |
| # host can't reauthor bot comments. Same fix helm-sync.yml uses. | |
| GH_CONFIG_DIR: ${{ runner.temp }}/gh-skill-eval-${{ github.run_id }}-${{ matrix.slug }} | |
| # Raise the Bash-tool timeout cap above the worst-case single | |
| # `uvx harbor run` (~2h: env-build 1800s + agent 3600s + verify | |
| # 1800s). Without this the runtime auto-backgrounds the foreground | |
| # harbor call at the 10-min default cap and the agent falls into | |
| # polling its task output — silently defeating the "block on | |
| # harbor" wait contract (the PreToolUse hook can't catch a | |
| # runtime-initiated background; it only inspects call input). Both | |
| # stay under the job's timeout-minutes so the call can't outlive it. | |
| BASH_DEFAULT_TIMEOUT_MS: "7200000" | |
| BASH_MAX_TIMEOUT_MS: "10800000" | |
| run: | | |
| mkdir -p "$GH_CONFIG_DIR" | |
| set -a | |
| source /home/ubuntu/eval-coordinator/.env # Anthropic / NGC / Brev | |
| set +a | |
| export PR_HEAD_SHA="${{ github.sha }}" | |
| export PR_REPO="${{ github.repository }}" | |
| export GITHUB_RUN_ID="${{ github.run_id }}" | |
| # Single-spec mode — plan already decided this leg's target, so | |
| # the agent skips the diff and processes exactly one (skill, spec). | |
| export EVAL_KIND="${{ matrix.kind }}" | |
| export EVAL_SKILL="${{ matrix.skill }}" | |
| export EVAL_SPEC_PATH="${{ matrix.spec_path }}" | |
| export EVAL_SPEC_STEM="${{ matrix.spec_stem }}" | |
| export EVAL_PLATFORM="${{ matrix.platform }}" | |
| export DAILY_RUN="true" | |
| # Per-leg scratch scope. Every leg keys its datasets/results/ | |
| # viewer under <slug>/<run_id> (slug = skill__spec_stem__platform) | |
| # so concurrent legs — and concurrent PR runs on the same host — | |
| # never collide (AGENTS.md § "Per-leg scratch isolation"). | |
| export EVAL_SLUG="${{ matrix.slug }}" | |
| # `.github` isn't a valid Python package prefix; expose the | |
| # harness via PYTHONPATH for `uvx harbor --environment-import-path`. | |
| export PYTHONPATH="${GITHUB_WORKSPACE}/.github/skill-eval:${PYTHONPATH:-}" | |
| mkdir -p /tmp/brev /tmp/skill-eval | |
| python3 .github/skill-eval/skills_eval_agent.py | |
| - name: Collect results for workflow artifact | |
| if: always() | |
| env: | |
| EVAL_SLUG: ${{ matrix.slug }} | |
| run: | | |
| # This leg's harbor output lives under its own scratch scope, | |
| # /tmp/skill-eval/results/<slug>/<run_id>/, so the artifact | |
| # carries only this (spec,platform)'s trials. missing_adapter / | |
| # blocker legs finish without a results dir — a valid outcome. | |
| LEG_RESULTS="/tmp/skill-eval/results/${EVAL_SLUG}/${GITHUB_RUN_ID}" | |
| if [ ! -d "$LEG_RESULTS" ]; then | |
| echo "no results dir for this leg — blocked before running a trial (expected for blocker outcomes)" | |
| exit 0 | |
| fi | |
| # Exclude per-trial agent/ trajectory subdirs from the tarball | |
| # (NOT from the on-disk tree the Harbor viewer serves). Those | |
| # files capture every tool_use/tool_result verbatim, so a `.env` | |
| # read or `docker compose config` dump would otherwise carry | |
| # secret values into the public artifact (PR #516 leaked | |
| # NGC_CLI_API_KEY this way). Reward / judge / result.json — what | |
| # the report consumes — stay in. | |
| # Per-leg tarball path scoped by slug + run_id. `max-parallel` | |
| # places multiple legs on one runner host, so a fixed | |
| # /tmp/skills-eval-results.tar.gz would let overlapping legs | |
| # overwrite each other's archive before upload — each leg then | |
| # uploads whichever sibling wrote last. The slug makes the path | |
| # unique per (skill,spec,platform). | |
| LEG_TARBALL="/tmp/skills-eval-daily-results-${EVAL_SLUG}-${GITHUB_RUN_ID}.tar.gz" | |
| RESULTS=$(find "$LEG_RESULTS" -maxdepth 3 -name "result.json" 2>/dev/null | head -50 || true) | |
| if [ -n "$RESULTS" ]; then | |
| tar czf "$LEG_TARBALL" \ | |
| -C "/tmp/skill-eval/results/${EVAL_SLUG}" --exclude='agent' "$GITHUB_RUN_ID" | |
| echo "archived $(echo "$RESULTS" | wc -l) result.json files (agent/ trajectories excluded)" | |
| else | |
| echo "results dir exists but empty — nothing to archive" | |
| fi | |
| - name: Upload results artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: skills-eval-daily-results-${{ matrix.slug }}-${{ github.run_id }} | |
| path: /tmp/skills-eval-daily-results-${{ matrix.slug }}-${{ github.run_id }}.tar.gz | |
| if-no-files-found: ignore | |
| retention-days: 7 |