feat: add swegym entry (full SWE-Gym 2438 + official lite subset) #29
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Results Check | |
| on: | |
| pull_request: | |
| branches: [main] | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| jobs: | |
| # ── Path-isolation gate ──────────────────────────────────────────────── | |
| # Determines whether this PR is eligible for results auto-merge. | |
| # Eligible only if the diff strictly adds files under results/<cube-id>/*.json | |
| # — every other shape (modify, delete, mixed with non-results) is rejected | |
| # or falls back to standard human review. | |
| classify: | |
| name: classify | |
| runs-on: ubuntu-latest | |
| outputs: | |
| added_results: ${{ steps.added.outputs.results_all_changed_files }} | |
| added_results_any: ${{ steps.added.outputs.results_any_changed }} | |
| modified_or_deleted_any: ${{ steps.modified_or_deleted.outputs.has_any }} | |
| non_results_any: ${{ steps.non_results.outputs.results_any_changed != 'true' && steps.all.outputs.any_changed == 'true' && 'true' || 'false' }} | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.2.2 | |
| with: | |
| fetch-depth: 0 | |
| # Added results files only. Use the namespaced files_yaml outputs so the | |
| # group key ("results") is unambiguous; mixing with the bare-form inputs | |
| # produced version-dependent behavior (W4). | |
| - name: Added result files | |
| id: added | |
| uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5 | |
| with: | |
| separator: " " | |
| files_yaml: | | |
| results: | |
| - results/*/*.json | |
| - results/*/*.samples.jsonl.gz | |
| - '!results/*/_*.json' | |
| # Modified, deleted, renamed, or copied files anywhere under results/. | |
| # Computed via `git diff --diff-filter=MDRC` so the signal cannot be | |
| # masked by an additive file in the same PR (W2). The journal is | |
| # append-only; any non-add inside results/ is grounds for rejection. | |
| - name: Modified-or-deleted files inside results/ | |
| id: modified_or_deleted | |
| run: | | |
| set -euo pipefail | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| HEAD_SHA="${{ github.event.pull_request.head.sha }}" | |
| changed=$(git diff --diff-filter=MDRC --name-only "$BASE_SHA" "$HEAD_SHA" -- 'results/*' || true) | |
| if [ -n "$changed" ]; then | |
| echo "has_any=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "has_any=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| { | |
| echo "files<<__EOF__" | |
| printf '%s\n' "$changed" | |
| echo "__EOF__" | |
| } >> "$GITHUB_OUTPUT" | |
| # Was anything outside results/ touched? Combined with `all` we know | |
| # whether this is a results-only PR (auto-merge eligible) or a mixed PR. | |
| - name: Any non-results changes | |
| id: non_results | |
| uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5 | |
| with: | |
| separator: " " | |
| files_yaml: | | |
| results: | |
| - 'results/**' | |
| - name: All changed files (sentinel) | |
| id: all | |
| uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5 | |
| with: | |
| separator: " " | |
| # ── Reject any modify / delete / rename inside results/ ──────────────── | |
| # The journal is append-only. Fires whenever the diff includes a non-add | |
| # inside results/, regardless of whether the PR also adds new files. This | |
| # closes the add+modify bypass W2 reviewers identified. | |
| reject-modify: | |
| name: reject-modify | |
| needs: classify | |
| if: needs.classify.outputs.modified_or_deleted_any == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Post rejection comment | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| REPO_FULL: ${{ github.repository }} | |
| run: | | |
| gh pr comment "$PR_NUMBER" --repo "$REPO_FULL" \ | |
| --body "❌ This PR modifies, deletes, or renames files in \`results/\`. The journal is append-only — corrections are made by submitting a new record with a \`supersedes\` reference." | |
| exit 1 | |
| # ── Validate added result files ──────────────────────────────────────── | |
| validate: | |
| name: validate | |
| needs: classify | |
| # Run only when (a) there is at least one added results file, AND (b) | |
| # no modify/delete signal — so the script never sees a PR that already | |
| # failed the append-only gate. | |
| if: needs.classify.outputs.added_results_any == 'true' && needs.classify.outputs.modified_or_deleted_any == 'false' | |
| runs-on: ubuntu-latest | |
| outputs: | |
| passed: ${{ steps.run.outputs.passed }} | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.2.2 | |
| - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.3.0 | |
| with: | |
| python-version: "3.12" | |
| - name: Install dependencies | |
| run: pip install jsonschema 'ruamel.yaml' pyyaml | |
| # SECURITY: PR-author-controlled file paths (W1). We MUST NOT splice | |
| # `needs.classify.outputs.added_results` directly into the shell — GHA | |
| # ${{ }} substitution happens at YAML-render time, before the shell | |
| # runs, so embedded $(…)/backticks/etc. would execute. Route the value | |
| # through the env, where the shell only ever sees its literal text. | |
| - name: Run validator | |
| id: run | |
| env: | |
| ADDED_FILES: ${{ needs.classify.outputs.added_results }} | |
| run: | | |
| set +e | |
| # Word-split on whitespace into separate --added arguments. xargs is | |
| # safer than shell glob expansion because it leaves $() / backticks | |
| # / globs as literal path components — the validator rejects bad | |
| # filenames via its own regex. | |
| printf '%s\n' $ADDED_FILES \ | |
| | xargs python scripts/results_check.py --summary-out /tmp/results-summary.md --added | |
| rc=$? | |
| if [ "$rc" -eq 0 ]; then | |
| echo "passed=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "passed=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| exit "$rc" | |
| - name: Post validation summary | |
| if: always() | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| REPO_FULL: ${{ github.repository }} | |
| run: | | |
| if [ -f /tmp/results-summary.md ]; then | |
| gh pr comment "$PR_NUMBER" --repo "$REPO_FULL" \ | |
| --body-file /tmp/results-summary.md | |
| fi | |
| # ── Auto-merge eligibility ───────────────────────────────────────────── | |
| # The PR auto-merges iff (1) at least one results file added, (2) no | |
| # modify/delete inside results/, (3) nothing outside results/ touched, | |
| # (4) every added file validates. | |
| # | |
| # Fork PRs: GITHUB_TOKEN is read-only on `pull_request` from a fork in | |
| # public repos, so `gh pr merge --auto` will no-op. That's documented in | |
| # README + the openspec design — community contributors who want | |
| # auto-merge submit from a branch of this repo. Fork PRs fall through to | |
| # standard human review (a maintainer can complete the merge by hand). | |
| auto-merge: | |
| name: auto-merge | |
| needs: [classify, validate] | |
| if: | | |
| needs.classify.outputs.added_results_any == 'true' && | |
| needs.classify.outputs.modified_or_deleted_any == 'false' && | |
| needs.classify.outputs.non_results_any == 'false' && | |
| needs.validate.outputs.passed == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Apply auto-merge label | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| REPO_FULL: ${{ github.repository }} | |
| run: | | |
| gh pr edit "$PR_NUMBER" --repo "$REPO_FULL" --add-label "auto-merge" | |
| gh pr merge "$PR_NUMBER" --repo "$REPO_FULL" --auto --squash | |
| # ── Mixed PR: validator passes, but other files were touched ────────── | |
| # The result files are valid, but the PR also changes non-results paths. | |
| # Fall back to standard human review (no auto-merge). | |
| request-human-review: | |
| name: request-human-review | |
| needs: [classify, validate] | |
| if: | | |
| needs.classify.outputs.added_results_any == 'true' && | |
| needs.classify.outputs.modified_or_deleted_any == 'false' && | |
| needs.classify.outputs.non_results_any == 'true' && | |
| needs.validate.outputs.passed == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Comment on mixed PR | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| REPO_FULL: ${{ github.repository }} | |
| run: | | |
| gh pr comment "$PR_NUMBER" --repo "$REPO_FULL" \ | |
| --body "ℹ️ The added results files pass validation, but this PR also touches paths outside \`results/\`. Auto-merge is disabled — a maintainer will review." |