fix(make-pdf): replace regex sanitizer with parser-backed sanitization #1388
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Evals | |
| on: | |
| pull_request: | |
| branches: [main] | |
| workflow_dispatch: | |
| concurrency: | |
| group: evals-${{ github.head_ref }} | |
| cancel-in-progress: true | |
| env: | |
| IMAGE: ghcr.io/${{ github.repository }}/ci | |
| EVALS_TIER: gate | |
| jobs: | |
| # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change) | |
| build-image: | |
| runs-on: ubicloud-standard-2 | |
| permissions: | |
| contents: read | |
| packages: write | |
| outputs: | |
| image-tag: ${{ steps.meta.outputs.tag }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - id: meta | |
| run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json', 'bun.lock') }}" >> "$GITHUB_OUTPUT" | |
| - uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Check if image exists | |
| id: check | |
| run: | | |
| if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then | |
| echo "exists=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "exists=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - if: steps.check.outputs.exists == 'false' | |
| run: cp package.json bun.lock .github/docker/ | |
| - if: steps.check.outputs.exists == 'false' | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: .github/docker | |
| file: .github/docker/Dockerfile.ci | |
| push: true | |
| tags: | | |
| ${{ steps.meta.outputs.tag }} | |
| ${{ env.IMAGE }}:latest | |
| evals: | |
| runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }} | |
| needs: build-image | |
| container: | |
| image: ${{ needs.build-image.outputs.image-tag }} | |
| credentials: | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| options: --user runner | |
| timeout-minutes: 25 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| suite: | |
| - name: llm-judge | |
| file: test/skill-llm-eval.test.ts | |
| - name: e2e-browse | |
| file: test/skill-e2e-bws.test.ts | |
| runner: ubicloud-standard-8 | |
| - name: e2e-plan | |
| file: test/skill-e2e-plan.test.ts | |
| - name: e2e-deploy | |
| file: test/skill-e2e-deploy.test.ts | |
| - name: e2e-design | |
| file: test/skill-e2e-design.test.ts | |
| - name: e2e-qa-bugs | |
| file: test/skill-e2e-qa-bugs.test.ts | |
| - name: e2e-qa-workflow | |
| file: test/skill-e2e-qa-workflow.test.ts | |
| - name: e2e-review | |
| file: test/skill-e2e-review.test.ts | |
| - name: e2e-workflow | |
| file: test/skill-e2e-workflow.test.ts | |
| - name: e2e-routing | |
| file: test/skill-routing-e2e.test.ts | |
| - name: e2e-codex | |
| file: test/codex-e2e.test.ts | |
| - name: e2e-gemini | |
| file: test/gemini-e2e.test.ts | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| # Bun creates root-owned temp dirs during Docker build. GH Actions runs as | |
| # runner user with HOME=/github/home. Redirect bun's cache to a writable dir. | |
| - name: Fix bun temp | |
| run: | | |
| mkdir -p /home/runner/.cache/bun | |
| { | |
| echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun" | |
| echo "BUN_TMPDIR=/home/runner/.cache/bun" | |
| echo "TMPDIR=/home/runner/.cache" | |
| } >> "$GITHUB_ENV" | |
| # Restore pre-installed node_modules from Docker image via recursive | |
| # copy. Symlink (`ln -s`) breaks bun's module resolution because bun | |
| # resolves a file's realpath when walking up to find node_modules/<dep>; | |
| # from a symlinked path, realpath escapes the workspace and sibling | |
| # deps no longer resolve. Hardlink copy (`cp -al`) fails because /opt | |
| # and /workspace are on different overlay-fs layers ("Invalid | |
| # cross-device link"). Recursive copy works on every layout. Cost: | |
| # ~5s for ~200 packages of small JS files vs ~0s for symlink — still | |
| # vastly cheaper than rerunning `bun install` (network + resolution). | |
| - name: Restore deps | |
| run: | | |
| if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then | |
| cp -r /opt/node_modules_cache node_modules | |
| else | |
| bun install | |
| fi | |
| - run: bun run build | |
| # Verify Playwright can launch Chromium (fails fast if sandbox/deps are broken) | |
| - name: Verify Chromium | |
| if: matrix.suite.name == 'e2e-browse' | |
| run: | | |
| echo "whoami=$(whoami) HOME=$HOME TMPDIR=${TMPDIR:-unset}" | |
| touch /tmp/.bun-test && rm /tmp/.bun-test && echo "/tmp writable" | |
| bun -e "import {chromium} from 'playwright';const b=await chromium.launch({args:['--no-sandbox']});console.log('Chromium OK');await b.close()" | |
| - name: Run ${{ matrix.suite.name }} | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
| EVALS_CONCURRENCY: "40" | |
| PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers | |
| run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }} | |
| - name: Upload eval results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-${{ matrix.suite.name }} | |
| path: ~/.gstack-dev/evals/*.json | |
| retention-days: 90 | |
| report: | |
| runs-on: ubicloud-standard-2 | |
| needs: evals | |
| if: always() && github.event_name == 'pull_request' | |
| timeout-minutes: 5 | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 1 | |
| - name: Download all eval artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: eval-* | |
| path: /tmp/eval-results | |
| merge-multiple: true | |
| - name: Post PR comment | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| # shellcheck disable=SC2086,SC2059 | |
| RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort) | |
| if [ -z "$RESULTS" ]; then | |
| echo "No eval results found" | |
| exit 0 | |
| fi | |
| TOTAL=0; PASSED=0; FAILED=0; COST="0" | |
| SUITE_LINES="" | |
| for f in $RESULTS; do | |
| if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then | |
| echo "Skipping malformed JSON: $f" | |
| continue | |
| fi | |
| T=$(jq -r '.total_tests // 0' "$f") | |
| P=$(jq -r '.passed // 0' "$f") | |
| F=$(jq -r '.failed // 0' "$f") | |
| C=$(jq -r '.total_cost_usd // 0' "$f") | |
| TIER=$(jq -r '.tier // "unknown"' "$f") | |
| [ "$T" -eq 0 ] && continue | |
| TOTAL=$((TOTAL + T)) | |
| PASSED=$((PASSED + P)) | |
| FAILED=$((FAILED + F)) | |
| COST=$(echo "$COST + $C" | bc) | |
| STATUS_ICON="✅" | |
| [ "$F" -gt 0 ] && STATUS_ICON="❌" | |
| SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n" | |
| done | |
| STATUS="✅ PASS" | |
| [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL" | |
| BODY="## E2E Evals: ${STATUS} | |
| **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners** | |
| | Suite | Result | Status | Cost | | |
| |-------|--------|--------|------| | |
| $(echo -e "$SUITE_LINES") | |
| --- | |
| *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*" | |
| if [ "$FAILED" -gt 0 ]; then | |
| FAILURES="" | |
| for f in $RESULTS; do | |
| if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi | |
| F=$(jq -r '.failed // 0' "$f") | |
| [ "$F" -eq 0 ] && continue | |
| FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error") | |
| FAILURES="${FAILURES}${FAILS}\n" | |
| done | |
| BODY="${BODY} | |
| ### Failures | |
| $(echo -e "$FAILURES")" | |
| fi | |
| # Update existing comment or create new one | |
| COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \ | |
| --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1) | |
| if [ -n "$COMMENT_ID" ]; then | |
| gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \ | |
| -X PATCH -f body="$BODY" | |
| else | |
| gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" | |
| fi |