Performance Report #6
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Performance Report | |
| on: | |
| schedule: | |
| - cron: "0 9 * * 1" # Every Monday 9am UTC | |
| workflow_dispatch: | |
| inputs: | |
| addon: | |
| description: "Addon to generate report for" | |
| type: choice | |
| required: true | |
| options: | |
| - ocr-onnx | |
| - nmtcpp | |
| - llamacpp-llm | |
| - onnx-tts | |
| - parakeet | |
| workflow_name: | |
| description: "Integration test workflow name to query" | |
| type: choice | |
| required: true | |
| options: | |
| - "Integration Tests (OCR)" | |
| - "Mobile Integration Tests (OCR)" | |
| - "Integration Tests (NMTCPP)" | |
| - "Integration Tests (LLM)" | |
| - "Mobile Integration Tests (LLM)" | |
| - "Integration Tests (TTS)" | |
| - "Mobile Integration Tests (TTS)" | |
| - "Mobile Integration Tests (Parakeet)" | |
| runs: | |
| description: "Number of recent runs to aggregate" | |
| type: number | |
| required: false | |
| default: 6 | |
| jobs: | |
| generate-report: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| actions: read | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v6 | |
| with: | |
| node-version: lts/* | |
| - name: Generate performance report (manual) | |
| if: ${{ github.event_name == 'workflow_dispatch' }} | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| # Pass inputs via env rather than ${{ }} interpolation inside | |
| # the `run:` block. Even though each input has a constrained | |
| # `type: choice`/`type: number`, reading them here via env | |
| # removes GH-Actions expression injection as an attack surface | |
| # class entirely — `bash` cannot re-evaluate an env var as | |
| # workflow syntax. | |
| PERF_ADDON: ${{ inputs.addon }} | |
| PERF_WORKFLOW: ${{ inputs.workflow_name }} | |
| PERF_RUNS: ${{ inputs.runs }} | |
| run: | | |
| node scripts/perf-report/aggregate.js \ | |
| --addon "$PERF_ADDON" \ | |
| --workflow "$PERF_WORKFLOW" \ | |
| --runs "$PERF_RUNS" \ | |
| --output "reports/${PERF_ADDON}-performance.md" \ | |
| --output-json "reports/${PERF_ADDON}-performance.json" \ | |
| --output-html "reports/${PERF_ADDON}-performance.html" | |
| - name: Generate performance reports (scheduled - all addons) | |
| if: ${{ github.event_name == 'schedule' }} | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| mkdir -p reports | |
| echo "=== OCR (Desktop) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon ocr-onnx \ | |
| --workflow "Integration Tests (OCR)" \ | |
| --runs 6 \ | |
| --output reports/ocr-onnx-performance.md \ | |
| --output-json reports/ocr-onnx-performance.json \ | |
| --output-html reports/ocr-onnx-performance.html || true | |
| echo "=== OCR (Mobile) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon ocr-onnx \ | |
| --workflow "Mobile Integration Tests (OCR)" \ | |
| --runs 6 \ | |
| --output reports/ocr-onnx-mobile-performance.md \ | |
| --output-json reports/ocr-onnx-mobile-performance.json \ | |
| --output-html reports/ocr-onnx-mobile-performance.html || true | |
| echo "=== Translation ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon nmtcpp \ | |
| --workflow "Integration Tests (NMTCPP)" \ | |
| --runs 6 \ | |
| --output reports/nmtcpp-performance.md \ | |
| --output-json reports/nmtcpp-performance.json \ | |
| --output-html reports/nmtcpp-performance.html || true | |
| echo "=== Vision/LLM (Desktop) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon llamacpp-llm \ | |
| --workflow "Integration Tests (LLM)" \ | |
| --runs 6 \ | |
| --output reports/llamacpp-llm-performance.md \ | |
| --output-json reports/llamacpp-llm-performance.json \ | |
| --output-html reports/llamacpp-llm-performance.html || true | |
| echo "=== Vision/LLM (Mobile) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon llamacpp-llm \ | |
| --workflow "Mobile Integration Tests (LLM)" \ | |
| --runs 6 \ | |
| --output reports/llamacpp-llm-mobile-performance.md \ | |
| --output-json reports/llamacpp-llm-mobile-performance.json \ | |
| --output-html reports/llamacpp-llm-mobile-performance.html || true | |
| echo "=== TTS ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon onnx-tts \ | |
| --workflow "Integration Tests (TTS)" \ | |
| --runs 6 \ | |
| --output reports/onnx-tts-performance.md \ | |
| --output-json reports/onnx-tts-performance.json \ | |
| --output-html reports/onnx-tts-performance.html || true | |
| echo "=== TTS (Mobile) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon onnx-tts \ | |
| --workflow "Mobile Integration Tests (TTS)" \ | |
| --runs 6 \ | |
| --output reports/onnx-tts-mobile-performance.md \ | |
| --output-json reports/onnx-tts-mobile-performance.json \ | |
| --output-html reports/onnx-tts-mobile-performance.html || true | |
| echo "=== Parakeet (Mobile) ===" | |
| node scripts/perf-report/aggregate.js \ | |
| --addon parakeet \ | |
| --workflow "Mobile Integration Tests (Parakeet)" \ | |
| --runs 6 \ | |
| --output reports/parakeet-mobile-performance.md \ | |
| --output-json reports/parakeet-mobile-performance.json \ | |
| --output-html reports/parakeet-mobile-performance.html || true | |
| # ─── Phase B: COMET quality scoring for NMT (weekly aggregate only) ─── | |
| # Runs only on the Monday scheduled trigger, or on workflow_dispatch | |
| # when inputs.addon == 'nmtcpp'. Intentionally NOT wired into per-PR | |
| # desktop or mobile integration workflows — COMET's 2+ GB model and | |
| # heavier Python environment would blow through per-PR wall time and | |
| # mobile bandwidth budgets (see QVAC-17474 Phase B plan). | |
| # | |
| # Any failure here (model download, pip install, comet-score crash) | |
| # is isolated with `continue-on-error: true` so the chrF++ output | |
| # generated by aggregate.js above always ships. | |
| # `always()` so COMET still tries to run even when the aggregate | |
| # step above fails (which happens when the last N NMTCPP runs | |
| # don't have perf-report-* artifacts yet — e.g. right after the | |
| # Phase A pipeline first landed). The COMET script downloads its | |
| # own copies of the per-run performance-report.json artifacts, | |
| # so it's independent of aggregate.js's output. If aggregate | |
| # succeeds, COMET complements it; if aggregate fails, COMET at | |
| # least emits a stub markdown so the Step Summary isn't empty. | |
| - name: Setup Python 3.11 for COMET | |
| if: | | |
| always() && ( | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp') | |
| ) | |
| uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # 6.2.0 | |
| with: | |
| python-version: '3.11' | |
| # `cache: pip` caches ~/.cache/pip keyed on the hash of the | |
| # `cache-dependency-path` file (we point at this workflow | |
| # itself, since we pin the unbabel-comet version inline). | |
| # Saves ~60–90s of PyPI wire time for the weekly run and | |
| # avoids cold-downloading ~250MB of torch/transformers wheels | |
| # on every trigger. | |
| cache: pip | |
| cache-dependency-path: .github/workflows/perf-report.yml | |
| - name: Cache HuggingFace model for COMET | |
| if: | | |
| always() && ( | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp') | |
| ) | |
| uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4 | |
| with: | |
| path: ~/.cache/huggingface/hub | |
| key: comet-model-v1-wmt22-comet-da | |
| - name: Install unbabel-comet | |
| if: | | |
| always() && ( | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp') | |
| ) | |
| continue-on-error: true | |
| run: | | |
| python -m pip install --upgrade pip | |
| # Pinned to an exact release so a future 2.2.x patch (or an | |
| # unexpected PyTorch transitive pin bump) can't silently | |
| # change the COMET scores or break the weekly run. Bump this | |
| # deliberately when we want a newer build. | |
| pip install 'unbabel-comet==2.2.6' | |
| comet-score --help | head -5 || true | |
| - name: Score NMT translations with COMET | |
| if: | | |
| always() && ( | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp') | |
| ) | |
| continue-on-error: true | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| # Same reasoning as the "Generate performance report (manual)" | |
| # step: avoid ${{ }} interpolation inside a `run:` block. | |
| PERF_RUNS: ${{ inputs.runs }} | |
| run: | | |
| mkdir -p reports | |
| # workflow_dispatch passes `runs` as input; schedule defaults to 6. | |
| RUNS="${PERF_RUNS:-6}" | |
| # Query the umbrella "On PR Trigger (NMTCPP)" workflow — that's | |
| # where perf-report-* artifacts are attached. The inner | |
| # "Integration Tests (NMTCPP)" workflow is invoked via | |
| # workflow_call and its artifacts surface on the umbrella run. | |
| node scripts/perf-report/comet-score-nmt.js \ | |
| --runs "$RUNS" \ | |
| --workflow "On PR Trigger (NMTCPP)" \ | |
| --output reports/nmtcpp-comet.md \ | |
| --model Unbabel/wmt22-comet-da || true | |
| - name: Write GitHub Step Summary | |
| if: always() | |
| run: | | |
| echo "# Performance Reports" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Generated: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| for f in reports/*.md; do | |
| if [ -f "$f" ]; then | |
| cat "$f" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "---" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| done | |
| - name: Upload reports | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: performance-reports-${{ github.run_number }} | |
| path: reports/ | |
| retention-days: 90 | |
| if-no-files-found: ignore |