Skip to content

Performance Report

Performance Report #6

Workflow file for this run

name: Performance Report
on:
schedule:
- cron: "0 9 * * 1" # Every Monday 9am UTC
workflow_dispatch:
inputs:
addon:
description: "Addon to generate report for"
type: choice
required: true
options:
- ocr-onnx
- nmtcpp
- llamacpp-llm
- onnx-tts
- parakeet
workflow_name:
description: "Integration test workflow name to query"
type: choice
required: true
options:
- "Integration Tests (OCR)"
- "Mobile Integration Tests (OCR)"
- "Integration Tests (NMTCPP)"
- "Integration Tests (LLM)"
- "Mobile Integration Tests (LLM)"
- "Integration Tests (TTS)"
- "Mobile Integration Tests (TTS)"
- "Mobile Integration Tests (Parakeet)"
runs:
description: "Number of recent runs to aggregate"
type: number
required: false
default: 6
jobs:
generate-report:
runs-on: ubuntu-latest
permissions:
contents: read
actions: read
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: lts/*
- name: Generate performance report (manual)
if: ${{ github.event_name == 'workflow_dispatch' }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# Pass inputs via env rather than ${{ }} interpolation inside
# the `run:` block. Even though each input has a constrained
# `type: choice`/`type: number`, reading them here via env
# removes GH-Actions expression injection as an attack surface
# class entirely — `bash` cannot re-evaluate an env var as
# workflow syntax.
PERF_ADDON: ${{ inputs.addon }}
PERF_WORKFLOW: ${{ inputs.workflow_name }}
PERF_RUNS: ${{ inputs.runs }}
run: |
node scripts/perf-report/aggregate.js \
--addon "$PERF_ADDON" \
--workflow "$PERF_WORKFLOW" \
--runs "$PERF_RUNS" \
--output "reports/${PERF_ADDON}-performance.md" \
--output-json "reports/${PERF_ADDON}-performance.json" \
--output-html "reports/${PERF_ADDON}-performance.html"
- name: Generate performance reports (scheduled - all addons)
if: ${{ github.event_name == 'schedule' }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p reports
echo "=== OCR (Desktop) ==="
node scripts/perf-report/aggregate.js \
--addon ocr-onnx \
--workflow "Integration Tests (OCR)" \
--runs 6 \
--output reports/ocr-onnx-performance.md \
--output-json reports/ocr-onnx-performance.json \
--output-html reports/ocr-onnx-performance.html || true
echo "=== OCR (Mobile) ==="
node scripts/perf-report/aggregate.js \
--addon ocr-onnx \
--workflow "Mobile Integration Tests (OCR)" \
--runs 6 \
--output reports/ocr-onnx-mobile-performance.md \
--output-json reports/ocr-onnx-mobile-performance.json \
--output-html reports/ocr-onnx-mobile-performance.html || true
echo "=== Translation ==="
node scripts/perf-report/aggregate.js \
--addon nmtcpp \
--workflow "Integration Tests (NMTCPP)" \
--runs 6 \
--output reports/nmtcpp-performance.md \
--output-json reports/nmtcpp-performance.json \
--output-html reports/nmtcpp-performance.html || true
echo "=== Vision/LLM (Desktop) ==="
node scripts/perf-report/aggregate.js \
--addon llamacpp-llm \
--workflow "Integration Tests (LLM)" \
--runs 6 \
--output reports/llamacpp-llm-performance.md \
--output-json reports/llamacpp-llm-performance.json \
--output-html reports/llamacpp-llm-performance.html || true
echo "=== Vision/LLM (Mobile) ==="
node scripts/perf-report/aggregate.js \
--addon llamacpp-llm \
--workflow "Mobile Integration Tests (LLM)" \
--runs 6 \
--output reports/llamacpp-llm-mobile-performance.md \
--output-json reports/llamacpp-llm-mobile-performance.json \
--output-html reports/llamacpp-llm-mobile-performance.html || true
echo "=== TTS ==="
node scripts/perf-report/aggregate.js \
--addon onnx-tts \
--workflow "Integration Tests (TTS)" \
--runs 6 \
--output reports/onnx-tts-performance.md \
--output-json reports/onnx-tts-performance.json \
--output-html reports/onnx-tts-performance.html || true
echo "=== TTS (Mobile) ==="
node scripts/perf-report/aggregate.js \
--addon onnx-tts \
--workflow "Mobile Integration Tests (TTS)" \
--runs 6 \
--output reports/onnx-tts-mobile-performance.md \
--output-json reports/onnx-tts-mobile-performance.json \
--output-html reports/onnx-tts-mobile-performance.html || true
echo "=== Parakeet (Mobile) ==="
node scripts/perf-report/aggregate.js \
--addon parakeet \
--workflow "Mobile Integration Tests (Parakeet)" \
--runs 6 \
--output reports/parakeet-mobile-performance.md \
--output-json reports/parakeet-mobile-performance.json \
--output-html reports/parakeet-mobile-performance.html || true
# ─── Phase B: COMET quality scoring for NMT (weekly aggregate only) ───
# Runs only on the Monday scheduled trigger, or on workflow_dispatch
# when inputs.addon == 'nmtcpp'. Intentionally NOT wired into per-PR
# desktop or mobile integration workflows — COMET's 2+ GB model and
# heavier Python environment would blow through per-PR wall time and
# mobile bandwidth budgets (see QVAC-17474 Phase B plan).
#
# Any failure here (model download, pip install, comet-score crash)
# is isolated with `continue-on-error: true` so the chrF++ output
# generated by aggregate.js above always ships.
# `always()` so COMET still tries to run even when the aggregate
# step above fails (which happens when the last N NMTCPP runs
# don't have perf-report-* artifacts yet — e.g. right after the
# Phase A pipeline first landed). The COMET script downloads its
# own copies of the per-run performance-report.json artifacts,
# so it's independent of aggregate.js's output. If aggregate
# succeeds, COMET complements it; if aggregate fails, COMET at
# least emits a stub markdown so the Step Summary isn't empty.
- name: Setup Python 3.11 for COMET
if: |
always() && (
github.event_name == 'schedule' ||
(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
)
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # 6.2.0
with:
python-version: '3.11'
# `cache: pip` caches ~/.cache/pip keyed on the hash of the
# `cache-dependency-path` file (we point at this workflow
# itself, since we pin the unbabel-comet version inline).
# Saves ~60–90s of PyPI wire time for the weekly run and
# avoids cold-downloading ~250MB of torch/transformers wheels
# on every trigger.
cache: pip
cache-dependency-path: .github/workflows/perf-report.yml
- name: Cache HuggingFace model for COMET
if: |
always() && (
github.event_name == 'schedule' ||
(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
)
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4
with:
path: ~/.cache/huggingface/hub
key: comet-model-v1-wmt22-comet-da
- name: Install unbabel-comet
if: |
always() && (
github.event_name == 'schedule' ||
(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
)
continue-on-error: true
run: |
python -m pip install --upgrade pip
# Pinned to an exact release so a future 2.2.x patch (or an
# unexpected PyTorch transitive pin bump) can't silently
# change the COMET scores or break the weekly run. Bump this
# deliberately when we want a newer build.
pip install 'unbabel-comet==2.2.6'
comet-score --help | head -5 || true
- name: Score NMT translations with COMET
if: |
always() && (
github.event_name == 'schedule' ||
(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
)
continue-on-error: true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# Same reasoning as the "Generate performance report (manual)"
# step: avoid ${{ }} interpolation inside a `run:` block.
PERF_RUNS: ${{ inputs.runs }}
run: |
mkdir -p reports
# workflow_dispatch passes `runs` as input; schedule defaults to 6.
RUNS="${PERF_RUNS:-6}"
# Query the umbrella "On PR Trigger (NMTCPP)" workflow — that's
# where perf-report-* artifacts are attached. The inner
# "Integration Tests (NMTCPP)" workflow is invoked via
# workflow_call and its artifacts surface on the umbrella run.
node scripts/perf-report/comet-score-nmt.js \
--runs "$RUNS" \
--workflow "On PR Trigger (NMTCPP)" \
--output reports/nmtcpp-comet.md \
--model Unbabel/wmt22-comet-da || true
- name: Write GitHub Step Summary
if: always()
run: |
echo "# Performance Reports" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Generated: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
for f in reports/*.md; do
if [ -f "$f" ]; then
cat "$f" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "---" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
fi
done
- name: Upload reports
if: always()
uses: actions/upload-artifact@v4
with:
name: performance-reports-${{ github.run_number }}
path: reports/
retention-days: 90
if-no-files-found: ignore