Performance Report #6

Workflow file for this run

.github/workflows/perf-report.yml at 65f006e

	name: Performance Report

	on:
	schedule:
	- cron: "0 9 * * 1" # Every Monday 9am UTC
	workflow_dispatch:
	inputs:
	addon:
	description: "Addon to generate report for"
	type: choice
	required: true
	options:
	- ocr-onnx
	- nmtcpp
	- llamacpp-llm
	- onnx-tts
	- parakeet
	workflow_name:
	description: "Integration test workflow name to query"
	type: choice
	required: true
	options:
	- "Integration Tests (OCR)"
	- "Mobile Integration Tests (OCR)"
	- "Integration Tests (NMTCPP)"
	- "Integration Tests (LLM)"
	- "Mobile Integration Tests (LLM)"
	- "Integration Tests (TTS)"
	- "Mobile Integration Tests (TTS)"
	- "Mobile Integration Tests (Parakeet)"
	runs:
	description: "Number of recent runs to aggregate"
	type: number
	required: false
	default: 6

	jobs:
	generate-report:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	actions: read

	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Setup Node.js
	uses: actions/setup-node@v6
	with:
	node-version: lts/*

	- name: Generate performance report (manual)
	if: ${{ github.event_name == 'workflow_dispatch' }}
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	# Pass inputs via env rather than ${{ }} interpolation inside
	# the `run:` block. Even though each input has a constrained
	# `type: choice`/`type: number`, reading them here via env
	# removes GH-Actions expression injection as an attack surface
	# class entirely — `bash` cannot re-evaluate an env var as
	# workflow syntax.
	PERF_ADDON: ${{ inputs.addon }}
	PERF_WORKFLOW: ${{ inputs.workflow_name }}
	PERF_RUNS: ${{ inputs.runs }}
	run: \|
	node scripts/perf-report/aggregate.js \
	--addon "$PERF_ADDON" \
	--workflow "$PERF_WORKFLOW" \
	--runs "$PERF_RUNS" \
	--output "reports/${PERF_ADDON}-performance.md" \
	--output-json "reports/${PERF_ADDON}-performance.json" \
	--output-html "reports/${PERF_ADDON}-performance.html"

	- name: Generate performance reports (scheduled - all addons)
	if: ${{ github.event_name == 'schedule' }}
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	mkdir -p reports

	echo "=== OCR (Desktop) ==="
	node scripts/perf-report/aggregate.js \
	--addon ocr-onnx \
	--workflow "Integration Tests (OCR)" \
	--runs 6 \
	--output reports/ocr-onnx-performance.md \
	--output-json reports/ocr-onnx-performance.json \
	--output-html reports/ocr-onnx-performance.html \|\| true

	echo "=== OCR (Mobile) ==="
	node scripts/perf-report/aggregate.js \
	--addon ocr-onnx \
	--workflow "Mobile Integration Tests (OCR)" \
	--runs 6 \
	--output reports/ocr-onnx-mobile-performance.md \
	--output-json reports/ocr-onnx-mobile-performance.json \
	--output-html reports/ocr-onnx-mobile-performance.html \|\| true

	echo "=== Translation ==="
	node scripts/perf-report/aggregate.js \
	--addon nmtcpp \
	--workflow "Integration Tests (NMTCPP)" \
	--runs 6 \
	--output reports/nmtcpp-performance.md \
	--output-json reports/nmtcpp-performance.json \
	--output-html reports/nmtcpp-performance.html \|\| true

	echo "=== Vision/LLM (Desktop) ==="
	node scripts/perf-report/aggregate.js \
	--addon llamacpp-llm \
	--workflow "Integration Tests (LLM)" \
	--runs 6 \
	--output reports/llamacpp-llm-performance.md \
	--output-json reports/llamacpp-llm-performance.json \
	--output-html reports/llamacpp-llm-performance.html \|\| true

	echo "=== Vision/LLM (Mobile) ==="
	node scripts/perf-report/aggregate.js \
	--addon llamacpp-llm \
	--workflow "Mobile Integration Tests (LLM)" \
	--runs 6 \
	--output reports/llamacpp-llm-mobile-performance.md \
	--output-json reports/llamacpp-llm-mobile-performance.json \
	--output-html reports/llamacpp-llm-mobile-performance.html \|\| true

	echo "=== TTS ==="
	node scripts/perf-report/aggregate.js \
	--addon onnx-tts \
	--workflow "Integration Tests (TTS)" \
	--runs 6 \
	--output reports/onnx-tts-performance.md \
	--output-json reports/onnx-tts-performance.json \
	--output-html reports/onnx-tts-performance.html \|\| true

	echo "=== TTS (Mobile) ==="
	node scripts/perf-report/aggregate.js \
	--addon onnx-tts \
	--workflow "Mobile Integration Tests (TTS)" \
	--runs 6 \
	--output reports/onnx-tts-mobile-performance.md \
	--output-json reports/onnx-tts-mobile-performance.json \
	--output-html reports/onnx-tts-mobile-performance.html \|\| true

	echo "=== Parakeet (Mobile) ==="
	node scripts/perf-report/aggregate.js \
	--addon parakeet \
	--workflow "Mobile Integration Tests (Parakeet)" \
	--runs 6 \
	--output reports/parakeet-mobile-performance.md \
	--output-json reports/parakeet-mobile-performance.json \
	--output-html reports/parakeet-mobile-performance.html \|\| true

	# ─── Phase B: COMET quality scoring for NMT (weekly aggregate only) ───
	# Runs only on the Monday scheduled trigger, or on workflow_dispatch
	# when inputs.addon == 'nmtcpp'. Intentionally NOT wired into per-PR
	# desktop or mobile integration workflows — COMET's 2+ GB model and
	# heavier Python environment would blow through per-PR wall time and
	# mobile bandwidth budgets (see QVAC-17474 Phase B plan).
	#
	# Any failure here (model download, pip install, comet-score crash)
	# is isolated with `continue-on-error: true` so the chrF++ output
	# generated by aggregate.js above always ships.
	# `always()` so COMET still tries to run even when the aggregate
	# step above fails (which happens when the last N NMTCPP runs
	# don't have perf-report-* artifacts yet — e.g. right after the
	# Phase A pipeline first landed). The COMET script downloads its
	# own copies of the per-run performance-report.json artifacts,
	# so it's independent of aggregate.js's output. If aggregate
	# succeeds, COMET complements it; if aggregate fails, COMET at
	# least emits a stub markdown so the Step Summary isn't empty.
	- name: Setup Python 3.11 for COMET
	if: \|
	always() && (
	github.event_name == 'schedule' \|\|
	(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
	)
	uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # 6.2.0
	with:
	python-version: '3.11'
	# `cache: pip` caches ~/.cache/pip keyed on the hash of the
	# `cache-dependency-path` file (we point at this workflow
	# itself, since we pin the unbabel-comet version inline).
	# Saves ~60–90s of PyPI wire time for the weekly run and
	# avoids cold-downloading ~250MB of torch/transformers wheels
	# on every trigger.
	cache: pip
	cache-dependency-path: .github/workflows/perf-report.yml

	- name: Cache HuggingFace model for COMET
	if: \|
	always() && (
	github.event_name == 'schedule' \|\|
	(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
	)
	uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4
	with:
	path: ~/.cache/huggingface/hub
	key: comet-model-v1-wmt22-comet-da

	- name: Install unbabel-comet
	if: \|
	always() && (
	github.event_name == 'schedule' \|\|
	(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
	)
	continue-on-error: true
	run: \|
	python -m pip install --upgrade pip
	# Pinned to an exact release so a future 2.2.x patch (or an
	# unexpected PyTorch transitive pin bump) can't silently
	# change the COMET scores or break the weekly run. Bump this
	# deliberately when we want a newer build.
	pip install 'unbabel-comet==2.2.6'
	comet-score --help \| head -5 \|\| true

	- name: Score NMT translations with COMET
	if: \|
	always() && (
	github.event_name == 'schedule' \|\|
	(github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
	)
	continue-on-error: true
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	# Same reasoning as the "Generate performance report (manual)"
	# step: avoid ${{ }} interpolation inside a `run:` block.
	PERF_RUNS: ${{ inputs.runs }}
	run: \|
	mkdir -p reports
	# workflow_dispatch passes `runs` as input; schedule defaults to 6.
	RUNS="${PERF_RUNS:-6}"
	# Query the umbrella "On PR Trigger (NMTCPP)" workflow — that's
	# where perf-report-* artifacts are attached. The inner
	# "Integration Tests (NMTCPP)" workflow is invoked via
	# workflow_call and its artifacts surface on the umbrella run.
	node scripts/perf-report/comet-score-nmt.js \
	--runs "$RUNS" \
	--workflow "On PR Trigger (NMTCPP)" \
	--output reports/nmtcpp-comet.md \
	--model Unbabel/wmt22-comet-da \|\| true

	- name: Write GitHub Step Summary
	if: always()
	run: \|
	echo "# Performance Reports" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Generated: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	for f in reports/*.md; do
	if [ -f "$f" ]; then
	cat "$f" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "---" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	fi
	done

	- name: Upload reports
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: performance-reports-${{ github.run_number }}
	path: reports/
	retention-days: 90
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance Report #6

Workflow file

Performance Report #6

Uh oh!

Workflow file for this run