qvac/.github/workflows/perf-report.yml at 65f006ef45fc37b76d08b9bdaf3d7d20de236791 · yuranich/qvac · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
name: Performance Report

on:
  schedule:
    - cron: "0 9 * * 1" # Every Monday 9am UTC
  workflow_dispatch:
    inputs:
      addon:
        description: "Addon to generate report for"
        type: choice
        required: true
        options:
          - ocr-onnx
          - nmtcpp
          - llamacpp-llm
          - onnx-tts
          - parakeet
      workflow_name:
        description: "Integration test workflow name to query"
        type: choice
        required: true
        options:
          - "Integration Tests (OCR)"
          - "Mobile Integration Tests (OCR)"
          - "Integration Tests (NMTCPP)"
          - "Integration Tests (LLM)"
          - "Mobile Integration Tests (LLM)"
          - "Integration Tests (TTS)"
          - "Mobile Integration Tests (TTS)"
          - "Mobile Integration Tests (Parakeet)"
      runs:
        description: "Number of recent runs to aggregate"
        type: number
        required: false
        default: 6

jobs:
  generate-report:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      actions: read

    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
          node-version: lts/*

      - name: Generate performance report (manual)
        if: ${{ github.event_name == 'workflow_dispatch' }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          # Pass inputs via env rather than ${{ }} interpolation inside
          # the `run:` block. Even though each input has a constrained
          # `type: choice`/`type: number`, reading them here via env
          # removes GH-Actions expression injection as an attack surface
          # class entirely — `bash` cannot re-evaluate an env var as
          # workflow syntax.
          PERF_ADDON: ${{ inputs.addon }}
          PERF_WORKFLOW: ${{ inputs.workflow_name }}
          PERF_RUNS: ${{ inputs.runs }}
        run: |
          node scripts/perf-report/aggregate.js \
            --addon "$PERF_ADDON" \
            --workflow "$PERF_WORKFLOW" \
            --runs "$PERF_RUNS" \
            --output "reports/${PERF_ADDON}-performance.md" \
            --output-json "reports/${PERF_ADDON}-performance.json" \
            --output-html "reports/${PERF_ADDON}-performance.html"

      - name: Generate performance reports (scheduled - all addons)
        if: ${{ github.event_name == 'schedule' }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          mkdir -p reports

          echo "=== OCR (Desktop) ==="
          node scripts/perf-report/aggregate.js \
            --addon ocr-onnx \
            --workflow "Integration Tests (OCR)" \
            --runs 6 \
            --output reports/ocr-onnx-performance.md \
            --output-json reports/ocr-onnx-performance.json \
            --output-html reports/ocr-onnx-performance.html || true

          echo "=== OCR (Mobile) ==="
          node scripts/perf-report/aggregate.js \
            --addon ocr-onnx \
            --workflow "Mobile Integration Tests (OCR)" \
            --runs 6 \
            --output reports/ocr-onnx-mobile-performance.md \
            --output-json reports/ocr-onnx-mobile-performance.json \
            --output-html reports/ocr-onnx-mobile-performance.html || true

          echo "=== Translation ==="
          node scripts/perf-report/aggregate.js \
            --addon nmtcpp \
            --workflow "Integration Tests (NMTCPP)" \
            --runs 6 \
            --output reports/nmtcpp-performance.md \
            --output-json reports/nmtcpp-performance.json \
            --output-html reports/nmtcpp-performance.html || true

          echo "=== Vision/LLM (Desktop) ==="
          node scripts/perf-report/aggregate.js \
            --addon llamacpp-llm \
            --workflow "Integration Tests (LLM)" \
            --runs 6 \
            --output reports/llamacpp-llm-performance.md \
            --output-json reports/llamacpp-llm-performance.json \
            --output-html reports/llamacpp-llm-performance.html || true

          echo "=== Vision/LLM (Mobile) ==="
          node scripts/perf-report/aggregate.js \
            --addon llamacpp-llm \
            --workflow "Mobile Integration Tests (LLM)" \
            --runs 6 \
            --output reports/llamacpp-llm-mobile-performance.md \
            --output-json reports/llamacpp-llm-mobile-performance.json \
            --output-html reports/llamacpp-llm-mobile-performance.html || true

          echo "=== TTS ==="
          node scripts/perf-report/aggregate.js \
            --addon onnx-tts \
            --workflow "Integration Tests (TTS)" \
            --runs 6 \
            --output reports/onnx-tts-performance.md \
            --output-json reports/onnx-tts-performance.json \
            --output-html reports/onnx-tts-performance.html || true

          echo "=== TTS (Mobile) ==="
          node scripts/perf-report/aggregate.js \
            --addon onnx-tts \
            --workflow "Mobile Integration Tests (TTS)" \
            --runs 6 \
            --output reports/onnx-tts-mobile-performance.md \
            --output-json reports/onnx-tts-mobile-performance.json \
            --output-html reports/onnx-tts-mobile-performance.html || true

          echo "=== Parakeet (Mobile) ==="
          node scripts/perf-report/aggregate.js \
            --addon parakeet \
            --workflow "Mobile Integration Tests (Parakeet)" \
            --runs 6 \
            --output reports/parakeet-mobile-performance.md \
            --output-json reports/parakeet-mobile-performance.json \
            --output-html reports/parakeet-mobile-performance.html || true

      # ─── Phase B: COMET quality scoring for NMT (weekly aggregate only) ───
      # Runs only on the Monday scheduled trigger, or on workflow_dispatch
      # when inputs.addon == 'nmtcpp'. Intentionally NOT wired into per-PR
      # desktop or mobile integration workflows — COMET's 2+ GB model and
      # heavier Python environment would blow through per-PR wall time and
      # mobile bandwidth budgets (see QVAC-17474 Phase B plan).
      #
      # Any failure here (model download, pip install, comet-score crash)
      # is isolated with `continue-on-error: true` so the chrF++ output
      # generated by aggregate.js above always ships.
      # `always()` so COMET still tries to run even when the aggregate
      # step above fails (which happens when the last N NMTCPP runs
      # don't have perf-report-* artifacts yet — e.g. right after the
      # Phase A pipeline first landed). The COMET script downloads its
      # own copies of the per-run performance-report.json artifacts,
      # so it's independent of aggregate.js's output. If aggregate
      # succeeds, COMET complements it; if aggregate fails, COMET at
      # least emits a stub markdown so the Step Summary isn't empty.
      - name: Setup Python 3.11 for COMET
        if: |
          always() && (
            github.event_name == 'schedule' ||
            (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
          )
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # 6.2.0
        with:
          python-version: '3.11'
          # `cache: pip` caches ~/.cache/pip keyed on the hash of the
          # `cache-dependency-path` file (we point at this workflow
          # itself, since we pin the unbabel-comet version inline).
          # Saves ~60–90s of PyPI wire time for the weekly run and
          # avoids cold-downloading ~250MB of torch/transformers wheels
          # on every trigger.
          cache: pip
          cache-dependency-path: .github/workflows/perf-report.yml

      - name: Cache HuggingFace model for COMET
        if: |
          always() && (
            github.event_name == 'schedule' ||
            (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
          )
        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # 5.0.4
        with:
          path: ~/.cache/huggingface/hub
          key: comet-model-v1-wmt22-comet-da

      - name: Install unbabel-comet
        if: |
          always() && (
            github.event_name == 'schedule' ||
            (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
          )
        continue-on-error: true
        run: |
          python -m pip install --upgrade pip
          # Pinned to an exact release so a future 2.2.x patch (or an
          # unexpected PyTorch transitive pin bump) can't silently
          # change the COMET scores or break the weekly run. Bump this
          # deliberately when we want a newer build.
          pip install 'unbabel-comet==2.2.6'
          comet-score --help | head -5 || true

      - name: Score NMT translations with COMET
        if: |
          always() && (
            github.event_name == 'schedule' ||
            (github.event_name == 'workflow_dispatch' && inputs.addon == 'nmtcpp')
          )
        continue-on-error: true
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          # Same reasoning as the "Generate performance report (manual)"
          # step: avoid ${{ }} interpolation inside a `run:` block.
          PERF_RUNS: ${{ inputs.runs }}
        run: |
          mkdir -p reports
          # workflow_dispatch passes `runs` as input; schedule defaults to 6.
          RUNS="${PERF_RUNS:-6}"
          # Query the umbrella "On PR Trigger (NMTCPP)" workflow — that's
          # where perf-report-* artifacts are attached. The inner
          # "Integration Tests (NMTCPP)" workflow is invoked via
          # workflow_call and its artifacts surface on the umbrella run.
          node scripts/perf-report/comet-score-nmt.js \
            --runs "$RUNS" \
            --workflow "On PR Trigger (NMTCPP)" \
            --output reports/nmtcpp-comet.md \
            --model Unbabel/wmt22-comet-da || true

      - name: Write GitHub Step Summary
        if: always()
        run: |
          echo "# Performance Reports" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "Generated: $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          for f in reports/*.md; do
            if [ -f "$f" ]; then
              cat "$f" >> $GITHUB_STEP_SUMMARY
              echo "" >> $GITHUB_STEP_SUMMARY
              echo "---" >> $GITHUB_STEP_SUMMARY
              echo "" >> $GITHUB_STEP_SUMMARY
            fi
          done

      - name: Upload reports
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: performance-reports-${{ github.run_number }}
          path: reports/
          retention-days: 90
          if-no-files-found: ignore