1- name : Alarm regression
1+ name : Benchmark Comparison & Alarm Regression
22
33on :
44 workflow_run :
@@ -9,6 +9,9 @@ permissions:
99 contents : read
1010 actions : read
1111 issues : write
12+ pull-requests : write
13+ checks : write
14+ statuses : write
1215
1316jobs :
1417 comment-if-regressed :
@@ -18,9 +21,334 @@ jobs:
1821 contains(fromJson('["success","neutral"]'), github.event.workflow_run.conclusion)
1922
2023 steps :
21- - name : Checkout code
22- uses : actions/checkout@v4
24+ - name : Setup Python
25+ uses : actions/setup-python@v5
2326 with :
24- # Checkout full history is required to shallow cloning while mark HEAD as "grafted". This breaks remote
25- # tracking thereby making it impossible to detect whether a commit is contained in upstream main.
26- fetch-depth : 0
27+ python-version : ' 3.10'
28+
29+ - name : Install deps
30+ run : |
31+ python -m pip install --quiet --upgrade wandb
32+
33+ - name : Download artifacts from triggering run
34+ id : dl
35+ uses : actions/download-artifact@v4
36+ with :
37+ name : speed-test-results
38+ run-id : ${{ github.event.workflow_run.id }}
39+ github-token : ${{ secrets.GITHUB_TOKEN }}
40+ path : ./artifacts
41+
42+ - name : Show downloaded files
43+ run : |
44+ echo "Downloaded into ${{ steps.dl.outputs.download-path }}"
45+ ls -la ${{ steps.dl.outputs.download-path }} || true
46+ (command -v tree >/dev/null && tree -a ${{ steps.dl.outputs.download-path }}) || true
47+
48+ - name : Resolve PR number
49+ id : pr
50+ uses : actions/github-script@v7
51+ with :
52+ script : |
53+ const wr = context.payload.workflow_run;
54+ let pr = (wr.pull_requests && wr.pull_requests[0]) || null;
55+
56+ if (!pr) {
57+ const sha = wr.head_sha;
58+ const { data } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
59+ owner: context.repo.owner,
60+ repo: context.repo.repo,
61+ commit_sha: sha,
62+ });
63+ if (data && data.length) pr = data[0];
64+ }
65+
66+ core.setOutput('pr_number', pr ? String(pr.number) : '');
67+
68+ - name : Check regressions
69+ if : ${{ steps.pr.outputs.pr_number != '' }}
70+ env :
71+ # --- W&B ---
72+ WANDB_API_KEY : ${{ secrets.WANDB_API_KEY }}
73+ WANDB_ENTITY : genesis-ai-company
74+ WANDB_PROJECT : genesis-benchmarks
75+ WANDB_SILENT : " true"
76+
77+ # --- Parameters ---
78+ MAX_REVISIONS : " 5"
79+ NO_CHANGE_PATIENCE : " 100"
80+ RUNTIME_REGRESSION_TOLERANCE_PCT : " 10"
81+ COMPILE_REGRESSION_TOLERANCE_PCT : " 10"
82+
83+ # Input/Output paths
84+ ARTIFACTS_DIR : ${{ steps.dl.outputs.download-path }}
85+ PR_COMMENT_PATH : pr_comment.md
86+ run : |
87+ python - <<'PY'
88+ import os, sys, json, math, re
89+ import statistics as stats
90+ import wandb
91+
92+ SHA_RE = re.compile(r"[0-9a-fA-F]{7,40}")
93+
94+ def _norm_rev(text):
95+ if not text: return None
96+ text = text.split("@", 1)[0]
97+ m = SHA_RE.search(text)
98+ return m.group(0) if m else text
99+
100+ def _normalize_kv_id(kv: dict, drop_keys=None) -> str:
101+ drop = set(drop_keys or [])
102+ pairs = []
103+ for k, v in kv.items():
104+ if k in drop or v is None: continue
105+ k = str(k).strip(); v = str(v).strip()
106+ if not k or not v: continue
107+ pairs.append((k, v))
108+ pairs.sort(key=lambda x: x[0])
109+ return "-".join(f"{k}={v}" for k, v in pairs)
110+
111+ def wandb_normalize_benchmark_id(bid: str) -> str:
112+ kv = {}
113+ for token in (p.strip() for p in bid.split("-") if p.strip()):
114+ if "=" not in token: continue
115+ k, v = token.split("=", 1)
116+ kv[k.strip()] = v.strip()
117+ return _normalize_kv_id(kv)
118+
119+ def artifacts_parse_speed_txt_lines(lines):
120+ METRIC_KEYS = {"compile_time", "runtime_fps", "realtime_factor"}
121+ out = {}
122+ for line in lines:
123+ if not line.strip(): continue
124+ parts = [p.strip() for p in line.strip().split("|") if "=" in p]
125+ kv = {}
126+ for p in parts:
127+ k, v = p.split("=", 1)
128+ kv[k.strip()] = v.strip()
129+ tid = _normalize_kv_id(kv, drop_keys=METRIC_KEYS)
130+ rt = kv.get("runtime_fps"); ct = kv.get("compile_time")
131+ try: rt = float(rt) if rt is not None else None
132+ except: rt = None
133+ try: ct = float(ct) if ct is not None else None
134+ except: ct = None
135+ out[tid] = {"runtime_fps": rt, "compile_time": ct}
136+ return out
137+
138+ # ----- read workflow_run id (for tag only; no API calls) -----
139+ ev = json.load(open(os.environ["GITHUB_EVENT_PATH"], "r", encoding="utf-8"))
140+ wr = ev["workflow_run"]
141+ wr_id = wr["id"]
142+
143+ # ----- Load current PR artifacts -----
144+ artifacts_path = os.path.abspath(os.environ.get("ARTIFACTS_DIR", "./artifacts"))
145+ if not os.path.exists(artifacts_path):
146+ print("No artifacts dir; skip."); open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close(); sys.exit(0)
147+
148+ current_txt_path = None
149+ for root, _, files in os.walk(artifacts_path):
150+ for fname in files:
151+ if fname.startswith("speed_test") and fname.endswith(".txt"):
152+ current_txt_path = os.path.join(root, fname); break
153+ if current_txt_path: break
154+
155+ if current_txt_path is None:
156+ print("No speed_test*.txt; skip."); open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close(); sys.exit(0)
157+
158+ with open(current_txt_path, "r", encoding="utf-8") as f:
159+ current_benchmark = artifacts_parse_speed_txt_lines(f.readlines())
160+
161+ # ----- W&B login (anonymous allowed) -----
162+ if not os.getenv("WANDB_API_KEY"):
163+ try: wandb.login(anonymous="allow", relogin=True)
164+ except Exception: pass
165+
166+ # ----- Collect baselines from W&B -----
167+ ENTITY = os.environ.get("WANDB_ENTITY","")
168+ PROJECT= os.environ.get("WANDB_PROJECT","")
169+ MAX_REVISIONS = int(os.environ.get("MAX_REVISIONS","5"))
170+ NO_CHANGE_PATIENCE = int(os.environ.get("NO_CHANGE_PATIENCE","100"))
171+ tol_rt = float(os.environ.get("RUNTIME_REGRESSION_TOLERANCE_PCT","10"))
172+ tol_ct = float(os.environ.get("COMPILE_REGRESSION_TOLERANCE_PCT","10"))
173+
174+ api = wandb.Api()
175+ runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at")
176+
177+ by_rev = {} # rev -> {bench_id: {runtime_fps, compile_time}}
178+ rev_order = [] # latest -> oldest
179+ selected_revs = None
180+ no_change_streak = 0
181+
182+ for run in runs_iter:
183+ if run.state != "finished": continue
184+ cfg = getattr(run, "config", None)
185+ if cfg is None: continue
186+ cfg = json.loads(cfg)
187+ raw_rev = cfg.get("revision"); raw_bid = cfg.get("benchmark_id")
188+ if not raw_rev or not raw_bid:
189+ if selected_revs is not None:
190+ no_change_streak += 1
191+ if no_change_streak >= NO_CHANGE_PATIENCE: break
192+ continue
193+
194+ rev = _norm_rev(raw_rev.get("value"))
195+ bid = raw_bid.get("value")
196+ if not rev or not bid:
197+ if selected_revs is not None:
198+ no_change_streak += 1
199+ if no_change_streak >= NO_CHANGE_PATIENCE: break
200+ continue
201+
202+ if selected_revs is not None and rev not in selected_revs:
203+ no_change_streak += 1
204+ if no_change_streak >= NO_CHANGE_PATIENCE: break
205+ continue
206+
207+ if rev not in by_rev:
208+ by_rev[rev] = {}
209+ rev_order.append(rev)
210+ if len(rev_order) >= MAX_REVISIONS:
211+ selected_revs = set(rev_order)
212+
213+ nbid = wandb_normalize_benchmark_id(bid)
214+ if nbid not in by_rev[rev]:
215+ runtime_fps = None; compile_time = None; cnt = 0
216+ for row in run.scan_history(keys=["runtime_fps","compile_time"]):
217+ runtime_fps = row.get("runtime_fps")
218+ compile_time = row.get("compile_time")
219+ if runtime_fps is not None and compile_time is not None: break
220+ cnt += 1
221+ if cnt >= 10: break
222+ by_rev[rev][nbid] = {"runtime_fps": runtime_fps, "compile_time": compile_time}
223+ if selected_revs is not None: no_change_streak = 0
224+ else:
225+ if selected_revs is not None:
226+ no_change_streak += 1
227+ if no_change_streak >= NO_CHANGE_PATIENCE: break
228+
229+ # ----- Compare current vs baselines -----
230+ def collect_mean(metric_key, bench_id):
231+ vals = []
232+ for r in by_rev.keys():
233+ v = by_rev.get(r, {}).get(bench_id, {}).get(metric_key)
234+ if isinstance(v, (int,float)) and not (isinstance(v,float) and math.isnan(v)):
235+ vals.append(float(v))
236+ return stats.mean(vals) if vals else None
237+
238+ runtime_regs = []; compile_regs = []
239+ for bid, m in current_benchmark.items():
240+ cur_rt = m.get("runtime_fps"); cur_ct = m.get("compile_time")
241+ base_rt = collect_mean("runtime_fps", bid)
242+ base_ct = collect_mean("compile_time", bid)
243+ if base_rt is not None and isinstance(cur_rt,(int,float)) and not (isinstance(cur_rt,float) and math.isnan(cur_rt)) and base_rt>0:
244+ d = (cur_rt - base_rt) / base_rt * 100.0
245+ if True: # d < -tol_rt:
246+ runtime_regs.append((bid, cur_rt, base_rt, d))
247+ if base_ct is not None and isinstance(cur_ct,(int,float)) and not (isinstance(cur_ct,float) and math.isnan(cur_ct)) and base_ct>0:
248+ d = (cur_ct - base_ct) / base_ct * 100.0
249+ if True: #d > tol_ct:
250+ compile_regs.append((bid, cur_ct, base_ct, d))
251+
252+ # if not runtime_regs and not compile_regs:
253+ # print("No regression vs baselines; skip comment.")
254+ # open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close()
255+ # sys.exit(0)
256+
257+ def trunc(s, n=120): return s if len(s)<=n else s[:n]+"…"
258+
259+ lines = []
260+ lines.append(":warning: **Benchmark regression detected**")
261+ lines.append(f"- Baseline commits considered: **{len(rev_order)}**")
262+ for i, r in enumerate(rev_order, 1):
263+ lines.append(f" - Commit {i}: {r}")
264+ lines.append(f"- Runtime tolerance: **-{tol_rt:.1f}%**; Compile tolerance: **+{tol_ct:.1f}%**")
265+ lines.append("")
266+
267+ if runtime_regs:
268+ runtime_regs.sort(key=lambda x: x[3])
269+ lines += ["**Runtime FPS regressions (vs mean of other commits)**",
270+ "| benchmark_id | current | baseline mean | delta % |",
271+ "|---|---:|---:|---:|"]
272+ for bid, cur, base, d in runtime_regs[:20]:
273+ lines.append(f"| `{trunc(bid)}` | {cur:,.0f} | {base:,.0f} | {d:.2f}% |")
274+ if len(runtime_regs)>20: lines.append("_Only first 20 shown._")
275+ lines.append("")
276+
277+ if compile_regs:
278+ compile_regs.sort(key=lambda x: -x[3])
279+ lines += ["**Compile-time regressions (vs mean of other commits)**",
280+ "| benchmark_id | current | baseline mean | delta % |",
281+ "|---|---:|---:|---:|"]
282+ for bid, cur, base, d in compile_regs[:20]:
283+ lines.append(f"| `{trunc(bid)}` | {cur:,.0f} | {base:,.0f} | {d:.2f}% |")
284+ if len(compile_regs)>20: lines.append("_Only first 20 shown._")
285+ lines.append("")
286+
287+ lines.append(f"<!-- bench-guard-run:{wr_id} -->")
288+ body = "\n".join(lines)
289+
290+ with open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"), "w", encoding="utf-8") as f:
291+ f.write(body + "\n")
292+ print("[INFO] wrote pr_comment.md")
293+ PY
294+
295+ # capture comment into env for next step
296+ if [ -s "pr_comment.md" ]; then
297+ {
298+ echo 'SCRIPT_OUTPUT<<__EOF__'
299+ cat pr_comment.md
300+ echo '__EOF__'
301+ } >> "$GITHUB_ENV"
302+ else
303+ echo "SCRIPT_OUTPUT=" >> "$GITHUB_ENV"
304+ fi
305+
306+ - name : Add PR comment
307+ if : ${{ steps.pr.outputs.pr_number != '' && env.SCRIPT_OUTPUT != '' }}
308+ uses : actions/github-script@v7
309+ env :
310+ COMMENT_BODY : ${{ env.SCRIPT_OUTPUT }}
311+ with :
312+ script : |
313+ const prNumber = Number('${{ steps.pr.outputs.pr_number }}');
314+ if (!prNumber) {
315+ core.info('No PR number; skipping comment.');
316+ return;
317+ }
318+ await github.rest.issues.createComment({
319+ owner: context.repo.owner,
320+ repo: context.repo.repo,
321+ issue_number: prNumber,
322+ body: process.env.COMMENT_BODY
323+ });
324+
325+ - name : Publish PR check (show in Checks tab)
326+ if : always() # 실패해도 체크는 찍히도록
327+ uses : actions/github-script@v7
328+ env :
329+ CHECK_NAME : Benchmark Comparison & Alarm Regression
330+ COMMENT_BODY : ${{ env.SCRIPT_OUTPUT }} # 위에서 pr_comment.md를 env로 넣은 값
331+ with :
332+ script : |
333+ const sha = context.payload.workflow_run.head_sha;
334+ const hasBody = Boolean(process.env.COMMENT_BODY && process.env.COMMENT_BODY.trim());
335+ const conclusion = hasBody ? 'failure' : 'success';
336+ const summary = hasBody
337+ ? 'Benchmark regression detected. See details in the output.'
338+ : 'No regression detected.';
339+
340+ // Checks API: PR 하단 "Checks" 섹션에 나타납니다.
341+ await github.rest.checks.create({
342+ owner: context.repo.owner,
343+ repo: context.repo.repo,
344+ name: process.env.CHECK_NAME,
345+ head_sha: sha,
346+ status: 'completed',
347+ conclusion,
348+ output: {
349+ title: process.env.CHECK_NAME,
350+ summary,
351+ // 길어도 괜찮다면 전체 코멘트 본문을 체크 출력으로 넣을 수 있어요
352+ text: process.env.COMMENT_BODY || undefined
353+ }
354+ });
0 commit comments