foo

SonSang · SonSang · commit 0cdcde1135a2 · 2025-09-30T20:03:19.000-07:00
diff --git a/.github/workflows/alarm.yml b/.github/workflows/alarm.yml
@@ -19,28 +19,20 @@ jobs:
       contains(fromJson('["success","neutral"]'), github.event.workflow_run.conclusion)
 
     steps:
-      - name: Checkout base repo (default branch)
-        uses: actions/checkout@v4
-        with:
-          # base(업스트림) 기본 브랜치의 최신 스크립트를 사용
-          fetch-depth: 0
-
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.10'
 
       - name: Install deps
         run: |
-          python -m pip install --quiet --upgrade requests wandb
+          python -m pip install --quiet --upgrade wandb
 
       - name: Download artifacts from triggering run
         id: dl
         uses: actions/download-artifact@v4
         with:
-          # Production에서 업로드한 아티팩트 이름 (고정 이름이라면 그대로 사용)
           name: speed-test-results
-          # 트리거한 워크플로우 런 ID로 정확히 해당 실행의 아티팩트를 가져옴
           run-id: ${{ github.event.workflow_run.id }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
           path: ./artifacts
@@ -49,34 +41,258 @@ jobs:
         run: |
           echo "Downloaded into ${{ steps.dl.outputs.download-path }}"
           ls -la ${{ steps.dl.outputs.download-path }} || true
-          echo
-          echo "Tree:"
           (command -v tree >/dev/null && tree -a ${{ steps.dl.outputs.download-path }}) || true
 
-      - name: Run W&B regression commenter
+      - name: Check regressions
         env:
           GITHUB_TOKEN: ${{ github.token }}
           GITHUB_API_URL: ${{ github.api_url }}
 
           # --- W&B ---
-          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}   # 공개 프로젝트면 없어도 동작(anonymous)
+          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
           WANDB_ENTITY: genesis-ai-company
           WANDB_PROJECT: genesis-benchmarks
           WANDB_SILENT: "true"
 
-          # --- 수집/비교 파라미터 ---
-          MAX_RUNS: "1000"
+          # --- Parameters ---
           MAX_REVISIONS: "5"
           NO_CHANGE_PATIENCE: "100"
           RUNTIME_REGRESSION_TOLERANCE_PCT: "10"
           COMPILE_REGRESSION_TOLERANCE_PCT: "10"
 
+          # Input/Output paths
           ARTIFACTS_DIR: ${{ steps.dl.outputs.download-path }}
+          PR_COMMENT_PATH: pr_comment.md
         run: |
-          python test_wandb.py
+          python - <<'PY'
+          import os, sys, json, math, re, requests
+          import statistics as stats
+          import wandb
+
+          SHA_RE = re.compile(r"[0-9a-fA-F]{7,40}")
+
+          def _norm_rev(text):
+              if not text: return None
+              text = text.split("@", 1)[0]
+              m = SHA_RE.search(text)
+              return m.group(0) if m else text
+
+          def _normalize_kv_id(kv: dict, drop_keys=None) -> str:
+              drop = set(drop_keys or [])
+              pairs = []
+              for k, v in kv.items():
+                  if k in drop or v is None: continue
+                  k = str(k).strip(); v = str(v).strip()
+                  if not k or not v: continue
+                  pairs.append((k, v))
+              pairs.sort(key=lambda x: x[0])
+              return "-".join(f"{k}={v}" for k, v in pairs)
+
+          def wandb_normalize_benchmark_id(bid: str) -> str:
+              kv = {}
+              for token in (p.strip() for p in bid.split("-") if p.strip()):
+                  if "=" not in token: continue
+                  k, v = token.split("=", 1)
+                  kv[k.strip()] = v.strip()
+              return _normalize_kv_id(kv)
+
+          def artifacts_parse_speed_txt_lines(lines):
+              METRIC_KEYS = {"compile_time", "runtime_fps", "realtime_factor"}
+              out = {}
+              for line in lines:
+                  if not line.strip(): continue
+                  parts = [p.strip() for p in line.strip().split("|") if "=" in p]
+                  kv = {}
+                  for p in parts:
+                      k, v = p.split("=", 1)
+                      kv[k.strip()] = v.strip()
+                  tid = _normalize_kv_id(kv, drop_keys=METRIC_KEYS)
+                  rt = kv.get("runtime_fps"); ct = kv.get("compile_time")
+                  try: rt = float(rt) if rt is not None else None
+                  except: rt = None
+                  try: ct = float(ct) if ct is not None else None
+                  except: ct = None
+                  out[tid] = {"runtime_fps": rt, "compile_time": ct}
+              return out
+
+          # ----- Read event / find PR -----
+          API = os.environ.get("GITHUB_API_URL", "https://api.github.com")
+          ev  = json.load(open(os.environ["GITHUB_EVENT_PATH"], "r", encoding="utf-8"))
+          wr  = ev["workflow_run"]
+          repo= ev["repository"]
+          owner, name = repo["full_name"].split("/", 1)
+          head_sha = wr.get("head_sha")
+          wr_id = wr["id"]
+
+          s = requests.Session()
+          s.headers.update({
+              "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
+              "Accept": "application/vnd.github+json",
+          })
+
+          prs = wr.get("pull_requests") or []
+          pr = prs[0] if prs else None
+          if not pr:
+              r = s.get(f"{API}/repos/{owner}/{name}/commits/{head_sha}/pulls",
+                        headers={"Accept":"application/vnd.github.groot-preview+json"})
+              if r.ok and r.json(): pr = r.json()[0]
+          if not pr:
+              print("No PR found; skip."); open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close(); sys.exit(0)
+
+          pr_num = pr["number"]
+          target_repo_api = pr["base"]["repo"]["url"]
+
+          # ----- Load current PR artifacts -----
+          artifacts_path = os.path.abspath(os.environ.get("ARTIFACTS_DIR", "./artifacts"))
+          if not os.path.exists(artifacts_path):
+              print("No artifacts dir; skip."); open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close(); sys.exit(0)
+
+          current_txt_path = None
+          for root, _, files in os.walk(artifacts_path):
+              for fname in files:
+                  if fname.startswith("speed_test") and fname.endswith(".txt"):
+                      current_txt_path = os.path.join(root, fname); break
+              if current_txt_path: break
+
+          if current_txt_path is None:
+              print("No speed_test*.txt; skip."); open(os.environ.get("PR_COMMENT_PATH","pr_comment.md"),"w").close(); sys.exit(0)
+
+          with open(current_txt_path, "r", encoding="utf-8") as f:
+              current_benchmark = artifacts_parse_speed_txt_lines(f.readlines())
+
+          # ----- W&B login (anonymous allowed) -----
+          if not os.getenv("WANDB_API_KEY"):
+              try: wandb.login(anonymous="allow", relogin=True)
+              except Exception: pass
+
+          # ----- Collect baselines from W&B -----
+          ENTITY = os.environ.get("WANDB_ENTITY","")
+          PROJECT= os.environ.get("WANDB_PROJECT","")
+          MAX_REVISIONS = int(os.environ.get("MAX_REVISIONS","5"))
+          NO_CHANGE_PATIENCE = int(os.environ.get("NO_CHANGE_PATIENCE","100"))
+          tol_rt = float(os.environ.get("RUNTIME_REGRESSION_TOLERANCE_PCT","10"))
+          tol_ct = float(os.environ.get("COMPILE_REGRESSION_TOLERANCE_PCT","10"))
+
+          api = wandb.Api()
+          runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at")
 
-          # if pr_comment.md exists, inject it to the next step as an ENV
-          if [ -f "pr_comment.md" ]; then
+          by_rev = {}      # rev -> {bench_id: {runtime_fps, compile_time}}
+          rev_order = []   # latest -> oldest
+          selected_revs = None
+          no_change_streak = 0
+
+          for run in runs_iter:
+              if run.state != "finished": continue
+              cfg = getattr(run, "config", None)
+              if cfg is None: continue
+              cfg = json.loads(cfg)
+              raw_rev = cfg.get("revision"); raw_bid = cfg.get("benchmark_id")
+              if not raw_rev or not raw_bid: 
+                  if selected_revs is not None:
+                      no_change_streak += 1
+                      if no_change_streak >= NO_CHANGE_PATIENCE: break
+                  continue
+
+              rev = _norm_rev(raw_rev.get("value"))
+              bid = raw_bid.get("value")
+              if not rev or not bid:
+                  if selected_revs is not None:
+                      no_change_streak += 1
+                      if no_change_streak >= NO_CHANGE_PATIENCE: break
+                  continue
+
+              if selected_revs is not None and rev not in selected_revs:
+                  no_change_streak += 1
+                  if no_change_streak >= NO_CHANGE_PATIENCE: break
+                  continue
+
+              if rev not in by_rev:
+                  by_rev[rev] = {}
+                  rev_order.append(rev)
+                  if len(rev_order) >= MAX_REVISIONS:
+                      selected_revs = set(rev_order)
+
+              if wandb_normalize_benchmark_id(bid) not in by_rev[rev]:
+                  # pull first rows with metrics
+                  runtime_fps = None; compile_time = None; cnt = 0
+                  for row in run.scan_history(keys=["runtime_fps","compile_time"]):
+                      runtime_fps = row.get("runtime_fps")
+                      compile_time = row.get("compile_time")
+                      if runtime_fps is not None and compile_time is not None: break
+                      cnt += 1
+                      if cnt >= 10: break
+                  nbid = wandb_normalize_benchmark_id(bid)
+                  by_rev[rev][nbid] = {"runtime_fps": runtime_fps, "compile_time": compile_time}
+                  if selected_revs is not None: no_change_streak = 0
+              else:
+                  if selected_revs is not None:
+                      no_change_streak += 1
+                      if no_change_streak >= NO_CHANGE_PATIENCE: break
+
+          # ----- Compare current vs baselines -----
+          def collect_mean(metric_key, bench_id):
+              vals = []
+              for r in by_rev.keys():
+                  v = by_rev.get(r, {}).get(bench_id, {}).get(metric_key)
+                  if isinstance(v, (int,float)) and not (isinstance(v,float) and math.isnan(v)):
+                      vals.append(float(v))
+              return stats.mean(vals) if vals else None
+
+          runtime_regs = []; compile_regs = []
+          for bid, m in current_benchmark.items():
+              cur_rt = m.get("runtime_fps"); cur_ct = m.get("compile_time")
+              base_rt = collect_mean("runtime_fps", bid)
+              base_ct = collect_mean("compile_time", bid)
+              if base_rt is not None and isinstance(cur_rt,(int,float)) and not (isinstance(cur_rt,float) and math.isnan(cur_rt)) and base_rt>0:
+                  d = (cur_rt - base_rt) / base_rt * 100.0
+                  runtime_regs.append((bid, cur_rt, base_rt, d))
+              if base_ct is not None and isinstance(cur_ct,(int,float)) and not (isinstance(cur_ct,float) and math.isnan(cur_ct)) and base_ct>0:
+                  d = (cur_ct - base_ct) / base_ct * 100.0
+                  compile_regs.append((bid, cur_ct, base_ct, d))
+
+          def trunc(s, n=120): return s if len(s)<=n else s[:n]+"…"
+
+          lines = []
+          lines.append(":warning: **Benchmark regression detected (vs W&B history)**")
+          lines.append(f"- Baseline commits considered: **{len(rev_order)}**")
+          for i, r in enumerate(rev_order, 1):
+              lines.append(f"  - Commit {i}: {r}")
+          lines.append(f"- Runtime tolerance: **-{tol_rt:.1f}%**; Compile tolerance: **+{tol_ct:.1f}%**")
+          lines.append("")
+
+          if runtime_regs:
+              runtime_regs.sort(key=lambda x: x[3])
+              lines += ["**Runtime FPS regressions (vs mean of other revisions)**",
+                        "| benchmark_id | current | baseline mean | delta % |",
+                        "|---|---:|---:|---:|"]
+              for bid, cur, base, d in runtime_regs[:20]:
+                  lines.append(f"| `{trunc(bid)}` | {cur:,.0f} | {base:,.0f} | {d:.2f}% |")
+              if len(runtime_regs)>20: lines.append("_Only first 20 shown._"); lines.append("")
+
+          if compile_regs:
+              compile_regs.sort(key=lambda x: -x[3])
+              lines += ["**Compile-time regressions (vs mean of other revisions)**",
+                        "| benchmark_id | current | baseline mean | delta % |",
+                        "|---|---:|---:|---:|"]
+              for bid, cur, base, d in compile_regs[:20]:
+                  lines.append(f"| `{trunc(bid)}` | {cur:,.0f} | {base:,.0f} | {d:.2f}% |")
+              if len(compile_regs)>20: lines.append("_Only first 20 shown._"); lines.append("")
+
+          tag = f"bench-guard-run:{wr_id}"
+          lines.append(f"<!-- {tag} -->")
+          body = "\n".join(lines)
+
+          print("********* Body: ")
+          print(body)
+
+          comment_path = os.environ.get("PR_COMMENT_PATH", "pr_comment.md")
+          with open(comment_path, "w", encoding="utf-8") as f:
+              f.write(body + "\n")
+          print(f"[INFO] wrote {comment_path}")
+          PY
+
+          # Pass pr_comment.md to the next step
+          if [ -s "pr_comment.md" ]; then
             {
               echo 'SCRIPT_OUTPUT<<__EOF__'
               cat pr_comment.md
@@ -93,18 +309,15 @@ jobs:
           COMMENT_BODY: ${{ env.SCRIPT_OUTPUT }}
         with:
           script: |
-            // workflow_run event does not have context.issue -> extract PR from payload
             const prs = (context.payload.workflow_run.pull_requests || []);
             if (!prs.length) {
               core.info('No associated PR; skipping comment.');
               return;
             }
             const prNumber = prs[0].number;
-
             await github.rest.issues.createComment({
               issue_number: prNumber,
               owner: context.repo.owner,
               repo: context.repo.repo,
               body: process.env.COMMENT_BODY
             });
-
diff --git a/test_wandb.py b/test_wandb.py
@@ -293,9 +293,9 @@ def trunc(s, n=120):
 
 lines = []
 lines.append(":warning: **Benchmark regression detected (vs W&B history)**")
-lines.append(f"- Revisions considered: **{len(rev_order)}**")
+lines.append(f"- Baseline commits considered: **{len(rev_order)}**")
 for i in range(len(rev_order)):
-    lines.append(f"\t- Revision {i+1}: {rev_order[i]}")
+    lines.append(f"\t- Commit {i+1}: {rev_order[i]}")
 lines.append(f"- Runtime tolerance: **-{tol_rt:.1f}%**; Compile tolerance: **+{tol_ct:.1f}%**")
 lines.append("")