test: #706 reachability repros (reverted in next commit; not for merge)

ncylich · ncylich · commit 94fc381bf681 · 2026-06-09T21:33:44.000-07:00
Runnable repros that reach each path #706 touches and show the dropped ones are unreachable: aten_ops mis-routing, scaled-addmm fail-closed, rms_norm non-last-dim mis-fusion, capture_jax strided-slice aliasing, plus the unreachable SDPA mask/dropout and no-training batchnorm cases. This commit is reverted immediately so it stays referenceable by SHA without entering the squash/merge. Signed-off-by: Noah Cylich <noahcylich@gmail.com>
diff --git a/python/tests/transpile/tools/repro_706/README.md b/python/tests/transpile/tools/repro_706/README.md
@@ -0,0 +1,32 @@
+# PR #706 reachability repros
+
+Scripts that reach (or show as unreachable) each code path touched by #706, on the
+real capture/import pipeline. This commit is intentionally reverted in the next
+commit so the repros are referenceable by SHA but are **not** part of the squash/merge.
+
+## Run
+
+```bash
+# PyTorch paths (routing, batchnorm export form, scaled-addmm, SDPA mask, rms_norm)
+python python/tests/transpile/tools/repro_706/repro_pytorch.py
+
+# JAX path (strided-slice aliasing)
+pip install "jax[cpu]"
+python python/tests/transpile/tools/repro_706/repro_jax.py
+
+# origin/main vs this branch, one file swapped at a time
+bash python/tests/transpile/tools/repro_706/origin_vs_pr.sh
+```
+
+## What each shows
+
+| Path | Kept? | Repro result |
+|---|---|---|
+| `aten_ops` longest-prefix | keep | origin mis-routes `addmm→add`, `minimum→min`, `maximum→max`, `slice_scatter→slice`, `select_scatter→index`; a `torch.minimum/maximum` model imports to `{minimum,maximum,add}` here |
+| `importers` scaled-addmm | keep | `torch.addmm(b,a,c, beta=2.0)` fails closed |
+| `fusion/rms_norm` guard | keep | FP16 channel-wise (non-last) RMS fuses into a last-dim kernel with weight `(5,)` for a channel dim of 4 without the guard; guard skips it, last-dim still fuses |
+| `capture_jax` strided slice | keep | `x[::2]` → origin aliases full length-6 input (`ops=[]`), branch gives length-3 `slice` |
+| `lower.py` | drop | only the non-FP16 branch changes; FP16 engine never executes it |
+| `import_semantics` dropout | drop | `dropout_p` always 0 after `model.eval()` |
+| `import_semantics` mask | drop | `attrs["mask"]` only set for a non-tensor literal; SDPA `attn_mask` is always Tensor/None (3 forms tested, none set it) |
+| `importers` batchnorm no-training | drop | `_native_batch_norm_legit_no_training` only appears after `run_decompositions`, which only feeds CoreML; the IR importer always sees 8-arg `aten.batch_norm.default` |
diff --git a/python/tests/transpile/tools/repro_706/origin_vs_pr.sh b/python/tests/transpile/tools/repro_706/origin_vs_pr.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Origin-vs-PR comparison for PR #706. Run from the repo root with this branch
+# checked out. Swaps a single file to origin/main, runs the probe, then restores.
+# The cactus package is editable-installed against this tree, so a file swap +
+# fresh interpreter is the way to compare; worktrees would all import this tree.
+set -euo pipefail
+cd "$(git rev-parse --show-toplevel)"
+BR=audit/transpiler-lowering
+F=python/cactus/transpile
+
+restore() { git checkout -q "$BR" -- "$1"; }
+
+echo "########## aten_ops: ORIGIN routing (expect add/min/max/slice/index) ##########"
+git checkout -q origin/main -- "$F/aten_ops.py"
+python -c "
+from cactus.transpile.normalize import normalize_target
+import torch
+for op in ['addmm','minimum','maximum','slice_scatter','select_scatter']:
+    print('  ORIGIN', op, '->', normalize_target(getattr(torch.ops.aten, op).default))"
+restore "$F/aten_ops.py"
+
+echo "########## rms_norm: WITH guard, non-last must NOT fuse / last must fuse ##########"
+# this branch ships the guard; show it explicitly
+python -c "
+import torch, torch.nn as nn
+from cactus.transpile.capture_pytorch import capture_model
+from cactus.transpile.optimize_graph import fuse_rms_norm
+def rms(x,d,w,e=1e-6): return x*torch.rsqrt(x.pow(2).mean(d,keepdim=True)+e)*w
+class Chan(nn.Module):
+    def __init__(s): super().__init__(); s.w=nn.Parameter(torch.ones(4,1,1,dtype=torch.float16))
+    def forward(s,x): return rms(x,1,s.w)
+class Last(nn.Module):
+    def __init__(s): super().__init__(); s.w=nn.Parameter(torch.ones(8,dtype=torch.float16))
+    def forward(s,x): return rms(x,-1,s.w)
+for name,m,ex in [('non-last',Chan(),torch.randn(2,4,5,5,dtype=torch.float16)),('last',Last(),torch.randn(2,3,8,dtype=torch.float16))]:
+    g=capture_model(m.half().eval(),(ex,)).ir_graph
+    ch=fuse_rms_norm(g); rn=sum(1 for n in g.order if g.nodes[n].op=='rms_norm')
+    print('  WITH-GUARD',name,'fused=',ch,'rms_norm=',rn)"
+
+echo "########## capture_jax: ORIGIN aliases strided slice (expect ops=[] shape=(6,)) ##########"
+git checkout -q origin/main -- "$F/capture_jax.py"
+python python/tests/transpile/tools/repro_706/repro_jax.py 2>/dev/null || echo "  (needs jax[cpu])"
+restore "$F/capture_jax.py"
+
+echo "########## batch_norm: run_decompositions turns it into the no_training op ##########"
+python -c "
+import torch, torch.nn as nn
+m=nn.BatchNorm2d(4).eval()
+ep=torch.export.export(m,(torch.randn(1,4,8,8),))
+print('  default export  :', [str(n.target) for n in ep.graph.nodes if 'batch_norm' in str(n.target)])
+ep2=ep.run_decompositions()
+print('  after decompose :', [str(n.target) for n in ep2.graph.nodes if 'batch_norm' in str(n.target)])
+print('  (decompose only runs on the NPU->CoreML path, never into import_captured_to_ir)')"
diff --git a/python/tests/transpile/tools/repro_706/repro_jax.py b/python/tests/transpile/tools/repro_706/repro_jax.py
@@ -0,0 +1,29 @@
+"""Reachability repro for PR #706 (JAX capture path).
+
+    pip install "jax[cpu]"
+    python python/tests/transpile/tools/repro_706/repro_jax.py
+
+Shows the stride-aware slice-aliasing fix in capture_jax.py: a strided slice
+(x[::2]) must not be treated as a no-op alias.
+"""
+from __future__ import annotations
+import numpy as np
+import jax, jax.numpy as jnp
+
+from cactus.transpile.capture_jax import capture_jax_function
+
+
+def describe(tag, fn, args):
+    ir = capture_jax_function(fn, args)
+    ops = [ir.nodes[n].op for n in ir.order]
+    out_id, in_id = ir.outputs[0], ir.inputs[0]
+    out_shape = getattr(ir.values.get(out_id), "shape", None)
+    print(f"[{tag}] ops={ops} out_shape={out_shape} output_is_input_alias={out_id == in_id}")
+
+
+# Strided slice over a length-6 vector -> expected length 3.
+# origin: stride ignored in changed_axes -> start=0,limit=full -> aliased no-op (WRONG).
+# this branch: stride!=1 -> a real slice node, length 3.
+x = jnp.arange(6.0)
+print("eager jax x[::2] =", np.asarray(jax.jit(lambda v: v[::2])(x)))
+describe("strided-slice x[::2]", lambda v: v[::2], (x,))
diff --git a/python/tests/transpile/tools/repro_706/repro_pytorch.py b/python/tests/transpile/tools/repro_706/repro_pytorch.py
@@ -0,0 +1,134 @@
+"""Reachability repros for PR #706 (PyTorch capture/import path).
+
+Run from the repo root with the branch checked out:
+
+    python python/tests/transpile/tools/repro_706/repro_pytorch.py
+
+Each section reaches a code path the PR touches (or shows it is unreachable),
+on the *current* checkout. Pair with origin_vs_pr.sh to see the origin behaviour.
+"""
+from __future__ import annotations
+import traceback
+from collections import Counter
+import torch, torch.nn as nn, torch.nn.functional as F
+
+from cactus.transpile.capture_pytorch import capture_model
+from cactus.transpile.optimize_graph import fuse_rms_norm
+from cactus.transpile.normalize import normalize_target
+from cactus.transpile.import_semantics import apply_import_semantics
+
+
+def opc(g):
+    return Counter(g.nodes[n].op for n in g.order)
+
+
+def line(t):
+    print(f"\n========== {t} ==========")
+
+
+# ---- 1. aten_ops longest-prefix routing (KEEP: reached by natural ops) ----
+line("1. aten_ops routing (this branch)")
+for op in ["addmm", "minimum", "maximum", "slice_scatter", "select_scatter"]:
+    t = getattr(torch.ops.aten, op).default
+    print(f"  normalize_target(aten.{op}) -> {normalize_target(t)}")
+
+class MinMax(nn.Module):
+    def forward(self, a, b):
+        return torch.minimum(a, b) + torch.maximum(a, b)
+c = capture_model(MinMax(), (torch.randn(3), torch.randn(3)))
+print("  MinMax model IR ops:", dict(opc(c.ir_graph)))
+
+
+# ---- 2. BatchNorm: which aten op does the IR import path actually see? ----
+# REMOVED from PR: the no_training (7-arg) branch is unreachable here because
+# the capture path never runs run_decompositions (that only happens on the
+# NPU->CoreML path). torch.export emits the 8-arg aten.batch_norm.default.
+line("2. eval-mode BatchNorm export form")
+class ConvBN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 4, 3, padding=1)
+        self.bn = nn.BatchNorm2d(4)
+    def forward(self, x):
+        return self.bn(self.conv(x))
+ep = torch.export.export(ConvBN().eval(), (torch.randn(1, 3, 8, 8),))
+print("  exported BN target(s):", [str(n.target) for n in ep.graph.nodes if "batch_norm" in str(n.target)])
+print("  (run_decompositions would instead emit _native_batch_norm_legit_no_training -> only feeds CoreML)")
+
+
+# ---- 3. scaled addmm fail-closed (KEEP: reached) ----
+line("3. scaled addmm (beta=2.0)")
+class ScaledAddmm(nn.Module):
+    def forward(self, bias, a, b):
+        return torch.addmm(bias, a, b, beta=2.0)
+try:
+    capture_model(ScaledAddmm(), (torch.randn(4), torch.randn(4, 5), torch.randn(5, 4)))
+    print("  imported WITHOUT failing (origin behaviour: silently drops beta)")
+except Exception as e:
+    print("  fail-closed OK:", type(e).__name__, "->", str(e).split(":")[-1].strip()[:80])
+
+
+# ---- 4. SDPA literal mask -> attrs['mask']? (REMOVED: never produced) ----
+line("4. SDPA mask -> attrs['mask']? (3 natural forms)")
+def check(name, mod, args):
+    g = capture_model(mod, args).ir_graph
+    apply_import_semantics(g)
+    for n in g.order:
+        nd = g.nodes[n]
+        if nd.op == "attention":
+            print(f"  [{name}] attrs={sorted(nd.attrs)} mask_in_attrs={'mask' in nd.attrs} n_inputs={len(nd.inputs)}")
+            return
+    print(f"  [{name}] no attention node")
+
+class MaskTensorArg(nn.Module):
+    def forward(self, q, k, v, m):
+        return F.scaled_dot_product_attention(q, k, v, attn_mask=m)
+check("tensor-mask-as-input", MaskTensorArg(), (torch.randn(1, 1, 4, 8),) * 3 + (torch.zeros(1, 1, 4, 4),))
+
+class MaskInlineConst(nn.Module):
+    def forward(self, q, k, v):
+        return F.scaled_dot_product_attention(q, k, v, attn_mask=torch.zeros(4, 4))
+check("inline-const-mask", MaskInlineConst(), (torch.randn(1, 1, 4, 8),) * 3)
+
+class CausalFlag(nn.Module):
+    def forward(self, q, k, v):
+        return F.scaled_dot_product_attention(q, k, v, is_causal=True)
+check("is_causal-flag", CausalFlag(), (torch.randn(1, 1, 4, 8),) * 3)
+
+
+# ---- 6. rms_norm guard (KEEP: non-last-dim mis-fuses without it) ----
+# NOTE: fuse_rms_norm skips FP32 inputs (kept unfused on purpose), so the inputs
+# must be FP16 to reach the fusion -- as they are in real transpilation.
+def rms(x, dim, w, eps=1e-6):
+    return x * torch.rsqrt(x.pow(2).mean(dim, keepdim=True) + eps) * w
+
+line("6a. LAST-dim RMSNorm, FP16")
+class RMSLast(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = nn.Parameter(torch.ones(8, dtype=torch.float16))
+    def forward(self, x):
+        return rms(x, -1, self.w)
+g = capture_model(RMSLast().half().eval(), (torch.randn(2, 3, 8, dtype=torch.float16),)).ir_graph
+changed = fuse_rms_norm(g)
+print("  fuse_rms_norm changed?", changed, "| rms_norm nodes:", opc(g).get("rms_norm", 0))
+
+line("6b. NON-last-dim RMSNorm (channel dim=1), FP16")
+class RMSChan(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = nn.Parameter(torch.ones(4, 1, 1, dtype=torch.float16))
+    def forward(self, x):
+        return rms(x, 1, self.w)
+g = capture_model(RMSChan().half().eval(), (torch.randn(2, 4, 5, 5, dtype=torch.float16),)).ir_graph
+print("  mean axes:", [(n, g.nodes[n].attrs.get("axis")) for n in g.order if g.nodes[n].op == "mean"])
+changed = fuse_rms_norm(g)
+for n in g.order:
+    if g.nodes[n].op == "rms_norm":
+        wv = g.values.get(g.nodes[n].inputs[1])
+        print("  fused rms_norm weight shape:", getattr(wv, "shape", None), "(channel=4, last dim=5)")
+print("  fuse_rms_norm changed?", changed,
+      "| this branch ships the guard so changed=False (correct);",
+      "checkout origin/main rms_norm.py to see changed=True with weight (5,)")
+
+print("\n[done]")