trnsci
diff --git a/‎CHANGELOG.md‎
Lines changed: 77 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎examples/df_mp2.py‎
Lines changed: 74 additions & 14 deletions b/‎examples/df_mp2.py‎
Lines changed: 74 additions & 14 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/run_neuron_tests.sh‎
Lines changed: 6 additions & 1 deletion b/‎scripts/run_neuron_tests.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎scripts/run_phase3_spike.sh‎
Lines changed: 1 addition & 2 deletions b/‎scripts/run_phase3_spike.sh‎
Lines changed: 1 addition & 2 deletions
@@ -7,6 +7,83 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.5.1] — 2026-04-15
+
+### Added
+
+- **Fused GEMM+energy kernel (#38, `nki_fused_gemm_energy`).** A single
+  `@nki.jit` kernel handles one DF-MP2 orbital pair — both GEMMs (T and T_T)
+  and the VE energy expression — without writing the `(nvir, nvir)` T_flat
+  intermediate to HBM.
+
+  **Two-GEMM T_T strategy:** `T.T[a,b] = T[b,a] = (B_j @ B_i.T)[a,b]`.
+  Rather than `nl.load_transpose2d` of T from HBM (which re-introduces the
+  HBM round-trip), T_T is computed as a second GEMM tile in the same kernel
+  body.  Both T and T_T land in SBUF via `tensor_copy` — no HBM write for
+  either intermediate.
+
+  **Kernel design:**
+  - `TILE = 128` everywhere (`nl.load_transpose2d` constrains both dims to ≤ 128).
+  - Outer a-loop, inner b-loop; two sequential PSUM allocations per (a, b) tile
+    (one for T, one for T_T); VE energy expression fully SBUF-resident.
+  - Cross-b batching: `acc_b (TILE, N_B_TILES)` in SBUF accumulates all b-strip
+    partials before one `nl.store` per a-strip — same pattern as `_mp2_energy_kernel`.
+  - NEFF cache amortises the two-GEMM compile across all `nocc²` pairs (same
+    shape every invocation).
+  - NKI 0.3.0 broadcast fix applied to `denom` construction (same as `_mp2_energy_kernel`).
+
+  **Public API:** `trnblas.nki.nki_fused_gemm_energy(b_i, b_j, eps_occ_i, eps_occ_j, eps_vir)` → scalar.
+
+  **Example integration:** `examples/df_mp2.py --fused-gemm-energy` routes the
+  energy step through the per-pair kernel.  Default path remains the chunk-GEMM
+  path — see benchmark note below.
+
+  **On-hardware benchmark (trn1, small shape: nbasis=128, nocc=16, nvir=112,
+  naux=384; 256 pairs):**
+
+  | Step | Baseline warm | Fused warm |
+  |---|---|---|
+  | energy | **0.13s** | **27.8s** |
+  | total | 3.98s | 31.5s |
+
+  The fused kernel is correct (energies agree to 6 significant figures) but
+  the per-pair loop is **215× slower** on the energy step.  Root cause:
+  Neuron XLA imposes ~100ms per-NEFF-dispatch overhead, independent of kernel
+  compute time.  With 256 pairs × 100ms = 25.6s ≈ 27.8s observed.  The
+  chunk-GEMM baseline amortises this with two dispatches total.
+
+  Pre-transferring B to the XLA device and accumulating on-device (eliminating
+  per-pair CPU syncs) produces the same warm timing because Neuron XLA's
+  per-dispatch overhead is in the dispatch pipeline itself, not in the
+  CPU→XLA transfer.
+
+  **Follow-on:** production speedup requires a batched kernel that processes
+  all nocc² pairs in one `@nki.jit` invocation — tracked in #43.
+
+  **Tests:** `TestFusedGemmEnergy` in `tests/test_nki_gemm.py`:
+  aligned/unaligned correctness (atol=1e-2), symmetry (`E(i,j) == E(j,i)`),
+  zero-B_i, NEFF cache reuse (cold vs warm timing).
+
+### Fixed
+
+- **NKI closure variable limitation in autotuner (#26 regression).** The
+  v0.5.0 `_make_gemm_kernel` factory returned a `@nki.jit` closure that
+  referenced tile sizes (`tm`, `tk`, `tn`) as Python free variables.
+  NKI's AST-based compiler reads source from the on-disk file and resolves
+  names from the local namespace only — it cannot traverse closure cells,
+  producing `error: unbound variable 'tm'` for every tile config.
+
+  **Fix:** replaced the factory with six static `@nki.jit` kernel definitions
+  at module level (`_gemm_kernel_64_128_128` … `_gemm_kernel_128_128_512`),
+  each with literal integer tile constants.  All six are registered in
+  `_gemm_kernel_registry` at import time; `_get_gemm_kernel()` is now a
+  dict lookup.  `_make_gemm_kernel` is removed.  Autotuner behaviour
+  (sweep, cache, escape-hatch) is unchanged.
+
+  **Root cause note:** NKI `@nki.jit` functions must have tile constants
+  visible as literal integers or module-level globals at AST trace time.
+  Closure variables from an enclosing factory scope are not reachable.
+
 ## [0.5.0] — 2026-04-15
 
 ### Added
 
@@ -84,6 +84,35 @@ def _energy_reduction(
     return float(e_mp2)
 
 
+def _energy_reduction_fused_gemm(
+    B: torch.Tensor,
+    eps_occ: torch.Tensor,
+    eps_vir: torch.Tensor,
+) -> float:
+    """Energy via the fused GEMM+energy kernel (#38, v0.5.1).
+
+    Calls `nki_fused_gemm_energy` once per (i, j) orbital pair.  Each
+    call fuses the GEMM (B[i] @ B[j].T) and the VE energy expression
+    into one @nki.jit kernel — eliminating the T_flat HBM round-trip
+    present in `_energy_reduction`.
+
+    This is the per-pair loop the RFC (fused_df_mp2_energy_kernel.md)
+    describes.  The NEFF cache amortises the two-GEMM compile cost
+    across all nocc² pairs since every (i, j) invocation has the same
+    shape and hits the same NEFF.
+    """
+    from trnblas.nki import nki_fused_gemm_energy
+
+    nocc, nvir, naux = B.shape
+    e_mp2 = torch.zeros((), dtype=B.dtype, device=B.device)
+    for i in range(nocc):
+        eps_occ_i = float(eps_occ[i])
+        for j in range(nocc):
+            eps_occ_j = float(eps_occ[j])
+            e_mp2 = e_mp2 + nki_fused_gemm_energy(B[i], B[j], eps_occ_i, eps_occ_j, eps_vir)
+    return float(e_mp2)
+
+
 def df_mp2_energy(
     C_occ: torch.Tensor,  # (nbasis, nocc) — occupied MO coefficients
     C_vir: torch.Tensor,  # (nbasis, nvir) — virtual MO coefficients
@@ -93,12 +122,17 @@ def df_mp2_energy(
     eps_vir: torch.Tensor,  # (nvir,) — virtual orbital energies
     timings: dict | None = None,
     use_fused: bool = False,
+    use_fused_gemm: bool = False,
 ) -> float:
     """Compute DF-MP2 correlation energy.
 
     Returns E_MP2 (scalar). Optionally fills `timings` with per-step seconds.
-    When `use_fused=True`, the energy-reduction step routes through
-    `trnblas.nki.nki_mp2_energy`.
+
+    use_fused:       Route energy-reduction through `nki_mp2_energy`
+                     (fused chunk-level kernel, #15 M2 — 1.48× on energy step).
+    use_fused_gemm:  Route energy through `nki_fused_gemm_energy` per (i,j)
+                     pair (fused GEMM+energy kernel, #38 v0.5.1 — eliminates
+                     T_flat HBM round-trip).
     """
     nbasis, nocc = C_occ.shape
     naux = J_metric.shape[0]
@@ -135,14 +169,18 @@ def df_mp2_energy(
     B = trnblas.batched_gemm(1.0, ia_P, J_b)  # (nocc, nvir, naux)
     t_metric = time.perf_counter() - t0
 
-    # Step 4: Energy via one GEMM (chunked over i if memory-tight).
+    # Step 4: Energy.
     #   T(i,j)_{ab} = Σ_P B[i,a,P] B[j,b,P]
-    # Reshape B → X of shape (nocc·nvir, naux); then T_full = X @ X.T is
-    # one GEMM, and T_full[i·nvir+a, j·nvir+b] = T(i,j)_{ab}. No batching
-    # over (i,j) needed — that was the wrong shape for this contraction.
-    # For shapes where the full T_full doesn't fit HBM, chunk over i.
+    #
+    # Three paths in order of increasing fusion:
+    #   default:         chunk-GEMM (B_flat @ B_flat.T) + torch reduction
+    #   --fused-energy:  chunk-GEMM + fused NKI energy kernel (#15 M2)
+    #   --fused-gemm-energy: per-pair fused GEMM+energy NKI kernel (#38 v0.5.1)
     t0 = time.perf_counter()
-    e_mp2 = _energy_reduction(B, eps_occ, eps_vir, use_fused=use_fused)
+    if use_fused_gemm:
+        e_mp2 = _energy_reduction_fused_gemm(B, eps_occ, eps_vir)
+    else:
+        e_mp2 = _energy_reduction(B, eps_occ, eps_vir, use_fused=use_fused)
     t_energy = time.perf_counter() - t0
 
     if timings is not None:
@@ -195,22 +233,28 @@ def _make_inputs(nbasis: int, nocc: int, naux: int, seed: int = 42, device: str
 }
 
 
-def bench(shape_name: str, device: str = "cpu", use_fused: bool = False):
+def bench(
+    shape_name: str,
+    device: str = "cpu",
+    use_fused: bool = False,
+    use_fused_gemm: bool = False,
+):
     nbasis, nocc, naux = _BENCH_SHAPES[shape_name]
     nvir = nbasis - nocc
     flops = _flops(nbasis, nocc, naux)
     inputs = _make_inputs(nbasis, nocc, naux, device=device)
 
+    energy_mode = "fused-gemm" if use_fused_gemm else ("fused" if use_fused else "torch")
     print(f"[shape={shape_name} nbasis={nbasis} nocc={nocc} nvir={nvir} naux={naux}]")
     print(
         f"  approx flops: {flops / 1e9:.1f} G  backend: {trnblas.get_backend()}  "
-        f"device: {device}  fused_energy: {use_fused}"
+        f"device: {device}  energy_mode: {energy_mode}"
     )
 
     for label in ("cold", "warm"):
         t = {}
         t0 = time.perf_counter()
-        e = df_mp2_energy(*inputs, timings=t, use_fused=use_fused)
+        e = df_mp2_energy(*inputs, timings=t, use_fused=use_fused, use_fused_gemm=use_fused_gemm)
         # Ensure async GPU work completes before stopping the timer.
         if device != "cpu" and torch.cuda.is_available():
             torch.cuda.synchronize()
@@ -246,14 +290,25 @@ def main():
         "--fused-energy",
         action="store_true",
         help="Route the energy-reduction step through trnblas.nki.nki_mp2_energy "
-        "(fused NKI kernel, #15 M2).",
+        "(fused chunk-level kernel, #15 M2 — 1.48× on energy step).",
+    )
+    parser.add_argument(
+        "--fused-gemm-energy",
+        action="store_true",
+        help="Route the energy step through nki_fused_gemm_energy (per-pair fused "
+        "GEMM+energy kernel, #38 v0.5.1 — eliminates T_flat HBM round-trip).",
     )
     args = parser.parse_args()
 
     if args.bench:
         shapes = [args.shape] if args.shape else list(_BENCH_SHAPES)
         for s in shapes:
-            bench(s, device=args.device, use_fused=args.fused_energy)
+            bench(
+                s,
+                device=args.device,
+                use_fused=args.fused_energy,
+                use_fused_gemm=args.fused_gemm_energy,
+            )
         return
 
     if args.demo:
@@ -273,7 +328,12 @@ def main():
 
     timings: dict = {}
     t0 = time.perf_counter()
-    e_mp2 = df_mp2_energy(*inputs, timings=timings, use_fused=args.fused_energy)
+    e_mp2 = df_mp2_energy(
+        *inputs,
+        timings=timings,
+        use_fused=args.fused_energy,
+        use_fused_gemm=args.fused_gemm_energy,
+    )
     total = time.perf_counter() - t0
     for k, v in timings.items():
         print(f"  {k:15s}: {v:.3f}s")
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "trnblas"
-version = "0.5.0"
+version = "0.5.1"
 description = "BLAS operations for AWS Trainium via NKI"
 readme = "README.md"
 license = "Apache-2.0"
 
@@ -34,7 +34,7 @@ SHA="$(git rev-parse HEAD)"
 echo "Looking up instance with Name=$TAG in $REGION..."
 INSTANCE_ID=$(aws ec2 describe-instances \
   --filters "Name=tag:Name,Values=$TAG" \
-            "Name=instance-state-name,Values=stopped,running,pending" \
+            "Name=instance-state-name,Values=stopped,stopping,running,pending" \
   --query 'Reservations[0].Instances[0].InstanceId' \
   --output text \
   --region "$REGION")
@@ -59,6 +59,11 @@ trap cleanup EXIT
 STATE=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --region "$REGION" \
   --query 'Reservations[0].Instances[0].State.Name' --output text)
 
+if [[ "$STATE" == "stopping" ]]; then
+  echo "Instance is stopping — waiting for stopped..."
+  aws ec2 wait instance-stopped --instance-ids "$INSTANCE_ID" --region "$REGION"
+  STATE=stopped
+fi
 if [[ "$STATE" == "stopped" ]]; then
   echo "Starting instance..."
   aws ec2 start-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null
 
@@ -114,9 +114,8 @@ device = xm.xla_device()
 TILE, NPAIRS = 128, 8
 B = torch.randn(NPAIRS, TILE, TILE).to(device)
 D = torch.ones(TILE, TILE).to(device)
-O = torch.zeros(NPAIRS, TILE, 1).to(device)
 print("Compiling spike C...", flush=True)
-_spike_c_te_ve_overlap(B, D, O)
+_spike_c_te_ve_overlap(B, D)
 print("Done.", flush=True)
 ')
 printf '%s' "$WARMUP_PY" > /tmp/spike_c_warmup.py