Fix Python layernorm benchmark throughput reporting to match Julia.

maleadt · claude · maleadt · commit 19117ebc1a30 · 2026-04-08T13:03:14.000+02:00
Python's layernorm metric() was returning a single tuple applied to both
fwd and bwd passes, while Julia correctly uses separate multipliers (4x
for forward, 5x for backward). Update layernorm.py to return a per-impl
dict and update benchmarks.py to handle dict metric returns.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/benchmarks.py b/examples/benchmarks.py
@@ -98,8 +98,9 @@ def run_benchmark(name: str):
     data = prepare_fn(benchmark=True)
 
     # Get metric info if available
+    # metric() returns either (total, unit) or dict{"impl": (total, unit)}
     metric_fn = getattr(mod, "metric", None)
-    metric_total, metric_unit = (0, "") if not metric_fn else metric_fn(data)
+    metric_result = metric_fn(data) if metric_fn else None
 
     # Run cuTile
     result = run_fn(data, nruns=NRUNS, warmup=WARMUP)
@@ -121,7 +122,7 @@ def run_benchmark(name: str):
         others = run_others_fn(data, nruns=NRUNS, warmup=WARMUP)
         results.update(others)
 
-    return results, metric_total, metric_unit
+    return results, metric_result
 
 
 #=============================================================================
@@ -147,14 +148,21 @@ def main():
             print("  (skipped - no prepare/run functions)")
             continue
 
-        results, metric_total, metric_unit = ret
+        results, metric_result = ret
 
         # Convert to BenchmarkResult for printing
         benchmark_results = []
         for impl_name, times in results.items():
             min_t = min(times)
             mean_t = sum(times) / len(times)
-            tp = format_throughput(metric_total, metric_unit, min_t) if metric_unit else ""
+            tp = ""
+            if isinstance(metric_result, dict):
+                if impl_name in metric_result:
+                    mt, mu = metric_result[impl_name]
+                    tp = format_throughput(mt, mu, min_t)
+            elif isinstance(metric_result, tuple):
+                mt, mu = metric_result
+                tp = format_throughput(mt, mu, min_t) if mu else ""
             benchmark_results.append(BenchmarkResult(impl_name, min_t, mean_t, tp))
 
         # Sort by min time
diff --git a/examples/layernorm.py b/examples/layernorm.py
@@ -255,9 +255,14 @@ def verify(data, result):
         f"DB mismatch! max diff: {np.max(np.abs(cp.asnumpy(result['DB']) - expected_DB))}"
 
 def metric(data):
-    """Return (total_bytes, unit) for throughput calculation."""
-    # Forward: 3 reads of X + W + B reads + Y write + Mean/Rstd writes ≈ 4*M*N floats
-    return 4 * data["M"] * data["N"] * 4, "GB/s"
+    """Return per-implementation (total_bytes, unit) for throughput calculation."""
+    MN = data["M"] * data["N"] * 4  # sizeof(float32)
+    return {
+        # Forward: X read (3 passes: mean, var, normalize) + Y write ≈ 4*M*N floats
+        "cuTile Fwd": (4 * MN, "GB/s"),
+        # Backward: X read (2 passes) + DY read (2 passes) + DX write ≈ 5*M*N floats
+        "cuTile Bwd": (5 * MN, "GB/s"),
+    }
 
 
 # No run_others for layernorm - no simple reference implementation to compare against