awslabs
diff --git a/‎examples/gym.py‎
Lines changed: 6 additions & 23 deletions b/‎examples/gym.py‎
Lines changed: 6 additions & 23 deletions
diff --git a/‎nkigym/src/nkigym/codegen/context.py‎
Lines changed: 62 additions & 3 deletions b/‎nkigym/src/nkigym/codegen/context.py‎
Lines changed: 62 additions & 3 deletions
diff --git a/‎nkigym/src/nkigym/codegen/loop_rolling.py‎
Lines changed: 36 additions & 1 deletion b/‎nkigym/src/nkigym/codegen/loop_rolling.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎nkigym/src/nkigym/ops/activation.py‎
Lines changed: 2 additions & 1 deletion b/‎nkigym/src/nkigym/ops/activation.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nkigym/src/nkigym/ops/matmul.py‎
Lines changed: 5 additions & 2 deletions b/‎nkigym/src/nkigym/ops/matmul.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎nkigym/src/nkigym/ops/nc_transpose.py‎
Lines changed: 2 additions & 1 deletion b/‎nkigym/src/nkigym/ops/nc_transpose.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nkigym/src/nkigym/ops/tensor_scalar.py‎
Lines changed: 2 additions & 1 deletion b/‎nkigym/src/nkigym/ops/tensor_scalar.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nkigym/src/nkigym/ops/tensor_tensor.py‎
Lines changed: 2 additions & 1 deletion b/‎nkigym/src/nkigym/ops/tensor_tensor.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nkigym/src/nkigym/ops/tiling_ops.py‎
Lines changed: 6 additions & 3 deletions b/‎nkigym/src/nkigym/ops/tiling_ops.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎nkigym/src/nkigym/search/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎nkigym/src/nkigym/search/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -7,17 +7,13 @@
 
 import argparse
 import logging
-import math
 from pathlib import Path
 
 import numpy as np
 
 import nkigym
-from nkigym.search import benchmark_variants, search
+from nkigym.search import search
 from nkigym.transforms import DataReuseTransform, OperandMergeTransform
-from nkigym.utils import setup_logging
-
-logger = logging.getLogger(__name__)
 
 
 def nkigym_matmul(lhs: np.ndarray, rhs: np.ndarray) -> np.ndarray:
@@ -37,45 +33,32 @@ def parse_args() -> argparse.Namespace:
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="NKI Gym search example")
     parser.add_argument(
-        "--cache-dir", type=Path, default=Path("cache"), help="Directory for storing output logs (default: cache)"
+        "--cache-dir", type=Path, default=Path("cache"), help="Directory for storing output (default: cache)"
     )
     return parser.parse_args()
 
 
 def main() -> None:
     """Run transform search on a tiled matmul workload."""
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
     args = parse_args()
     cache_dir = args.cache_dir
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    log_path = cache_dir / "gym.log"
-    setup_logging(str(log_path))
 
     k, m, n = 256, 256, 256
     rng = np.random.default_rng(42)
     lhs = rng.standard_normal((k, m)).astype(np.float32)
     rhs = rng.standard_normal((k, n)).astype(np.float32)
 
-    variants = search(
+    search(
         func=nkigym_matmul,
         transforms=[DataReuseTransform(), OperandMergeTransform()],
-        num_targets=math.inf,
+        num_targets=1000,
         seed=42,
         min_depth=10,
         save_cache=cache_dir,
         kernel_kwargs={"lhs": lhs, "rhs": rhs},
     )
-    logger.info("Search produced %d unique variants", len(variants))
-
-    results = benchmark_variants(
-        cache_dir=cache_dir,
-        func_name="nkigym_matmul",
-        kernel_kwargs={"lhs": lhs, "rhs": rhs},
-        output_name="output",
-        output_shape=(m, n),
-        warmup=2,
-        iters=5,
-    )
-    results.summary(top_k=5)
 
 
 if __name__ == "__main__":
 
@@ -19,12 +19,14 @@ class _LoweringContext:
         params: Input parameter names (all live in HBM).
         buffers: Variable name to buffer location string.
         aliases: Maps accumulation output names to canonical PSUM variable.
+        alias_offsets: Maps alias names to their start offsets per axis.
         staging_counter: Monotonic counter for staging variable names.
     """
 
     params: tuple[str, ...]
     buffers: dict[str, str] = field(default_factory=dict)
     aliases: dict[str, str] = field(default_factory=dict)
+    alias_offsets: dict[str, tuple[int, ...]] = field(default_factory=dict)
     staging_counter: int = 0
 
     def resolve(self, name: str) -> str:
@@ -40,6 +42,26 @@ def resolve(self, name: str) -> str:
             name = self.aliases[name]
         return name
 
+    def _resolve_offsets(self, name: str) -> tuple[int, ...]:
+        """Accumulate start offsets along the alias chain.
+
+        Args:
+            name: Variable name, possibly an accumulation alias.
+
+        Returns:
+            Tuple of accumulated start offsets per axis.
+        """
+        offsets: list[int] = []
+        while name in self.aliases:
+            entry_offsets = self.alias_offsets.get(name, ())
+            if not offsets:
+                offsets = list(entry_offsets)
+            else:
+                for i, o in enumerate(entry_offsets):
+                    offsets[i] += o
+            name = self.aliases[name]
+        return tuple(offsets)
+
     def buffer_of(self, name: str) -> str:
         """Look up the buffer location of a variable, resolving aliases.
 
@@ -57,8 +79,9 @@ def buffer_of(self, name: str) -> str:
     def subscript(self, ref: TensorRef) -> str:
         """Render a TensorRef as ``name[s:e, s:e]``, resolving aliases.
 
-        Unconditionally renders slices from the IR. The IR is the
-        source of truth — no shape comparison or optimization.
+        When the name resolves through an alias chain, accumulates
+        start offsets and composes them with the ref slices so the
+        subscript points at the correct region of the canonical buffer.
 
         Args:
             ref: Tensor reference.
@@ -67,25 +90,61 @@ def subscript(self, ref: TensorRef) -> str:
             Subscripted string or plain resolved name.
         """
         resolved = self.resolve(ref.name)
+        offsets = self._resolve_offsets(ref.name)
         result = resolved
         if ref.slices:
-            parts = ", ".join(f"{s}:{e}" for s, e in ref.slices)
+            parts = _compose_slices(ref.slices, offsets)
             result = f"{resolved}[{parts}]"
         return result
 
 
+def _compose_slices(slices: tuple[tuple[int, int], ...], offsets: tuple[int, ...]) -> str:
+    """Compose ref slices with alias offsets into a subscript string.
+
+    Args:
+        slices: Per-axis (start, stop) bounds from the TensorRef.
+        offsets: Per-axis start offsets from the alias chain.
+
+    Returns:
+        Comma-separated ``s:e`` subscript string.
+    """
+    parts: list[str] = []
+    for i, (s, e) in enumerate(slices):
+        offset = offsets[i] if i < len(offsets) else 0
+        parts.append(f"{s + offset}:{e + offset}")
+    return ", ".join(parts)
+
+
 def get_kwarg(stmt: GymStatement, key: str) -> object:
     """Extract a keyword argument value from a statement.
 
+    Asserts that kwargs contain no duplicate keys, since duplicates
+    indicate an IR construction bug upstream.
+
     Args:
         stmt: GymStatement to search.
         key: Keyword argument name.
 
     Returns:
         The value if found, None otherwise.
     """
+    _assert_no_duplicate_kwargs(stmt)
     result = None
     for k, v in stmt.kwargs:
         if k == key:
             result = v
+            break
     return result
+
+
+def _assert_no_duplicate_kwargs(stmt: GymStatement) -> None:
+    """Assert that a statement has no duplicate keyword argument names.
+
+    Args:
+        stmt: GymStatement to check.
+
+    Raises:
+        AssertionError: If duplicate kwarg keys are found.
+    """
+    keys = [k for k, _ in stmt.kwargs]
+    assert len(keys) == len(set(keys)), f"Duplicate kwargs in {stmt.op} stmt '{stmt.output.name}': {keys}"
@@ -228,6 +228,41 @@ def _extract_varying(
     return (valid, varying)
 
 
+def _collect_assigned_names(stmts: list[ast.stmt]) -> set[str]:
+    """Collect all assignment target names from a list of statements."""
+    names: set[str] = set()
+    for stmt in stmts:
+        if isinstance(stmt, ast.Assign):
+            for target in stmt.targets:
+                if isinstance(target, ast.Name):
+                    names.add(target.id)
+    return names
+
+
+def _collect_referenced_names(stmts: list[ast.stmt]) -> set[str]:
+    """Collect all Name references (loads) from a list of statements."""
+    names: set[str] = set()
+    for stmt in stmts:
+        for node in ast.walk(stmt):
+            if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
+                names.add(node.id)
+    return names
+
+
+def _check_scope_safe(working_stmts: list[ast.stmt], start_idx: int, block_size: int, trip_count: int) -> bool:
+    """Check that rolling a run won't hide definitions used after the loop.
+
+    Returns False if any variable defined inside the rolled region is
+    referenced by statements after the rolled region.
+    """
+    end_idx = start_idx + trip_count * block_size
+    rolled_region = working_stmts[start_idx:end_idx]
+    after_region = working_stmts[end_idx:]
+    defined = _collect_assigned_names(rolled_region)
+    used_after = _collect_referenced_names(after_region)
+    return not (defined & used_after)
+
+
 def _count_matching_blocks(
     working_stmts: list[ast.stmt], start: int, block_size: int, n: int, cache: dict[int, str]
 ) -> int:
@@ -271,7 +306,7 @@ def _find_best_run(working_stmts: list[ast.stmt]) -> _LoopRun:
             count = _count_matching_blocks(working_stmts, p, k, n, cache)
             if count >= 2 and count * k > best_coverage:
                 valid, varying = _extract_varying(working_stmts, k, count, p)
-                if valid:
+                if valid and _check_scope_safe(working_stmts, p, k, count):
                     best = _LoopRun(p, k, count, varying)
                     best_coverage = count * k
             p += 1
 
@@ -73,6 +73,7 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
 
         data = ctx.subscript(data_ref)
         out_name = stmt.output.name
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "SBUF"
 
         func_str = "nl.identity"
@@ -82,5 +83,5 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         shape_str = repr(stmt.output.shape)
         return [
             f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-            f"nisa.activation(dst={out_name}, op={func_str}, data={data})",
+            f"nisa.activation(dst={out_sub}, op={func_str}, data={data})",
         ]
@@ -69,17 +69,20 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         stat_name = ctx.subscript(stat_ref)
         mov_name = ctx.subscript(mov_ref)
         out_name = stmt.output.name
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "PSUM"
 
         lines: list[str] = []
         if isinstance(acc_ref, TensorRef):
             canonical = ctx.resolve(acc_ref.name)
+            acc_sub = ctx.subscript(acc_ref)
             ctx.aliases[out_name] = canonical
-            lines = [f"nisa.nc_matmul(dst={canonical}, stationary={stat_name}, moving={mov_name})"]
+            ctx.alias_offsets[out_name] = tuple(s for s, _ in acc_ref.slices)
+            lines = [f"nisa.nc_matmul(dst={acc_sub}, stationary={stat_name}, moving={mov_name})"]
         else:
             shape_str = repr(stmt.output.shape)
             lines = [
                 f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.psum)",
-                f"nisa.nc_matmul(dst={out_name}, stationary={stat_name}, moving={mov_name})",
+                f"nisa.nc_matmul(dst={out_sub}, stationary={stat_name}, moving={mov_name})",
             ]
         return lines
@@ -53,10 +53,11 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
 
         data = ctx.subscript(data_ref)
         out_name = stmt.output.name
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "SBUF"
 
         shape_str = repr(stmt.output.shape)
         return [
             f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-            f"nisa.nc_transpose(dst={out_name}, data={data})",
+            f"nisa.nc_transpose(dst={out_sub}, data={data})",
         ]
@@ -64,6 +64,7 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
 
         data = ctx.subscript(data_ref)
         out_name = stmt.output.name
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "SBUF"
 
         operand = str(operand_ref)
@@ -78,5 +79,5 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         shape_str = repr(stmt.output.shape)
         return [
             f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-            f"nisa.tensor_scalar(dst={out_name}, data={data}{op_kwarg}, operand0={operand})",
+            f"nisa.tensor_scalar(dst={out_sub}, data={data}{op_kwarg}, operand0={operand})",
         ]
@@ -65,6 +65,7 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         d1 = ctx.subscript(d1_ref)
         d2 = ctx.subscript(d2_ref)
         out_name = stmt.output.name
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "SBUF"
 
         op_part = ""
@@ -75,5 +76,5 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         shape_str = repr(stmt.output.shape)
         return [
             f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-            f"nisa.tensor_tensor(dst={out_name}, data1={d1}, data2={d2}{op_part})",
+            f"nisa.tensor_tensor(dst={out_sub}, data1={d1}, data2={d2}{op_part})",
         ]
@@ -104,12 +104,13 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
         shape_str = repr(stmt.output.shape)
         src_subscript = ctx.subscript(src_ref)
 
+        out_sub = ctx.subscript(stmt.output)
         ctx.buffers[out_name] = "SBUF"
         lines = [f"{out_name} = {src_subscript}"]
         if src_buffer != "SBUF":
             lines = [
                 f"{out_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-                f"nisa.dma_copy(dst={out_name}, src={src_subscript})",
+                f"nisa.dma_copy(dst={out_sub}, src={src_subscript})",
             ]
         return lines
 
@@ -184,9 +185,11 @@ def to_nki(self, stmt: "GymStatement", ctx: "_LoweringContext") -> list[str]:
             staging_name = f"_staging_{ctx.staging_counter}"
             ctx.staging_counter += 1
             shape_str = repr(src_ref.shape)
+            parts = ", ".join(f"0:{s}" for s in src_ref.shape)
+            staging_sub = f"{staging_name}[{parts}]"
             lines = [
                 f"{staging_name} = nl.ndarray({shape_str}, dtype=nl.float32, buffer=nl.sbuf)",
-                f"nisa.tensor_copy(dst={staging_name}, src={src_subscript})",
-                f"nisa.dma_copy(dst={dst_subscript}, src={staging_name})",
+                f"nisa.tensor_copy(dst={staging_sub}, src={src_subscript})",
+                f"nisa.dma_copy(dst={dst_subscript}, src={staging_sub})",
             ]
         return lines
@@ -7,7 +7,7 @@
 provides systematic exploration and sampling of that search space.
 """
 
-from nkigym.search.benchmark import benchmark_variants
+from nkigym.search.compile import SearchResults
 from nkigym.search.search import search
 
-__all__ = ["benchmark_variants", "search"]
+__all__ = ["SearchResults", "search"]