awslabs
diff --git a/‎examples/gym.py‎
Lines changed: 4 additions & 9 deletions b/‎examples/gym.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎examples/rmsnorm_matmul.py‎
Lines changed: 59 additions & 0 deletions b/‎examples/rmsnorm_matmul.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎nkigym/src/nkigym/__init__.py‎
Lines changed: 47 additions & 8 deletions b/‎nkigym/src/nkigym/__init__.py‎
Lines changed: 47 additions & 8 deletions
diff --git a/‎nkigym/src/nkigym/codegen/parse.py‎
Lines changed: 98 additions & 6 deletions b/‎nkigym/src/nkigym/codegen/parse.py‎
Lines changed: 98 additions & 6 deletions
@@ -33,25 +33,20 @@ def matmul(a: np.ndarray, b: np.ndarray) -> np.ndarray:
 def parse_args() -> argparse.Namespace:
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="NKI Gym search example")
-    parser.add_argument(
-        "--cache-dir",
-        type=Path,
-        default=Path("/fsx/weittang/gym_cache"),
-        help="Directory for storing output (default: /fsx/weittang/gym_cache)",
-    )
+    parser.add_argument("--cache-dir", type=Path, required=True, help="Directory for storing output artifacts")
     return parser.parse_args()
 
 
 def main() -> None:
-    """Run schedule search on a 2048x2048 matmul workload."""
+    """Run schedule search on a 1024x1024 matmul workload."""
     logging.basicConfig(level=logging.INFO, format="%(message)s")
 
     args = parse_args()
     cache_dir = args.cache_dir
 
     rng = np.random.default_rng(42)
-    a = rng.standard_normal((2048, 2048)).astype(np.float16)
-    b = rng.standard_normal((2048, 2048)).astype(np.float16)
+    a = rng.standard_normal((1024, 1024)).astype(np.float16)
+    b = rng.standard_normal((1024, 1024)).astype(np.float16)
 
     search(func=matmul, num_targets=99999, seed=42, save_cache=cache_dir, kernel_kwargs={"a": a, "b": b})
 
 
@@ -0,0 +1,59 @@
+"""NKI Gym search: rmsnorm + matmul two-pass reduction kernel.
+
+Demonstrates multi-pass schedule search: RMSNorm (activation+reduce
+over K, then normalize) followed by matrix multiply, producing two
+sequential reduction passes over the same dimension.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import nkigym
+from nkigym.search import search
+
+
+def rmsnorm_matmul(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """RMSNorm(a) @ b: normalize rows of a then multiply by b.
+
+    Args:
+        a: Input tensor of shape [M, K].
+        b: Weight tensor of shape [K, N].
+
+    Returns:
+        Output tensor of shape [M, N].
+    """
+    sum_sq = nkigym.activation(a, op="square", reduce_op=np.add)
+    scaled = nkigym.tensor_scalar(sum_sq, op0=np.multiply, operand0=1 / 1024, op1=np.add, operand1=1e-6)
+    rsqrt_val = nkigym.activation(scaled, op="rsqrt")
+    a_normed = nkigym.tensor_scalar(a, rsqrt_val, op0=np.multiply)
+    a_t = nkigym.transpose(a_normed)
+    result = nkigym.nc_matmul(a_t, b)
+    return result
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description="NKI Gym rmsnorm+matmul search")
+    parser.add_argument("--cache-dir", type=Path, required=True, help="Directory for storing output artifacts")
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Run schedule search on a 1024x1024 rmsnorm+matmul workload."""
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    args = parse_args()
+    cache_dir = args.cache_dir
+
+    rng = np.random.default_rng(42)
+    a = rng.standard_normal((1024, 1024)).astype(np.float16)
+    b = rng.standard_normal((1024, 1024)).astype(np.float16)
+
+    search(func=rmsnorm_matmul, num_targets=99999, seed=42, save_cache=cache_dir, kernel_kwargs={"a": a, "b": b})
+
+
+if __name__ == "__main__":
+    main()
@@ -15,6 +15,8 @@
 import numpy as np
 
 from nkigym.ops.activation import NKIActivation
+from nkigym.ops.activation_1d import NKIActivation1D
+from nkigym.ops.activation_reduce import NKIActivationReduce
 from nkigym.ops.add import NKIAdd
 from nkigym.ops.base import NKIOp
 from nkigym.ops.dma_copy import NKIDmaCopy
@@ -23,6 +25,8 @@
 from nkigym.ops.tensor_copy import NKITensorCopy
 from nkigym.ops.tensor_reduce import NKITensorReduce
 from nkigym.ops.tensor_scalar import NKITensorScalar
+from nkigym.ops.tensor_scalar_const import NKITensorScalarConst
+from nkigym.ops.transpose import NKITranspose
 
 
 def nc_matmul(*args: Any, **kwargs: Any) -> Any:
@@ -35,15 +39,27 @@ def nc_matmul(*args: Any, **kwargs: Any) -> Any:
     return np.matmul(stationary.T, moving)
 
 
+def _rsqrt(x: Any) -> Any:
+    """Reciprocal square root: 1 / sqrt(x)."""
+    return 1.0 / np.sqrt(x)
+
+
+_STR_OPS: dict[str, Any] = {"square": np.square, "rsqrt": _rsqrt}
+
+
 def activation(*args: Any, **kwargs: Any) -> Any:
-    """Apply element-wise activation function.
+    """Apply element-wise activation, optionally with reduction.
 
     Returns:
-        Activated numpy array.
+        Activated numpy array, or reduced 1D array if reduce_op given.
     """
     data = args[0]
     op_fn = kwargs.get("op")
-    result = op_fn(data) if op_fn is not None else data
+    if isinstance(op_fn, str):
+        op_fn = _STR_OPS[op_fn]
+    activated = op_fn(data) if op_fn is not None else data
+    reduce_op = kwargs.get("reduce_op")
+    result = reduce_op.reduce(activated, axis=-1) if reduce_op is not None else activated
     return result
 
 
@@ -76,15 +92,34 @@ def tensor_reduce(*args: Any, **kwargs: Any) -> Any:
     return op_fn.reduce(data, axis=-1)
 
 
+def _expand_operand(data: Any, operand0: Any) -> Any:
+    """Expand operand0 for broadcasting against data if needed."""
+    result = operand0
+    if isinstance(operand0, np.ndarray) and data.ndim > operand0.ndim:
+        pad = data.ndim - operand0.ndim
+        result = operand0.reshape(operand0.shape + (1,) * pad)
+    return result
+
+
 def tensor_scalar(*args: Any, **kwargs: Any) -> Any:
-    """Element-wise op between a tensor and a column vector.
+    """Element-wise op between a tensor and a scalar/column vector.
+
+    Supports two modes:
+    - 2D broadcast: ``tensor_scalar(data, tensor_operand, op0=...)``
+    - 1D compound: ``tensor_scalar(data, op0=..., operand0=literal, ...)``
 
     Returns:
-        Result numpy array (same shape as data).
+        Result numpy array.
     """
-    data, operand0 = args[0], args[1]
-    op_fn = kwargs.get("op0", np.add)
-    return op_fn(data, operand0[..., np.newaxis])
+    data = args[0]
+    op0 = kwargs.get("op0", np.add)
+    operand0 = args[1] if len(args) > 1 else kwargs["operand0"]
+    expanded = _expand_operand(data, operand0)
+    result = op0(data, expanded)
+    op1 = kwargs.get("op1")
+    if op1 is not None:
+        result = op1(result, kwargs["operand1"])
+    return result
 
 
 def transpose(x: Any) -> Any:
@@ -110,12 +145,16 @@ def ndarray(shape: tuple[int, ...], **kwargs: Any) -> np.ndarray:
     "NKIOp",
     "NKIMatmul",
     "NKIActivation",
+    "NKIActivation1D",
+    "NKIActivationReduce",
     "NKIAdd",
     "NKIDmaCopy",
     "NKIMultiply",
     "NKITensorCopy",
     "NKITensorReduce",
     "NKITensorScalar",
+    "NKITensorScalarConst",
+    "NKITranspose",
     "nc_matmul",
     "activation",
     "add",
 
@@ -5,12 +5,22 @@
 """
 
 import ast
+import operator
 
 import numpy as np
 
 from nkigym.codegen.analysis import _OpCall
+from nkigym.ops.activation import NKIActivation
+from nkigym.ops.activation_1d import NKIActivation1D
 from nkigym.ops.base import NKIOp
 
+_BINOP_FNS: dict[type, object] = {
+    ast.Add: operator.add,
+    ast.Sub: operator.sub,
+    ast.Mult: operator.mul,
+    ast.Div: operator.truediv,
+}
+
 
 def find_func_def(source: str) -> ast.FunctionDef:
     """Find the first FunctionDef in parsed source.
@@ -41,10 +51,28 @@ def _is_nkigym_call(call: ast.Call) -> bool:
     return isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name) and func.value.id == "nkigym"
 
 
+def _eval_binop(node: ast.BinOp) -> object:
+    """Evaluate a binary operation on constant operands.
+
+    Args:
+        node: AST BinOp node.
+
+    Returns:
+        Result of the binary operation.
+    """
+    left = _eval_expr(node.left)
+    right = _eval_expr(node.right)
+    op_fn = _BINOP_FNS.get(type(node.op))
+    if op_fn is None:
+        raise ValueError(f"Unsupported binary op: {ast.dump(node)}")
+    return op_fn(left, right)
+
+
 def _eval_expr(node: ast.expr) -> object:
     """Evaluate an AST expression to a Python object.
 
-    Resolves ``np.X`` attribute accesses and literal constants.
+    Resolves ``np.X`` attribute accesses, literal constants,
+    binary operations, and unary negation.
 
     Args:
         node: AST expression node.
@@ -53,11 +81,14 @@ def _eval_expr(node: ast.expr) -> object:
         The resolved Python object.
     """
     result = None
-    if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name):
-        if node.value.id == "np":
-            result = getattr(np, node.attr)
+    if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name) and node.value.id == "np":
+        result = getattr(np, node.attr)
     elif isinstance(node, ast.Constant):
         result = node.value
+    elif isinstance(node, ast.BinOp):
+        result = _eval_binop(node)
+    elif isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub):
+        result = -_eval_expr(node.operand)
     if result is None:
         raise ValueError(f"Unsupported kwarg expression: {ast.dump(node)}")
     return result
@@ -77,6 +108,45 @@ def _arg_name(node: ast.expr) -> str:
     return node.id
 
 
+def _maybe_reclassify_activation(op: _OpCall, output_axes_map: dict[str, tuple[str, ...]]) -> _OpCall:
+    """Reclassify NKIActivation to NKIActivation1D if input is 1D.
+
+    Args:
+        op: Parsed op call to check.
+        output_axes_map: Maps variable name to output axes of its producer op.
+
+    Returns:
+        Original or reclassified op call.
+    """
+    is_1d = (
+        op.stmt_type is NKIActivation
+        and op.input_vars[0] in output_axes_map
+        and len(output_axes_map[op.input_vars[0]]) == 1
+    )
+    return op._replace(stmt_type=NKIActivation1D) if is_1d else op
+
+
+def _resolve_op_variants(op_calls: list[_OpCall]) -> list[_OpCall]:
+    """Post-parse pass to reclassify ops based on producer output shapes.
+
+    Traces the SSA chain to determine operand dimensionality and
+    reclassifies NKIActivation to NKIActivation1D when input is 1D.
+
+    Args:
+        op_calls: Parsed op calls from the function body.
+
+    Returns:
+        Op calls with reclassified types where appropriate.
+    """
+    output_axes_map: dict[str, tuple[str, ...]] = {}
+    result: list[_OpCall] = []
+    for op in op_calls:
+        resolved = _maybe_reclassify_activation(op, output_axes_map)
+        output_axes_map[resolved.output_var] = getattr(resolved.stmt_type, "OUTPUT_AXES", ())
+        result.append(resolved)
+    return result
+
+
 def parse_body(func_def: ast.FunctionDef) -> list[_OpCall]:
     """Parse function body into a list of _OpCall.
 
@@ -91,7 +161,7 @@ def parse_body(func_def: ast.FunctionDef) -> list[_OpCall]:
     for node in func_def.body:
         if not _try_parse_node(node, op_calls, counter):
             raise ValueError(f"Unsupported statement: {ast.dump(node)}")
-    return op_calls
+    return _resolve_op_variants(op_calls)
 
 
 def _try_parse_node(node: ast.stmt, op_calls: list[_OpCall], counter: list[int]) -> bool:
@@ -157,6 +227,28 @@ def _try_parse_return(node: ast.Return, op_calls: list[_OpCall], counter: list[i
     return result
 
 
+def _disambiguate_op(op_name: str, call: ast.Call) -> str:
+    """Disambiguate user function name to internal op registry key.
+
+    - ``activation`` with ``reduce_op`` kwarg → ``activation_reduce``
+    - ``tensor_scalar`` with < 2 positional args → ``tensor_scalar_const``
+
+    Args:
+        op_name: User-facing function name from AST.
+        call: AST Call node with keyword arguments.
+
+    Returns:
+        Internal op registry key.
+    """
+    kwarg_names = {kw.arg for kw in call.keywords}
+    result = op_name
+    if op_name == "activation" and "reduce_op" in kwarg_names:
+        result = "activation_reduce"
+    elif op_name == "tensor_scalar" and len(call.args) < 2:
+        result = "tensor_scalar_const"
+    return result
+
+
 def _flatten_call(call: ast.Call, output: str, op_calls: list[_OpCall], counter: list[int]) -> None:
     """Flatten a nkigym call (possibly nested) into _OpCall entries.
 
@@ -167,7 +259,7 @@ def _flatten_call(call: ast.Call, output: str, op_calls: list[_OpCall], counter:
         counter: Mutable counter for intermediate variable names.
     """
     assert isinstance(call.func, ast.Attribute)
-    op_name = call.func.attr
+    op_name = _disambiguate_op(call.func.attr, call)
     registry = NKIOp.all_ops()
     if op_name not in registry:
         raise ValueError(f"Unknown op: {op_name!r}")