pytorch
diff --git a/‎README.md‎
Lines changed: 6 additions & 5 deletions b/‎README.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎docs/api/config.md‎
Lines changed: 23 additions & 19 deletions b/‎docs/api/config.md‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎docs/index.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/index.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 11 additions & 11 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎helion/_compiler/device_ir.py‎
Lines changed: 65 additions & 30 deletions b/‎helion/_compiler/device_ir.py‎
Lines changed: 65 additions & 30 deletions
diff --git a/‎helion/language/memory_ops.py‎
Lines changed: 9 additions & 6 deletions b/‎helion/language/memory_ops.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎helion/runtime/config.py‎
Lines changed: 4 additions & 4 deletions b/‎helion/runtime/config.py‎
Lines changed: 4 additions & 4 deletions
@@ -35,7 +35,7 @@ portable between different hardware. Helion automates and autotunes over:
 
    * Automatically calculates strides and indices.
    * Autotunes choices among various indexing methods (pointers, block pointers, TensorDescriptors).
-   * Supports per-load indexing strategies for fine-grained memory access control.
+   * Supports per-operation indexing strategies for fine-grained memory access control of loads and stores.
 
 2. **Masking:**
 
@@ -259,10 +259,11 @@ cache behavior. A value of `1` disables this optimization, while higher
 values specify the grouping size.
 
 * **indexing** (`"pointer"`, `"tensor_descriptor"`, `"block_ptr"`, or a list of these):
-Specifies the memory indexing strategy for load operations. Can be:
-  - A single strategy (applies to all loads): `indexing="block_ptr"`
-  - A list of strategies (one per load operation): `indexing=["pointer", "block_ptr", "tensor_descriptor"]`
-  - Empty/omitted (defaults to `"pointer"` for all loads)
+Specifies the memory indexing strategy for load and store operations. Can be:
+  - A single strategy (applies to all loads and stores): `indexing="block_ptr"`
+  - A list of strategies (one per load/store in execution order): `indexing=["pointer", "pointer", "block_ptr"]`
+  - Empty/omitted (defaults to `"pointer"` for all operations)
+  - When using a list, provide strategies in order: `[load1, load2, ..., store1, store2, ...]`
 
   The `"tensor_descriptor"` option uses Tensor Memory Accelerators (TMAs) but
   requires a Hopper or newer GPU and the latest development version of Triton.
 
@@ -109,31 +109,37 @@ Configs are typically discovered automatically through autotuning, but can also
 
 .. autoattribute:: Config.indexing
 
-   Memory indexing strategy for load operations. Can be specified as:
+   Memory indexing strategy for load and store operations. Can be specified as:
 
-   **Single strategy (applies to all loads - backward compatible):**
+   **Single strategy (applies to all loads and stores - backward compatible):**
 
    .. code-block:: python
 
-      indexing="block_ptr"  # All loads use block pointers
+      indexing="block_ptr"  # All loads and stores use block pointers
 
-   **Per-load strategies (list, one per load operation):**
+   **Per-operation strategies (list, one per load/store in execution order):**
 
    .. code-block:: python
 
-      indexing=["pointer", "block_ptr", "tensor_descriptor"]
+      # 2 loads + 1 store = 3 indexing strategies
+      indexing=["pointer", "pointer", "block_ptr"]  # loads use pointer, store uses block_ptr
 
-   **Empty/omitted (defaults to** ``"pointer"`` **for all loads):**
+   **Empty/omitted (defaults to** ``"pointer"`` **for all operations):**
 
    .. code-block:: python
 
-      # indexing not specified - all loads use pointer indexing
+      # indexing not specified - all loads and stores use pointer indexing
 
    **Valid strategies:**
 
    - ``"pointer"``: Pointer-based indexing (default)
    - ``"tensor_descriptor"``: Tensor descriptor indexing (requires Hopper+ GPU)
    - ``"block_ptr"``: Block pointer indexing
+
+   .. note::
+      When using a list, provide one strategy for each load and store operation in the order
+      they appear in the kernel. The indexing list is ordered as:
+      ``[load1, load2, ..., loadN, store1, store2, ..., storeM]``
 ```
 
 ### Memory and Caching
@@ -212,32 +218,30 @@ import torch
 import helion
 import helion.language as hl
 
-# Single indexing strategy for all loads (backward compatible)
+# Single indexing strategy for all loads and stores (backward compatible)
 @helion.kernel(config={"indexing": "block_ptr"})
 def kernel_uniform_indexing(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     out = torch.empty_like(x)
     for tile in hl.tile(x.size(0)):
-        a = hl.load(x, [tile])  # Uses block_ptr
-        b = hl.load(y, [tile])  # Uses block_ptr
-        out[tile] = a + b
+        a = hl.load(x, [tile])  # Load: uses block_ptr
+        b = hl.load(y, [tile])  # Load: uses block_ptr
+        out[tile] = a + b       # Store: uses block_ptr
     return out
 
-# Per-load indexing strategies for fine-grained control
+# Per-operation indexing strategies for fine-grained control
+# Indexing list is ordered: [load1, load2, ..., store1, store2, ...]
 @helion.kernel(
     config={
         "block_size": 16,
-        "indexing": ["pointer", "block_ptr", "tensor_descriptor"],
+        "indexing": ["pointer", "pointer", "block_ptr"],  # 2 loads + 1 store
     }
 )
-def kernel_mixed_indexing(
-    x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
-) -> torch.Tensor:
+def kernel_mixed_indexing(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     out = torch.empty_like(x)
     for tile in hl.tile(x.size(0)):
         a = hl.load(x, [tile])  # First load: pointer indexing
-        b = hl.load(y, [tile])  # Second load: block_ptr indexing
-        c = hl.load(z, [tile])  # Third load: tensor_descriptor indexing
-        out[tile] = a + b + c
+        b = hl.load(y, [tile])  # Second load: pointer indexing
+        out[tile] = a + b       # Store: block_ptr indexing
     return out
 ```
 
 
@@ -35,7 +35,7 @@ portable between different hardware. Helion automates and autotunes over:
 
     * Automatically calculates strides and indices.
     * Autotunes choices among various indexing methods (pointers, block pointers, TensorDescriptors).
-    * Supports per-load indexing strategies for fine-grained memory access control.
+    * Supports per-operation indexing strategies for fine-grained memory access control of loads and stores.
 
 2. **Masking:**
 
 
@@ -247,40 +247,40 @@ def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
 
         self.tile_strategy: TileStrategyDispatch = TileStrategyDispatch(self, config)
 
-        # Store indexing config to lazily create strategies per load
+        # Store indexing config to lazily create strategies per load/store
         self._indexing_config = config.indexing
         self.indexing_strategies: list[IndexingStrategy] = []
-        self.tensor_to_load_index: dict[
-            int, int
-        ] = {}  # Maps tensor id to its load index
 
         self.rng_seed_count = 0
         self.device_load_index = 0
+        self.device_store_index = 0
+        # Single counter for both loads and stores for indexing assignment
+        self.device_memory_op_index = 0
         self.rng_seed_buffer_param_name = None
 
-    def get_indexing_strategy(self, load_index: int) -> IndexingStrategy:
+    def get_indexing_strategy(self, index: int) -> IndexingStrategy:
         from typing import cast
 
         from .indexing_strategy import IndexingStrategy
         from .indexing_strategy import PointerIndexingStrategy
 
         # Expand strategies list if needed
-        while len(self.indexing_strategies) <= load_index:
+        while len(self.indexing_strategies) <= index:
             idx = len(self.indexing_strategies)
 
             if isinstance(self._indexing_config, str):
-                # Single string: all loads use the same strategy
+                # Single string: all loads/stores use the same strategy
                 if not self.indexing_strategies:
                     strategy = IndexingStrategy.select(
                         cast("IndexingLiteral", self._indexing_config)
                     )
                 else:
                     strategy = self.indexing_strategies[0]
             elif isinstance(self._indexing_config, list) and self._indexing_config:
-                # List: one strategy per load
+                # List: one strategy per load/store
                 assert idx < len(self._indexing_config), (
-                    f"Load operation {idx} exceeds indexing config length "
-                    f"{len(self._indexing_config)}. Please specify indexing for all loads."
+                    f"Load/Store operation {idx} exceeds indexing config length "
+                    f"{len(self._indexing_config)}. Please specify indexing for all loads and stores."
                 )
                 strategy = IndexingStrategy.select(
                     cast("IndexingLiteral", self._indexing_config[idx])
@@ -291,7 +291,7 @@ def get_indexing_strategy(self, load_index: int) -> IndexingStrategy:
 
             self.indexing_strategies.append(strategy)
 
-        return self.indexing_strategies[load_index]
+        return self.indexing_strategies[index]
 
     def has_rng_ops(self) -> bool:
         """Check if this kernel uses any RNG operations."""
 
@@ -1076,8 +1076,15 @@ def visit_For(self, node: ast.For) -> None:
             self.generic_visit(node)
 
 
-def _count_device_loads(device_ir: DeviceIR) -> int:
-    """Count the number of load operations in all device code for eviction policy tuning."""
+def _count_device_loads_and_stores(device_ir: DeviceIR) -> tuple[int, int, int]:
+    """Count the number of load and store operations in device code for autotuning.
+
+    Returns:
+        tuple[int, int, int]: (total_load_count, loads_without_eviction_policy, store_count)
+            - total_load_count: all loads (for indexing tunable)
+            - loads_without_eviction_policy: loads that need eviction policy tuning
+            - store_count: all stores (for indexing tunable)
+    """
     from ..language import memory_ops
 
     # Build set of rolled graph IDs to exclude (these are duplicates)
@@ -1087,31 +1094,47 @@ def _count_device_loads(device_ir: DeviceIR) -> int:
         if info.new_graph_id is not None
     }
 
-    load_count = 0
+    total_load_count = 0
+    loads_without_eviction_policy = 0
+    store_count = 0
+
     # Walk all graphs except rolled duplicates
     for graph_info in device_ir.graphs:
         if graph_info.graph_id in rolled_graph_ids:
             continue
 
         for node in graph_info.graph.nodes:
-            # Check if this is a load operation
-            if node.op == "call_function" and node.target is memory_ops.load:
-                # Only count loads without explicit eviction policy
-                # (user can still specify eviction_policy to override tuning)
-                # Check kwargs first, then check if 4th arg (eviction_policy) is None
-                eviction_policy_arg = node.kwargs.get("eviction_policy")
-                if eviction_policy_arg is None:
-                    # Check if eviction_policy was passed as positional arg (index 3)
-                    if len(node.args) >= 4:
-                        eviction_policy_arg = node.args[3]
+            if node.op == "call_function":
+                # Check if this is a load operation
+                if node.target is memory_ops.load:
+                    total_load_count += 1
+                    # Check if this load needs eviction policy tuning
+                    # (user can still specify eviction_policy to override tuning)
+                    eviction_policy_arg = node.kwargs.get("eviction_policy")
                     if eviction_policy_arg is None:
-                        load_count += 1
-    return load_count
-
-
-def _register_load_tunables(load_count: int) -> None:
-    """Register list-based tunables (indexing, eviction policies) for all device loads."""
-    if load_count == 0:
+                        # Check if eviction_policy was passed as positional arg (index 3)
+                        if len(node.args) >= 4:
+                            eviction_policy_arg = node.args[3]
+                        if eviction_policy_arg is None:
+                            loads_without_eviction_policy += 1
+                # Check if this is a store operation
+                elif node.target is memory_ops.store:
+                    store_count += 1
+
+    return total_load_count, loads_without_eviction_policy, store_count
+
+
+def _register_load_store_tunables(
+    total_load_count: int, loads_without_eviction_policy: int, store_count: int
+) -> None:
+    """Register list-based tunables (indexing, eviction policies) for all device loads and stores.
+
+    Args:
+        total_load_count: Total number of loads (for indexing tunable)
+        loads_without_eviction_policy: Number of loads that need eviction policy tuning
+        store_count: Total number of stores (for indexing tunable)
+    """
+    if total_load_count == 0 and store_count == 0:
         return
 
     from ..autotuner.config_fragment import EnumFragment
@@ -1120,13 +1143,21 @@ def _register_load_tunables(load_count: int) -> None:
     from ..autotuner.config_spec import ConfigSpec
 
     env = CompileEnvironment.current()
-    env.config_spec.load_eviction_policies = ListOf(
-        EnumFragment(choices=VALID_EVICTION_POLICIES), length=load_count
-    )
-    env.config_spec.indexing = ListOf(
-        EnumFragment(choices=ConfigSpec._valid_indexing_types()), length=load_count
-    )
-    env.device_load_count = load_count
+
+    # Register eviction policies only for loads without explicit eviction_policy
+    if loads_without_eviction_policy > 0:
+        env.config_spec.load_eviction_policies = ListOf(
+            EnumFragment(choices=VALID_EVICTION_POLICIES),
+            length=loads_without_eviction_policy,
+        )
+        env.device_load_count = loads_without_eviction_policy
+
+    # Indexing applies to ALL loads and stores
+    total_count = total_load_count + store_count
+    if total_count > 0:
+        env.config_spec.indexing = ListOf(
+            EnumFragment(choices=ConfigSpec._valid_indexing_types()), length=total_count
+        )
 
 
 def lower_to_device_ir(func: HostFunction) -> DeviceIR:
@@ -1151,9 +1182,13 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
             # xyz not supported with shared program IDs, but persistent kernels are allowed
             CompileEnvironment.current().config_spec.disallow_pid_type("xyz")
 
-        # Count all device loads and register tunables
-        load_count = _count_device_loads(device_ir)
-        _register_load_tunables(load_count)
+        # Count all device loads and stores and register tunables
+        total_load_count, loads_without_eviction_policy, store_count = (
+            _count_device_loads_and_stores(device_ir)
+        )
+        _register_load_store_tunables(
+            total_load_count, loads_without_eviction_policy, store_count
+        )
 
         return device_ir
 
 
@@ -97,9 +97,11 @@ def _(state: CodegenState) -> ast.AST:
 
     if isinstance(tensor, torch.Tensor):
         device_fn = state.device_function
-        # Use the same strategy that was used to load this tensor, or default to first strategy
-        load_idx = device_fn.tensor_to_load_index.get(id(tensor), 0)
-        strategy = device_fn.get_indexing_strategy(load_idx)
+        device_fn.device_store_index += 1
+        # Use the shared memory op index for indexing strategy
+        indexing_idx = device_fn.device_memory_op_index
+        device_fn.device_memory_op_index += 1
+        strategy = device_fn.get_indexing_strategy(indexing_idx)
         return strategy.codegen_store(state, tensor, [*subscript], value, extra_mask)
     if isinstance(tensor, tuple):
         from .._compiler.indexing_strategy import StackIndexingStrategy
@@ -268,9 +270,10 @@ def _(state: CodegenState) -> ast.AST:
         eviction_policy = ast.Constant(value=eviction_policy)
 
     if isinstance(tensor, torch.Tensor):
-        strategy = device_fn.get_indexing_strategy(load_idx)
-        # Track which strategy was used for this tensor so stores can use the same one
-        device_fn.tensor_to_load_index[id(tensor)] = load_idx
+        # Use the shared memory op index for indexing strategy
+        indexing_idx = device_fn.device_memory_op_index
+        device_fn.device_memory_op_index += 1
+        strategy = device_fn.get_indexing_strategy(indexing_idx)
         return strategy.codegen_load(
             state, tensor, [*subscript], extra_mask, eviction_policy
         )
 
@@ -61,12 +61,12 @@ def __init__(
             num_warps: Number of warps per block.
             num_stages: Number of stages for software pipelining.
             pid_type: Program ID type strategy ("flat", "xyz", "persistent_blocked", "persistent_interleaved").
-            indexing: Indexing strategy for load operations. Can be:
-                - A single strategy string (all loads use this strategy):
+            indexing: Indexing strategy for load and store operations. Can be:
+                - A single strategy string (all loads/stores use this strategy):
                   indexing="block_ptr"  # backward compatible
-                - A list of strategies (one per load operation, must specify all):
+                - A list of strategies (one per load/store operation, must specify all):
                   indexing=["pointer", "block_ptr", "tensor_descriptor"]
-                - Empty/omitted (all loads default to "pointer")
+                - Empty/omitted (all loads/stores default to "pointer")
                 Valid strategies: "pointer", "tensor_descriptor", "block_ptr"
             **kwargs: Additional user-defined configuration parameters.
         """