Validate incompatible cache_modifier/eviction_policy combinations in NVIDIA backend

swjng · swjng · commit 8b4f25cf3610 · 2026-04-13T12:58:34.000+09:00
When tl.load/tl.store is called with a PTX-illegal combination of
cache_modifier and eviction_policy, Triton previously emitted PTX
containing both modifiers and let ptxas fail with an opaque assembler
error:

    ptxas error: Modifier '.evict_first' cannot be combined with modifier '.cs'

Users saw a low-level message with no indication of which Python
arguments caused it.

Add validation in LoadStoreOpToLLVM.cpp (NVIDIA-specific PTX lowering)
that emits a clear compilation error before any PTX is generated.
Placing the check in the NVIDIA backend, not in backend-agnostic
semantic.py, keeps the frontend neutral to PTX ISA constraints.

PTX-illegal combinations covered:

| op    | cache_modifier | eviction_policy              |
|-------|----------------|------------------------------|
| store | .cs            | evict_first, evict_last      |
| store | .cg            | evict_first                  |
| load  | .ca            | evict_first, evict_last      |
| load  | .cg            | evict_first                  |
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1,4 +1,5 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm -reconcile-unrealized-casts 2>/dev/null | FileCheck %s --dump-input-context 20
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm -reconcile-unrealized-casts --verify-diagnostics 2>&1 | FileCheck %s --dump-input-context 20
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<1> {tt.pointee_type = f16}, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)
@@ -127,6 +128,80 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // -----
 
+// PTX-illegal combinations of cache_modifier and eviction_policy.
+// Before this check, ptxas would fail with an opaque assembler error
+// (e.g. "Modifier '.evict_first' cannot be combined with modifier '.cs'").
+// The NVIDIA backend now emits a clear op error before PTX generation.
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @store_cs_evict_first(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>,
+                                %vals : tensor<256xf32, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.cs' is incompatible with eviction_policy 'evict_first'/'evict_last'}}
+    tt.store %ptrs, %vals evictionPolicy = evict_first cacheModifier = cs : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @store_cs_evict_last(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>,
+                               %vals : tensor<256xf32, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.cs' is incompatible with eviction_policy 'evict_first'/'evict_last'}}
+    tt.store %ptrs, %vals evictionPolicy = evict_last cacheModifier = cs : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @store_cg_evict_first(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>,
+                                %vals : tensor<256xf32, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.cg' is incompatible with eviction_policy 'evict_first'}}
+    tt.store %ptrs, %vals evictionPolicy = evict_first cacheModifier = cg : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @load_ca_evict_first(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.ca' is incompatible with eviction_policy 'evict_first'/'evict_last'}}
+    %0 = tt.load %ptrs evictionPolicy = evict_first cacheModifier = ca : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @load_ca_evict_last(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.ca' is incompatible with eviction_policy 'evict_first'/'evict_last'}}
+    %0 = tt.load %ptrs evictionPolicy = evict_last cacheModifier = ca : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @load_cg_evict_first(%ptrs : tensor<256x!tt.ptr<f32>, #blocked0>) {
+    // expected-error @+1 {{cache_modifier '.cg' is incompatible with eviction_policy 'evict_first'}}
+    %0 = tt.load %ptrs evictionPolicy = evict_first cacheModifier = cg : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked0 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32} {
   // CHECK-LABEL: global_load_store_no_vec
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -188,6 +188,23 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
       otherElems = unpackLLElements(loc, llOther, rewriter);
     }
 
+    // Validate cache_modifier + eviction_policy combinations that the PTX ISA
+    // forbids. This check belongs here (PTX codegen) rather than in the
+    // frontend semantic layer, which is backend-agnostic.
+    auto cache = op.getCache();
+    auto evict = op.getEvict();
+    if ((evict == triton::EvictionPolicy::EVICT_FIRST ||
+         evict == triton::EvictionPolicy::EVICT_LAST) &&
+        cache == triton::CacheModifier::CA)
+      return op.emitOpError(
+          "cache_modifier '.ca' is incompatible with eviction_policy "
+          "'evict_first'/'evict_last': .ca overrides L1 eviction policy");
+    if (evict == triton::EvictionPolicy::EVICT_FIRST &&
+        cache == triton::CacheModifier::CG)
+      return op.emitOpError(
+          "cache_modifier '.cg' is incompatible with eviction_policy "
+          "'evict_first': .cg bypasses L1 cache");
+
     // vectorized iteration through all the pointer/mask/other elements
     const int valueElemNBits =
         std::max(8u, valueElemTy.getIntOrFloatBitWidth());
@@ -399,6 +416,23 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
                        << mask << "\n";
     }
 
+    // Validate cache_modifier + eviction_policy combinations that the PTX ISA
+    // forbids. This check belongs here (PTX codegen) rather than in the
+    // frontend semantic layer, which is backend-agnostic.
+    auto cache = op.getCache();
+    auto evict = op.getEvict();
+    if ((evict == triton::EvictionPolicy::EVICT_FIRST ||
+         evict == triton::EvictionPolicy::EVICT_LAST) &&
+        cache == triton::CacheModifier::CS)
+      return op.emitOpError(
+          "cache_modifier '.cs' is incompatible with eviction_policy "
+          "'evict_first'/'evict_last': .cs bypasses L1 cache");
+    if (evict == triton::EvictionPolicy::EVICT_FIRST &&
+        cache == triton::CacheModifier::CG)
+      return op.emitOpError(
+          "cache_modifier '.cg' is incompatible with eviction_policy "
+          "'evict_first': .cg bypasses L1 cache");
+
     const size_t dtsize =
         std::max<int>(1, valueElemTy.getIntOrFloatBitWidth() / 8);
     const size_t valueElemNBits = dtsize * 8;