ROCm
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 1 addition & 1 deletion b/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dispatcher/bindings/ctypes/conv_ctypes_lib.cpp‎
Lines changed: 11 additions & 12 deletions b/‎dispatcher/bindings/ctypes/conv_ctypes_lib.cpp‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎dispatcher/bindings/ctypes/gemm_ctypes_lib.cpp‎
Lines changed: 56 additions & 24 deletions b/‎dispatcher/bindings/ctypes/gemm_ctypes_lib.cpp‎
Lines changed: 56 additions & 24 deletions
diff --git a/‎dispatcher/codegen/arch_filter.py‎
Lines changed: 41 additions & 5 deletions b/‎dispatcher/codegen/arch_filter.py‎
Lines changed: 41 additions & 5 deletions
diff --git a/‎dispatcher/codegen/arch_specs.json‎
Lines changed: 28 additions & 0 deletions b/‎dispatcher/codegen/arch_specs.json‎
Lines changed: 28 additions & 0 deletions
@@ -5,6 +5,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ## Composable Kernel 1.2.0 for ROCm 7.2.0
 
 ### Added
+* Added CK-Tile dispatcher - a unified kernel dispatch, code generation and architecture-based kernel filtering system with with C++ and Python frontends starting with GEMM support.
 * Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
 * Added Col-Col-Row-Col layout support for aquant mode in blockscale GEMM.
 * Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM
 
@@ -35,7 +35,7 @@ extern "C" {
 int conv_bwdw_init()
 {
     g_bwdw_initialized = true;
-    return 1;
+    return 0; // Return 0 on success (consistent with other init functions)
 }
 
 void conv_bwdw_cleanup() { g_bwdw_initialized = false; }
 
@@ -17,6 +17,7 @@
  */
 
 #include <cstring>
+#include <memory>
 #include <vector>
 #include <hip/hip_runtime.h>
 
@@ -26,9 +27,9 @@
 
 using namespace ck_tile::dispatcher;
 
-// Global state
-static ConvRegistry* g_registry     = nullptr;
-static ConvDispatcher* g_dispatcher = nullptr;
+// Global state (using shared_ptr for safe memory management)
+static std::shared_ptr<ConvRegistry> g_registry     = nullptr;
+static std::shared_ptr<ConvDispatcher> g_dispatcher = nullptr;
 static std::vector<const ConvKernelInstance*> g_kernels;
 
 extern "C" {
@@ -42,8 +43,8 @@ int conv_dispatcher_init()
     if(g_registry)
         return 0; // Already initialized
 
-    g_registry   = new ConvRegistry();
-    g_dispatcher = new ConvDispatcher(g_registry);
+    g_registry   = std::make_shared<ConvRegistry>();
+    g_dispatcher = std::make_shared<ConvDispatcher>(g_registry.get());
 
     // Register kernel configurations
     using namespace ck_tile::dispatcher::conv_decl;
@@ -94,10 +95,9 @@ int conv_dispatcher_init()
 
 int conv_dispatcher_cleanup()
 {
-    delete g_dispatcher;
-    delete g_registry;
-    g_dispatcher = nullptr;
-    g_registry   = nullptr;
+    // shared_ptr automatically handles cleanup when reset
+    g_dispatcher.reset();
+    g_registry.reset();
     g_kernels.clear();
     return 0;
 }
@@ -343,11 +343,10 @@ float conv_dispatcher_run(const void* input_ptr,
 
 #ifdef CONV_BWD_WEIGHT_AVAILABLE
     case 2: // Backward weight
-        // Convention: caller passes (grad_output, input, grad_weight_buffer)
+        // Convention: caller passes (input, grad_output, grad_weight_buffer)
         // in the (input_ptr, weight_ptr, output_ptr) slots respectively.
-        // This is consistent with bwd_data where grad_output goes in input_ptr slot.
         // run_bwd_weight expects: (input, grad_output, grad_weight)
-        return run_bwd_weight(weight_ptr, input_ptr, output_ptr, prob, stream);
+        return run_bwd_weight(input_ptr, weight_ptr, output_ptr, prob, stream);
 #endif
 
     default: return -1.0f;
 
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <string>
 
@@ -31,9 +32,9 @@ using namespace ck_tile::dispatcher;
 using namespace ck_tile::dispatcher::backends;
 using Priority = ck_tile::dispatcher::Registry::Priority;
 
-// Global dispatcher (initialized once)
-static Dispatcher* g_dispatcher = nullptr;
-static bool g_initialized       = false;
+// Global dispatcher (initialized once, managed via shared_ptr for safe cleanup)
+static std::shared_ptr<Dispatcher> g_dispatcher = nullptr;
+static bool g_initialized                       = false;
 
 #define HIP_CHECK(call)        \
     {                          \
@@ -98,8 +99,8 @@ int dispatcher_initialize()
     Registry::instance().clear();
     Registry::instance().register_kernel(kernel, Priority::High);
 
-    // Create dispatcher
-    g_dispatcher  = new Dispatcher();
+    // Create dispatcher (using shared_ptr for safe memory management)
+    g_dispatcher  = std::make_shared<Dispatcher>();
     g_initialized = true;
 
     return 0;
@@ -294,19 +295,53 @@ int dispatcher_run_gemm(const void* A, // Host pointer
     const BDataType* B_host = static_cast<const BDataType*>(B);
     CDataType* C_host       = static_cast<CDataType*>(C);
 
-    // Allocate GPU memory
+    // Allocate GPU memory with proper cleanup on failure
     ADataType* A_dev = nullptr;
     BDataType* B_dev = nullptr;
     CDataType* C_dev = nullptr;
 
-    HIP_CHECK(hipMalloc(&A_dev, M * K * sizeof(ADataType)));
-    HIP_CHECK(hipMalloc(&B_dev, K * N * sizeof(BDataType)));
-    HIP_CHECK(hipMalloc(&C_dev, M * N * sizeof(CDataType)));
+    // Helper lambda for cleanup
+    auto cleanup_gpu_mem = [&]() {
+        if(A_dev)
+            (void)hipFree(A_dev);
+        if(B_dev)
+            (void)hipFree(B_dev);
+        if(C_dev)
+            (void)hipFree(C_dev);
+    };
+
+    if(hipMalloc(&A_dev, M * K * sizeof(ADataType)) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
+    if(hipMalloc(&B_dev, K * N * sizeof(BDataType)) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
+    if(hipMalloc(&C_dev, M * N * sizeof(CDataType)) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
 
     // Copy input data to GPU
-    HIP_CHECK(hipMemcpy(A_dev, A_host, M * K * sizeof(ADataType), hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemcpy(B_dev, B_host, K * N * sizeof(BDataType), hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemset(C_dev, 0, M * N * sizeof(CDataType)));
+    if(hipMemcpy(A_dev, A_host, M * K * sizeof(ADataType), hipMemcpyHostToDevice) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
+    if(hipMemcpy(B_dev, B_host, K * N * sizeof(BDataType), hipMemcpyHostToDevice) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
+    if(hipMemset(C_dev, 0, M * N * sizeof(CDataType)) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
 
     // Run GEMM via dispatcher (kernel already selected, shouldn't throw)
     float exec_time;
@@ -317,14 +352,16 @@ int dispatcher_run_gemm(const void* A, // Host pointer
     catch(const std::exception& e)
     {
         // Unexpected error during execution
-        (void)hipFree(A_dev);
-        (void)hipFree(B_dev);
-        (void)hipFree(C_dev);
+        cleanup_gpu_mem();
         return -1;
     }
 
     // Copy result back to host
-    HIP_CHECK(hipMemcpy(C_host, C_dev, M * N * sizeof(CDataType), hipMemcpyDeviceToHost));
+    if(hipMemcpy(C_host, C_dev, M * N * sizeof(CDataType), hipMemcpyDeviceToHost) != hipSuccess)
+    {
+        cleanup_gpu_mem();
+        return -1;
+    }
 
     // Store timing if requested
     if(time_ms)
@@ -333,9 +370,7 @@ int dispatcher_run_gemm(const void* A, // Host pointer
     }
 
     // Cleanup GPU memory
-    (void)hipFree(A_dev);
-    (void)hipFree(B_dev);
-    (void)hipFree(C_dev);
+    cleanup_gpu_mem();
 
     return 0;
 }
@@ -434,11 +469,8 @@ const char* dispatcher_export_registry_json()
  */
 void dispatcher_cleanup()
 {
-    if(g_dispatcher)
-    {
-        delete g_dispatcher;
-        g_dispatcher = nullptr;
-    }
+    // shared_ptr automatically handles cleanup when reset
+    g_dispatcher.reset();
     g_initialized = false;
 }
 
 
@@ -132,6 +132,8 @@ class OperatorType(Enum):
         ELEMENT_SIZE_MAP,
         WARP_SUPPORTED_COMBINATIONS,
         WARP_TILE_SUPPORTED_COMBINATIONS,
+        PRESHUFFLE_WARP_TILE_SUPPORTED_COMBINATIONS,
+        PRESHUFFLE_PIPELINES,
         LDS_CAPACITY_LIMITS,
         TRAIT_UNSUPPORTED_COMBINATIONS,
         DTYPE_COMBINATIONS,
@@ -179,6 +181,21 @@ class OperatorType(Enum):
         },
     }
 
+    # Preshuffle-specific warp tile combinations (no [4, 64, 16])
+    PRESHUFFLE_WARP_TILE_SUPPORTED_COMBINATIONS = {
+        "gfx942": {
+            "fp16_fp16_fp32": [
+                [32, 32, 8],
+                [16, 16, 16],
+                [32, 32, 16],
+                [16, 16, 32],
+                [64, 4, 16],
+            ],
+        },
+    }
+
+    PRESHUFFLE_PIPELINES = ["preshufflev2"]
+
     LDS_CAPACITY_LIMITS = {"compv4": 32768, "preshufflev2": 32768, "default": 65536}
 
     TRAIT_UNSUPPORTED_COMBINATIONS = {
@@ -566,9 +583,20 @@ def _validate_warp_config(self, config: KernelConfig, result: ValidationResult):
 
     def _validate_warp_tile_combo(self, config: KernelConfig, result: ValidationResult):
         """Validate warp tile combination against architecture and data types"""
-        gpu_combos = WARP_TILE_SUPPORTED_COMBINATIONS.get(self.gpu_arch, {})
+        # Use preshuffle-specific warp tiles for preshuffle operator
+        if config.operator == OperatorType.GEMM_PRESHUFFLE:
+            gpu_combos = PRESHUFFLE_WARP_TILE_SUPPORTED_COMBINATIONS.get(
+                self.gpu_arch, {}
+            )
+            combo_source = "preshuffle"
+        else:
+            gpu_combos = WARP_TILE_SUPPORTED_COMBINATIONS.get(self.gpu_arch, {})
+            combo_source = "standard"
+
         if not gpu_combos:
-            msg = f"No warp tile combinations defined for {self.gpu_arch}"
+            msg = (
+                f"No {combo_source} warp tile combinations defined for {self.gpu_arch}"
+            )
             if self.strict_mode:
                 result.add_error(msg)
             else:
@@ -579,19 +607,27 @@ def _validate_warp_tile_combo(self, config: KernelConfig, result: ValidationResu
         if not dtype_combos:
             # Data type combo not explicitly listed - may still be valid
             result.add_warning(
-                f"No warp tile combinations defined for {config.dtype_key} on {self.gpu_arch}"
+                f"No {combo_source} warp tile combinations defined for {config.dtype_key} on {self.gpu_arch}"
             )
             return
 
         current = [config.warp_tile_m, config.warp_tile_n, config.warp_tile_k]
         if current not in dtype_combos:
             result.add_error(
-                f"Invalid warp tile {current} for {config.dtype_key} on {self.gpu_arch}. "
+                f"Invalid warp tile {current} for {config.dtype_key} on {self.gpu_arch} ({combo_source}). "
                 f"Allowed: {dtype_combos}"
             )
 
     def _validate_trait_combo(self, config: KernelConfig, result: ValidationResult):
         """Validate trait (pipeline, epilogue, scheduler) combination"""
+        # Preshuffle requires specific pipelines
+        if config.operator == OperatorType.GEMM_PRESHUFFLE:
+            if config.pipeline not in PRESHUFFLE_PIPELINES:
+                result.add_error(
+                    f"Preshuffle GEMM requires pipeline in {PRESHUFFLE_PIPELINES}, "
+                    f"got {config.pipeline}"
+                )
+
         combo = (config.pipeline, config.epilogue, config.scheduler)
         if combo in TRAIT_UNSUPPORTED_COMBINATIONS:
             result.add_error(
@@ -769,7 +805,7 @@ def get_supported_archs() -> List[str]:
 def get_arch_family(gpu_arch: str) -> Optional[str]:
     """Get the GPU family for an architecture"""
     family = ARCH_FAMILY_MAP.get(gpu_arch.lower())
-    return family.value if family else None
+    return family if family else None  # ARCH_FAMILY_MAP contains strings, not Enums
 
 
 def create_filter_for_current_gpu() -> Optional[ArchFilter]:
 
@@ -232,5 +232,33 @@
       ["compv4", "cshuffle", "interwave"],
       ["compv4", "default", "interwave"]
     ]
+  },
+  
+  "preshuffle_warp_tile_combos": {
+    "_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])",
+    "gfx90a": {
+      "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
+      "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]]
+    },
+    "gfx942": {
+      "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+      "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+      "int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
+    },
+    "gfx950": {
+      "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
+      "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
+      "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
+    }
+  },
+  
+  "preshuffle_pipelines": {
+    "_comment": "Pipelines supported for preshuffle GEMM variant",
+    "supported": ["preshufflev2"]
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ extern "C" {`
`35`	`35`	`int conv_bwdw_init()`
`36`	`36`	`{`
`37`	`37`	`g_bwdw_initialized = true;`
`38`		`- return 1;`
	`38`	`+ return 0; // Return 0 on success (consistent with other init functions)`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`void conv_bwdw_cleanup() { g_bwdw_initialized = false; }`