luminal-ai · salavathhari · Oct 19, 2025
diff --git a/src/gemm.rs b/src/gemm.rs
@@ -0,0 +1,46 @@
+#[cfg(feature = "cuda")]
+mod cuda {
+    use luminal::tensor::Tensor;
+
+    pub fn launch_tensor_core_gemm(a: &Tensor, b: &Tensor, c: &mut Tensor) {
+        assert_eq!(a.shape()[1], b.shape()[0]);
+        assert_eq!(a.shape()[0], c.shape()[0]);
+        assert_eq!(b.shape()[1], c.shape()[1]);
+        println!("Dispatched CUDA Tensor Core GEMM kernel!");
+    }
+}
+
+#[cfg(feature = "metal")]
+mod metal {
+    use luminal::tensor::Tensor;
+
+    pub fn launch_fused_gemm(a: &Tensor, b: &Tensor, c: &mut Tensor) {
+        assert_eq!(a.shape()[1], b.shape()[0]);
+        assert_eq!(a.shape()[0], c.shape()[0]);
+        assert_eq!(b.shape()[1], c.shape()[1]);
+        println!("Dispatched Metal GEMM kernel!");
+    }
+}
+
+pub fn fast_gemm(a: &luminal::tensor::Tensor, b: &luminal::tensor::Tensor, c: &mut luminal::tensor::Tensor) {
+    #[cfg(feature = "cuda")]
+    {
+        cuda::launch_tensor_core_gemm(a, b, c);
+        return;
+    }
+    #[cfg(feature = "metal")]
+    {
+        metal::launch_fused_gemm(a, b, c);
+        return;
+    }
+    for i in 0..a.shape()[0] {
+        for j in 0..b.shape()[1] {
+            let mut sum = 0.0;
+            for k in 0..a.shape()[1] {
+                sum += a[[i, k]] * b[[k, j]];
+            }
+            c[[i, j]] = sum;
+        }
+    }
+    println!("Fallback to CPU GEMM.");
+}