redhat-et · kyolebu · Aug 4, 2025 · cmagina · Aug 4, 2025 · kyolebu
diff --git a/benchmarks/benchmark_gemv.py b/benchmarks/benchmark_gemv.py
@@ -30,9 +30,9 @@
     )
 )
 def benchmark_gemv(M, K, provider):
-    A = torch.randn((M, K), device=DEVICE, dtype=torch.float16)
-    x = torch.randn(K, device=DEVICE, dtype=torch.float16)
-    y = torch.randn(M, device=DEVICE, dtype=torch.float16)
+    A = torch.randn((M, K), device=DEVICE, dtype=torch.float32)
+    x = torch.randn(K, device=DEVICE, dtype=torch.float32)
+    y = torch.randn(M, device=DEVICE, dtype=torch.float32)
     alpha = 0.42
     beta = 10
     quantiles = [0.5, 0.2, 0.8]

diff --git a/tritonblas/level2/gemv.py b/tritonblas/level2/gemv.py
@@ -7,38 +7,70 @@ def get_autotune_config():
     return [
         triton.Config(
             {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_K": 64,
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_K": 128,
             },
         ),
         triton.Config(
             {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_K": 32,
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_K": 128,
             },
         ),
         triton.Config(
             {
                 "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_K": 32,
+                "BLOCK_SIZE_K": 128,
             },
         ),
         triton.Config(
             {
-                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_M": 256,
                 "BLOCK_SIZE_K": 64,
             },
         ),
         triton.Config(
             {
-                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_M": 256,
                 "BLOCK_SIZE_K": 32,
             },
         ),
+
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_K": 128,
+            },
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_K": 256,
+            },
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 8,
+                "BLOCK_SIZE_K": 128,
+            },
+        ),
+
         triton.Config(
             {
                 "BLOCK_SIZE_M": 32,
-                "BLOCK_SIZE_K": 64,
+                "BLOCK_SIZE_K": 256,
+            },
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_K": 512,
+            },
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 8,
+                "BLOCK_SIZE_K": 256,
             },
         ),
     ]
@@ -90,8 +122,7 @@ def gemv_kernel(
         tiled_x = tl.load(x_block, mask=k_mask, other=0.0)
 
         # Partial dot product
-        partial_result = tl.sum(tiled_a * tiled_x[None, :], axis=1)
-        accumulator += partial_result
+        accumulator += tl.sum(tiled_a * tiled_x[None, :], axis=1)
 
         a_block += BLOCK_SIZE_K * stride_ak
         x_block += BLOCK_SIZE_K * stride_x