[BACKEND] remove workaround in fp4padded alloc size calculation (#6739)

ThomasRaoux · web-flow · commit 084bc477aff4 · 2025-05-07T18:27:09.000-07:00
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -316,12 +316,7 @@ SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
   if (auto sharedMMALayout = mlir::dyn_cast<NVMMASharedEncodingAttr>(layout)) {
     if (sharedMMALayout.getFp4Padded()) {
       auto packedAxis = getOrder(sharedMMALayout, shapeLogical)[0];
-      if (shape.size() == 3) {
-        // Take into account multi buffering
-        shape[1 + packedAxis] *= 2;
-      } else {
-        shape[packedAxis] *= 2;
-      }
+      shape[packedAxis] *= 2;
     }
   }
   return getShapePerCTA(layout, shape);
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -20,6 +20,8 @@
 #NVMMA_SHARED_32 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 16}>
 #NVMMA_SHARED_64 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
 #NVMMA_SHARED_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#NVMMA_SHARED_FP4PADDED = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8, fp4Padded = true}>
+
 #smem = #ttg.shared_memory
 
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
@@ -917,6 +919,8 @@ tt.func @tightly_packed_captures(%arg0: i8, %arg1: i64) {
 // expected-remark @below {{nvmma_alignment}}
 // expected-remark @below {{size = 1088}}
 tt.func @nvmma_alignment(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
+  // expected-remark @below {{offset = 0, size = 128}}
+  %fp4 = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_FP4PADDED, #ttg.shared_memory, mutable>
   // expected-remark @below {{offset = 0, size = 64}}
   %a = ttg.local_alloc : () -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   // expected-remark @below {{offset = 128, size = 64}}

Original file line number	Diff line number	Diff line change
`@@ -316,12 +316,7 @@ SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,`
`316`	`316`	`if (auto sharedMMALayout = mlir::dyn_cast<NVMMASharedEncodingAttr>(layout)) {`
`317`	`317`	`if (sharedMMALayout.getFp4Padded()) {`
`318`	`318`	`auto packedAxis = getOrder(sharedMMALayout, shapeLogical)[0];`
`319`		`- if (shape.size() == 3) {`
`320`		`- // Take into account multi buffering`
`321`		`- shape[1 + packedAxis] *= 2;`
`322`		`- } else {`
`323`		`- shape[packedAxis] *= 2;`
`324`		`- }`
	`319`	`+ shape[packedAxis] *= 2;`
`325`	`320`	`}`
`326`	`321`	`}`
`327`	`322`	`return getShapePerCTA(layout, shape);`