ROCm · AlexBrownAMD · May 1, 2025 · Apr 23, 2025 · Apr 30, 2025 · msujon-AMD
@@ -688,7 +688,7 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
         # AccVgpr read
         # if kernel.enabledSetPrioSplitLDS:
         #     kStr += inst("s_setprio", "0", "")
-        if codeAccVgprRead is not None: # and writer.kernel["LocalSplitU"] == 1
+        if codeAccVgprRead is not None and kernel["LocalSplitU"] == 1:
             regsPerScalar = writer.states.bpeCinternal // writer.states.bpr # register per scalar
             # loop over store instructions within one batch
             for elementIdx in range(0, len(batchElements)):
@@ -756,6 +756,11 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
             module.add(VMovB32(vgpr(cvtVgprStruct.vgprBF8Max), "0x47600000", comment="BF8 Max value 57344 as float32" ))
             module.add(VMovB32(vgpr(cvtVgprStruct.vgprBF8Min), "0xc7600000", comment="BF8 Min value -57344 as float32" ))
 
+        if kernel["EnableMatrixInstruction"]:
+            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] * kernel["WorkGroup"][2]
+        else:
+            WaveNum = kernel["NumThreads"] // kernel["WavefrontSize"]
+
         storeCode = Module("Partials GroupLoadStore")
         for elementIdx in range(len(batchElements)):
             element = batchElements[elementIdx]
@@ -771,7 +776,8 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
                 # kStr += inst("v_mul_lo_u32", , "Partials buffer address")
                 module.add(SMovB32(dst=sgpr(tmpS01), src=0, comment="Init sgpr offset"))
             else:
-                increment = (kernel["WavefrontSize"] * 4) * storeWidth * writer.states.bpeCinternal
+                increment = (kernel["WavefrontSize"] * WaveNum) * storeWidth * writer.states.bpeCinternal
+                # module.addComment1("WavefrontSize={}, WaveNum={}, storeWidth={}, bpeC={}".format(kernel["WavefrontSize"], WaveNum, storeWidth, writer.states.bpeCinternal))
                 module.add(SAddU32(dst=sgpr(tmpS01), src0=sgpr(tmpS01), src1=increment, comment="Inc sgpr offset"))
 
             # TODO StreamK need this packing code???
@@ -1089,7 +1095,6 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
         storesIssued = 0
         tmpS01 = tmpSgpr # scratch sgprs
 
-        wavelen = kernel["WavefrontSize"]
         # laneSGPRC = writer.states.laneSGPRCount
         # always use gwvw for buffer load C for atomic_cmpswap
         # bpm = self.bpeCexternal * atomicW
@@ -1126,6 +1131,11 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
         #     accVgprRead = Code.Module("movaccVgpr")
         #     self.StoreCUnrollLoadCWaitComment = "waitcnt for LoadC" # this will be used later to identify waitcnt for loadC
 
+        if kernel["EnableMatrixInstruction"]:
+            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] * kernel["WorkGroup"][2]
+        else:
+            WaveNum = kernel["NumThreads"] // kernel["WavefrontSize"]
+
         for elementIdx in range(0, len(batchElements)):
             element = batchElements[elementIdx]
             addrCVgpr = ss.elementAddr[elementIdx].addrCVgpr
@@ -1146,7 +1156,8 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
                 # kStr += inst("v_mul_lo_u32", , "Partials buffer address")
                 module.add(SMovB32(dst=sgpr(tmpS01), src=0, comment="Init sgpr offset"))
             else:
-                increment = (kernel["WavefrontSize"] * 4) * storeWidth * writer.states.bpeCinternal
+                increment = (kernel["WavefrontSize"] * WaveNum) * storeWidth * writer.states.bpeCinternal
+                # module.addComment1("WavefrontSize={}, WaveNum={}, storeWidth={}, bpeC={}".format(kernel["WavefrontSize"], WaveNum, storeWidth, writer.states.bpeCinternal))
                 module.add(SAddU32(dst=sgpr(tmpS01), src0=sgpr(tmpS01), src1=increment, comment="Inc sgpr offset"))
 
             module.add(writer.readInput(kernel, ss, 'WS', kernel["ProblemType"]["ComputeDataType"], addrCalc, vc0, data, gwvw, addrCVgpr, sgpr(tmpS01)))
@@ -1156,7 +1167,7 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
         # AccVgpr read
         # if kernel.enabledSetPrioSplitLDS:
         #     kStr += inst("s_setprio", "0", "")
-        if codeAccVgprRead is not None:
+        if codeAccVgprRead is not None and kernel["LocalSplitU"] == 1:
             regsPerScalar = writer.states.bpeCinternal // writer.states.bpr # register per scalar
             # loop over store instructions within one batch
             for elementIdx in range(0, len(batchElements)):

@@ -1089,8 +1089,6 @@ def assignDerivedParameters(
       state["GlobalSplitUAlgorithm"] = "MultipleBuffer" # Set default Algorithm
       if state["ProblemType"]["DataType"].isDouble():
         reject(state, printRejectionReason, "Type {} for DataType not yet supported with StreamK".format(state["ProblemType"]["DataType"].toChar()))
-      if state["MIWaveGroup"][0] * state["MIWaveGroup"][1] != 4:
-        reject(state, printRejectionReason, "Stream-K requries MIWaveGroup0*MIWaveGroup1=4")
       if not state["EnableMatrixInstruction"]:
         reject(state, printRejectionReason, "Stream-K requires MatrixInstruction")
       if isaInfoMap[isa].asmCaps["HasWMMA"]:

@@ -0,0 +1,139 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Single wave
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - MatrixInstruction:
+          - [16,16,16,1, 1, 1,1, 1,1]
+          # - [16,16,16,1, 1, 2,1, 1,1]
+          # - [16,16,16,1, 1, 4,1, 1,1]
+          # - [16,16,16,1, 1, 1,2, 1,1]
+          # - [16,16,16,1, 1, 1,4, 1,1]
+          # - [16,16,16,1, 1, 2,2, 1,1]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+        - DepthU: [256]
+        - 1LDSBuffer: [-1]
+        - ClusterLocalRead: [True]
+        - ExpandPointerSwap: [0]
+        # - LocalReadVectorWidth: [4, 8]
+        # - NumElementsPerBatchStore: [0, 16]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        - StaggerU: [0]
+        - StreamK: [3]
+        - WorkGroupMappingXCC: [8]
+        - TransposeLDS: [-1]
+        - UseSgprForGRO: [0]
+        - WorkGroupMapping: [6]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          - Exact: [1024, 1024, 1, 1024]
+          - Exact: [1031, 1031, 1, 1031]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - LSU
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - MatrixInstruction:
+          - [16,16,16,1, 1, 1,1, 1,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          - [16,16,16,1, 1, 2,1, 1,1]
+          - [16,16,16,1, 1, 4,1, 1,1]
+          - [16,16,16,1, 1, 1,2, 1,1]
+          - [16,16,16,1, 1, 1,4, 1,1]
+          - [16,16,16,1, 1, 2,2, 1,1]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+          # - [16,16,16,1, 1, 8,8, 1,1]
+          # - [16,16,16,1, 1, 8,8, 2,1]
+          # - [16,16,16,1, 1, 8,8, 1,2]
+        - DepthU: [256]
+        - WorkGroup:
+          - [4,4,4]
+        - 1LDSBuffer: [-1]
+        - ClusterLocalRead: [True]
+        - ExpandPointerSwap: [0]
+        # - LocalReadVectorWidth: [4, 8]
+        # - NumElementsPerBatchStore: [0, 16]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        - StaggerU: [0]
+        - StreamK: [3]
+        - WorkGroupMappingXCC: [8]
+        - TransposeLDS: [-1]
+        - UseSgprForGRO: [0]
+        - WorkGroupMapping: [6]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          - Exact: [1024, 1024, 1, 1024]
+          - Exact: [1031, 1031, 1, 1031]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
@@ -69,6 +69,9 @@ BenchmarkProblems:
           - [16, 16, 16, 1, 1, 1,4, 2,2]
           - [16, 16, 16, 1, 1, 1,2, 2,2]
           - [16, 16, 16, 1, 1, 1,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,1]
+          - [16, 16, 16, 1, 1, 1,1, 1,2]
+          - [16, 16, 16, 1, 1, 1,1, 1,1]
         - MIArchVgpr: [0]
         - PrefetchGlobalRead: [2]
         - PrefetchLocalRead: [1]
@@ -265,6 +268,9 @@ BenchmarkProblems:
           - [16, 16, 16, 1, 1, 1,4, 2,2]
           - [16, 16, 16, 1, 1, 1,2, 2,2]
           - [16, 16, 16, 1, 1, 1,1, 2,2]
+          - [16, 16, 16, 1, 1, 1,1, 2,1]
+          - [16, 16, 16, 1, 1, 1,1, 1,2]
+          - [16, 16, 16, 1, 1, 1,1, 1,1]
         - MIArchVgpr: [0]
         - PrefetchGlobalRead: [2]
         - PrefetchLocalRead: [1]
@@ -461,6 +467,9 @@ BenchmarkProblems:
     #       - [16, 16, 16, 1, 1, 1,4, 2,2]
     #       - [16, 16, 16, 1, 1, 1,2, 2,2]
     #       - [16, 16, 16, 1, 1, 1,1, 2,2]
+    #       - [16, 16, 16, 1, 1, 1,1, 2,1]
+    #       - [16, 16, 16, 1, 1, 1,1, 1,2]
+    #       - [16, 16, 16, 1, 1, 1,1, 1,1]
     #     - MIArchVgpr: [0]
     #     - PrefetchGlobalRead: [2]
     #     - PrefetchLocalRead: [1]
@@ -657,6 +666,9 @@ BenchmarkProblems:
     #       - [16, 16, 16, 1, 1, 1,4, 2,2]
     #       - [16, 16, 16, 1, 1, 1,2, 2,2]
     #       - [16, 16, 16, 1, 1, 1,1, 2,2]
+    #       - [16, 16, 16, 1, 1, 1,1, 2,1]
+    #       - [16, 16, 16, 1, 1, 1,1, 1,2]
+    #       - [16, 16, 16, 1, 1, 1,1, 1,1]
     #     - MIArchVgpr: [0]
     #     - PrefetchGlobalRead: [2]
     #     - PrefetchLocalRead: [1]