Fix StreamK+LSU and add new test case

AlexBrownAMD · AlexBrownAMD · commit d8b6766ba316 · 2025-04-30T21:20:01.000Z
diff --git a/tensilelite/Tensile/Components/StreamK.py b/tensilelite/Tensile/Components/StreamK.py
@@ -688,7 +688,7 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
         # AccVgpr read
         # if kernel.enabledSetPrioSplitLDS:
         #     kStr += inst("s_setprio", "0", "")
-        if codeAccVgprRead is not None: # and writer.kernel["LocalSplitU"] == 1
+        if codeAccVgprRead is not None and kernel["LocalSplitU"] == 1:
             regsPerScalar = writer.states.bpeCinternal // writer.states.bpr # register per scalar
             # loop over store instructions within one batch
             for elementIdx in range(0, len(batchElements)):
@@ -757,7 +757,7 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
             module.add(VMovB32(vgpr(cvtVgprStruct.vgprBF8Min), "0xc7600000", comment="BF8 Min value -57344 as float32" ))
 
         if kernel["EnableMatrixInstruction"]:
-            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1]
+            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] * kernel["WorkGroup"][2]
         else:
             WaveNum = kernel["NumThreads"] // kernel["WavefrontSize"]
 
@@ -777,6 +777,7 @@ def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edg
                 module.add(SMovB32(dst=sgpr(tmpS01), src=0, comment="Init sgpr offset"))
             else:
                 increment = (kernel["WavefrontSize"] * WaveNum) * storeWidth * writer.states.bpeCinternal
+                # module.addComment1("WavefrontSize={}, WaveNum={}, storeWidth={}, bpeC={}".format(kernel["WavefrontSize"], WaveNum, storeWidth, writer.states.bpeCinternal))
                 module.add(SAddU32(dst=sgpr(tmpS01), src0=sgpr(tmpS01), src1=increment, comment="Inc sgpr offset"))
 
             # TODO StreamK need this packing code???
@@ -1131,7 +1132,7 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
         #     self.StoreCUnrollLoadCWaitComment = "waitcnt for LoadC" # this will be used later to identify waitcnt for loadC
 
         if kernel["EnableMatrixInstruction"]:
-            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1]
+            WaveNum = kernel["MIWaveGroup"][0] * kernel["MIWaveGroup"][1] * kernel["WorkGroup"][2]
         else:
             WaveNum = kernel["NumThreads"] // kernel["WavefrontSize"]
 
@@ -1156,6 +1157,7 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
                 module.add(SMovB32(dst=sgpr(tmpS01), src=0, comment="Init sgpr offset"))
             else:
                 increment = (kernel["WavefrontSize"] * WaveNum) * storeWidth * writer.states.bpeCinternal
+                # module.addComment1("WavefrontSize={}, WaveNum={}, storeWidth={}, bpeC={}".format(kernel["WavefrontSize"], WaveNum, storeWidth, writer.states.bpeCinternal))
                 module.add(SAddU32(dst=sgpr(tmpS01), src0=sgpr(tmpS01), src1=increment, comment="Inc sgpr offset"))
 
             module.add(writer.readInput(kernel, ss, 'WS', kernel["ProblemType"]["ComputeDataType"], addrCalc, vc0, data, gwvw, addrCVgpr, sgpr(tmpS01)))
@@ -1165,7 +1167,7 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \
         # AccVgpr read
         # if kernel.enabledSetPrioSplitLDS:
         #     kStr += inst("s_setprio", "0", "")
-        if codeAccVgprRead is not None:
+        if codeAccVgprRead is not None and kernel["LocalSplitU"] == 1:
             regsPerScalar = writer.states.bpeCinternal // writer.states.bpr # register per scalar
             # loop over store instructions within one batch
             for elementIdx in range(0, len(batchElements)):
diff --git a/tensilelite/Tensile/Tests/common/streamk/sk_hgemm_lsu.yaml b/tensilelite/Tensile/Tests/common/streamk/sk_hgemm_lsu.yaml
@@ -0,0 +1,139 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  BoundsCheck: False
+  KernelTime: False
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeA: 12
+  DataInitTypeB: 13
+  DataInitTypeC: 12
+  # DataInitTypeC: 1
+  # ValidationPrintValids: True
+  MaxWorkspaceSize: 134217728
+  # PrintSolutionRejectionReason: True
+  # ForceGenerateKernel: True
+  # GenerateSourcesAndExit: True
+  NumWarmups: 0
+  EnqueuesPerSync: 1
+  # NumBenchmarks: 10
+  SleepPercent: 50
+
+BenchmarkProblems:
+
+  - # HGEMM NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # HGEMM NT - Single wave
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - MatrixInstruction:
+          - [16,16,16,1, 1, 1,1, 1,1]
+          # - [16,16,16,1, 1, 2,1, 1,1]
+          # - [16,16,16,1, 1, 4,1, 1,1]
+          # - [16,16,16,1, 1, 1,2, 1,1]
+          # - [16,16,16,1, 1, 1,4, 1,1]
+          # - [16,16,16,1, 1, 2,2, 1,1]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+        - DepthU: [256]
+        - 1LDSBuffer: [-1]
+        - ClusterLocalRead: [True]
+        - ExpandPointerSwap: [0]
+        # - LocalReadVectorWidth: [4, 8]
+        # - NumElementsPerBatchStore: [0, 16]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        - StaggerU: [0]
+        - StreamK: [3]
+        - WorkGroupMappingXCC: [8]
+        - TransposeLDS: [-1]
+        - UseSgprForGRO: [0]
+        - WorkGroupMapping: [6]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          - Exact: [1024, 1024, 1, 1024]
+          - Exact: [1031, 1031, 1, 1031]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]
+
+    - # HGEMM NT - LSU
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - MatrixInstruction:
+          - [16,16,16,1, 1, 1,1, 1,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          - [16,16,16,1, 1, 2,1, 1,1]
+          - [16,16,16,1, 1, 4,1, 1,1]
+          - [16,16,16,1, 1, 1,2, 1,1]
+          - [16,16,16,1, 1, 1,4, 1,1]
+          - [16,16,16,1, 1, 2,2, 1,1]
+          # - [16,16,16,1, 1, 1,1, 2,1]
+          # - [16,16,16,1, 1, 1,1, 1,2]
+          # - [16,16,16,1, 1, 8,8, 1,1]
+          # - [16,16,16,1, 1, 8,8, 2,1]
+          # - [16,16,16,1, 1, 8,8, 1,2]
+        - DepthU: [256]
+        - WorkGroup:
+          - [4,4,4]
+        - 1LDSBuffer: [-1]
+        - ClusterLocalRead: [True]
+        - ExpandPointerSwap: [0]
+        # - LocalReadVectorWidth: [4, 8]
+        # - NumElementsPerBatchStore: [0, 16]
+        - PrefetchGlobalRead: [2]
+        - PrefetchLocalRead: [1]
+        - ScheduleIterAlg: [3]
+        - SourceSwap: [True]
+        - StaggerU: [0]
+        - StreamK: [3]
+        - WorkGroupMappingXCC: [8]
+        - TransposeLDS: [-1]
+        - UseSgprForGRO: [0]
+        - WorkGroupMapping: [6]
+        - VectorWidthA: [1]
+        - VectorWidthB: [1]
+
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [512, 512, 1, 512]
+          - Exact: [1024, 1024, 1, 1024]
+          - Exact: [1031, 1031, 1, 1031]
+          # - Exact: [4096, 4096, 1, 1024]
+          # - Exact: [4103, 4096, 1, 1024]
+          # - Exact: [4096, 4103, 1, 1024]
+          # - Exact: [4096, 4096, 1, 1031]
+          - Exact: [4103, 4103, 1, 1031]