[tuner] add use_direct_load (Global Load DMA) support to tuner

bangtianliu · bangtianliu · commit 540386b43d3b · 2026-03-16T20:24:07.000Z
Signed-off-by: Bangtian Liu &lt;liubangtian@gmail.com&gt;
diff --git a/amdsharktuner/amdsharktuner/candidate_gen.py b/amdsharktuner/amdsharktuner/candidate_gen.py
@@ -95,8 +95,8 @@ def generate_solutions(
     num_subgroups: int = 4,  # GPU spec, used to determine candidate generation constraints.
     allowed_waves_per_eu: list[int] = [2],
     allowed_denorm_flushing: list[bool] = [False],
+    allowed_use_direct_load: list[bool] = [False],
     pipeline_options_search_space: rocm_dispatch_constraints.PipelineOptionsSearchSpace = rocm_dispatch_constraints.PipelineOptionsSearchSpace(),
-    codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute,
     conv_strategy: rocm_common.ConvolutionStrategy = rocm_common.ConvolutionStrategy.igemm
     | rocm_common.ConvolutionStrategy.direct,
 ) -> Iterator[list[common.TuningConfiguration]]:
@@ -112,6 +112,8 @@ def generate_solutions(
             target_info,
             num_subgroups=num_subgroups,
             allowed_waves_per_eu=allowed_waves_per_eu,
+            allowed_denorm_flushing=allowed_denorm_flushing,
+            allowed_use_direct_load=allowed_use_direct_load,
             pipeline_options_search_space=pipeline_options_search_space,
             conv_strategy=conv_strategy,
         )
@@ -122,6 +124,7 @@ def generate_solutions(
         num_subgroups=num_subgroups,
         allowed_waves_per_eu=allowed_waves_per_eu,
         allowed_denorm_flushing=allowed_denorm_flushing,
+        allowed_use_direct_load=allowed_use_direct_load,
         pipeline_options_search_space=pipeline_options_search_space,
     )
 
diff --git a/amdsharktuner/amdsharktuner/common.py b/amdsharktuner/amdsharktuner/common.py
@@ -337,6 +337,24 @@ def get_lowering_config(
                     assert (
                         False
                     ), f"Unsupported type for key '{key}': {type(value).__name__}"
+            case "promotion_types":
+                # Handle list of Attribute objects for use_direct_load.
+                if isinstance(value, Sequence):
+                    # Validate length matches promote_operands if present.
+                    if "promote_operands" in lowering_config_dict:
+                        promote_ops = lowering_config_dict["promote_operands"]
+                        if hasattr(promote_ops, "__len__") and len(value) != len(
+                            promote_ops
+                        ):
+                            assert False, (
+                                f"promotion_types length ({len(value)}) must match "
+                                f"promote_operands length ({len(promote_ops)})"
+                            )
+                    promoted_value = ir.ArrayAttr.get(list(value))
+                elif not isinstance(value, ir.ArrayAttr):
+                    assert (
+                        False
+                    ), f"Unsupported type for key '{key}': {type(value).__name__}"
             case _:
                 assert False, f"Unhandled key in lowering configuration: {key}"
 
diff --git a/amdsharktuner/amdsharktuner/libtuner.py b/amdsharktuner/amdsharktuner/libtuner.py
@@ -422,6 +422,16 @@ def parse_arguments(
         "denormals to zero. Only applicable to attention ops. "
         "Possible values: [True, False]",
     )
+    candidate_gen_args.add_argument(
+        "--use-direct-load-options",
+        type=lambda t: [s.strip().lower() == "true" for s in t.split(",")],
+        default=[False],
+        help="Comma-separated list of allowed values for use_direct_load. "
+        "When True, enables Global Load DMA mode for matmul operand loading. "
+        "Only supported on gfx950+ GPUs. Automatically sets "
+        "no_reduce_shared_memory_bank_conflicts=true. "
+        "Possible values: [True, False]. Default: [False].",
+    )
     candidate_gen_args.add_argument(
         "--codegen-pipeline",
         choices=[x.value for x in CodegenPipelines],
@@ -839,8 +849,8 @@ def generate_candidate_specs(
             num_subgroups=args.num_subgroups,
             allowed_waves_per_eu=args.waves_per_eu_options,
             allowed_denorm_flushing=allowed_denorm_flushing,
+            allowed_use_direct_load=args.use_direct_load_options,
             pipeline_options_search_space=pipeline_options_search_space,
-            codegen_pipeline=get_iree_codegen_pipeline(args.codegen_pipeline),
             conv_strategy=conv_strategy,
         )
         if args.enable_random_seed:
diff --git a/amdsharktuner/amdsharktuner/rocm/rocm_common.py b/amdsharktuner/amdsharktuner/rocm/rocm_common.py
@@ -25,6 +25,36 @@
 # List of tested ROCm architectures.
 ROCM_ARCHITECTURES = ["gfx942", "gfx950", "gfx1100", "gfx1201"]
 
+tune_logger = logging.getLogger("tune")
+
+
+def supports_global_load_dma(arch: str) -> bool:
+    """Check if architecture supports Global Load DMA (gfx950+).
+
+    CDNA4 is gfx950+ (majorVersion == 9 && minorVersion >= 5).
+    """
+    if not arch.startswith("gfx"):
+        return False
+    try:
+        version = int(arch[3:])
+        major = version // 100
+        minor = (version % 100) // 10
+        return major == 9 and minor >= 5
+    except ValueError:
+        return False
+
+
+def get_use_global_load_dma_attr() -> ir.Attribute:
+    """Get the UseGlobalLoadDMAAttr for direct load promotion."""
+    # TODO(Bangtian): Expose Python binding for iree_gpu.UseGlobalLoadDMAAttr instead of parsing string.
+    return ir.Attribute.parse("#iree_gpu.use_global_load_dma")
+
+
+def get_promotion_types_for_direct_load(num_operands: int) -> list[ir.Attribute]:
+    """Get promotion_types array for direct load (all operands use DMA)."""
+    dma_attr = get_use_global_load_dma_attr()
+    return [dma_attr] * num_operands
+
 
 class ConvolutionStrategy(IntFlag):
     """ROCm convolution lowering strategy for TileAndFuse pipeline."""
@@ -33,6 +63,46 @@ class ConvolutionStrategy(IntFlag):
     direct = 2
 
 
+def filter_use_direct_load(
+    allowed_use_direct_load: list[bool],
+    codegen_pipeline: "iree_codegen.DispatchLoweringPassPipeline",
+    arch: str,
+    conv_strategy: ConvolutionStrategy,
+) -> list[bool]:
+    """Filter use_direct_load options for unsupported configurations.
+
+    Returns filtered list with use_direct_load=True removed if unsupported.
+    Logs warnings explaining why filtering occurred.
+    """
+    from iree.compiler.dialects import iree_codegen  # type: ignore
+
+    if not any(opt is True for opt in allowed_use_direct_load):
+        return allowed_use_direct_load
+
+    if codegen_pipeline != iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse:
+        tune_logger.warning(
+            f"use_direct_load is only supported with TileAndFuse pipeline. "
+            f"Current pipeline: {codegen_pipeline}. Disabling use_direct_load."
+        )
+        return [False]
+
+    if not supports_global_load_dma(arch):
+        tune_logger.warning(
+            f"use_direct_load is only supported on gfx950+ architectures. "
+            f"Current architecture: {arch}. Disabling use_direct_load."
+        )
+        return [False]
+
+    if conv_strategy == ConvolutionStrategy.direct:
+        tune_logger.warning(
+            "use_direct_load is not supported for direct convolution strategy. "
+            "Disabling use_direct_load."
+        )
+        return [False]
+
+    return allowed_use_direct_load
+
+
 @dataclass
 class ConvToIgemmInfo:
     """
diff --git a/amdsharktuner/amdsharktuner/rocm/rocm_constraint_generators.py b/amdsharktuner/amdsharktuner/rocm/rocm_constraint_generators.py
@@ -118,6 +118,17 @@ def generate_solutions(
         gpu_target_info: iree_gpu.TargetInfo,
         **pipeline_constraint_options,
     ) -> Iterator[list[common.TuningConfiguration]]:
+        # Filter use_direct_load for unsupported configurations.
+        codegen_pipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse
+        pipeline_constraint_options[
+            "allowed_use_direct_load"
+        ] = rocm_common.filter_use_direct_load(
+            pipeline_constraint_options.get("allowed_use_direct_load", [False]),
+            codegen_pipeline,
+            gpu_target_info.arch,
+            rocm_common.ConvolutionStrategy.igemm,  # Contraction uses IGEMM-like path.
+        )
+
         return rocm_solutions.generate_generic_contraction_solutions(
             tuner_ctx=tuner_context,
             gpu_target_info=gpu_target_info,
@@ -128,7 +139,7 @@ def generate_solutions(
             res_type=self.op_info.res_type,
             dispatch_kind=common.DispatchKind.contraction,
             indexing_maps=self.op_info.indexing_maps,
-            codegen_pipeline=iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
+            codegen_pipeline=codegen_pipeline,
             **pipeline_constraint_options,
         )
 
@@ -164,11 +175,25 @@ def generate_solutions(
             self.op_info.convolution_dims is not None
         ), "convolution_dims must be set for convolution operations"
 
+        codegen_pipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse
+
         # Generate IGEMM candidates.
         if conv_strategy & rocm_common.ConvolutionStrategy.igemm:
             tuner_context.logger.info(
                 "Generating convolution candidates using IGEMM strategy"
             )
+
+            # Filter use_direct_load for IGEMM strategy.
+            igemm_options = pipeline_constraint_options.copy()
+            igemm_options[
+                "allowed_use_direct_load"
+            ] = rocm_common.filter_use_direct_load(
+                igemm_options.get("allowed_use_direct_load", [False]),
+                codegen_pipeline,
+                gpu_target_info.arch,
+                rocm_common.ConvolutionStrategy.igemm,
+            )
+
             yield from rocm_solutions.generate_generic_contraction_solutions(
                 tuner_ctx=tuner_context,
                 gpu_target_info=gpu_target_info,
@@ -179,11 +204,11 @@ def generate_solutions(
                 res_type=self.op_info.res_type,
                 dispatch_kind=common.DispatchKind.conv,
                 indexing_maps=self.op_info.indexing_maps,
-                codegen_pipeline=iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
+                codegen_pipeline=codegen_pipeline,
                 igemm_details=self.op_info.igemm_details,
                 conv_to_igemm_info=self.op_info.conv_to_igemm_info,
                 convolution_dims=self.op_info.convolution_dims,
-                **pipeline_constraint_options,
+                **igemm_options,
             )
 
         # Generate direct convolution candidates if supported.
@@ -192,6 +217,18 @@ def generate_solutions(
                 tuner_context.logger.info(
                     "Generating convolution candidates using direct strategy"
                 )
+
+                # Filter use_direct_load for direct conv strategy.
+                direct_options = pipeline_constraint_options.copy()
+                direct_options[
+                    "allowed_use_direct_load"
+                ] = rocm_common.filter_use_direct_load(
+                    direct_options.get("allowed_use_direct_load", [False]),
+                    codegen_pipeline,
+                    gpu_target_info.arch,
+                    rocm_common.ConvolutionStrategy.direct,
+                )
+
                 direct_dims, direct_sizes = self._compute_direct_conv_dimensions()
                 # Pass filter loop info so solution generator can add them with tile size 1.
                 direct_conv_info: rocm_solutions.DirectConvInfo = {
@@ -210,11 +247,11 @@ def generate_solutions(
                     res_type=self.op_info.res_type,
                     dispatch_kind=common.DispatchKind.conv,
                     indexing_maps=self.op_info.indexing_maps,
-                    codegen_pipeline=iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
+                    codegen_pipeline=codegen_pipeline,
                     igemm_details=None,
                     conv_to_igemm_info=None,
                     direct_conv_info=direct_conv_info,
-                    **pipeline_constraint_options,
+                    **direct_options,
                 )
 
     def _supports_direct_convolution(self, tuner_context: common.TunerContext) -> bool:
diff --git a/amdsharktuner/amdsharktuner/rocm/rocm_dispatch_constraints.py b/amdsharktuner/amdsharktuner/rocm/rocm_dispatch_constraints.py
@@ -757,34 +757,56 @@ def generate_tile_and_fuse_compilation_infos(
     padding: Optional[list[int]] = None,
     padding_conv: Optional[list[int]] = None,
     allowed_denorm_flushing: list[bool] = [False],
+    allowed_use_direct_load: list[bool] = [False],
 ) -> list[iree_codegen.CompilationInfoAttr]:
     """Generate compilation infos for LLVMGPUTileAndFuse pipeline."""
-    lowering_config_args = {
-        "workgroup": workgroup_tile_sizes,
-        "reduction": reduction_tile_sizes,
-        "subgroup": subgroup_tile_sizes,
-        "promote_operands": promote_operands,
-    }
-
-    if mma_attr is not None:
-        lowering_config_args["mma_kind"] = mma_attr
-
-    if padding is not None:
-        lowering_config_args["padding"] = padding
-
-    if padding_conv is not None:
-        lowering_config_args["padding_conv"] = padding_conv
+    all_compilation_infos: list[iree_codegen.CompilationInfoAttr] = []
+
+    for use_direct_load in allowed_use_direct_load:
+        lowering_config_args = {
+            "workgroup": workgroup_tile_sizes,
+            "reduction": reduction_tile_sizes,
+            "subgroup": subgroup_tile_sizes,
+            "promote_operands": promote_operands,
+        }
+
+        # Add promotion_types when use_direct_load is enabled.
+        if use_direct_load:
+            # Defensive check: direct convolution should not reach here with use_direct_load=True.
+            is_direct_conv = (
+                pipeline_options_search_space.use_igemm_convolution is not None
+                and pipeline_options_search_space.use_igemm_convolution == [False]
+            )
+            assert not is_direct_conv, (
+                "use_direct_load=True is not supported for direct convolution. "
+                "This should have been filtered in ROCmConvolutionTileAndFuseConstraintGenerator."
+            )
+            lowering_config_args[
+                "promotion_types"
+            ] = rocm_common.get_promotion_types_for_direct_load(len(promote_operands))
+
+        if mma_attr is not None:
+            lowering_config_args["mma_kind"] = mma_attr
+
+        if padding is not None:
+            lowering_config_args["padding"] = padding
+
+        if padding_conv is not None:
+            lowering_config_args["padding_conv"] = padding_conv
+
+        compilation_infos = _build_compilation_infos(
+            tuner_ctx,
+            lowering_config_args,
+            workgroup_sizes,
+            subgroup_size,
+            iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
+            pipeline_options_search_space,
+            allowed_waves_per_eu,
+            allowed_denorm_flushing,
+        )
+        all_compilation_infos.extend(compilation_infos)
 
-    return _build_compilation_infos(
-        tuner_ctx,
-        lowering_config_args,
-        workgroup_sizes,
-        subgroup_size,
-        iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse,
-        pipeline_options_search_space,
-        allowed_waves_per_eu,
-        allowed_denorm_flushing,
-    )
+    return all_compilation_infos
 
 
 def generate_vector_distribute_compilation_infos(
diff --git a/amdsharktuner/amdsharktuner/rocm/rocm_solutions.py b/amdsharktuner/amdsharktuner/rocm/rocm_solutions.py
@@ -125,6 +125,7 @@ def generate_generic_contraction_solutions(
     num_subgroups: int = 4,
     allowed_waves_per_eu: list[int] = [2],
     allowed_denorm_flushing: list[bool] = [False],
+    allowed_use_direct_load: list[bool] = [False],
     pipeline_options_search_space: rocm_dispatch_constraints.PipelineOptionsSearchSpace = rocm_dispatch_constraints.PipelineOptionsSearchSpace(),
     igemm_details: Optional[iree_codegen.IGEMMGenericConvDetails] = None,
     conv_to_igemm_info: Optional[rocm_common.ConvToIgemmInfo] = None,
@@ -378,6 +379,7 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes):
                         padding=padding,
                         padding_conv=padding_conv,
                         allowed_denorm_flushing=allowed_denorm_flushing,
+                        allowed_use_direct_load=allowed_use_direct_load,
                     )
                 )
             case iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute:
diff --git a/amdsharktuner/tests/common_test.py b/amdsharktuner/tests/common_test.py
diff --git a/amdsharktuner/tests/rocm/rocm_common_test.py b/amdsharktuner/tests/rocm/rocm_common_test.py
diff --git a/amdsharktuner/tests/rocm/rocm_dispatch_constraints_test.py b/amdsharktuner/tests/rocm/rocm_dispatch_constraints_test.py