Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/_linux-test-mi350.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ jobs:
SETUP_SCRIPT: "/workspace/setup_instance.sh"
CONDA_ENV: ${{ inputs.conda_env }}
DOCKER_IMAGE: "ghcr.io/meta-pytorch/tritonbench:rocm-latest"
TRITON_HIP_USE_ASYNC_COPY: "0"
steps:
- name: Checkout Tritonbench
uses: actions/checkout@v3
Expand Down
17 changes: 16 additions & 1 deletion tritonbench/kernels/triton_fused_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import torch
import triton
import triton.language as tl
from triton import knobs

from .attention_utils import (
HAS_EXPLICIT_WS, # guard new tuning configs such as num_consumer_groups
Expand All @@ -28,6 +29,20 @@
)


def is_cuda():
return triton.runtime.driver.active.get_current_target().backend == "cuda"


def is_hip_async_copy_enabled():
if is_cuda():
return False

# default is enabled
if knobs.amd.use_async_copy is None:
return True
return knobs.amd.use_async_copy


if HAS_TMA_DESC:
print(
"TMA benchmarks will be running with experimental grid constant TMA descriptor.",
Expand Down Expand Up @@ -481,7 +496,7 @@ def get_fwd_config_space(
bmList = [128] if enable_ws else [64, 128]
bnList = [64, 128] # To handle hDim of 64, we need BLOCK_N to be <= 64
wList = [4] if enable_ws else [4, 8]
stageList = [2] if enable_ws else [3, 4, 7]
stageList = [2] if enable_ws else [3] if is_hip_async_copy_enabled() else [3, 4, 7]
for BM in bmList:
for BN in bnList:
for sched in schedList: # set in global scope
Expand Down
17 changes: 14 additions & 3 deletions tritonbench/operators/fp8_gemm/persistent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import triton
import triton.language as tl
from torch._inductor.kernel.mm import ScalingType
from triton import knobs
from tritonbench.utils.env_utils import is_cuda
from tritonbench.utils.triton_utils import has_experimental_descriptor

Expand All @@ -24,6 +25,16 @@
pass


def is_hip_async_copy_enabled():
if is_cuda():
return False

# default is enabled
if knobs.amd.use_async_copy is None:
return True
return knobs.amd.use_async_copy


def _matmul_launch_metadata(grid, kernel, args):
ret = {}
M, N, K = args["M"], args["N"], args["K"]
Expand Down Expand Up @@ -135,23 +146,23 @@ def matmul_persistent(a, b):
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8,
"num_stages": 4,
"num_stages": 3 if is_hip_async_copy_enabled() else 4,
"num_warps": 8,
},
torch.float16: {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"num_stages": 3,
"num_stages": 2 if is_hip_async_copy_enabled() else 3,
"num_warps": 8,
},
torch.bfloat16: {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"num_stages": 3,
"num_stages": 2 if is_hip_async_copy_enabled() else 3,
"num_warps": 8,
},
}
Expand Down
16 changes: 13 additions & 3 deletions tritonbench/operators/grouped_gemm/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import torch
import triton
import triton.language as tl
from triton import knobs

try:
# @manual=//triton:triton
Expand All @@ -47,9 +48,17 @@ def is_cuda():


def num_sms():
return torch.cuda.get_device_properties("cuda").multi_processor_count


def is_hip_async_copy_enabled():
if is_cuda():
return torch.cuda.get_device_properties("cuda").multi_processor_count
return 148
return False

# default is enabled
if knobs.amd.use_async_copy is None:
return True
return knobs.amd.use_async_copy


def torch_dtype_to_triton_dtype(dtype):
Expand All @@ -73,7 +82,8 @@ def torch_dtype_to_triton_dtype(dtype):
"BLOCK_SIZE_N": BLOCK_N,
"BLOCK_SIZE_K": BLOCK_K,
"NUM_SMS": num_sms(),
}
},
num_stages=2 if is_hip_async_copy_enabled() else 3,
)
for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product([128, 256], repeat=3)
],
Expand Down