Skip to content

Commit 031ddf2

Browse files
Trevor MorrisLokiiiiii
Trevor Morris
authored andcommitted
[CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (apache#7623)
* Use less threads for certain GPUs to avoid register limit * Move util function to nvcc.py * Fix lint
1 parent 9a9282a commit 031ddf2

File tree

2 files changed

+50
-0
lines changed

2 files changed

+50
-0
lines changed

python/tvm/contrib/nvcc.py

+41
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,47 @@ def callback_libdevice_path(arch):
216216
return ""
217217

218218

219+
def get_target_compute_version(target=None):
220+
"""Utility function to get compute capability of compilation target.
221+
222+
Looks for the arch in three different places, first in the target attributes, then the global
223+
scope, and finally the GPU device (if it exists).
224+
225+
Parameters
226+
----------
227+
target : tvm.target.Target, optional
228+
The compilation target
229+
230+
Returns
231+
-------
232+
compute_version : str
233+
compute capability of a GPU (e.g. "8.0")
234+
"""
235+
# 1. Target
236+
if target:
237+
if "arch" in target.attrs:
238+
compute_version = target.attrs["arch"]
239+
major, minor = compute_version.split("_")[1]
240+
return major + "." + minor
241+
242+
# 2. Global scope
243+
from tvm.autotvm.env import AutotvmGlobalScope # pylint: disable=import-outside-toplevel
244+
245+
if AutotvmGlobalScope.current.cuda_target_arch:
246+
major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1]
247+
return major + "." + minor
248+
249+
# 3. GPU
250+
if tvm.gpu(0).exist:
251+
return tvm.gpu(0).compute_version
252+
253+
warnings.warn(
254+
"No CUDA architecture was specified or GPU detected."
255+
"Try specifying it by adding '-arch=sm_xx' to your target."
256+
)
257+
return None
258+
259+
219260
def parse_compute_version(compute_version):
220261
"""Parse compute capability string to divide major and minor version
221262

python/tvm/topi/cuda/nms.py

+9
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"""Non-maximum suppression operator"""
2020
import tvm
2121
from tvm import te
22+
from tvm.contrib import nvcc
2223
from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
2324
from tvm.tir import if_then_else
2425
from .sort import argsort, argsort_thrust
@@ -493,6 +494,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
493494
nthread_by = batch_size
494495
nthread_tx = max_threads
495496

497+
# Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
498+
# vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
499+
# be exceeded with 1024 threads.
500+
target = tvm.target.Target.current(allow_none=False)
501+
if target.kind.name == "cuda":
502+
if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
503+
nthread_tx = 512
504+
496505
by = te.thread_axis("blockIdx.y")
497506
tx = te.thread_axis("threadIdx.x")
498507
ib.scope_attr(by, "thread_extent", nthread_by)

0 commit comments

Comments
 (0)