[CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (apache#7623)

Trevor Morris · Lokiiiiii · commit 031ddf2504dd · 2021-03-10T11:44:25.000-08:00
* Use less threads for certain GPUs to avoid register limit

* Move util function to nvcc.py

* Fix lint
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
@@ -216,6 +216,47 @@ def callback_libdevice_path(arch):
         return ""
 
 
+def get_target_compute_version(target=None):
+    """Utility function to get compute capability of compilation target.
+
+    Looks for the arch in three different places, first in the target attributes, then the global
+    scope, and finally the GPU device (if it exists).
+
+    Parameters
+    ----------
+    target : tvm.target.Target, optional
+        The compilation target
+
+    Returns
+    -------
+    compute_version : str
+        compute capability of a GPU (e.g. "8.0")
+    """
+    # 1. Target
+    if target:
+        if "arch" in target.attrs:
+            compute_version = target.attrs["arch"]
+            major, minor = compute_version.split("_")[1]
+            return major + "." + minor
+
+    # 2. Global scope
+    from tvm.autotvm.env import AutotvmGlobalScope  # pylint: disable=import-outside-toplevel
+
+    if AutotvmGlobalScope.current.cuda_target_arch:
+        major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1]
+        return major + "." + minor
+
+    # 3. GPU
+    if tvm.gpu(0).exist:
+        return tvm.gpu(0).compute_version
+
+    warnings.warn(
+        "No CUDA architecture was specified or GPU detected."
+        "Try specifying it by adding '-arch=sm_xx' to your target."
+    )
+    return None
+
+
 def parse_compute_version(compute_version):
     """Parse compute capability string to divide major and minor version
 
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
@@ -19,6 +19,7 @@
 """Non-maximum suppression operator"""
 import tvm
 from tvm import te
+from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from tvm.tir import if_then_else
 from .sort import argsort, argsort_thrust
@@ -493,6 +494,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         nthread_by = batch_size
         nthread_tx = max_threads
 
+        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
+        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
+        # be exceeded with 1024 threads.
+        target = tvm.target.Target.current(allow_none=False)
+        if target.kind.name == "cuda":
+            if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
+                nthread_tx = 512
+
         by = te.thread_axis("blockIdx.y")
         tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(by, "thread_extent", nthread_by)