Optimize cudaGetDeviceProperties runtime overhead (#4209)

jiawenliu64 · facebook-github-bot · commit 7caa61ffaa97 · 2025-05-28T20:49:07.000-07:00
Summary: Pull Request resolved: #4209 X-link: facebookresearch/FBGEMM#1284 Further optimize FP8 kernels runtime overhead with `cudaGetDeviceProperties` by only triggering it once Before this Diff: [Trace](https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2F0%2F1748487716%2Flocalhost%2Flibkineto_activities_3431969.json.gz&bucket=gpu_traces) After this Diff: [Trace](https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2F0%2F1748488054%2Flocalhost%2Flibkineto_activities_3821152.json.gz&bucket=gpu_traces) Differential Revision: D75574880
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu
@@ -8,6 +8,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 // clang-format on
 
 #include "f8f8bf16_rowwise/f8f8bf16_rowwise_manifest.cuh"
@@ -28,12 +29,21 @@ at::Tensor dispatch_fp8_rowwise_kernel(
   int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
   int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   int K = XQ.size(-1);
-
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   // Use shape heuristics to dispatch to optimized kernel configuration.
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/cuda/CUDAGuard.h>
 #include <cute/tensor.hpp>
 #include "f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_manifest.cuh"
 
@@ -29,11 +30,21 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
     bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 batched GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   TORCH_CHECK(
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu
@@ -8,6 +8,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 // clang-format on
 
 #include "f8f8bf16_rowwise_grouped/f8f8bf16_rowwise_grouped_manifest.cuh"
@@ -30,16 +31,21 @@ at::Tensor dispatch_fp8_grouped_kernel(
     at::Tensor output,
     std::optional<at::Tensor> zero_start_index_M = std::nullopt,
     std::optional<at::Tensor> M_sizes = std::nullopt) {
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
-    int runtimeVersion;
-    cudaRuntimeGetVersion(&runtimeVersion);
-    TORCH_CHECK(
-        runtimeVersion >= 12080,
-        "FP8 grouped GEMM on blackwell sm100a requires cuda >= 12.8");
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 grouped GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   // Use heuristics to pick the best kernel implementation.