Optimize cudaGetDeviceProperties runtime overhead

jiawenliu64 · facebook-github-bot · commit eeb68985e5a9 · 2025-05-28T20:23:09.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1284 Further optimize FP8 kernels runtime overhead with `cudaGetDeviceProperties` by only triggering it once Before this Diff: [Trace](https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2F0%2F1748487716%2Flocalhost%2Flibkineto_activities_3431969.json.gz&bucket=gpu_traces) After this Diff: [Trace](https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2F0%2F1748488054%2Flocalhost%2Flibkineto_activities_3821152.json.gz&bucket=gpu_traces) Differential Revision: D75574880
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu
@@ -28,12 +28,21 @@ at::Tensor dispatch_fp8_rowwise_kernel(
   int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
   int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   int K = XQ.size(-1);
-
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      cudaRuntimeGetVersion(&runtimeVersion);
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   // Use shape heuristics to dispatch to optimized kernel configuration.
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu
@@ -29,11 +29,21 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
     bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      cudaRuntimeGetVersion(&runtimeVersion);
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 batched GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   TORCH_CHECK(
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu
@@ -30,16 +30,21 @@ at::Tensor dispatch_fp8_grouped_kernel(
     at::Tensor output,
     std::optional<at::Tensor> zero_start_index_M = std::nullopt,
     std::optional<at::Tensor> M_sizes = std::nullopt) {
-  int arch = 9;
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  if (prop.major >= 10) {
-    arch = 10;
-    int runtimeVersion;
-    cudaRuntimeGetVersion(&runtimeVersion);
-    TORCH_CHECK(
-        runtimeVersion >= 12080,
-        "FP8 grouped GEMM on blackwell sm100a requires cuda >= 12.8");
+  static int arch = -1;
+  // Avoid expensive cudaGetDeviceProperties call.
+  if (arch < 0) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    if (prop.major >= 10) {
+      arch = 10;
+      int runtimeVersion;
+      cudaRuntimeGetVersion(&runtimeVersion);
+      TORCH_CHECK(
+          runtimeVersion >= 12080,
+          "FP8 grouped GEMM on sm100a or above requires cuda >= 12.8");
+    } else {
+      arch = 9;
+    }
   }
 
   // Use heuristics to pick the best kernel implementation.