Add blackwell check in TRTRTX EP unit tests for FP4/FP8 Custom ops (#26832)

vishalpandya1990 · web-flow · commit 1a112a289073 · 2025-12-30T09:55:55.000-08:00
### Description - Follow-up to [PR-26555](#26555) - add Blackwell check in TRTRTX EP unit tests for FP4/FP8 Custom ops since Blackwell ### Motivation and Context - NVFP4 recipe (combination of FP4 and FP8) is primarily intended for Blackwell+ GPUs as they have Tensor Cores for FP4 data type.
diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUDA
+#include <iostream>
+
+#if defined(USE_CUDA) || defined(USE_NV)
 #include "cuda_runtime_api.h"
 #endif
 
@@ -13,7 +15,7 @@ int GetCudaArchitecture() {
   // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
   static int cuda_arch = -1;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_NV)
   if (cuda_arch == -1) {
     int current_device_id = 0;
     cudaGetDevice(&current_device_id);
@@ -26,6 +28,15 @@ int GetCudaArchitecture() {
     if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) {
       cuda_arch = prop.major * 100 + prop.minor * 10;
     }
+
+    // Log GPU compute capability
+    if (cuda_arch == -1) {
+      std::cout << "WARNING: CUDA is not available or failed to initialize" << std::endl;
+    } else {
+      std::cout << "GPU Compute Capability: SM "
+                << cuda_arch / 100 << "." << (cuda_arch % 100) / 10
+                << " (value: " << cuda_arch << ")" << std::endl;
+    }
   }
 #endif
 
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
@@ -9,6 +9,7 @@
 #include "test/util/include/scoped_env_vars.h"
 #include "test/common/trt_op_test_utils.h"
 #include "test/common/random_generator.h"
+#include "test/common/cuda_op_test_utils.h"
 #include "test/providers/nv_tensorrt_rtx/test_nv_trt_rtx_ep_util.h"
 
 #include <thread>
@@ -22,6 +23,21 @@ namespace onnxruntime {
 
 namespace test {
 
+// Helper function to check if GPU is Blackwell (SM 12.0+) or above
+// Returns true if requirement is met
+// Returns false if CUDA is unavailable or GPU is below SM 12.0
+static bool IsBlackwellOrAbove() {
+  constexpr int kBlackwellMinCapability = 1200;  // SM 12.0 = 12 * 100 + 0 * 10
+  int cuda_arch = GetCudaArchitecture();
+
+  // Check if CUDA is available
+  if (cuda_arch == -1) {
+    return false;
+  }
+
+  return cuda_arch >= kBlackwellMinCapability;
+}
+
 TEST(NvExecutionProviderTest, ContextEmbedAndReload) {
   PathString model_name = ORT_TSTR("nv_execution_provider_test.onnx");
   PathString model_name_ctx = ORT_TSTR("nv_execution_provider_test_ctx.onnx");
@@ -442,6 +458,10 @@ TEST(NvExecutionProviderTest, DataTransfer) {
 }
 
 TEST(NvExecutionProviderTest, FP8CustomOpModel) {
+  if (!IsBlackwellOrAbove()) {
+    GTEST_SKIP() << "Test requires SM 12.0+ GPU (Blackwell+)";
+  }
+
   PathString model_name = ORT_TSTR("nv_execution_provider_fp8_quantize_dequantize_test.onnx");
   clearFileIfExists(model_name);
   std::string graph_name = "nv_execution_provider_fp8_quantize_dequantize_graph";
@@ -509,6 +529,10 @@ TEST(NvExecutionProviderTest, FP8CustomOpModel) {
 }
 
 TEST(NvExecutionProviderTest, FP4CustomOpModel) {
+  if (!IsBlackwellOrAbove()) {
+    GTEST_SKIP() << "Test requires SM 12.0+ GPU (Blackwell+)";
+  }
+
   PathString model_name = ORT_TSTR("nv_execution_provider_fp4_dynamic_quantize_test.onnx");
   clearFileIfExists(model_name);
   std::string graph_name = "nv_execution_provider_fp4_dynamic_quantize_graph";