reflect review comments

HectorSVC · HectorSVC · commit 2ec6c08c7877 · 2026-02-13T10:23:19.000-08:00
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_common.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_common.cc
@@ -4,6 +4,7 @@
 #include "contrib_ops/webgpu/quantization/matmul_nbits_common.h"
 #include <sstream>
 #include "core/common/common.h"
+#include "core/providers/webgpu/webgpu_context.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -54,6 +55,12 @@ fn mm_read_zero(row : u32, col : u32, r_dim: u32, c_dim: u32) -> )"
   return ss.str();
 }
 
+bool HasDP4ADeviceSupport(int context_id) {
+  auto& ctx = onnxruntime::webgpu::WebGpuContextFactory::GetContext(context_id);
+  return ctx.DeviceHasFeature(wgpu::FeatureName::Subgroups) &&
+         ctx.AdapterInfo().vendor != std::string_view{"apple"};
+}
+
 }  // namespace webgpu
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_common.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits_common.h
@@ -21,6 +21,11 @@ namespace webgpu {
 std::string GenerateZeroPointReadingCode(uint32_t nbits, bool has_zero_points,
                                          const std::string& output_type = "output_element_t");
 
+/// Returns true when the default WebGPU device supports the DP4A kernel path
+/// (Subgroups feature present and non-Apple vendor).
+/// \p context_id is the WebGpuContext slot (0 for the default context).
+bool HasDP4ADeviceSupport(int context_id = 0);
+
 }  // namespace webgpu
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_2bits_test.cc b/onnxruntime/test/contrib_ops/matmul_2bits_test.cc
@@ -27,6 +27,9 @@
 #include "core/session/ort_env.h"
 #include "core/util/qmath.h"
 #include "core/providers/webgpu/webgpu_provider_options.h"
+#ifdef USE_WEBGPU
+#include "contrib_ops/webgpu/quantization/matmul_nbits_common.h"
+#endif
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
@@ -543,7 +546,15 @@ TEST(MatMul2BitsWebGpu, Float32_ZeroPoint_LargerK) {
 // DP4A path tests (accuracy_level=4) — exercises the 1024-entry LUT / dequantization
 // path for 2-bit weights with zero_points.
 // DP4A constraints: accuracy_level==4, block_size%32==0, K%128==0, N%16==0.
+// Skipped when the adapter lacks Subgroups support or is Apple (Metal),
+// because the DP4A kernel would silently fall back to the default path.
 TEST(MatMul2BitsWebGpu, Float32_ZeroPoint_DP4A) {
+  // Ensure the WebGPU context is initialized so we can query adapter capabilities.
+  auto ep = DefaultWebGpuExecutionProvider();
+  if (!contrib::webgpu::HasDP4ADeviceSupport(ep->GetDeviceId())) {
+    GTEST_SKIP() << "DP4A requires Subgroups support on a non-Apple adapter";
+  }
+
   TestOptions2Bits opts{};
   opts.accuracy_level = 4;
   opts.has_zero_point = true;