[webgpu] Support int64 for range (#26673)

qjia7 · web-flow · commit da9b0a54f737 · 2026-01-23T17:02:59.000-08:00
### Description  
 - Add new registerInt64Ops option to WebGpuExecutionProviderConfig
- Int64 support now enabled when enable_graph_capture OR
register_int64_ops is true
- Refactor Range kernel registration to support conditional int64
registration
  - Update kernel registry caching to handle all 4 combinations of flags
- Rename parameters from enable_graph_capture to enable_int64 for
clarity
- Add config parsing in webgpu_provider_factory.cc for registerInt64Ops
option

### Motivation
Needed by updating position id with an onnx model in genai.

Continuous decoding mode: `position_ids[i] = i + total_length -
new_kv_length`

We can use an onnx model which includes a Range op to implement update
the position ids:
Inputs: start (total_length - new_kv_length), limit (total_length),
delta (1)
    Output: position_ids (1D tensor of size new_kv_length)
diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc
@@ -24,53 +24,93 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
   }
 
   uint32_t output_size = onnxruntime::narrow<uint32_t>(n);
-  RangeProgram program{};
-#if defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif
+  RangeProgram program{output_tensor->GetElementType()};
+
+  // For int64, we need to ensure values fit in int32 range since we use 4 bytes in uniforms
+  uint32_t start_u32, delta_u32;
+  if constexpr (std::is_same_v<T, int64_t>) {
+    // Check if values fit in int32 range
+    ORT_ENFORCE(start >= std::numeric_limits<int32_t>::min() && start <= std::numeric_limits<int32_t>::max(),
+                "Range start value ", start, " is out of int32 range");
+    ORT_ENFORCE(delta >= std::numeric_limits<int32_t>::min() && delta <= std::numeric_limits<int32_t>::max(),
+                "Range delta value ", delta, " is out of int32 range");
+    int32_t start_i32 = static_cast<int32_t>(start);
+    int32_t delta_i32 = static_cast<int32_t>(delta);
+    start_u32 = std::bit_cast<uint32_t>(start_i32);
+    delta_u32 = std::bit_cast<uint32_t>(delta_i32);
+  } else {
+    start_u32 = std::bit_cast<uint32_t>(start);
+    delta_u32 = std::bit_cast<uint32_t>(delta);
+  }
 
   program.AddOutput({output_tensor, ProgramTensorMetadataDependency::Type})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
           output_size,
-          *reinterpret_cast<uint32_t*>(&start),
-          *reinterpret_cast<uint32_t*>(&delta),
+          start_u32,
+          delta_u32,
       });
 
-#if defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-
   return context.RunProgram(program);
 }
 
 Status RangeProgram::GenerateShaderCode(ShaderHelper& sh) const {
   const auto& output = sh.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
-  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-                        << "  let value = bitcast<output_value_t>(uniforms.start) + output_value_t(global_idx) * bitcast<output_value_t>(uniforms.delta);\n"
-                        << output.SetByOffset("global_idx", "value");
+  sh.MainFunctionBody() << sh.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size");
+
+  // For int64, we need to cast to i32 first, then assign to output (which handles vec2<u32> conversion)
+  // For int32 and float, we can use output_value_t directly
+  if (data_type_ == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    // int64 case: bitcast to i32, compute with i32, then assign (automatic conversion to vec2<u32>)
+    sh.MainFunctionBody() << "  let value = bitcast<i32>(uniforms.start) + i32(global_idx) * bitcast<i32>(uniforms.delta);\n"
+                          << output.SetByOffset("global_idx", "value");
+  } else {
+    // float or int32 case: use output_value_t
+    sh.MainFunctionBody() << "  let value = bitcast<output_value_t>(uniforms.start) + output_value_t(global_idx) * bitcast<output_value_t>(uniforms.delta);\n"
+                          << output.SetByOffset("global_idx", "value");
+  }
 
   return Status();
 }
 
-#define WEBGPU_RANGE_KERNEL(TYPE)                                   \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                    \
-      Range,                                                        \
-      kOnnxDomain,                                                  \
-      11,                                                           \
-      TYPE,                                                         \
-      kWebGpuExecutionProvider,                                     \
-      KernelDefBuilder()                                            \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()) \
-          .InputMemoryType(OrtMemTypeCPU, 0)                        \
-          .InputMemoryType(OrtMemTypeCPU, 1)                        \
-          .InputMemoryType(OrtMemTypeCPU, 2),                       \
-      Range<TYPE>);
-
-WEBGPU_RANGE_KERNEL(float)
-WEBGPU_RANGE_KERNEL(int32_t)
+// Explicit template instantiations (needed for linking)
+template class Range<float>;
+template class Range<int32_t>;
+template class Range<int64_t>;
+
+void RegisterRangeKernels(KernelRegistry& kernel_registry, bool enable_int64) {
+  // Helper lambda to create kernel
+  auto create_range_kernel_info = [](auto type_tag) {
+    using T = decltype(type_tag);
+    KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+      out = std::make_unique<Range<T>>(info);
+      return Status::OK();
+    };
+
+    return KernelCreateInfo(
+        KernelDefBuilder()
+            .SetName("Range")
+            .SetDomain(kOnnxDomain)
+            .SinceVersion(11)
+            .Provider(kWebGpuExecutionProvider)
+            .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())
+            .InputMemoryType(OrtMemTypeCPU, 0)
+            .InputMemoryType(OrtMemTypeCPU, 1)
+            .InputMemoryType(OrtMemTypeCPU, 2)
+            .Build(),
+        kernel_create_fn);
+  };
+
+  // Always register float and int32_t
+  ORT_THROW_IF_ERROR(kernel_registry.Register(create_range_kernel_info(float{})));
+  ORT_THROW_IF_ERROR(kernel_registry.Register(create_range_kernel_info(int32_t{})));
+
+  // Register int64_t only if int64 support is enabled
+  if (enable_int64) {
+    ORT_THROW_IF_ERROR(kernel_registry.Register(create_range_kernel_info(int64_t{})));
+  }
+}
 
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/generator/range.h b/onnxruntime/core/providers/webgpu/generator/range.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "core/framework/kernel_registry.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
 namespace onnxruntime {
@@ -19,13 +20,20 @@ class Range : public WebGpuKernel {
 class RangeProgram : public Program<RangeProgram> {
  public:
   RangeProgram() : Program{"Range"} {}
+  RangeProgram(int32_t data_type) : Program{"Range"}, data_type_(data_type) {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
                                           {"start", ProgramUniformVariableDataType::Uint32},
                                           {"delta", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int32_t data_type_{0};
 };
 
+// Register Range kernels with conditional int64 support
+void RegisterRangeKernels(KernelRegistry& kernel_registry, bool enable_int64);
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc
@@ -110,8 +110,8 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const {
 }
 
 template <int StartVersion, int EndVersion>
-KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) {
-  const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture);
+KernelCreateInfo CreateCastKernelInfo(bool enable_int64) {
+  const auto& type_constraints = CastOpTypeConstraints(enable_int64);
 
   KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
     out = std::make_unique<Cast>(info);
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h
@@ -40,9 +40,9 @@ class Cast final : public WebGpuKernel {
   int32_t to_;
 };
 
-// Create Cast kernel info with appropriate type constraints based on graph capture support
+// Create Cast kernel info with appropriate type constraints based on int64 support
 template <int StartVersion, int EndVersion = StartVersion>
-KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture);
+KernelCreateInfo CreateCastKernelInfo(bool enable_int64);
 
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -29,6 +29,7 @@
 #include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
 #include "core/providers/webgpu/tensor/cast.h"
+#include "core/providers/webgpu/generator/range.h"
 
 namespace onnxruntime {
 
@@ -390,9 +391,6 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInt
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 22, InstanceNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 22, InstanceNormalization);
 
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, float, Range);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, int32_t, Range);
-
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, Einsum);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 2, 10, Pad);
@@ -436,7 +434,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterElements);
 
-std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = false) {
+std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = false, bool enable_int64 = false) {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -746,9 +744,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = fals
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 22, InstanceNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 22, InstanceNormalization)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, float, Range)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, int32_t, Range)>,
-
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, Einsum)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
@@ -801,13 +796,16 @@ std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture = fals
     }
   }
 
-  // Register Cast kernels with conditional int64 support based on graph capture
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_graph_capture)));
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_graph_capture)));
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_graph_capture)));
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_graph_capture)));
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_graph_capture)));
-  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_graph_capture)));
+  // Register Cast kernels with conditional int64 support
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<6, 8>(enable_int64)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<9, 12>(enable_int64)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<13, 18>(enable_int64)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<19, 20>(enable_int64)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<21, 22>(enable_int64)));
+  ORT_THROW_IF_ERROR(kernel_registry->Register(CreateCastKernelInfo<23>(enable_int64)));
+
+  // Register Range kernels with conditional int64 support
+  RegisterRangeKernels(*kernel_registry, enable_int64);
 
 #ifndef DISABLE_CONTRIB_OPS
   Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry, enable_graph_capture);
@@ -830,6 +828,7 @@ WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
       preferred_data_layout_{config.data_layout},
       force_cpu_node_names_{std::move(config.force_cpu_node_names)},
       enable_graph_capture_{config.enable_graph_capture},
+      enable_int64_{config.enable_graph_capture || config.enable_int64},
       prepack_allocator_{std::make_shared<webgpu::GpuBufferAllocator>(context_.InitializerBufferManager(), false)} {
   // If graph capture is enabled, create a dedicated buffer manager for graph mode
   if (enable_graph_capture_) {
@@ -952,11 +951,16 @@ std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapa
 }
 
 std::shared_ptr<KernelRegistry> WebGpuExecutionProvider::GetKernelRegistry() const {
+  // Cache registries based on enable_graph_capture_ and enable_int64_ flags
+  // Note: enable_int64_ is always true when enable_graph_capture_ is true
   if (enable_graph_capture_) {
-    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(true);
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(true, true);
+    return registry;
+  } else if (enable_int64_) {
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false, true);
     return registry;
   } else {
-    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false);
+    static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels(false, false);
     return registry;
   }
 }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -34,6 +34,7 @@ struct WebGpuExecutionProviderConfig {
   DataLayout data_layout{DataLayout::NHWC};  // preferred layout is NHWC by default
   bool enable_graph_capture{false};          // graph capture feature is disabled by default
   bool enable_pix_capture{false};            // PIX capture is disabled by default
+  bool enable_int64{false};                  // int64 ops are not enabled by default
   std::vector<std::string> force_cpu_node_names{};
 };
 
@@ -92,6 +93,7 @@ class WebGpuExecutionProvider : public IExecutionProvider {
   DataLayout preferred_data_layout_;
   std::vector<std::string> force_cpu_node_names_;
   bool enable_graph_capture_ = false;
+  bool enable_int64_ = false;
   bool is_graph_captured_ = false;
   int regular_run_count_before_graph_capture_ = 0;
   const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -61,6 +61,17 @@ WebGpuExecutionProviderConfig ParseEpConfig(const ConfigOptions& config_options)
     }
   }
 
+  std::string enable_int64_str;
+  if (config_options.TryGetConfigEntry(kEnableInt64, enable_int64_str)) {
+    if (enable_int64_str == kEnableInt64_ON) {
+      webgpu_ep_config.enable_int64 = true;
+    } else if (enable_int64_str == kEnableInt64_OFF) {
+      webgpu_ep_config.enable_int64 = false;
+    } else {
+      ORT_THROW("Invalid enableInt64 value: ", enable_int64_str);
+    }
+  }
+
   // parse force CPU node names
   // The force CPU node names are separated by EOL (\n or \r\n) in the config entry.
   // each line is a node name that will be forced to run on CPU.
@@ -96,6 +107,7 @@ WebGpuExecutionProviderConfig ParseEpConfig(const ConfigOptions& config_options)
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_config.enable_graph_capture;
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP force CPU node count: " << webgpu_ep_config.force_cpu_node_names.size();
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP pix capture enable: " << webgpu_ep_config.enable_pix_capture;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP enable int64: " << webgpu_ep_config.enable_int64;
 
   return webgpu_ep_config;
 }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -11,6 +11,7 @@ namespace options {
 
 constexpr const char* kPreferredLayout = "ep.webgpuexecutionprovider.preferredLayout";
 constexpr const char* kEnableGraphCapture = "ep.webgpuexecutionprovider.enableGraphCapture";
+constexpr const char* kEnableInt64 = "ep.webgpuexecutionprovider.enableInt64";
 
 constexpr const char* kDawnProcTable = "ep.webgpuexecutionprovider.dawnProcTable";
 
@@ -49,6 +50,9 @@ constexpr const char* kPreferredLayout_NHWC = "NHWC";
 constexpr const char* kEnableGraphCapture_ON = "1";
 constexpr const char* kEnableGraphCapture_OFF = "0";
 
+constexpr const char* kEnableInt64_ON = "1";
+constexpr const char* kEnableInt64_OFF = "0";
+
 constexpr const char* kEnablePIXCapture_ON = "1";
 constexpr const char* kEnablePIXCapture_OFF = "0";
 

Original file line number	Diff line number	Diff line change
`@@ -110,8 +110,8 @@ Status CastProgram::GenerateShaderCode(ShaderHelper& sh) const {`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`template <int StartVersion, int EndVersion>`
`113`		`-KernelCreateInfo CreateCastKernelInfo(bool enable_graph_capture) {`
`114`		`- const auto& type_constraints = CastOpTypeConstraints(enable_graph_capture);`
	`113`	`+KernelCreateInfo CreateCastKernelInfo(bool enable_int64) {`
	`114`	`+ const auto& type_constraints = CastOpTypeConstraints(enable_int64);`
`115`	`115`
`116`	`116`	`KernelCreateFn kernel_create_fn = [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {`
`117`	`117`	`out = std::make_unique<Cast>(info);`