WebGPU: Use dedicated prepack allocator for kernel prepacking (#26857)

jchen10 · web-flow · commit 38355ba07c5b · 2025-12-27T10:15:38.000Z
Remove the CreateUnmappedGPUTensor workaround by using a proper prepack
allocator that creates unmapped GPU buffers directly to avoid the need
to manually unmap buffers after allocation.
diff --git a/onnxruntime/core/providers/webgpu/compute_context.cc b/onnxruntime/core/providers/webgpu/compute_context.cc
@@ -20,22 +20,6 @@ const webgpu::BufferManager& ComputeContextBase::BufferManagerAccessor::Get(cons
   return context.ep_.BufferManager();
 }
 
-Status ComputeContextBase::CreateUnmappedGPUTensor(AllocatorPtr alloc, MLDataType data_type, const TensorShape& shape, std::unique_ptr<Tensor>& tensor) const {
-  ORT_RETURN_IF_NOT(alloc != nullptr, "Allocator must not be null when creating GPU tensor.");
-
-  tensor = std::make_unique<Tensor>(data_type, shape, alloc);
-  ORT_RETURN_IF_NOT(tensor != nullptr, "Failed to allocate GPU tensor.");
-
-  void* data = tensor->MutableDataRaw();
-  ORT_RETURN_IF_NOT(data != nullptr, "Failed to get GPU tensor buffer.");
-
-  auto buffer = reinterpret_cast<WGPUBuffer>(data);
-  if (wgpuBufferGetMapState(buffer) != WGPUBufferMapState_Unmapped) {
-    wgpuBufferUnmap(buffer);
-  }
-  return Status::OK();
-}
-
 ComputeContext::ComputeContext(WebGpuContext& webgpu_context,
                                const WebGpuExecutionProvider& ep,
                                const OpKernel& op_kernel,
diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h
@@ -56,9 +56,6 @@ class ComputeContextBase {
     return op_kernel_.Node().Name();
   }
 
-  Status CreateUnmappedGPUTensor(AllocatorPtr alloc, MLDataType data_type, const TensorShape& shape,
-                                 std::unique_ptr<Tensor>& tensor) const;
-
   //
   // Get the operator type.
   //
diff --git a/onnxruntime/core/providers/webgpu/nn/conv.cc b/onnxruntime/core/providers/webgpu/nn/conv.cc
@@ -354,11 +354,9 @@ Status Conv<is_channels_last, is_fused>::PrePackInternal(ComputeContextBase& con
   }
   TensorShape transposed_kernel_shape(transposed_kernel_shape_vector);
 
-  ORT_ENFORCE(alloc != nullptr, "Allocator must be provided for WebGPU pre-pack.");
-
-  // Create the transposed kernel tensor using the WebGPU allocator.
-  // Both input tensor and output tensor are GPU tensors, ready for GPU operations.
-  ORT_RETURN_IF_ERROR(context.CreateUnmappedGPUTensor(alloc, tensor.DataType(), transposed_kernel_shape, transposed_kernel_));
+  // Create the transposed kernel tensor using the prepack allocator.
+  // This allocator creates GPU buffers without mapping, suitable for GPU-based operations.
+  transposed_kernel_ = std::make_unique<Tensor>(tensor.DataType(), transposed_kernel_shape, alloc);
 
   // Perform GPU-based transpose directly from the input GPU tensor
   ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, tensor, *transposed_kernel_));
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -799,7 +799,8 @@ WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
       context_{context},
       preferred_data_layout_{config.data_layout},
       force_cpu_node_names_{std::move(config.force_cpu_node_names)},
-      enable_graph_capture_{config.enable_graph_capture} {
+      enable_graph_capture_{config.enable_graph_capture},
+      prepack_allocator_{std::make_shared<webgpu::GpuBufferAllocator>(context_.InitializerBufferManager(), false)} {
   // If graph capture is enabled, create a dedicated buffer manager for graph mode
   if (enable_graph_capture_) {
     // Create buffer manager for graph capture mode with appropriate cache modes
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -84,6 +84,7 @@ class WebGpuExecutionProvider : public IExecutionProvider {
   bool IsGraphCaptured(int graph_annotation_id) const override;
   Status ReplayGraph(int graph_annotation_id) override;
   webgpu::BufferManager& BufferManager() const;
+  AllocatorPtr PrepackAllocator() const { return prepack_allocator_; }
 
  private:
   bool IsGraphCaptureAllowed() const;
@@ -105,6 +106,9 @@ class WebGpuExecutionProvider : public IExecutionProvider {
 
   // Store captured commands directly in the EP instead of in WebGpuContext
   std::vector<webgpu::CapturedCommandInfo> captured_commands_;
+
+  // Allocator for prepacked weights (uses buffers without mapping)
+  AllocatorPtr prepack_allocator_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc
@@ -34,7 +34,7 @@ Status WebGpuKernel::Compute(OpKernelContext* p_op_kernel_context) const {
   return s;
 }
 
-Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
                              /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /* prepacked_weights */) {
   ComputeContextBase context{webgpu_context_, ep_, *this};
 
@@ -45,8 +45,9 @@ Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr a
   // Currently, ORT does not allow using prepacked weights in non-CPU EPs.
   // So we do not pass prepacked_weights to PrePackInternal.
   // Kernel implementation that supports prepacking should manage its own storage.
+  // Use the EP's prepack allocator which creates unmapped GPU buffers.
 
-  Status s = PrePackInternal(context, tensor, input_idx, alloc, is_packed);
+  Status s = PrePackInternal(context, tensor, input_idx, ep_.PrepackAllocator(), is_packed);
 
   if (webgpu_context_.ValidationMode() >= ValidationMode::Full) {
     ORT_RETURN_IF_ERROR(webgpu_context_.PopErrorScope());
diff --git a/onnxruntime/core/providers/webgpu/webgpu_kernel.h b/onnxruntime/core/providers/webgpu/webgpu_kernel.h
@@ -44,7 +44,7 @@ class WebGpuKernel : public OpKernel {
   // @param context       The WebGPU compute context base providing access to the execution environment.
   // @param tensor        The constant tensor to potentially pre-process.
   // @param input_idx     The index of this input in the kernel's input list.
-  // @param alloc         The allocator to use for any new tensor allocations.
+  // @param alloc         The allocator to use for any new tensor allocations (prepack allocator).
   // @param is_packed     Output parameter. Set to true if the tensor was pre-packed/processed,
   //                      false otherwise. The default implementation sets this to false.
   //

Original file line number	Diff line number	Diff line change
`@@ -56,9 +56,6 @@ class ComputeContextBase {`
`56`	`56`	`return op_kernel_.Node().Name();`
`57`	`57`	`}`
`58`	`58`
`59`		`- Status CreateUnmappedGPUTensor(AllocatorPtr alloc, MLDataType data_type, const TensorShape& shape,`
`60`		`- std::unique_ptr<Tensor>& tensor) const;`
`61`		`-`
`62`	`59`	`//`
`63`	`60`	`// Get the operator type.`
`64`	`61`	`//`