Skip to content

Commit 38355ba

Browse files
authored
WebGPU: Use dedicated prepack allocator for kernel prepacking (#26857)
Remove the CreateUnmappedGPUTensor workaround by using a proper prepack allocator that creates unmapped GPU buffers directly to avoid the need to manually unmap buffers after allocation.
1 parent 2d1ed5b commit 38355ba

File tree

7 files changed

+13
-28
lines changed

7 files changed

+13
-28
lines changed

onnxruntime/core/providers/webgpu/compute_context.cc

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,6 @@ const webgpu::BufferManager& ComputeContextBase::BufferManagerAccessor::Get(cons
2020
return context.ep_.BufferManager();
2121
}
2222

23-
Status ComputeContextBase::CreateUnmappedGPUTensor(AllocatorPtr alloc, MLDataType data_type, const TensorShape& shape, std::unique_ptr<Tensor>& tensor) const {
24-
ORT_RETURN_IF_NOT(alloc != nullptr, "Allocator must not be null when creating GPU tensor.");
25-
26-
tensor = std::make_unique<Tensor>(data_type, shape, alloc);
27-
ORT_RETURN_IF_NOT(tensor != nullptr, "Failed to allocate GPU tensor.");
28-
29-
void* data = tensor->MutableDataRaw();
30-
ORT_RETURN_IF_NOT(data != nullptr, "Failed to get GPU tensor buffer.");
31-
32-
auto buffer = reinterpret_cast<WGPUBuffer>(data);
33-
if (wgpuBufferGetMapState(buffer) != WGPUBufferMapState_Unmapped) {
34-
wgpuBufferUnmap(buffer);
35-
}
36-
return Status::OK();
37-
}
38-
3923
ComputeContext::ComputeContext(WebGpuContext& webgpu_context,
4024
const WebGpuExecutionProvider& ep,
4125
const OpKernel& op_kernel,

onnxruntime/core/providers/webgpu/compute_context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,6 @@ class ComputeContextBase {
5656
return op_kernel_.Node().Name();
5757
}
5858

59-
Status CreateUnmappedGPUTensor(AllocatorPtr alloc, MLDataType data_type, const TensorShape& shape,
60-
std::unique_ptr<Tensor>& tensor) const;
61-
6259
//
6360
// Get the operator type.
6461
//

onnxruntime/core/providers/webgpu/nn/conv.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -354,11 +354,9 @@ Status Conv<is_channels_last, is_fused>::PrePackInternal(ComputeContextBase& con
354354
}
355355
TensorShape transposed_kernel_shape(transposed_kernel_shape_vector);
356356

357-
ORT_ENFORCE(alloc != nullptr, "Allocator must be provided for WebGPU pre-pack.");
358-
359-
// Create the transposed kernel tensor using the WebGPU allocator.
360-
// Both input tensor and output tensor are GPU tensors, ready for GPU operations.
361-
ORT_RETURN_IF_ERROR(context.CreateUnmappedGPUTensor(alloc, tensor.DataType(), transposed_kernel_shape, transposed_kernel_));
357+
// Create the transposed kernel tensor using the prepack allocator.
358+
// This allocator creates GPU buffers without mapping, suitable for GPU-based operations.
359+
transposed_kernel_ = std::make_unique<Tensor>(tensor.DataType(), transposed_kernel_shape, alloc);
362360

363361
// Perform GPU-based transpose directly from the input GPU tensor
364362
ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, tensor, *transposed_kernel_));

onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,8 @@ WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
799799
context_{context},
800800
preferred_data_layout_{config.data_layout},
801801
force_cpu_node_names_{std::move(config.force_cpu_node_names)},
802-
enable_graph_capture_{config.enable_graph_capture} {
802+
enable_graph_capture_{config.enable_graph_capture},
803+
prepack_allocator_{std::make_shared<webgpu::GpuBufferAllocator>(context_.InitializerBufferManager(), false)} {
803804
// If graph capture is enabled, create a dedicated buffer manager for graph mode
804805
if (enable_graph_capture_) {
805806
// Create buffer manager for graph capture mode with appropriate cache modes

onnxruntime/core/providers/webgpu/webgpu_execution_provider.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ class WebGpuExecutionProvider : public IExecutionProvider {
8484
bool IsGraphCaptured(int graph_annotation_id) const override;
8585
Status ReplayGraph(int graph_annotation_id) override;
8686
webgpu::BufferManager& BufferManager() const;
87+
AllocatorPtr PrepackAllocator() const { return prepack_allocator_; }
8788

8889
private:
8990
bool IsGraphCaptureAllowed() const;
@@ -105,6 +106,9 @@ class WebGpuExecutionProvider : public IExecutionProvider {
105106

106107
// Store captured commands directly in the EP instead of in WebGpuContext
107108
std::vector<webgpu::CapturedCommandInfo> captured_commands_;
109+
110+
// Allocator for prepacked weights (uses buffers without mapping)
111+
AllocatorPtr prepack_allocator_;
108112
};
109113

110114
} // namespace onnxruntime

onnxruntime/core/providers/webgpu/webgpu_kernel.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Status WebGpuKernel::Compute(OpKernelContext* p_op_kernel_context) const {
3434
return s;
3535
}
3636

37-
Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
37+
Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
3838
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* /* prepacked_weights */) {
3939
ComputeContextBase context{webgpu_context_, ep_, *this};
4040

@@ -45,8 +45,9 @@ Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr a
4545
// Currently, ORT does not allow using prepacked weights in non-CPU EPs.
4646
// So we do not pass prepacked_weights to PrePackInternal.
4747
// Kernel implementation that supports prepacking should manage its own storage.
48+
// Use the EP's prepack allocator which creates unmapped GPU buffers.
4849

49-
Status s = PrePackInternal(context, tensor, input_idx, alloc, is_packed);
50+
Status s = PrePackInternal(context, tensor, input_idx, ep_.PrepackAllocator(), is_packed);
5051

5152
if (webgpu_context_.ValidationMode() >= ValidationMode::Full) {
5253
ORT_RETURN_IF_ERROR(webgpu_context_.PopErrorScope());

onnxruntime/core/providers/webgpu/webgpu_kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class WebGpuKernel : public OpKernel {
4444
// @param context The WebGPU compute context base providing access to the execution environment.
4545
// @param tensor The constant tensor to potentially pre-process.
4646
// @param input_idx The index of this input in the kernel's input list.
47-
// @param alloc The allocator to use for any new tensor allocations.
47+
// @param alloc The allocator to use for any new tensor allocations (prepack allocator).
4848
// @param is_packed Output parameter. Set to true if the tensor was pre-packed/processed,
4949
// false otherwise. The default implementation sets this to false.
5050
//

0 commit comments

Comments
 (0)