Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions onnxruntime/core/providers/webgpu/webgpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -792,11 +792,20 @@
if (indirect_dispatch_tensor != nullptr) {
indirect_buffer = reinterpret_cast<WGPUBuffer>(const_cast<void*>(indirect_dispatch_tensor->DataRaw()));
}

// Store profiling info if profiling is enabled
std::optional<std::tuple<std::string, std::string, std::vector<TensorShape>, std::vector<TensorShape>>> profiling_data;

Check warning on line 797 in onnxruntime/core/providers/webgpu/webgpu_context.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/webgpu_context.cc:797: Add #include <string> for string [build/include_what_you_use] [4]
if (is_profiling_ && !pending_kernels_.empty()) {
const auto& kernel_info = pending_kernels_.back();
profiling_data = std::make_tuple(kernel_info.name, kernel_info.cache_key, kernel_info.input_shapes, kernel_info.output_shapes);
}

external_captured_commands_->push_back({program_artifact.compute_pipeline,
bind_group,
bind_group_layout,
{x, y, z},
indirect_buffer});
indirect_buffer,
profiling_data});
} else {
compute_pass_encoder.SetPipeline(program_artifact.compute_pipeline);
wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, bind_group, 0, nullptr);
Expand Down Expand Up @@ -827,9 +836,6 @@
external_captured_commands_->clear();
}

// TODO: support profiling with graph capture.
ORT_ENFORCE(!is_profiling_, "profiling is not supported yet under graph capture mode");

graph_capture_state_ = GraphCaptureState::Capturing;
}

Expand All @@ -842,6 +848,13 @@
auto& command = captured_commands[i];
const auto& compute_pass_encoder = GetComputePassEncoder();
WriteTimestamp(num_pending_dispatches_ * 2);

// Restore profiling info if available and profiling is enabled
if (is_profiling_ && command.pending_kernel_info.has_value()) {
const auto& [name, cache_key, input_shapes, output_shapes] = command.pending_kernel_info.value();
pending_kernels_.emplace_back(name, cache_key, input_shapes, output_shapes);
}

compute_pass_encoder.SetPipeline(command.compute_pipeline);
wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, command.bind_group, 0, nullptr);

Expand Down
17 changes: 16 additions & 1 deletion onnxruntime/core/providers/webgpu/webgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include <memory>
#include <mutex>
#include <optional>
#include <tuple>

#include "core/providers/webgpu/webgpu_external_header.h"

Expand All @@ -31,7 +33,10 @@ struct CapturedCommandInfo {
WGPUBindGroup bind_group;
WGPUBindGroupLayout bind_group_layout;
std::array<uint32_t, 3> dispatch_group;
WGPUBuffer indirect_buffer; // WGPUBuffer for indirect dispatch, nullptr if not using indirect dispatch
// WGPUBuffer for indirect dispatch, nullptr if not using indirect dispatch
WGPUBuffer indirect_buffer;
// Optional profiling data: (name, cache_key, input_shapes, output_shapes)
std::optional<std::tuple<std::string, std::string, std::vector<TensorShape>, std::vector<TensorShape>>> pending_kernel_info;
};

struct WebGpuBufferCacheConfig {
Expand Down Expand Up @@ -280,6 +285,16 @@ class WebGpuContext final {
}
}

// Constructor for replay - takes shapes directly
PendingKernelInfo(std::string name_in,
std::string cache_key_in,
std::vector<TensorShape> input_shapes_in,
std::vector<TensorShape> output_shapes_in)
: name{std::move(name_in)},
cache_key{std::move(cache_key_in)},
input_shapes{std::move(input_shapes_in)},
output_shapes{std::move(output_shapes_in)} {}

PendingKernelInfo(PendingKernelInfo&&) = default;
PendingKernelInfo& operator=(PendingKernelInfo&&) = default;
ORT_DISALLOW_COPY_AND_ASSIGNMENT(PendingKernelInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,13 @@ bool WebGpuExecutionProvider::IsGraphCaptured(int graph_annotation_id) const {

Status WebGpuExecutionProvider::ReplayGraph(int graph_annotation_id) {
ORT_ENFORCE(IsGraphCaptured(graph_annotation_id));
if (profiler_->Enabled()) {
context_.StartProfiling();
}
context_.Replay(captured_commands_, *graph_buffer_mgr_);
if (profiler_->Enabled()) {
context_.CollectProfilingData(profiler_->Events());
}
return Status::OK();
}

Expand Down
Loading