[webgpu] Enable profiling for graph capture

qjia7 · qjia7 · commit df7d5ace08ac · 2026-01-16T16:06:24.000+08:00
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -792,11 +792,20 @@ void WebGpuContext::LaunchComputePipeline(const wgpu::ComputePassEncoder& comput
     if (indirect_dispatch_tensor != nullptr) {
       indirect_buffer = reinterpret_cast<WGPUBuffer>(const_cast<void*>(indirect_dispatch_tensor->DataRaw()));
     }
+
+    // Store profiling info if profiling is enabled
+    std::optional<std::tuple<std::string, std::string, std::vector<TensorShape>, std::vector<TensorShape>>> profiling_data;
+    if (is_profiling_ && !pending_kernels_.empty()) {
+      const auto& kernel_info = pending_kernels_.back();
+      profiling_data = std::make_tuple(kernel_info.name, kernel_info.cache_key, kernel_info.input_shapes, kernel_info.output_shapes);
+    }
+
     external_captured_commands_->push_back({program_artifact.compute_pipeline,
                                             bind_group,
                                             bind_group_layout,
                                             {x, y, z},
-                                            indirect_buffer});
+                                            indirect_buffer,
+                                            profiling_data});
   } else {
     compute_pass_encoder.SetPipeline(program_artifact.compute_pipeline);
     wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, bind_group, 0, nullptr);
@@ -827,9 +836,6 @@ void WebGpuContext::CaptureBegin(std::vector<webgpu::CapturedCommandInfo>* captu
     external_captured_commands_->clear();
   }
 
-  // TODO: support profiling with graph capture.
-  ORT_ENFORCE(!is_profiling_, "profiling is not supported yet under graph capture mode");
-
   graph_capture_state_ = GraphCaptureState::Capturing;
 }
 
@@ -842,6 +848,13 @@ void WebGpuContext::Replay(const std::vector<webgpu::CapturedCommandInfo>& captu
     auto& command = captured_commands[i];
     const auto& compute_pass_encoder = GetComputePassEncoder();
     WriteTimestamp(num_pending_dispatches_ * 2);
+
+    // Restore profiling info if available and profiling is enabled
+    if (is_profiling_ && command.pending_kernel_info.has_value()) {
+      const auto& [name, cache_key, input_shapes, output_shapes] = command.pending_kernel_info.value();
+      pending_kernels_.emplace_back(name, cache_key, input_shapes, output_shapes);
+    }
+
     compute_pass_encoder.SetPipeline(command.compute_pipeline);
     wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, command.bind_group, 0, nullptr);
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -5,6 +5,8 @@
 
 #include <memory>
 #include <mutex>
+#include <optional>
+#include <tuple>
 
 #include "core/providers/webgpu/webgpu_external_header.h"
 
@@ -31,7 +33,8 @@ struct CapturedCommandInfo {
   WGPUBindGroup bind_group;
   WGPUBindGroupLayout bind_group_layout;
   std::array<uint32_t, 3> dispatch_group;
-  WGPUBuffer indirect_buffer;  // WGPUBuffer for indirect dispatch, nullptr if not using indirect dispatch
+  WGPUBuffer indirect_buffer;                                                                                                   // WGPUBuffer for indirect dispatch, nullptr if not using indirect dispatch
+  std::optional<std::tuple<std::string, std::string, std::vector<TensorShape>, std::vector<TensorShape>>> pending_kernel_info;  // Optional profiling data: (name, cache_key, input_shapes, output_shapes)
 };
 
 struct WebGpuBufferCacheConfig {
@@ -280,6 +283,16 @@ class WebGpuContext final {
       }
     }
 
+    // Constructor for replay - takes shapes directly
+    PendingKernelInfo(std::string name_in,
+                      std::string cache_key_in,
+                      std::vector<TensorShape> input_shapes_in,
+                      std::vector<TensorShape> output_shapes_in)
+        : name{std::move(name_in)},
+          cache_key{std::move(cache_key_in)},
+          input_shapes{std::move(input_shapes_in)},
+          output_shapes{std::move(output_shapes_in)} {}
+
     PendingKernelInfo(PendingKernelInfo&&) = default;
     PendingKernelInfo& operator=(PendingKernelInfo&&) = default;
     ORT_DISALLOW_COPY_AND_ASSIGNMENT(PendingKernelInfo);
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -1050,7 +1050,13 @@ bool WebGpuExecutionProvider::IsGraphCaptured(int graph_annotation_id) const {
 
 Status WebGpuExecutionProvider::ReplayGraph(int graph_annotation_id) {
   ORT_ENFORCE(IsGraphCaptured(graph_annotation_id));
+  if (profiler_->Enabled()) {
+    context_.StartProfiling();
+  }
   context_.Replay(captured_commands_, *graph_buffer_mgr_);
+  if (profiler_->Enabled()) {
+    context_.CollectProfilingData(profiler_->Events());
+  }
   return Status::OK();
 }