microsoft
diff --git a/‎cmake/onnxruntime_providers_webgpu.cmake‎
Lines changed: 6 additions & 0 deletions b/‎cmake/onnxruntime_providers_webgpu.cmake‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 13 additions & 0 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/cpu/tensor/upsamplebase.h‎
Lines changed: 16 additions & 2 deletions b/‎onnxruntime/core/providers/cpu/tensor/upsamplebase.h‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎onnxruntime/core/providers/webgpu/compute_context.h‎
Lines changed: 4 additions & 0 deletions b/‎onnxruntime/core/providers/webgpu/compute_context.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/webgpu/controlflow/if.cc‎
Lines changed: 15 additions & 1 deletion b/‎onnxruntime/core/providers/webgpu/controlflow/if.cc‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎onnxruntime/core/providers/webgpu/controlflow/if.h‎
Lines changed: 14 additions & 1 deletion b/‎onnxruntime/core/providers/webgpu/controlflow/if.h‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎onnxruntime/core/providers/webgpu/data_transfer.cc‎
Lines changed: 27 additions & 14 deletions b/‎onnxruntime/core/providers/webgpu/data_transfer.cc‎
Lines changed: 27 additions & 14 deletions
diff --git a/‎onnxruntime/core/providers/webgpu/data_transfer.h‎
Lines changed: 6 additions & 0 deletions b/‎onnxruntime/core/providers/webgpu/data_transfer.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/webgpu/ep/api.cc‎
Lines changed: 78 additions & 0 deletions b/‎onnxruntime/core/providers/webgpu/ep/api.cc‎
Lines changed: 78 additions & 0 deletions
@@ -122,6 +122,12 @@
     if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       message(FATAL_ERROR "WebGPU EP shared library build is not supported on Emscripten. Please use static library build.")
     endif()
+
+    # Configure precompiled headers for shared library build
+    # PCH ensures ep/_pch.h is included first and improves compilation speed
+    target_precompile_headers(onnxruntime_providers_webgpu PRIVATE
+      "${REPO_ROOT}/include/onnxruntime/ep/_pch.h"
+    )
   endif()
 
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES CXX_STANDARD_REQUIRED ON)
 
@@ -1038,6 +1038,18 @@ function(onnxruntime_apply_test_target_workarounds target)
   endif()
 endfunction()
 
+# Set environment variables for plugin EP tests when run via CTest.
+function(onnxruntime_set_plugin_ep_test_environment target)
+  if(onnxruntime_USE_WEBGPU AND NOT onnxruntime_BUILD_WEBGPU_EP_STATIC_LIB)
+    set(ORT_PLUGIN_EP_JSON_CONFIG "{\"ep_library_registration_name\": \"WebGPU_PluginEP\", \"ep_library_path\": \"onnxruntime_providers_webgpu.dll\", \"selected_ep_name\": \"WebGpuExecutionProvider\"}")
+    set_tests_properties(${target} PROPERTIES
+      ENVIRONMENT "ORT_UNIT_TEST_MAIN_DYNAMIC_PLUGIN_EP_CONFIG_JSON=${ORT_PLUGIN_EP_JSON_CONFIG}"
+    )
+  # TODO: add for other plugin EPs if needed
+  # elseif()
+  endif()
+endfunction()
+
 function(onnxruntime_apply_emscripten_test_link_settings target)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set_target_properties(${target} PROPERTIES LINK_DEPENDS ${TEST_SRC_DIR}/wasm/onnxruntime_test_adapter.js)
@@ -1239,6 +1251,7 @@ block()
   )
 
   onnxruntime_apply_test_target_workarounds(onnxruntime_provider_test)
+  onnxruntime_set_plugin_ep_test_environment(onnxruntime_provider_test)
 
   # Expose QNN SDK headers to unit tests via an interface target
   if(onnxruntime_USE_QNN)
 
@@ -219,8 +219,22 @@ class UpsampleBase {
     if (scales_input_idx_ > 0) {
       const Tensor* scale;
       bool get_scale = info.TryGetConstantInput(scales_input_idx_, &scale);
-      auto x_shape = node.InputDefs()[0]->Shape();
-      int64_t rank = x_shape ? x_shape->dim_size() : -1;
+      int64_t rank = -1;
+      if constexpr (std::is_same_v<KernelInfoType, onnxruntime::OpKernelInfo>) {
+        auto x_shape = node.InputDefs()[0]->Shape();
+        if (x_shape != nullptr) {
+          rank = x_shape->dim_size();
+        }
+      } else {
+        int is_const;
+        auto tensor = info.GetKernelInfo().GetTensorConstantInput(0, &is_const);
+        if (is_const) {
+          auto type_and_shape_info = tensor.GetTensorTypeAndShapeInfo();
+          if (type_and_shape_info.HasShape()) {
+            rank = static_cast<int64_t>(type_and_shape_info.GetShape().size());
+          }
+        }
+      }
       if (get_scale && scale->Shape().Size() > 0 && ((opset < 18) || (rank > 0 && opset >= 18))) {
         ORT_THROW_IF_ERROR(ParseScalesData(scale, scales_, rank));
         scales_cached_ = true;
 
@@ -100,7 +100,11 @@ class ComputeContextBase {
   // Get the logger.
   //
   inline const logging::Logger& Logger() const {
+#if defined(BUILD_WEBGPU_EP_STATIC_LIB)
     return *ep_.GetLogger();
+#else
+    return ep_.GetEpLogger();
+#endif
   }
 
   //
 
@@ -3,6 +3,10 @@
 
 #include "core/providers/webgpu/controlflow/if.h"
 
+#if !defined(BUILD_WEBGPU_EP_STATIC_LIB)
+#include "core/framework/error_code_helper.h"
+#endif
+
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::common;
 
@@ -68,10 +72,20 @@ ONNX_OPERATOR_KERNEL_EX(If,
                             .TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorTypes()),
                         If);
 
+#if defined(BUILD_WEBGPU_EP_STATIC_LIB)
 Status If::Compute(OpKernelContext* ctx) const {
   // call the base CPU version.
   return onnxruntime::If::Compute(ctx);
 }
+#else
+Status If::CreateControlFlowKernelImpl(const OrtKernelInfo* info, OrtKernelImpl** impl) {
+  return ToStatusAndRelease(ep::Api().ep.CreateIfKernel(info, impl));
+}
+
+Status If::Compute(OpKernelContext* ctx) const {
+  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "If operator should be handled by ORT core.");
+}
+#endif
 
 }  // namespace webgpu
-}  // namespace onnxruntime
+}  // namespace onnxruntime
@@ -10,6 +10,8 @@
 namespace onnxruntime {
 namespace webgpu {
 
+#if defined(BUILD_WEBGPU_EP_STATIC_LIB)
+
 // Use the CPU implementation for the logic
 class If final : public onnxruntime::If {
  public:
@@ -18,5 +20,16 @@ class If final : public onnxruntime::If {
   Status Compute(OpKernelContext* ctx) const override;
 };
 
+#else
+
+class If final : public OpKernel {
+ public:
+  If(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status CreateControlFlowKernelImpl(const OrtKernelInfo* info, OrtKernelImpl** impl) override;
+  Status Compute(OpKernelContext* ctx) const override;
+};
+#endif
+
 }  // namespace webgpu
-}  // namespace onnxruntime
+}  // namespace onnxruntime
@@ -13,32 +13,45 @@ bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_dev
          (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU);
 }
 
-common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
-  size_t bytes = src.SizeInBytes();
+common::Status DataTransfer::CopyTensorImpl(void const* src_data,
+                                            bool src_is_gpu,
+                                            void* dst_data,
+                                            bool dst_is_gpu,
+                                            size_t bytes) const {
   if (bytes > 0) {
-    void const* src_data = src.DataRaw();
-    void* dst_data = dst.MutableDataRaw();
-
-    auto& src_device = src.Location().device;
-    auto& dst_device = dst.Location().device;
-
-    if (dst_device.Type() == OrtDevice::GPU) {
-      if (src_device.Type() == OrtDevice::GPU) {
+    if (dst_is_gpu) {
+      if (src_is_gpu) {
         // copy from GPU to GPU
         buffer_manager_.MemCpy(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
-                               static_cast<WGPUBuffer>(dst_data), bytes);
+                               static_cast<WGPUBuffer>(dst_data),
+                               bytes);
       } else {
         // copy from CPU to GPU
-        buffer_manager_.Upload(const_cast<void*>(src_data), static_cast<WGPUBuffer>(dst_data), bytes);
+        buffer_manager_.Upload(const_cast<void*>(src_data),
+                               static_cast<WGPUBuffer>(dst_data),
+                               bytes);
       }
-    } else /* if (src_device.Type() == OrtDevice::GPU) */ {
+    } else {
       // copy from GPU to CPU
-      buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)), dst_data, bytes);
+      buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
+                               dst_data,
+                               bytes);
     }
   }
 
   return Status::OK();
 }
 
+common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
+  void const* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  return CopyTensorImpl(src_data,
+                        src.Location().device.Type() == OrtDevice::GPU,
+                        dst_data,
+                        dst.Location().device.Type() == OrtDevice::GPU,
+                        src.SizeInBytes());
+}
+
 }  // namespace webgpu
 }  // namespace onnxruntime
@@ -20,6 +20,12 @@ class DataTransfer : public IDataTransfer {
 
   common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
 
+  common::Status CopyTensorImpl(void const* src_data,
+                                bool src_is_gpu,
+                                void* dst_data,
+                                bool dst_is_gpu,
+                                size_t bytes) const;
+
  private:
   const BufferManager& buffer_manager_;
 };
 
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+
+#include <memory>
+
+#include "core/providers/webgpu/ep/factory.h"
+
+// To make symbols visible on macOS/iOS
+#ifdef __APPLE__
+#define EXPORT_SYMBOL __attribute__((visibility("default")))
+#else
+#define EXPORT_SYMBOL
+#endif
+
+namespace onnxruntime {
+namespace webgpu {
+void CleanupWebGpuContexts();
+void CleanupKernelRegistries();
+}  // namespace webgpu
+}  // namespace onnxruntime
+
+namespace google {
+namespace protobuf {
+void ShutdownProtobufLibrary();
+}  // namespace protobuf
+}  // namespace google
+
+extern "C" {
+//
+// Public symbols
+//
+EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const OrtApiBase* ort_api_base,
+                                           const OrtLogger* default_logger,
+                                           OrtEpFactory** factories, size_t max_factories, size_t* num_factories) {
+  // Manual init for the C++ API
+  onnxruntime::ep::ApiInit(ort_api_base);
+
+  if (max_factories < 1) {
+    return onnxruntime::ep::Api().ort.CreateStatus(ORT_INVALID_ARGUMENT,
+                                                   "Not enough space to return EP factory. Need at least one.");
+  }
+
+  // Initialize the global default logger
+  ::onnxruntime::ep::adapter::Logger::CreateDefaultLogger(default_logger);
+
+  // Factory could use registration_name or define its own EP name.
+  std::unique_ptr<OrtEpFactory> factory = std::make_unique<onnxruntime::webgpu::ep::Factory>();
+
+  factories[0] = factory.release();
+  *num_factories = 1;
+
+  return nullptr;
+}
+
+EXPORT_SYMBOL OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) {
+  // STEP.1 - Release the factory
+  delete static_cast<onnxruntime::webgpu::ep::Factory*>(factory);
+
+  // STEP.2 - Clean up cached kernel registries
+  onnxruntime::webgpu::CleanupKernelRegistries();
+
+  // STEP.3 - Clean up WebGPU contexts
+  onnxruntime::webgpu::CleanupWebGpuContexts();
+
+  // STEP.4 - Destroy the global default logger wrapper
+  ::onnxruntime::ep::adapter::Logger::DestroyDefaultLogger();
+
+  // STEP.5 - Shutdown protobuf library
+  google::protobuf::ShutdownProtobufLibrary();
+
+  return nullptr;
+}
+
+}  // extern "C"