Skip to content

Commit 101353c

Browse files
authored
enable WebGPU EP in WebAssembly build (#23913)
### Description This PR is the first step for migrating the webgpu backend of onnxruntime-web from JSEP based to WebGPU EP based. In this change, we enable building WebGPU EP in a wasm build (ie. `--build_wasm` `--use_webgpu` `--use_jsep`). However, the old build flags should still keep previous behavior.
1 parent ccf8fdd commit 101353c

29 files changed

+1245
-395
lines changed

cmake/external/onnxruntime_external_deps.cmake

+23-1
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,29 @@ if (onnxruntime_USE_WEBGPU)
725725
# # if we need to apply patches in the future, we can uncomment the following line.
726726
#
727727
# The dawn.patch contains the following changes:
728-
# - https://dawn-review.googlesource.com/c/dawn/+/225514
728+
#
729+
# - (public) CMake fix to support Emscripten v4.0.3+
730+
# This change allows Dawn to find the file "gen_struct_info.py" in the correct location.
731+
# https://dawn-review.googlesource.com/c/dawn/+/225514
732+
#
733+
# - (public) Fix emwgpu C++ implementation for buffer destroy
734+
# In native implementation, wgpuBufferRelease will trigger the buffer destroy (if refcount decreased to 0). But
735+
# in emwgpu implementation, the buffer destroy won't happen. This change fixes the bug.
736+
# https://dawn-review.googlesource.com/c/dawn/+/226315
737+
#
738+
# - (private) Allow "external" buffer in emwgpu C++ implementation
739+
# This change allows WGPUBufferImpl to destroy the buffer when the refcount decreased to 0 only for non-external
740+
# buffer.
741+
# "external buffer" means the GPUBuffer instance created in JavaScript and imported to C++ by `importJsBuffer`.
742+
#
743+
# - (private) Remove hard-coded CMAKE_OSX_DEPLOYMENT_TARGET in Dawn's CMake files
744+
# https://github.com/microsoft/onnxruntime/pull/23729
745+
#
746+
# - (private) Fix external ref count for "external" device in emwgpu C++ implementation
747+
# This change fixes the incorrect external ref count for class WGPUDeviceImpl when used with "external" device.
748+
# "external device" means the GPUDevice instance created in JavaScript and imported to C++ by `importJsDevice`.
749+
#
750+
#
729751
PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
730752
EXCLUDE_FROM_ALL
731753
)

cmake/onnxruntime_webassembly.cmake

+28-9
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,14 @@ else()
211211
target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard)
212212
endif()
213213

214+
set(onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre.js")
215+
216+
set(EXPORTED_FUNCTIONS "_malloc,_free")
214217
if (onnxruntime_USE_JSEP)
215-
set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName")
216-
else()
217-
set(EXPORTED_FUNCTIONS "_malloc,_free")
218+
string(APPEND EXPORTED_FUNCTIONS ",_JsepOutput,_JsepGetNodeName")
219+
endif()
220+
if (onnxruntime_USE_WEBGPU)
221+
string(APPEND EXPORTED_FUNCTIONS ",_wgpuBufferRelease,_wgpuCreateInstance")
218222
endif()
219223

220224
if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
@@ -312,13 +316,15 @@ else()
312316
target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
313317
endif()
314318
target_link_options(onnxruntime_webassembly PRIVATE
315-
--post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js"
319+
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js\""
316320
)
321+
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js")
317322
else ()
318323
set(MAXIMUM_MEMORY "4294967296")
319324
target_link_options(onnxruntime_webassembly PRIVATE
320-
--post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js"
325+
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js.js\""
321326
)
327+
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js")
322328
endif ()
323329

324330
target_link_options(onnxruntime_webassembly PRIVATE
@@ -372,7 +378,6 @@ jsepDownload:_pp_")
372378
"SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'"
373379
)
374380
endif ()
375-
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
376381

377382
if (onnxruntime_USE_JSEP)
378383
# NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
@@ -382,10 +387,8 @@ jsepDownload:_pp_")
382387
target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1)
383388
target_link_options(onnxruntime_webassembly PRIVATE
384389
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
385-
"SHELL:-s ASYNCIFY=1"
386-
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
387390
)
388-
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
391+
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js")
389392

390393
if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
391394
target_link_options(onnxruntime_webassembly PRIVATE
@@ -397,6 +400,20 @@ jsepDownload:_pp_")
397400

398401
if (onnxruntime_USE_WEBGPU)
399402
target_compile_definitions(onnxruntime_webassembly PRIVATE USE_WEBGPU=1)
403+
target_link_options(onnxruntime_webassembly PRIVATE
404+
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js\""
405+
)
406+
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js")
407+
endif()
408+
409+
if (onnxruntime_USE_JSEP OR onnxruntime_USE_WEBGPU OR onnxruntime_USE_WEBNN)
410+
# if any of the above is enabled, we need to use the asyncify library
411+
target_link_options(onnxruntime_webassembly PRIVATE
412+
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-async.js\""
413+
"SHELL:-s ASYNCIFY=1"
414+
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
415+
)
416+
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-async.js")
400417
endif()
401418

402419
if (onnxruntime_EMSCRIPTEN_SETTINGS)
@@ -458,6 +475,8 @@ jsepDownload:_pp_")
458475
)
459476
endif()
460477

478+
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS "${onnxruntime_webassembly_script_deps}")
479+
461480
set(target_name_list ort)
462481

463482
if (onnxruntime_ENABLE_TRAINING_APIS)

cmake/patches/dawn/dawn.patch

+112-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ index 6e8ae37593..633af91eef 100644
1818
@@ -77,9 +77,17 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
1919
"${arg_UNPARSED_ARGUMENTS}")
2020
endif()
21-
21+
2222
+ # since Emscripten 4.0.3, file gen_struct_info.py is moved to outside of directory maint.
2323
+ if (EXISTS "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
2424
+ set(EM_GEN_STRUCT_INFO_SCRIPT "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
@@ -34,3 +34,114 @@ index 6e8ae37593..633af91eef 100644
3434
-q
3535
"${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
3636
"-I=${EM_BUILD_GEN_DIR}/include"
37+
diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md
38+
index efd6491cd6..8ebc5d28b6 100644
39+
--- a/src/emdawnwebgpu/README.md
40+
+++ b/src/emdawnwebgpu/README.md
41+
@@ -56,7 +56,7 @@ Set up the build directory using emcmake
42+
mkdir out/cmake-wasm
43+
cd out/cmake-wasm
44+
45+
-# Make sure the path is to the source checkout of Emscripten, not emsdk's release.
46+
+# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release.
47+
emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
48+
49+
ninja
50+
diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
51+
index f1c5a7d50e..16f2495712 100644
52+
--- a/third_party/emdawnwebgpu/webgpu.cpp
53+
+++ b/third_party/emdawnwebgpu/webgpu.cpp
54+
@@ -131,7 +131,6 @@ class RefCounted : NonMovable {
55+
bool Release() {
56+
if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) {
57+
std::atomic_thread_fence(std::memory_order_acquire);
58+
- emwgpuDelete(this);
59+
return true;
60+
}
61+
return false;
62+
@@ -234,6 +233,7 @@ class Ref {
63+
static void Release(T value) {
64+
if (value != nullptr && value->RefCounted::Release()) {
65+
delete value;
66+
+ emwgpuDelete(value);
67+
}
68+
}
69+
70+
@@ -641,7 +641,8 @@ struct WGPUAdapterImpl final : public EventSource, public RefCounted {
71+
struct WGPUBufferImpl final : public EventSource,
72+
public RefCountedWithExternalCount {
73+
public:
74+
- WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
75+
+ WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal);
76+
+ ~WGPUBufferImpl();
77+
78+
void Destroy();
79+
const void* GetConstMappedRange(size_t offset, size_t size);
80+
@@ -671,6 +672,7 @@ struct WGPUBufferImpl final : public EventSource,
81+
};
82+
MapRequest mPendingMapRequest;
83+
WGPUBufferMapState mMapState;
84+
+ bool mIsExternal;
85+
};
86+
87+
struct WGPUQueueImpl final : public EventSource, public RefCounted {
88+
@@ -1164,11 +1166,15 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {
89+
90+
WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
91+
bool mappedAtCreation = false) {
92+
- return new WGPUBufferImpl(source, mappedAtCreation);
93+
+ return new WGPUBufferImpl(source, mappedAtCreation, true);
94+
}
95+
96+
WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
97+
- return new WGPUDeviceImpl(source, queue);
98+
+ // This function is only called from JS via `importJsDevice()`, which
99+
+ // needs to increment the external ref count to fix the behavior.
100+
+ WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue);
101+
+ device->AddExternalRef();
102+
+ return device;
103+
}
104+
105+
WGPUQueue emwgpuCreateQueue(const EventSource* source) {
106+
@@ -1275,15 +1281,22 @@ WGPUAdapterImpl::WGPUAdapterImpl(const EventSource* source)
107+
// WGPUBuffer implementations.
108+
// ----------------------------------------------------------------------------
109+
110+
-WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
111+
+WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal)
112+
: EventSource(source),
113+
mMapState(mappedAtCreation ? WGPUBufferMapState_Mapped
114+
- : WGPUBufferMapState_Unmapped) {
115+
+ : WGPUBufferMapState_Unmapped),
116+
+ mIsExternal(isExternal) {
117+
if (mappedAtCreation) {
118+
mPendingMapRequest = {kNullFutureId, WGPUMapMode_Write};
119+
}
120+
}
121+
122+
+WGPUBufferImpl::~WGPUBufferImpl() {
123+
+ if (!mIsExternal) {
124+
+ Destroy();
125+
+ }
126+
+}
127+
+
128+
void WGPUBufferImpl::Destroy() {
129+
emwgpuBufferDestroy(this);
130+
AbortPendingMap("Buffer was destroyed before mapping was resolved.");
131+
@@ -1504,6 +1517,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
132+
void wgpu##Name##Release(WGPU##Name o) { \
133+
if (o->Release()) { \
134+
delete o; \
135+
+ emwgpuDelete(o); \
136+
} \
137+
}
138+
WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
139+
@@ -1638,7 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {
140+
141+
WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
142+
const WGPUBufferDescriptor* descriptor) {
143+
- WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
144+
+ WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation, false);
145+
emwgpuDeviceCreateBuffer(device, descriptor, buffer);
146+
return buffer;
147+
}

js/build_webgpu.bat

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
@echo off
2+
3+
rem build_webgpu.bat --- build onnxruntime-web with WebGPU EP
4+
rem
5+
rem Usage:
6+
rem build_webgpu.bat config [clean]
7+
rem
8+
rem Options:
9+
rem config Build configuration, "d" or "r"
10+
rem clean Perform a clean build, "clean" or empty
11+
12+
setlocal enabledelayedexpansion
13+
14+
set ROOT=%~dp0..\
15+
set BUILD_DIR=%ROOT%build_webgpu
16+
17+
:arg1
18+
if ["%~1"]==["d"] (
19+
set CONFIG=Debug
20+
set CONFIG_EXTRA_FLAG=
21+
@rem --enable_wasm_profiling --wasm_run_tests_in_browser
22+
@rem --cmake_extra_defines onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL=1
23+
@rem --enable_wasm_debug_info
24+
goto :arg2
25+
)
26+
if ["%~1"]==["r"] (
27+
set CONFIG=Release
28+
set CONFIG_EXTRA_FLAG=
29+
@rem --enable_wasm_api_exception_catching --disable_rtti
30+
goto :arg2
31+
)
32+
echo Invalid configuration "%~1", must be "d"(Debug) or "r"(Release)
33+
exit /b 1
34+
35+
:arg2
36+
if ["%~2"]==["clean"] (
37+
goto :clean
38+
)
39+
if not exist "%ROOT%js\web\dist" (
40+
goto :npm_ci
41+
)
42+
43+
goto :build_wasm
44+
45+
:clean
46+
if exist "%BUILD_DIR%" (
47+
rd /s /q %BUILD_DIR%
48+
)
49+
50+
pushd %ROOT%
51+
git submodule sync --recursive
52+
git submodule update --init --recursive
53+
popd
54+
55+
:npm_ci
56+
pushd %ROOT%js
57+
call npm ci
58+
popd
59+
pushd %ROOT%js\common
60+
call npm ci
61+
popd
62+
pushd %ROOT%js\web
63+
call npm ci
64+
call npm run pull:wasm
65+
popd
66+
67+
:build_wasm
68+
69+
set PATH=C:\Program Files\Git\usr\bin;%PATH%
70+
71+
call %ROOT%build.bat --config %CONFIG% %CONFIG_EXTRA_FLAG% --skip_submodule_sync --build_wasm --target onnxruntime_webassembly --skip_tests^
72+
--enable_wasm_simd --enable_wasm_threads --use_jsep --use_webnn --use_webgpu --build_dir %BUILD_DIR%
73+
74+
IF NOT "%ERRORLEVEL%" == "0" (
75+
exit /b %ERRORLEVEL%
76+
)
77+
78+
copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.wasm %ROOT%js\web\dist\
79+
copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.mjs %ROOT%js\web\dist\

js/web/lib/build-def.d.ts

+7
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ interface BuildDefinitions {
4040
*/
4141
readonly ENABLE_BUNDLE_WASM_JS: boolean;
4242

43+
/**
44+
* defines whether to use WebGPU EP instead of JSEP for WebGPU backend.
45+
*
46+
* This flag requires the corresponding WebAssembly artifact to be built with `--use_webgpu` flag.
47+
*/
48+
readonly USE_WEBGPU_EP: boolean;
49+
4350
// #endregion
4451

4552
// #region Build definitions for ESM

0 commit comments

Comments
 (0)