microsoft
diff --git a/‎.vscode/settings.json‎
Lines changed: 19 additions & 1 deletion b/‎.vscode/settings.json‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 3 deletions b/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cmake/external/abseil-cpp.cmake‎
Lines changed: 1 addition & 5 deletions b/‎cmake/external/abseil-cpp.cmake‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 15 additions & 7 deletions b/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 5 additions & 1 deletion b/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎cmake/external/xnnpack.cmake‎
Lines changed: 20 additions & 8 deletions b/‎cmake/external/xnnpack.cmake‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎cmake/onnxruntime_providers_nv.cmake‎
Lines changed: 6 additions & 3 deletions b/‎cmake/onnxruntime_providers_nv.cmake‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎docs/Versioning.md‎
Lines changed: 61 additions & 0 deletions b/‎docs/Versioning.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎js/web/docs/webnn-operators.md‎
Lines changed: 1 addition & 1 deletion b/‎js/web/docs/webnn-operators.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts‎
Lines changed: 1 addition & 1 deletion b/‎js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts‎
Lines changed: 1 addition & 1 deletion
@@ -14,5 +14,23 @@
         "-build/include_subdir",
         "-runtime/references"
     ],
-    "C_Cpp.autoAddFileAssociations": false
+    "C_Cpp.autoAddFileAssociations": false,
+
+    // Exclude build directories and non-essential folders from C++ parsing
+    "C_Cpp.files.exclude": {
+        "**/build/**": true,
+        "**/build_*/**": true,
+        "**/cmake/external/**": true,
+        "**/node_modules/**": true,
+        "**/.git/**": true
+    },
+
+    // Exclude from search but keep in explorer
+    "search.exclude": {
+        "**/build/**": true,
+        "**/build_*/**": true,
+        "**/cmake/external/**": true,
+        "**/node_modules/**": true,
+        "**/.git/**": true
+    }
 }
@@ -1441,7 +1441,7 @@ get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_
 if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_STANDARD 17)
   if(onnxruntime_CUDA_HOME)
-    file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
+    file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
   endif()
   find_package(CUDAToolkit REQUIRED)
 
@@ -1801,8 +1801,11 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
   )
 endif()
 
-if(NOT onnxruntime_BUILD_SHARED_LIB AND onnxruntime_USE_WEBGPU)
-  message(WARNING "CMake target files will not be generated for static onnxruntime builds with webgpu support")
+if (NOT onnxruntime_BUILD_SHARED_LIB AND
+    (onnxruntime_USE_WEBGPU OR (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_USE_XNNPACK)))
+  message(WARNING
+    "CMake target files will not be generated for static onnxruntime builds "
+    "with WebGPU or Emscripten+XNNPACK support")
 else()
   # Install
   include(CMakePackageConfigHelpers)
 
@@ -12,12 +12,8 @@ set(ABSL_USE_EXTERNAL_GOOGLETEST ON)
 
 # Both abseil and xnnpack create a target called memory, which
 # results in a duplicate target if ABSL_ENABLE_INSTALL is on.
-if (onnxruntime_USE_XNNPACK)
-  set(ABSL_ENABLE_INSTALL OFF)
-else()
-  if (NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
+if (NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
     set(ABSL_ENABLE_INSTALL ON)
-  endif()
 endif()
 
 if(Patch_FOUND)
 
@@ -85,6 +85,11 @@ macro(setup_cuda_architectures)
   #  * Always use accelerated (`-a` suffix) target for supported real architectures.
   # cmake-format: on
 
+  # Allow override via CUDAARCHS environment variable (standard CMake variable)
+  if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS})
+    set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
+  endif()
+
   if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
     # Detect highest available compute capability
     set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
@@ -142,12 +147,12 @@ macro(setup_cuda_architectures)
       continue()
     endif()
 
-    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
+    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$")
       set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
-      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$")
       list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$")
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4})
     else()
       message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
     endif()
@@ -159,7 +164,7 @@ macro(setup_cuda_architectures)
   set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
   message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
 
-  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
+  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
   foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
     if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
@@ -168,10 +173,13 @@ macro(setup_cuda_architectures)
   endforeach()
 
   # Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
-  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
+  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120")
   unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
   foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
+    if(CUDA_ARCH MATCHES "^([0-9]+)f$")
+      # Family code, no -real suffix
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}")
+    elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
     else()
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")
 
@@ -764,7 +764,11 @@ if (onnxruntime_USE_WEBGPU)
           # - (private) Fix compatibility issues with Safari. Contains the following changes:
           #   - Polyfill for `device.AdapterInfo` (returns `undefined` in Safari v26.0)
           #
-          ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/safari_polyfill.patch)
+          ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/safari_polyfill.patch &&
+
+          # Remove the test folder to speed up potential file scan operations (70k+ files not needed for build).
+          # Using <SOURCE_DIR> token ensures the correct absolute path regardless of working directory.
+          ${CMAKE_COMMAND} -E rm -rf <SOURCE_DIR>/test)
 
       onnxruntime_fetchcontent_declare(
         dawn
 
@@ -62,6 +62,8 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
   SET(ORT_TARGET_PROCESSOR "arm64")
 ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
   SET(ORT_TARGET_PROCESSOR "ppc64")
+ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  SET(ORT_TARGET_PROCESSOR "wasm")
 ELSEIF(NOT ORT_TARGET_PROCESSOR MATCHES "^(x86(_64)?|arm64|riscv(32|64|128)|Hexagon|ppc64)$")
   SET(ORT_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
 ELSE()
@@ -90,18 +92,21 @@ onnxruntime_fetchcontent_makeavailable(googlexnnpack)
 set(XNNPACK_DIR ${googlexnnpack_SOURCE_DIR})
 set(XNNPACK_INCLUDE_DIR ${XNNPACK_DIR}/include)
 
-set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK xnnpack-microkernels-prod pthreadpool)
+set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK pthreadpool)
 if(ORT_TARGET_PROCESSOR MATCHES "^arm64.*" AND NOT CMAKE_C_COMPILER_ID STREQUAL "MSVC")
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES_XNNPACK kleidiai)
 endif()
+if(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES_XNNPACK xnnpack-microkernels-prod)
+endif()
 
 # the XNNPACK CMake setup doesn't include the WASM kernels so we have to manually set those up
 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   # See source lists in _deps/googlexnnpack-src/BUILD.bazel for wasm_prod_microkernels
   message("Adding WebAssembly Source Files to XNNPACK")
   set(wasm_srcs "")
 
-  file(READ "${XNNPACK_DIR}/BUILD.bazel" xnnpack_bazel_config)
+  file(READ "${XNNPACK_DIR}/build_srcs.bzl" xnnpack_bazel_config)
 
   # Replace newlines with semicolon so that it is treated as a list by CMake
   # Also replace '[' and ']' so the bazel source lists don't get parsed as a nested list by cmake
@@ -139,19 +144,26 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   GetSrcListFromBazel("TABLE_SRCS" table_srcs)
   list(APPEND wasm_srcs ${operator_srcs} ${table_srcs})
 
-  # kernels
-  list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/scalar.c)
-  list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasm.c)
+  set(microkernel_src "")
+
+  include(${XNNPACK_DIR}/cmake/gen/scalar_microkernels.cmake)
+  list(APPEND microkernel_src ${PROD_SCALAR_MICROKERNEL_SRCS})
+  list(APPEND microkernel_src ${PROD_WASM_MICROKERNEL_SRCS})
 
   if(onnxruntime_ENABLE_WEBASSEMBLY_RELAXED_SIMD)
-    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c)
-    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmrelaxedsimd.c)
+    include(${XNNPACK_DIR}/cmake/gen/wasmsimd_microkernels.cmake)
+    include(${XNNPACK_DIR}/cmake/gen/wasmrelaxedsimd_microkernels.cmake)
+    list(APPEND microkernel_src ${PROD_WASMSIMD_MICROKERNEL_SRCS})
+    list(APPEND microkernel_src ${PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS})
     target_compile_options(XNNPACK PRIVATE "-msimd128")
     target_compile_options(XNNPACK PRIVATE "-mrelaxed-simd")
   elseif(onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
-    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c)
+    include(${XNNPACK_DIR}/cmake/gen/wasmsimd_microkernels.cmake)
+    list(APPEND microkernel_src ${PROD_WASMSIMD_MICROKERNEL_SRCS})
     target_compile_options(XNNPACK PRIVATE "-msimd128")
   endif()
+  list(TRANSFORM microkernel_src PREPEND "${XNNPACK_DIR}/")
+  list(APPEND wasm_srcs ${microkernel_src})
 
   message(DEBUG "wasm_srcs: ${wasm_srcs}\n")
   target_sources(XNNPACK PRIVATE ${wasm_srcs})
 
@@ -1,7 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # Licensed under the MIT License.
-  find_package(CUDAToolkit REQUIRED 12.8)
+  if(onnxruntime_CUDA_HOME)
+    file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
+  endif()
+  find_package(CUDAToolkit REQUIRED)
   enable_language(CUDA)
   if(onnxruntime_DISABLE_CONTRIB_OPS)
     message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
@@ -146,9 +149,9 @@ endif ()
   target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen  onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
   add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
-    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
   else()
-    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
   endif()
   target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
 
@@ -11,6 +11,67 @@ The version number of the current stable release can be found
 ## Release cadence
 See [Release Management](ReleaseManagement.md)
 
+## Updating the Version for a Release
+
+When preparing a release, follow these steps to update the version number across the codebase. This applies both when creating an initial release branch (updating `main`) and when preparing patch releases on release branches:
+
+### Prerequisites
+- Node.js (check [js/.nvmrc](../js/.nvmrc) for the required version)
+- npm (comes with Node.js)
+- Python 3
+
+Verify your setup:
+```bash
+node --version   # Should match the version in js/.nvmrc
+npm --version    # Should be v8.0 or newer
+```
+
+### Steps
+
+1. **Update the VERSION_NUMBER file**
+
+   Edit [VERSION_NUMBER](../VERSION_NUMBER) in the repository root to reflect the new version (e.g., `1.23.3`).
+
+2. **Run the version update script**
+
+   From the repository root, run:
+   ```bash
+   python tools/python/update_version.py
+   ```
+
+   This script automatically updates version numbers in:
+   - `docs/Versioning.md` - Adds a new row to the version table
+   - `docs/python/README.rst` - Adds release notes entry
+   - `onnxruntime/__init__.py` - Python package version
+   - `js/` packages - All NPM package versions and lock files
+
+3. **Update the C API static_assert (Manual Step)**
+
+   The script does **not** update the version check in the C API. You must manually update the `static_assert` in [onnxruntime/core/session/onnxruntime_c_api.cc](../onnxruntime/core/session/onnxruntime_c_api.cc).
+
+   Search for `static_assert(std::string_view(ORT_VERSION)` and update the version string:
+   ```cpp
+   static_assert(std::string_view(ORT_VERSION) == "X.Y.Z",
+                 "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
+   ```
+
+   Replace `X.Y.Z` with your new version number. The comments following this assert explain additional steps if new APIs were added to this release.
+
+4. **Review all changes**
+
+   Review all modified files. Verify:
+   - Version numbers are correct in all updated files
+   - The release notes URL format is correct (e.g., `https://github.com/Microsoft/onnxruntime/releases/tag/vX.Y.Z`)
+
+5. **Commit and create PR**
+
+   Commit all changes and create a PR targeting `main` or a release branch as appropriate.
+
+### Notes
+
+- The version table in this file and the ONNX opset compatibility information on [onnxruntime.ai](https://onnxruntime.ai/docs/reference/compatibility.html#onnx-opset-support) are the canonical sources for version compatibility information.
+- For ONNX version/opset/IR reference numbers, see the [ONNX Versioning documentation](https://github.com/onnx/onnx/blob/main/docs/Versioning.md#released-versions).
+
 # Compatibility
 
 ## Backwards compatibility
 
@@ -52,7 +52,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
 | Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
 | GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
-| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. Does not support cos_cache and sin_cache inputs |
+| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. |
 | GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
 | HardSigmoid | ai.onnx(7+) | hardSigmoid | |
 | HardSwish | ai.onnx(14+) | hardSwish | |
 
@@ -132,7 +132,7 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
     typeof attributes.autoPad == 'undefined' ? 0 : (attributes.autoPad as number)
   ];
   const dilations = attributes.dilations as [number, number];
-  const group = attributes.group as number;
+  const group = (attributes.group as number) ?? 1; // default to 1 per ONNX spec
   const kernelShape = attributes.kernelShape as [number, number];
   const pads = attributes.pads as [number, number, number, number];
   const strides = attributes.strides as [number, number];