bytedeco
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/README.md‎
Lines changed: 3 additions & 3 deletions b/‎onnxruntime/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎onnxruntime/cppbuild.sh‎
Lines changed: 13 additions & 6 deletions b/‎onnxruntime/cppbuild.sh‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎onnxruntime/onnxruntime-cuda13.patch‎
Lines changed: 96 additions & 0 deletions b/‎onnxruntime/onnxruntime-cuda13.patch‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎onnxruntime/platform/gpu/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/platform/gpu/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/platform/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/platform/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/samples/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/samples/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/AllocatorImpl.java‎
Lines changed: 7 additions & 0 deletions b/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/AllocatorImpl.java‎
Lines changed: 7 additions & 0 deletions
@@ -4,7 +4,7 @@
  * Add new `SampleOnnxMNIST` in samples for TensorRT ([pull #1742](https://github.com/bytedeco/javacpp-presets/pull/1742))
  * Fix loading issues with `libomp.dylib` and `libiomp5.dylib` for DNNL and PyTorch on Mac
  * Include `model_package_loader.h` header file in presets for PyTorch ([issue #1729](https://github.com/bytedeco/javacpp-presets/issues/1729))
- * Upgrade presets for FFmpeg 8.1, OpenBLAS 0.3.32, CUDA 13.2.1, cuDNN 9.21.1.3, NCCL 2.30.4, nvCOMP 5.2.0.10, CPython 3.14.4, NumPy 2.4.4, SciPy 1.17.1, LLVM 22.1.1, PyTorch 2.11.0, TensorFlow Lite 2.21.0, TensorRT 10.16.1.11, Triton Inference Server 2.68.0, ONNX 1.21.0, ONNX Runtime 1.24.4 ([pull #1750](https://github.com/bytedeco/javacpp-presets/pull/1750)), and their dependencies
+ * Upgrade presets for FFmpeg 8.1, OpenBLAS 0.3.32, CUDA 13.2.1, cuDNN 9.21.1.3, NCCL 2.30.4, nvCOMP 5.2.0.10, CPython 3.14.4, NumPy 2.4.4, SciPy 1.17.1, LLVM 22.1.1, PyTorch 2.11.0, TensorFlow Lite 2.21.0, TensorRT 10.16.1.11, Triton Inference Server 2.68.0, ONNX 1.21.0, ONNX Runtime 1.25.1 ([pull #1753](https://github.com/bytedeco/javacpp-presets/pull/1753)), and their dependencies
  * Compile classes with `parameters` bumping minimum requirements to Java SE 8 and Android 7.0 ([issue #1739](https://github.com/bytedeco/javacpp-presets/issues/1739))
 
 ### February 22, 2026 version 1.5.13
 
@@ -234,7 +234,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * DepthAI 2.24.x  https://github.com/luxonis/depthai-core
  * ONNX 1.20.x  https://github.com/onnx/onnx
  * nGraph 0.26.0  https://github.com/NervanaSystems/ngraph
- * ONNX Runtime 1.24.x  https://github.com/microsoft/onnxruntime
+ * ONNX Runtime 1.25.x  https://github.com/microsoft/onnxruntime
  * TVM 0.18.x  https://github.com/apache/tvm
  * Bullet Physics SDK 3.25  https://pybullet.org
  * LiquidFun  http://google.github.io/liquidfun/
 
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * ONNX Runtime 1.24.4  https://microsoft.github.io/onnxruntime/
+ * ONNX Runtime 1.25.1  https://microsoft.github.io/onnxruntime/
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -46,14 +46,14 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform</artifactId>
-            <version>1.24.4-1.5.14-SNAPSHOT</version>
+            <version>1.25.1-1.5.14-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA and cuDNN -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform-gpu</artifactId>
-            <version>1.24.4-1.5.14-SNAPSHOT</version>
+            <version>1.25.1-1.5.14-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA and cuDNN -->
 
@@ -12,6 +12,7 @@ export DNNL_FLAGS="--use_dnnl"
 export CMAKE_ARGS=
 export COREML_FLAGS=
 export OPENMP_FLAGS= # "--use_openmp"
+export TRAINING_FLAGS= # --enable_training_apis --enable_training_ops
 export CUDAFLAGS="-v"
 export CUDACXX="/usr/local/cuda/bin/nvcc"
 export CUDA_HOME="/usr/local/cuda"
@@ -25,7 +26,7 @@ if [[ "$EXTENSION" == *gpu ]]; then
     GPU_FLAGS="--use_cuda"
 fi
 
-ONNXRUNTIME=1.24.4
+ONNXRUNTIME=1.25.1
 
 mkdir -p "$PLATFORM$EXTENSION"
 cd "$PLATFORM$EXTENSION"
@@ -69,7 +70,7 @@ case $PLATFORM in
         ;;
 esac
 
-patch -Np1 < ../../../onnxruntime-cuda13.patch
+patch -Np1 < ../../../onnxruntime-cuda13.patch || true
 
 #if [[ -n "$ARCH_FLAGS" ]]; then
 #    # build host version of protoc
@@ -107,6 +108,10 @@ sedinplace 's/Darwin|iOS/iOS/g' cmake/onnxruntime_providers_cpu.cmake cmake/onnx
 sedinplace 's/-fvisibility=hidden//g' cmake/CMakeLists.txt cmake/adjust_global_compile_flags.cmake cmake/onnxruntime_providers_cpu.cmake cmake/onnxruntime_providers.cmake
 sedinplace 's:/Yucuda_pch.h /FIcuda_pch.h::g' cmake/onnxruntime_providers_cuda.cmake cmake/onnxruntime_providers.cmake
 sedinplace 's/${PROJECT_SOURCE_DIR}\/external\/cub//g' cmake/onnxruntime_providers_cuda.cmake cmake/onnxruntime_providers.cmake
+sedinplace 's/-Xcompiler \/Zc:__cplusplus/-Xcompiler \/Zc:__cplusplus -Xcompiler \/Zc:preprocessor/g' cmake/onnxruntime_providers_cuda.cmake cmake/onnxruntime_providers_cuda_plugin.cmake
+sedinplace '/CXX>:\/permissive/a\
+      "$<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor>"
+' cmake/onnxruntime_providers_cuda.cmake cmake/onnxruntime_providers_cuda_plugin.cmake
 sedinplace 's/ONNXRUNTIME_PROVIDERS_SHARED)/ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)/g' cmake/onnxruntime_providers_cpu.cmake cmake/onnxruntime_providers.cmake
 sedinplace 's/DNNL_TAG v.*)/DNNL_TAG v3.11)/g' cmake/external/dnnl.cmake
 sedinplace 's/DNNL_SHARED_LIB libdnnl.1.dylib/DNNL_SHARED_LIB libdnnl.2.dylib/g' cmake/external/dnnl.cmake
@@ -132,7 +137,7 @@ sedinplace '/cvtfp16Avx/d' cmake/onnxruntime_mlas.cmake
 sedinplace 's/MlasCastF16ToF32KernelAvx;/MlasCastF16ToF32KernelAvx2;/g' onnxruntime/core/mlas/lib/platform.cpp
 
 # compile for all CUDA archs instead of using PTX to reduce load time
-sedinplace 's/"60;70;75;80;86;89;90;100;120"/"75;80;90;100;120"/g' cmake/external/cuda_configuration.cmake
+sedinplace 's/75;80;86;89;90;100;120/75;80;90;100;120/g' cmake/external/cuda_configuration.cmake
 sedinplace 's/"all"/"50-real;60-real;70-real;80-real;90-real;100-real;120-real"/g' cmake/CMakeLists.txt
 sedinplace 's/-gencode=arch=compute_52,code=sm_52/-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90/g' cmake/CMakeLists.txt
 sedinplace '/-gencode=arch=compute_..,code=sm_../d' cmake/CMakeLists.txt
@@ -187,10 +192,12 @@ sedinplace 's/devicePtrs = allocarray/devicePtrs = (const OrtEpDevice**)allocarr
 sedinplace 's/UTFChars(javaNameStrings/UTFChars((jstring)javaNameStrings/g' java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.cpp
 sedinplace 's/initializers = allocarray/initializers = (const OrtValue**)allocarray/g' java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.cpp
 
+sedinplace 's/SoftMaxComputeHelper<T, TOut, true>(ctx->GetComputeStream()/SoftMaxComputeHelper<T, TOut, true>((CUstream_st*)ctx->GetComputeStream()->GetHandle()/g' orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc
+sedinplace 's/SoftMaxComputeHelper<T, T, true>(ctx->GetComputeStream()/SoftMaxComputeHelper<T, T, true>((CUstream_st*)ctx->GetComputeStream()->GetHandle()/g' orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc
+sedinplace 's/PrepareCompute<TIndex>(context->GetComputeStream()/PrepareCompute<TIndex>(context->GetComputeStream()->GetHandle(), (CUstream_st*)context->GetComputeStream()->GetHandle()/g' orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc
+
 which ctest3 &> /dev/null && CTEST="ctest3" || CTEST="ctest"
-for i in {1..2}; do
-  "$PYTHON_BIN_PATH" tools/ci_build/build.py --build_dir ../build --config Release --parallel $MAKEJ --enable_training_apis --enable_training_ops --cmake_path "$CMAKE" --ctest_path "$CTEST" --build_shared_lib $ARCH_FLAGS $DNNL_FLAGS $COREML_FLAGS $OPENMP_FLAGS $GPU_FLAGS || sedinplace 's/5ea4d05e62d7f954a46b3213f9b2535bdd866803/51982be81bbe52572b54180454df11a3ece9a934/g' cmake/deps.txt
-done
+"$PYTHON_BIN_PATH" tools/ci_build/build.py --build_dir ../build --config Release --parallel $MAKEJ --cmake_path "$CMAKE" --ctest_path "$CTEST" --build_shared_lib $ARCH_FLAGS $DNNL_FLAGS $COREML_FLAGS $OPENMP_FLAGS $TRAINING_FLAGS $GPU_FLAGS
 
 # install headers and libraries in standard directories
 cp -r include/* ../include
 
@@ -1,3 +1,99 @@
+From 712fbe0f6e491a2edd7388f99ea4124f25cda774 Mon Sep 17 00:00:00 2001
+From: "M. Chornyi" <99709299+mc-nv@users.noreply.github.com>
+Date: Fri, 1 May 2026 21:48:54 +0000
+Subject: [PATCH] Fix CUDA 13.2 (CUB 3.2.0) build failure: invalid C++ in
+ device_transform.cuh
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+CUB 3.2.0 ships device_transform.cuh with an invalid template specialisation:
+  struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {};
+A globally-qualified class name in a specialisation is rejected by the compiler
+under -std=c++20.  device_copy.cuh transitively pulls device_transform.cuh in
+via dispatch_copy_mdspan.cuh, so it fails for the same reason.
+
+Fix: two shadow stubs under onnxruntime/cub/device/, resolved first via -I
+ahead of the -isystem CUDA toolkit path.
+
+  device_transform.cuh — re-emits the parts Thrust uses internally
+    (cub::detail::__return_constant and the proclaims_copyable_arguments
+    specialisation) with the specialisation written inside the cuda namespace
+    so the class name is unqualified. cub::DeviceTransform is omitted.
+
+  device_copy.cuh — empty stub. ORT does not use cub::DeviceCopy.
+
+cub.cuh is unchanged.
+---
+ onnxruntime/cub/device/device_copy.cuh      |  9 +++++
+ onnxruntime/cub/device/device_transform.cuh | 42 +++++++++++++++++++++
+ 2 files changed, 51 insertions(+)
+ create mode 100644 onnxruntime/cub/device/device_copy.cuh
+ create mode 100644 onnxruntime/cub/device/device_transform.cuh
+
+diff --git a/onnxruntime/cub/device/device_copy.cuh b/onnxruntime/cub/device/device_copy.cuh
+new file mode 100644
+index 0000000000000..14e9f1772a3ef
+--- /dev/null
++++ b/onnxruntime/cub/device/device_copy.cuh
+@@ -0,0 +1,9 @@
++// Copyright (c) Microsoft Corporation. All rights reserved.
++// Licensed under the MIT License.
++
++// Shadow stub for <cub/device/device_copy.cuh>. The real header transitively
++// includes dispatch_copy_mdspan.cuh, which references cub::DeviceTransform — a
++// type our device_transform.cuh stub intentionally omits. ORT does not use
++// cub::DeviceCopy, so this empty stub is sufficient.
++
++#pragma once
+diff --git a/onnxruntime/cub/device/device_transform.cuh b/onnxruntime/cub/device/device_transform.cuh
+new file mode 100644
+index 0000000000000..378bd8f0b5be8
+--- /dev/null
++++ b/onnxruntime/cub/device/device_transform.cuh
+@@ -0,0 +1,42 @@
++// Copyright (c) Microsoft Corporation. All rights reserved.
++// Licensed under the MIT License.
++
++// Shadow stub for <cub/device/device_transform.cuh>. Resolved first via -I,
++// ahead of the -isystem CUDA toolkit path.
++//
++// CUB 3.2.0 (CUDA 13.2) ships an invalid template specialisation:
++//   struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {};
++// A globally-qualified class name in a specialisation is rejected by the compiler.
++// We re-emit the parts Thrust needs internally with the fixed syntax (the
++// specialisation is written inside the cuda namespace so the name is unqualified).
++// cub::DeviceTransform itself is not used by ORT and is intentionally omitted.
++
++#pragma once
++
++#include <cub/version.cuh>
++
++#if CUB_VERSION >= 300200
++
++#include <cub/device/dispatch/dispatch_transform.cuh>  // cub::detail::transform::dispatch_t (Thrust)
++#include <cuda/__functional/address_stability.h>       // cuda::proclaims_copyable_arguments primary
++
++CUB_NAMESPACE_BEGIN
++namespace detail
++{
++template <typename T>
++struct __return_constant
++{
++  T value;
++  template <typename... Args>
++  _CCCL_HOST_DEVICE T operator()(Args&&...) const { return value; }
++};
++} // namespace detail
++CUB_NAMESPACE_END
++
++_CCCL_BEGIN_NAMESPACE_CUDA
++template <typename T>
++struct proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_constant<T>>
++    : ::cuda::std::true_type {};
++_CCCL_END_NAMESPACE_CUDA
++
++#endif  // CUB_VERSION >= 300200
 diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu b/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu
 index 638c7d6637..73063765d7 100644
 --- a/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu
 
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime-platform-gpu</artifactId>
-  <version>1.24.4-${project.parent.version}</version>
+  <version>1.25.1-${project.parent.version}</version>
   <name>JavaCPP Presets Platform GPU for ONNX Runtime</name>
 
   <properties>
 
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime-platform</artifactId>
-  <version>1.24.4-${project.parent.version}</version>
+  <version>1.25.1-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for ONNX Runtime</name>
 
   <properties>
 
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime</artifactId>
-  <version>1.24.4-${project.parent.version}</version>
+  <version>1.25.1-${project.parent.version}</version>
   <name>JavaCPP Presets for ONNX Runtime</name>
 
   <properties>
 
@@ -12,14 +12,14 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform</artifactId>
-            <version>1.24.4-1.5.14-SNAPSHOT</version>
+            <version>1.25.1-1.5.14-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA and cuDNN -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform-gpu</artifactId>
-            <version>1.24.4-1.5.14-SNAPSHOT</version>
+            <version>1.25.1-1.5.14-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA and cuDNN -->
 
@@ -34,6 +34,7 @@ public class AllocatorImpl extends BaseAllocator {
 
 
   public native Pointer Alloc(@Cast("size_t") long size);
+  public native Pointer Reserve(@Cast("size_t") long size);
   public native @ByVal MemoryAllocation GetAllocation(@Cast("size_t") long size);
   public native void Free(Pointer p);
   public native @ByVal @Cast("Ort::ConstMemoryInfo*") MemoryInfoImpl GetInfo();
@@ -43,4 +44,10 @@ public class AllocatorImpl extends BaseAllocator {
    * @return A pointer to a KeyValuePairs object that will be filled with the allocator statistics.
    */
   public native @ByVal KeyValuePairs GetStats();
+
+  /** \brief Release unused memory held by the allocator.
+   *
+   * Calls the optional Shrink function pointer if available; does nothing otherwise.
+   */
+  public native void Shrink();
 }