Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17,624 changes: 17,624 additions & 0 deletions 187.txt

Large diffs are not rendered by default.

Empty file added 2.0
Empty file.
22,163 changes: 22,163 additions & 0 deletions 203.txt

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions build_cpu.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
IF "%VCToolsInstallDir%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"

build.bat --cmake_generator "Visual Studio 17 2022" --config Release --build_dir build\cpu --build_wheel ^
--parallel 4 --nvcc_threads 1 --build_shared_lib ^
--skip_tests ^
--use_binskim_compliant_compile_flags ^
--cmake_extra_defines "onnxruntime_BUILD_UNIT_TESTS=OFF" ^
--cmake_extra_defines "FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER"
11 changes: 11 additions & 0 deletions build_cu118.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
IF "VCToolsInstallDir"=="" call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"

build.bat --cmake_generator "Visual Studio 16 2019" --config Release --build_dir build --build_wheel ^
--parallel 4 --nvcc_threads 1 --build_shared_lib ^
--use_cuda --cuda_version "11.8" --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" ^
--cudnn_home "D:\cudnn\8.9.7.29_cuda11" ^
--cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=61;86" ^
--use_binskim_compliant_compile_flags ^
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF --skip_tests

REM onnxruntime_ENABLE_NVTX_PROFILE=ON --enable_cuda_line_info
17 changes: 17 additions & 0 deletions build_cu128.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
IF "%VCToolsInstallDir%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"

build.bat --cmake_generator "Visual Studio 17 2022" --config Release --build_dir build\cuda128 --build_wheel ^
--parallel 4 --nvcc_threads 1 --build_shared_lib ^
--use_cuda --cuda_version "12.8" --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8" ^
--cudnn_home "D:\cudnn\9.13.0.50_cuda12" ^
--build_nuget ^
--skip_tests ^
--use_binskim_compliant_compile_flags ^
--cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;86;89;90;120" ^
--cmake_extra_defines "onnxruntime_USE_FPA_INTB_GEMM=ON" ^
--cmake_extra_defines "onnxruntime_BUILD_UNIT_TESTS=OFF" ^
--cmake_extra_defines "FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER"

REM --use_vcpkg

REM onnxruntime_ENABLE_NVTX_PROFILE=ON --enable_cuda_line_info
16 changes: 16 additions & 0 deletions build_cu129.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
IF "%VCToolsInstallDir%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"

build.bat --cmake_generator "Visual Studio 17 2022" --config Release --build_dir build\cuda129 --build_wheel ^
--parallel 4 --nvcc_threads 1 --build_shared_lib ^
--use_cuda --cuda_version "12.9" --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" ^
--cudnn_home "D:\cudnn\9.13.0.50_cuda12" ^
--build_nuget ^
--skip_tests ^
--use_binskim_compliant_compile_flags ^
--cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=120" ^
--cmake_extra_defines "onnxruntime_USE_FPA_INTB_GEMM=ON" ^
--cmake_extra_defines "onnxruntime_BUILD_UNIT_TESTS=OFF" ^
--cmake_extra_defines "FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER"


REM onnxruntime_ENABLE_NVTX_PROFILE=ON --enable_cuda_line_info
17 changes: 17 additions & 0 deletions build_cu130.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
IF "%VCToolsInstallDir%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"

build.bat --cmake_generator "Visual Studio 17 2022" --config Release --build_dir build\cuda130 --build_wheel ^
--parallel 4 --nvcc_threads 1 --build_shared_lib ^
--use_cuda --cuda_version "13.0" --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" ^
--cudnn_home "D:\cudnn\9.13.0.50_cuda13" ^
--cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=native" ^
--build_nuget ^
--skip_tests ^
--use_binskim_compliant_compile_flags ^
--cmake_extra_defines "onnxruntime_BUILD_UNIT_TESTS=ON" ^
--cmake_extra_defines "onnxruntime_ENABLE_NVTX_PROFILE=ON" ^
--cmake_extra_defines "onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=ON" ^
--cmake_extra_defines "FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER"

REM --use_vcpkg
REM onnxruntime_ENABLE_NVTX_PROFILE=ON --enable_cuda_line_info
17,624 changes: 17,624 additions & 0 deletions build_log.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cmake/external/onnx
Submodule onnx updated 852 files
115 changes: 115 additions & 0 deletions cuda_pad_diff.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index bdd6567d2e..656890e796 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -94,7 +94,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToCudaType<T>::MappedType CudaT;
const auto& input_tensor = *ctx->Input<Tensor>(0);
auto const& input_shape = input_tensor.Shape();
- int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
+ const size_t dimension_count = input_shape.NumDimensions();

const PadsVector* p_pads = &pads_;
const PadsVector* p_slices = &slices_;
@@ -134,26 +134,85 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
TArray<int64_t> input_strides(input_pitches);

auto output_dims(input_shape.AsShapeVector());
- ORT_ENFORCE(static_cast<size_t>(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
+ ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");

// Calculate output dimensions, and handle any negative padding
TArray<int64_t> lower_pads(dimension_count);
TArray<int64_t> upper_pads(dimension_count);
- for (auto i = 0; i < dimension_count; i++) {
- lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
- upper_pads[i] = (*p_pads)[static_cast<int64_t>(i) + dimension_count] + (*p_slices)[static_cast<int64_t>(i) + dimension_count];
- output_dims[i] += lower_pads[i] + upper_pads[i];
+ for (size_t i = 0; i < dimension_count; i++) {
+ lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
+ upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
+ output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
+ }
+
+ TensorShapeVector effective_input_extents;
+ effective_input_extents.reserve(dimension_count);
+ for (size_t i = 0; i < dimension_count; i++) {
+ int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
+ (*p_slices)[i] + (*p_slices)[i + dimension_count],
+ 0LL);
+ effective_input_extents.push_back(extent);
}

TensorShape output_shape(output_dims);
+ auto& output_tensor = *ctx->Output(0, output_shape);

- // special case when there is a dim value of 0 in the shape. behavior depends on mode
+ // If the input size is zero, but output shape is not, need padding only
+ // this is expected for constant mode only, otherwise the output is empty
+ // no error
if (input_shape.Size() == 0) {
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+ if (mode_ == Mode::Constant) {
+ const int64_t output_size = output_shape.Size();
+ if (output_size > 0) {
+ Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+ output_size);
+ }
+ }
+ // No error for other modes (preserve CPU historical behavior),
+ // but no output should be expected either
+ return Status::OK();
}

- auto& output_tensor = *ctx->Output(0, output_shape);
+ // Early constant-fill: input is not empty as above
+ // However, if any effective input extent is zero, no data to copy
+ // only padding if any.
+ const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+ [](int64_t v) { return v == 0; });
+
+ if (no_effective_data_to_copy) {
+ if (mode_ == Mode::Constant) {
+ // Attempt to pad constant mode in case output is not empty
+ // all other modes are an error
+ const int64_t output_size = output_shape.Size();
+ if (output_size > 0) {
+ Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+ output_size);
+ }
+ return Status::OK();
+ }
+ return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+ "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
+ }
+
+ // Special case for Reflect mode: ensure all extents >= 2 after slicing
+ // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+ // implies that this would be wrong as the start and end positions should be distinct
+ // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+ if (mode_ == Mode::Reflect) {
+ for (size_t i = 0; i < dimension_count; ++i) {
+ const int64_t extent = effective_input_extents[i]; // length after slicing
+ const bool reflect_on_axis =
+ (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
+ if (reflect_on_axis && extent < 2) {
+ return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+ "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+ input_shape);
+ }
+ }
+ }

+ // Case of all pads and slices being zero: just copy input to output
if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
output_shape.Size() > 0) {
@@ -164,7 +223,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
return Status::OK();
}

- if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
+ if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) {
// If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)

// NCHW input
Loading
Loading