File tree Expand file tree Collapse file tree 4 files changed +10
-12
lines changed
Expand file tree Collapse file tree 4 files changed +10
-12
lines changed Original file line number Diff line number Diff line change 2929 dockerfile_path : tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
3030 docker_build_args : ' --build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
3131 docker_image_repo : onnxruntimecuda12manylinuxbuild
32- extra_build_flags : ' --use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
32+ extra_build_flags : ' --use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 -- cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
3333 python_path_prefix : ' PATH=/opt/python/cp310-cp310/bin:$PATH'
3434 run_tests : false # <<< Do not run tests in this job
3535 upload_build_output : true # <<< Upload the build/Release directory
Original file line number Diff line number Diff line change @@ -115,7 +115,7 @@ jobs:
115115 exit $lastExitCode
116116 }
117117 # Execute the build process
118- python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
118+ python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 -- use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
119119 if ($lastExitCode -ne 0) {
120120 exit $lastExitCode
121121 }
@@ -235,7 +235,7 @@ jobs:
235235 exit $lastExitCode
236236 }
237237
238- python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
238+ python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 -- use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
239239 if ($lastExitCode -ne 0) {
240240 exit $lastExitCode
241241 }
Original file line number Diff line number Diff line change @@ -121,7 +121,7 @@ jobs:
121121 exit $lastExitCode
122122 }
123123 # Execute the build process
124- python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
124+ python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
125125 if ($lastExitCode -ne 0) {
126126 exit $lastExitCode
127127 }
@@ -247,7 +247,7 @@ jobs:
247247 exit $lastExitCode
248248 }
249249
250- python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
250+ python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags -- parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
251251 if ($lastExitCode -ne 0) {
252252 exit $lastExitCode
253253 }
Original file line number Diff line number Diff line change @@ -217,16 +217,14 @@ def number_of_nvcc_threads(args):
217217
218218 available_memory = psutil .virtual_memory ().available
219219 if isinstance (available_memory , int ) and available_memory > 0 :
220- if available_memory > 60 * 1024 * 1024 * 1024 :
220+ if available_memory >= 64 * 1024 * 1024 * 1024 :
221221 # When available memory is large enough, chance of OOM is small.
222- nvcc_threads = 4
222+ nvcc_threads = min ( 4 , int ( available_memory / ( 8 * 4 * 1024 * 1024 * 1024 )))
223223 else :
224- # NVCC need a lot of memory to compile 8 flash attention cu files in Linux or 4 cutlass fmha cu files in Windows.
225- # Here we select number of threads to ensure each thread has enough memory (>= 4 GB). For example,
226- # Standard_NC4as_T4_v3 has 4 CPUs and 28 GB memory. When parallel=4 and nvcc_threads=2,
227- # total nvcc threads is 4 * 2, which is barely able to build in 28 GB memory so we will use nvcc_threads=1.
224+ # NVCC need a lot of memory to compile 48 flash attention cu files.
225+ # Here we select number of threads to ensure each thread has enough memory (>= 4 GB).
228226 memory_per_thread = 4 * 1024 * 1024 * 1024
229- fmha_cu_files = 4 if is_windows () else 16
227+ fmha_cu_files = 48
230228 fmha_parallel_jobs = min (fmha_cu_files , number_of_parallel_jobs (args ))
231229 nvcc_threads = max (1 , int (available_memory / (memory_per_thread * fmha_parallel_jobs )))
232230 print (
You can’t perform that action at this time.
0 commit comments