Skip to content

Commit 333890b

Browse files
nWEIdiapytorchmergebot
authored andcommitted
Enable CUDA 12.4.1 (pytorch#132202)
Trying to keep a record of the steps before I lose track of it. - 1st Commit: Similar to pytorch/builder#1720 - 2nd Commit: Update CUDA 12.4 CI CUDA versions from 12.4.0 to 12.4.1 mapping to changes in https://github.com/pytorch/pytorch/pull/125944/files - 3rd Commit: update for aarch64 install_cuda_aarch64.sh docker step - 4th Commit: pytorch@aaa456e Related pytorch#121684 - Synchronization point: Meta helps uploading pypi cuda dependencies specified in .github/scripts/generate_binary_build_matrix.py - The above pypi upload is done (thanks Andrey!), restarted jobs like https://github.com/pytorch/pytorch/actions/runs/10188203670/job/28369471321 - pytorch@7753234, use temporary docker containers (generated from a previous successful container build). If merged, these containers would be rebuilt, therefore testing them now. (5th commit) - 6th commit pytorch@5f93c62: revert the 5th commit. Update, done but have to debug seemingly irrelevant failures (rocm/xpu/mps) Pull Request resolved: pytorch#132202 Approved by: https://github.com/Skylion007, https://github.com/eqy, https://github.com/atalman
1 parent e41b520 commit 333890b

7 files changed

+41
-42
lines changed

.ci/docker/build.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
9292
# from scratch
9393
case "$image" in
9494
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
95-
CUDA_VERSION=12.4.0
95+
CUDA_VERSION=12.4.1
9696
CUDNN_VERSION=9
9797
ANACONDA_PYTHON_VERSION=3.10
9898
GCC_VERSION=9
@@ -120,7 +120,7 @@ case "$image" in
120120
TRITON=yes
121121
;;
122122
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
123-
CUDA_VERSION=12.4.0
123+
CUDA_VERSION=12.4.1
124124
CUDNN_VERSION=9
125125
ANACONDA_PYTHON_VERSION=3.10
126126
GCC_VERSION=9
@@ -165,7 +165,7 @@ case "$image" in
165165
INDUCTOR_BENCHMARKS=yes
166166
;;
167167
pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
168-
CUDA_VERSION=12.4.0
168+
CUDA_VERSION=12.4.1
169169
CUDNN_VERSION=9
170170
ANACONDA_PYTHON_VERSION=3.12
171171
GCC_VERSION=9
@@ -194,7 +194,7 @@ case "$image" in
194194
TRITON=yes
195195
;;
196196
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
197-
CUDA_VERSION=12.4.0
197+
CUDA_VERSION=12.4.1
198198
CUDNN_VERSION=9
199199
ANACONDA_PYTHON_VERSION=3.10
200200
GCC_VERSION=9
@@ -222,7 +222,7 @@ case "$image" in
222222
TRITON=yes
223223
;;
224224
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
225-
CUDA_VERSION=12.4.0
225+
CUDA_VERSION=12.4.1
226226
CUDNN_VERSION=9
227227
ANACONDA_PYTHON_VERSION=3.10
228228
GCC_VERSION=9

.ci/docker/common/install_cuda.sh

+6-6
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,13 @@ function install_121 {
9494
}
9595

9696
function install_124 {
97-
echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
97+
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
9898
rm -rf /usr/local/cuda-12.4 /usr/local/cuda
99-
# install CUDA 12.4.0 in the same container
100-
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
101-
chmod +x cuda_12.4.0_550.54.14_linux.run
102-
./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
103-
rm -f cuda_12.4.0_550.54.14_linux.run
99+
# install CUDA 12.4.1 in the same container
100+
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
101+
chmod +x cuda_12.4.1_550.54.15_linux.run
102+
./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
103+
rm -f cuda_12.4.1_550.54.15_linux.run
104104
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
105105

106106
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement

.ci/docker/common/install_cuda_aarch64.sh

+6-6
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ function install_cusparselt_052 {
1717
}
1818

1919
function install_124 {
20-
echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
20+
echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
2121
rm -rf /usr/local/cuda-12.4 /usr/local/cuda
22-
# install CUDA 12.4.0 in the same container
23-
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
24-
chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
25-
./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
26-
rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
22+
# install CUDA 12.4.1 in the same container
23+
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
24+
chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
25+
./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
26+
rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
2727
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
2828

2929
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement

.ci/pytorch/test.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -657,8 +657,7 @@ test_inductor_torchbench_smoketest_perf() {
657657
# https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
658658
# and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
659659
# we switch to use some other model.
660-
# lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
661-
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
660+
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
662661

663662
# Check memory compression ratio for a few models
664663
for test in hf_Albert timm_vision_transformer; do

.github/scripts/generate_binary_build_matrix.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
CUDA_ARCHES = ["11.8", "12.1", "12.4"]
1919

2020

21-
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
21+
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.1"}
2222

2323

2424
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
@@ -68,18 +68,18 @@
6868
"nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
6969
),
7070
"12.4": (
71-
"nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
72-
"nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
73-
"nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
71+
"nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
72+
"nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
73+
"nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
7474
"nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
75-
"nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
76-
"nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
77-
"nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
78-
"nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
79-
"nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
75+
"nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | "
76+
"nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
77+
"nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
78+
"nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
79+
"nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
8080
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
81-
"nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
82-
"nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
81+
"nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
82+
"nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
8383
),
8484
}
8585

.github/workflows/generated-linux-binary-manywheel-main.yml

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)