From f876226375a3f0ca62413fdc4be2882465857b84 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 17:11:30 -0800 Subject: [PATCH 01/10] update EFA version in base image --- base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile b/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile index 3d8964dd3f6e..b5bb35a4d101 100644 --- a/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile +++ b/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile @@ -3,7 +3,7 @@ ARG PYTHON_VERSION="3.12.10" ARG PYTHON_SHORT_VERSION="3.12" ARG CUDA_MAJOR="13" ARG CUDA_MINOR="0" -ARG EFA_VERSION="1.43.3" +ARG EFA_VERSION="1.44.0" ARG OS_VERSION="ubuntu22.04" FROM nvidia/cuda:13.0.0-base-${OS_VERSION} AS base-builder From 2d8d0769f53d6e11369cbb8349f91313e4d2c224 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 17:22:48 -0800 Subject: [PATCH 02/10] rebuild after efa version update --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index ce4cad98d4e8..2a76d7a74bf9 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["base"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -121,7 +121,7 @@ nightly_pr_test_mode = false ### TRAINING PR JOBS ### # Base -dlc-pr-base = "" +dlc-pr-base = "base/buildspec-cu130-ubuntu22.yml" # Standard Framework Training dlc-pr-pytorch-training = "" From fa9874c2517062f5e7bbc8017d9e1d76f999693d Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 18:16:56 -0800 Subject: [PATCH 03/10] renamed the NCCL OFI plugin library --- scripts/install_efa.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 75cbc6e93116..6c497f75829f 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -18,7 +18,7 @@ esac function check_libnccl_net_so { OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" - NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" # Check if file exists if [ ! -f "$NCCL_NET_SO" ]; then From ccfff58bd8e72b1fe7f1e577c6762f451972c52e Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 18:52:21 -0800 Subject: [PATCH 04/10] fix NCCL OFI plugin library issue --- scripts/install_efa.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 6c497f75829f..966d465c3ced 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -18,13 +18,22 @@ esac function check_libnccl_net_so { OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" - NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" + #NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" - # Check if file exists - if [ ! -f "$NCCL_NET_SO" ]; then - echo "ERROR: $NCCL_NET_SO does not exist" + # Check for either the new or old filename + if [ -f "$OFI_LIB_DIR/libnccl-net-ofi.so" ] || [ -f "$OFI_LIB_DIR/libnccl-net.so" ]; then + echo "NCCL OFI plugin found" + return 0 + else + echo "ERROR: NCCL OFI plugin not found" return 1 fi + + # Check if file exists + #if [ ! -f "$NCCL_NET_SO" ]; then + # echo "ERROR: $NCCL_NET_SO does not exist" + # return 1 + #fi } function install_efa { From 9d9c007b695db42c6252b89bd81e6c83d41e3c37 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 11 Nov 2025 09:27:24 -0800 Subject: [PATCH 05/10] temporarily disable NCCL check --- scripts/install_efa.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 966d465c3ced..19ad179c3ab1 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -95,7 +95,7 @@ function install_efa { apt-get autoremove -y rm -rf /var/lib/apt/lists/* ldconfig - check_libnccl_net_so + #check_libnccl_net_so } # idiomatic parameter and option handling in sh From 6457c39e8ebaa28691010beb953ae46b80a404fa Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 11 Nov 2025 15:02:59 -0800 Subject: [PATCH 06/10] Enable NCCL check and build --- scripts/install_efa.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 19ad179c3ab1..7a5653f340f3 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -17,7 +17,7 @@ case $ARCH in esac function check_libnccl_net_so { - OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/" #NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" # Check for either the new or old filename @@ -95,7 +95,7 @@ function install_efa { apt-get autoremove -y rm -rf /var/lib/apt/lists/* ldconfig - #check_libnccl_net_so + check_libnccl_net_so } # idiomatic parameter and option handling in sh From 70722f3517509a52a333bc2c8680079850ae58f5 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 11 Nov 2025 15:59:06 -0800 Subject: [PATCH 07/10] Enable efa version based NCCL check --- scripts/install_efa.sh | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 7a5653f340f3..291df0f41447 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -17,23 +17,27 @@ case $ARCH in esac function check_libnccl_net_so { - OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/" - #NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" - - # Check for either the new or old filename - if [ -f "$OFI_LIB_DIR/libnccl-net-ofi.so" ] || [ -f "$OFI_LIB_DIR/libnccl-net.so" ]; then - echo "NCCL OFI plugin found" - return 0 + + if [[ "$EFA_VERSION" >= "1.44.0" ]]; then # version threshold + # Newer EFA version - no ARCH_DIR, different filename + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" + echo "Using newer EFA path structure" else - echo "ERROR: NCCL OFI plugin not found" - return 1 + # Older EFA version - uses ARCH_DIR + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so" + echo "Using older EFA path structure with ARCH_DIR: $ARCH_DIR" fi # Check if file exists - #if [ ! -f "$NCCL_NET_SO" ]; then - # echo "ERROR: $NCCL_NET_SO does not exist" - # return 1 - #fi + if [ ! -f "$NCCL_NET_SO" ]; then + echo "ERROR: $NCCL_NET_SO does not exist" + return 1 + else + echo "NCCL OFI plugin found at: $NCCL_NET_SO" + return 0 + fi } function install_efa { From a9faf7c8ad56c78029f1d71cc29a213b5668efb0 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 11 Nov 2025 16:23:14 -0800 Subject: [PATCH 08/10] fix typo --- scripts/install_efa.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 291df0f41447..c9a913afa509 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -18,7 +18,7 @@ esac function check_libnccl_net_so { - if [[ "$EFA_VERSION" >= "1.44.0" ]]; then # version threshold + if [[ "$EFA_VERSION" > "1.44.0" ]] || [[ "$EFA_VERSION" == "1.44.0" ]]; then # version threshold # Newer EFA version - no ARCH_DIR, different filename OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/" NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" From 3ed4b379ad03afb8d080a93095b0a83c1bb052f7 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 11 Nov 2025 20:41:09 -0800 Subject: [PATCH 09/10] test build for cuda129 --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2a76d7a74bf9..d79693b8012d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -121,7 +121,7 @@ nightly_pr_test_mode = false ### TRAINING PR JOBS ### # Base -dlc-pr-base = "base/buildspec-cu130-ubuntu22.yml" +dlc-pr-base = "base/buildspec-cu129-ubuntu22.yml" # Standard Framework Training dlc-pr-pytorch-training = "" From eecb9f23521d1ea2843201c6b3e17ccf9c8743d9 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 12 Nov 2025 10:10:30 -0800 Subject: [PATCH 10/10] revert back toml file --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e825d9e3ac5f..3a94c6f0f940 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["base"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -121,7 +121,7 @@ nightly_pr_test_mode = false ### TRAINING PR JOBS ### # Base -dlc-pr-base = "base/buildspec-cu129-ubuntu22.yml" +dlc-pr-base = "" # Standard Framework Training dlc-pr-pytorch-training = ""