diff --git a/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile b/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile index 3d8964dd3f6e..b5bb35a4d101 100644 --- a/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile +++ b/base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile @@ -3,7 +3,7 @@ ARG PYTHON_VERSION="3.12.10" ARG PYTHON_SHORT_VERSION="3.12" ARG CUDA_MAJOR="13" ARG CUDA_MINOR="0" -ARG EFA_VERSION="1.43.3" +ARG EFA_VERSION="1.44.0" ARG OS_VERSION="ubuntu22.04" FROM nvidia/cuda:13.0.0-base-${OS_VERSION} AS base-builder diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 389bd905ccf0..3a94c6f0f940 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -36,7 +36,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] build_frameworks = [] diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh index 75cbc6e93116..c9a913afa509 100644 --- a/scripts/install_efa.sh +++ b/scripts/install_efa.sh @@ -17,13 +17,26 @@ case $ARCH in esac function check_libnccl_net_so { - OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" - NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so" + + if [[ "$EFA_VERSION" > "1.44.0" ]] || [[ "$EFA_VERSION" == "1.44.0" ]]; then # version threshold + # Newer EFA version - no ARCH_DIR, different filename + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so" + echo "Using newer EFA path structure" + else + # Older EFA version - uses ARCH_DIR + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so" + echo "Using older EFA path structure with ARCH_DIR: $ARCH_DIR" + fi # Check if file exists if [ ! -f "$NCCL_NET_SO" ]; then echo "ERROR: $NCCL_NET_SO does not exist" return 1 + else + echo "NCCL OFI plugin found at: $NCCL_NET_SO" + return 0 fi }