Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion base/x86_64/gpu/cu130/ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ARG PYTHON_VERSION="3.12.10"
ARG PYTHON_SHORT_VERSION="3.12"
ARG CUDA_MAJOR="13"
ARG CUDA_MINOR="0"
ARG EFA_VERSION="1.43.3"
ARG EFA_VERSION="1.44.0"
ARG OS_VERSION="ubuntu22.04"
FROM nvidia/cuda:13.0.0-base-${OS_VERSION} AS base-builder

Expand Down
2 changes: 1 addition & 1 deletion dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ deep_canary_mode = false

[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []


Expand Down
17 changes: 15 additions & 2 deletions scripts/install_efa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,26 @@ case $ARCH in
esac

function check_libnccl_net_so {
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"

if [[ "$EFA_VERSION" > "1.44.0" ]] || [[ "$EFA_VERSION" == "1.44.0" ]]; then # version threshold
# Newer EFA version - no ARCH_DIR, different filename
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/"
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net-ofi.so"
echo "Using newer EFA path structure"
else
# Older EFA version - uses ARCH_DIR
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"
echo "Using older EFA path structure with ARCH_DIR: $ARCH_DIR"
fi

# Check if file exists
if [ ! -f "$NCCL_NET_SO" ]; then
echo "ERROR: $NCCL_NET_SO does not exist"
return 1
else
echo "NCCL OFI plugin found at: $NCCL_NET_SO"
return 0
fi
}

Expand Down