Skip to content

Commit aaad5b0

Browse files
authored
Merge pull request #26 from NVIDIA/feat/nvidia-setup-ofi
feat(nvidia-setup): add ofi setup
2 parents 3e56590 + ffca29a commit aaad5b0

File tree

7 files changed

+38
-11
lines changed

7 files changed

+38
-11
lines changed

nvidia-setup/VERSION_OVERVIEW.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1-
# 0.1.0
1+
# 0.1.x
22

3-
| service | accelerator | kernel | efa | chrony | raid0 |
4-
|---------|-------------|---------------------|-------------|--------|-------|
5-
| eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y |
6-
| eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y |
3+
| service | accelerator | kernel | efa | chrony | raid0 | OFI |
4+
|---------|-------------|---------------------|-------------|--------|-------|-----|
5+
| eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y | N |
6+
| eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y | N |
7+
8+
# 0.2.x
9+
10+
| service | accelerator | kernel | efa | chrony | raid0 | OFI |
11+
|---------|-------------|---------------------|-------------|--------|-------|-----|
12+
| eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y | Y |
13+
| eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y | Y |

nvidia-setup/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "v1",
33
"package_name": "nvidia_setup",
4-
"package_version": "0.1.0",
4+
"package_version": "0.2.0",
55
"expected_config_files": ["service", "accelerator"],
66
"modes": {
77
"apply": [

nvidia-setup/skyhook_dir/apply.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ fi
2020
run_eks_h100() {
2121
"${STEPS_DIR}/upgrade.sh"
2222
"${STEPS_DIR}/install-efa-driver.sh" "${EFA}"
23+
"${STEPS_DIR}/install_ofi.sh"
2324
# "${STEPS_DIR}/install-lustre.sh" "${KERNEL}" "${LUSTRE}"
2425
"${STEPS_DIR}/configure-chrony.sh"
2526
"${STEPS_DIR}/setup_local_disks.sh" raid0
@@ -28,6 +29,7 @@ run_eks_h100() {
2829
run_eks_gb200() {
2930
"${STEPS_DIR}/upgrade.sh"
3031
"${STEPS_DIR}/install-efa-driver.sh" "${EFA}"
32+
"${STEPS_DIR}/install_ofi.sh"
3133
# "${STEPS_DIR}/install-lustre.sh" "${KERNEL}" "${LUSTRE}"
3234
"${STEPS_DIR}/configure-chrony.sh"
3335
"${STEPS_DIR}/setup_local_disks.sh" raid0

nvidia-setup/skyhook_dir/apply_check.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ fi
1313
check_eks_h100() {
1414
"${STEPS_CHECK_DIR}/upgrade_check.sh"
1515
"${STEPS_CHECK_DIR}/install_efa_driver_check.sh"
16+
"${STEPS_CHECK_DIR}/install_ofi_check.sh"
1617
# "${STEPS_CHECK_DIR}/install_lustre_check.sh" "${KERNEL}"
1718
"${STEPS_CHECK_DIR}/configure_chrony_check.sh"
1819
"${STEPS_CHECK_DIR}/setup_local_disks_check.sh"
@@ -21,6 +22,7 @@ check_eks_h100() {
2122
check_eks_gb200() {
2223
"${STEPS_CHECK_DIR}/upgrade_check.sh"
2324
"${STEPS_CHECK_DIR}/install_efa_driver_check.sh"
25+
"${STEPS_CHECK_DIR}/install_ofi_check.sh"
2426
# "${STEPS_CHECK_DIR}/install_lustre_check.sh" "${KERNEL}"
2527
"${STEPS_CHECK_DIR}/configure_chrony_check.sh"
2628
"${STEPS_CHECK_DIR}/setup_local_disks_check.sh"

nvidia-setup/skyhook_dir/steps/install-efa-driver.sh

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@ export DEBIAN_FRONTEND=noninteractive
66

77
# Function to install EFA with retry logic
88
install_efa() {
9-
local install_dir
10-
11-
install_dir="$(mktemp -d)"
12-
cd "${install_dir}"
13-
149
echo "Downloading EFA installer version ${EFA_VERSION}..."
1510
curl -sSfO "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VERSION}.tar.gz"
1611
tar -xf "aws-efa-installer-${EFA_VERSION}.tar.gz"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash -e
2+
3+
OFI_PREFIX=/opt/amazon/ofi-nccl
4+
echo "PATH=\$PATH:${OFI_PREFIX}/bin" > /etc/profile.d/ofi-aws.sh
5+
echo "${OFI_PREFIX}/lib" > /etc/ld.so.conf.d/000_ofi_aws.conf
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
if [ ! -f /etc/profile.d/ofi-aws.sh ]; then
4+
echo "ERROR: /etc/profile.d/ofi-aws.sh not found"
5+
exit 1
6+
fi
7+
8+
if [ ! -f /etc/ld.so.conf.d/000_ofi_aws.conf ]; then
9+
echo "ERROR: /etc/ld.so.conf.d/000_ofi_aws.conf not found"
10+
exit 1
11+
fi
12+
13+
if [ ! -d /opt/amazon/ofi-nccl ]; then
14+
echo "ERROR: /opt/amazon/ofi-nccl not found"
15+
exit 1
16+
fi

0 commit comments

Comments
 (0)