File tree Expand file tree Collapse file tree 7 files changed +38
-11
lines changed
Expand file tree Collapse file tree 7 files changed +38
-11
lines changed Original file line number Diff line number Diff line change 1- # 0.1.0
1+ # 0.1.x
22
3- | service | accelerator | kernel | efa | chrony | raid0 |
4- | ---------| -------------| ---------------------| -------------| --------| -------|
5- | eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y |
6- | eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y |
3+ | service | accelerator | kernel | efa | chrony | raid0 | OFI |
4+ | ---------| -------------| ---------------------| -------------| --------| -------| -----|
5+ | eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y | N |
6+ | eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y | N |
7+
8+ # 0.2.x
9+
10+ | service | accelerator | kernel | efa | chrony | raid0 | OFI |
11+ | ---------| -------------| ---------------------| -------------| --------| -------| -----|
12+ | eks | h100 | 6.14.0-1018-aws | 1.47.0 | Y | Y | Y |
13+ | eks | gb200 | 6.14.0-1018-aws | 1.47.0 | Y | Y | Y |
Original file line number Diff line number Diff line change 11{
22 "schema_version" : " v1" ,
33 "package_name" : " nvidia_setup" ,
4- "package_version" : " 0.1 .0" ,
4+ "package_version" : " 0.2 .0" ,
55 "expected_config_files" : [" service" , " accelerator" ],
66 "modes" : {
77 "apply" : [
Original file line number Diff line number Diff line change 2020run_eks_h100 () {
2121 " ${STEPS_DIR} /upgrade.sh"
2222 " ${STEPS_DIR} /install-efa-driver.sh" " ${EFA} "
23+ " ${STEPS_DIR} /install_ofi.sh"
2324 # "${STEPS_DIR}/install-lustre.sh" "${KERNEL}" "${LUSTRE}"
2425 " ${STEPS_DIR} /configure-chrony.sh"
2526 " ${STEPS_DIR} /setup_local_disks.sh" raid0
@@ -28,6 +29,7 @@ run_eks_h100() {
2829run_eks_gb200 () {
2930 " ${STEPS_DIR} /upgrade.sh"
3031 " ${STEPS_DIR} /install-efa-driver.sh" " ${EFA} "
32+ " ${STEPS_DIR} /install_ofi.sh"
3133 # "${STEPS_DIR}/install-lustre.sh" "${KERNEL}" "${LUSTRE}"
3234 " ${STEPS_DIR} /configure-chrony.sh"
3335 " ${STEPS_DIR} /setup_local_disks.sh" raid0
Original file line number Diff line number Diff line change 1313check_eks_h100 () {
1414 " ${STEPS_CHECK_DIR} /upgrade_check.sh"
1515 " ${STEPS_CHECK_DIR} /install_efa_driver_check.sh"
16+ " ${STEPS_CHECK_DIR} /install_ofi_check.sh"
1617 # "${STEPS_CHECK_DIR}/install_lustre_check.sh" "${KERNEL}"
1718 " ${STEPS_CHECK_DIR} /configure_chrony_check.sh"
1819 " ${STEPS_CHECK_DIR} /setup_local_disks_check.sh"
@@ -21,6 +22,7 @@ check_eks_h100() {
2122check_eks_gb200 () {
2223 " ${STEPS_CHECK_DIR} /upgrade_check.sh"
2324 " ${STEPS_CHECK_DIR} /install_efa_driver_check.sh"
25+ " ${STEPS_CHECK_DIR} /install_ofi_check.sh"
2426 # "${STEPS_CHECK_DIR}/install_lustre_check.sh" "${KERNEL}"
2527 " ${STEPS_CHECK_DIR} /configure_chrony_check.sh"
2628 " ${STEPS_CHECK_DIR} /setup_local_disks_check.sh"
Original file line number Diff line number Diff line change @@ -6,11 +6,6 @@ export DEBIAN_FRONTEND=noninteractive
66
77# Function to install EFA with retry logic
88install_efa () {
9- local install_dir
10-
11- install_dir=" $( mktemp -d) "
12- cd " ${install_dir} "
13-
149 echo " Downloading EFA installer version ${EFA_VERSION} ..."
1510 curl -sSfO " https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VERSION} .tar.gz"
1611 tar -xf " aws-efa-installer-${EFA_VERSION} .tar.gz"
Original file line number Diff line number Diff line change 1+ #! /bin/bash -e
2+
3+ OFI_PREFIX=/opt/amazon/ofi-nccl
4+ echo " PATH=\$ PATH:${OFI_PREFIX} /bin" > /etc/profile.d/ofi-aws.sh
5+ echo " ${OFI_PREFIX} /lib" > /etc/ld.so.conf.d/000_ofi_aws.conf
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ if [ ! -f /etc/profile.d/ofi-aws.sh ]; then
4+ echo " ERROR: /etc/profile.d/ofi-aws.sh not found"
5+ exit 1
6+ fi
7+
8+ if [ ! -f /etc/ld.so.conf.d/000_ofi_aws.conf ]; then
9+ echo " ERROR: /etc/ld.so.conf.d/000_ofi_aws.conf not found"
10+ exit 1
11+ fi
12+
13+ if [ ! -d /opt/amazon/ofi-nccl ]; then
14+ echo " ERROR: /opt/amazon/ofi-nccl not found"
15+ exit 1
16+ fi
You can’t perform that action at this time.
0 commit comments