Skip to content

Commit 6d508b8

Browse files
committed
feat(nvidia-setup): change expected service to aws from eks to align with nvidia-tuned
1 parent 2b83722 commit 6d508b8

File tree

7 files changed

+44
-33
lines changed

7 files changed

+44
-33
lines changed

nvidia-setup/skyhook_dir/apply.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ fi
1717
# Otherwise: ensure current kernel is >= required, then run full apply
1818
"${STEPS_DIR}/ensure_kernel.sh"
1919

20-
run_eks_h100() {
20+
run_aws_h100() {
2121
"${STEPS_DIR}/upgrade.sh"
2222
"${STEPS_DIR}/install-efa-driver.sh" "${EFA}"
2323
"${STEPS_DIR}/install_ofi.sh"
@@ -26,7 +26,7 @@ run_eks_h100() {
2626
"${STEPS_DIR}/setup_local_disks.sh" raid0
2727
}
2828

29-
run_eks_gb200() {
29+
run_aws_gb200() {
3030
"${STEPS_DIR}/upgrade.sh"
3131
"${STEPS_DIR}/install-efa-driver.sh" "${EFA}"
3232
"${STEPS_DIR}/install_ofi.sh"
@@ -36,8 +36,8 @@ run_eks_gb200() {
3636
}
3737

3838
case "${COMBINATION}" in
39-
eks-h100) run_eks_h100 ;;
40-
eks-gb200) run_eks_gb200 ;;
39+
aws-h100) run_aws_h100 ;;
40+
aws-gb200) run_aws_gb200 ;;
4141
*)
4242
echo "Unsupported combination: ${COMBINATION}" >&2
4343
echo "Supported: $(find "${DEFAULTS_DIR}" -maxdepth 1 -name '*.conf' -exec basename {} .conf \; 2>/dev/null | tr '\n' ' ')" >&2

nvidia-setup/skyhook_dir/apply_check.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ if [ "${NVIDIA_SETUP_INSTALL_KERNEL}" = "true" ]; then
1010
exit 0
1111
fi
1212

13-
check_eks_h100() {
13+
check_aws_h100() {
1414
"${STEPS_CHECK_DIR}/upgrade_check.sh"
1515
"${STEPS_CHECK_DIR}/install_efa_driver_check.sh"
1616
"${STEPS_CHECK_DIR}/install_ofi_check.sh"
@@ -19,7 +19,7 @@ check_eks_h100() {
1919
"${STEPS_CHECK_DIR}/setup_local_disks_check.sh"
2020
}
2121

22-
check_eks_gb200() {
22+
check_aws_gb200() {
2323
"${STEPS_CHECK_DIR}/upgrade_check.sh"
2424
"${STEPS_CHECK_DIR}/install_efa_driver_check.sh"
2525
"${STEPS_CHECK_DIR}/install_ofi_check.sh"
@@ -29,8 +29,8 @@ check_eks_gb200() {
2929
}
3030

3131
case "${COMBINATION}" in
32-
eks-h100) check_eks_h100 ;;
33-
eks-gb200) check_eks_gb200 ;;
32+
aws-h100) check_aws_h100 ;;
33+
aws-gb200) check_aws_gb200 ;;
3434
*)
3535
echo "Unsupported combination: ${COMBINATION}" >&2
3636
echo "Supported: $(find "${DEFAULTS_DIR}" -maxdepth 1 -name '*.conf' -exec basename {} .conf \; 2>/dev/null | tr '\n' ' ')" >&2
File renamed without changes.
File renamed without changes.

nvidia-setup/skyhook_dir/steps/install-efa-driver.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@ set -e
33
EFA_VERSION="${1:?EFA version required}"
44
export DEBIAN_FRONTEND=noninteractive
55

6+
# Skip if EFA is already installed (same criteria as install_efa_driver_check.sh)
7+
efa_already_installed() {
8+
[ -d /opt/amazon/efa ] && return 0
9+
ldconfig -p 2>/dev/null | grep -q libfabric && return 0
10+
dkms status 2>/dev/null | grep -q 'efa.*installed' && return 0
11+
return 1
12+
}
13+
if efa_already_installed; then
14+
echo "EFA already installed, skipping."
15+
exit 0
16+
fi
617

718
# Function to install EFA with retry logic
819
install_efa() {

tests/integration/nvidia_setup/test_apply.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def test_unsupported_combination():
2828
finally:
2929
runner.cleanup()
3030

31-
@pytest.mark.skip(reason="Skipping test_apply_eks_h100. Kernel is flaky based on where it is run.")
32-
def test_apply_eks_h100(base_image):
33-
"""Test apply.sh with eks-h100 combination."""
31+
@pytest.mark.skip(reason="Skipping test_apply_aws_h100. Kernel is flaky based on where it is run.")
32+
def test_apply_aws_h100(base_image):
33+
"""Test apply.sh with aws-h100 combination."""
3434
runner = DockerTestRunner(package="nvidia-setup", base_image=base_image)
3535
try:
3636
result = runner.run_script(
3737
script="apply.sh",
38-
configmaps={"service": "eks", "accelerator": "h100"},
38+
configmaps={"service": "aws", "accelerator": "h100"},
3939
skip_system_operations=True
4040
)
4141

@@ -44,14 +44,14 @@ def test_apply_eks_h100(base_image):
4444
finally:
4545
runner.cleanup()
4646

47-
@pytest.mark.skip(reason="Skipping test_apply_eks_gb200. Kernel is flaky based on where it is run.")
48-
def test_apply_eks_gb200(base_image):
49-
"""Test apply.sh with eks-gb200 combination."""
47+
@pytest.mark.skip(reason="Skipping test_apply_aws_gb200. Kernel is flaky based on where it is run.")
48+
def test_apply_aws_gb200(base_image):
49+
"""Test apply.sh with aws-gb200 combination."""
5050
runner = DockerTestRunner(package="nvidia-setup", base_image=base_image)
5151
try:
5252
result = runner.run_script(
5353
script="apply.sh",
54-
configmaps={"service": "eks", "accelerator": "gb200"},
54+
configmaps={"service": "aws", "accelerator": "gb200"},
5555
skip_system_operations=True
5656
)
5757

@@ -66,7 +66,7 @@ def test_apply_with_env_overrides(base_image):
6666
try:
6767
result = runner.run_script(
6868
script="apply.sh",
69-
configmaps={"service": "eks", "accelerator": "h100"},
69+
configmaps={"service": "aws", "accelerator": "h100"},
7070
env_vars={
7171
"NVIDIA_KERNEL": "6.8.0",
7272
"NVIDIA_SETUP_KERNEL_ALLOW_NEWER": "true", # container kernel may be newer than override
@@ -108,9 +108,9 @@ def test_apply_dynamic_supported_listing(base_image):
108108
)
109109

110110
assert_exit_code(result, 1)
111-
# Should contain at least eks-h100 and eks-gb200 in the supported list
112-
assert_output_contains(result.stdout, "eks-h100")
113-
assert_output_contains(result.stdout, "eks-gb200")
111+
# Should contain at least aws-h100 and aws-gb200 in the supported list
112+
assert_output_contains(result.stdout, "aws-h100")
113+
assert_output_contains(result.stdout, "aws-gb200")
114114
finally:
115115
runner.cleanup()
116116

@@ -121,7 +121,7 @@ def test_apply_install_kernel_only_skips_actual_install(base_image):
121121
try:
122122
result = runner.run_script(
123123
script="apply.sh",
124-
configmaps={"service": "eks", "accelerator": "h100"},
124+
configmaps={"service": "aws", "accelerator": "h100"},
125125
env_vars={"NVIDIA_SETUP_INSTALL_KERNEL": "true"},
126126
skip_system_operations=True,
127127
)
@@ -131,13 +131,13 @@ def test_apply_install_kernel_only_skips_actual_install(base_image):
131131
runner.cleanup()
132132

133133

134-
def test_apply_install_kernel_only_eks_gb200_skips_actual_install(base_image):
135-
"""Kernel-only path with eks-gb200; skips actual install when SKIP_SYSTEM_OPERATIONS set."""
134+
def test_apply_install_kernel_only_aws_gb200_skips_actual_install(base_image):
135+
"""Kernel-only path with aws-gb200; skips actual install when SKIP_SYSTEM_OPERATIONS set."""
136136
runner = DockerTestRunner(package="nvidia-setup", base_image=base_image)
137137
try:
138138
result = runner.run_script(
139139
script="apply.sh",
140-
configmaps={"service": "eks", "accelerator": "gb200"},
140+
configmaps={"service": "aws", "accelerator": "gb200"},
141141
env_vars={"NVIDIA_SETUP_INSTALL_KERNEL": "true"},
142142
skip_system_operations=True,
143143
)

tests/integration/nvidia_setup/test_apply_check.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ def test_apply_check_unsupported_combination():
2323
runner.cleanup()
2424

2525

26-
def test_apply_check_eks_h100(base_image):
27-
"""Test apply_check.sh with eks-h100 combination."""
26+
def test_apply_check_aws_h100(base_image):
27+
"""Test apply_check.sh with aws-h100 combination."""
2828
runner = DockerTestRunner(package="nvidia-setup", base_image=base_image)
2929
try:
3030
result = runner.run_script(
3131
script="apply_check.sh",
32-
configmaps={"service": "eks", "accelerator": "h100"}
32+
configmaps={"service": "aws", "accelerator": "h100"}
3333
)
3434

3535
# apply_check.sh will likely fail if packages aren't installed,
@@ -39,13 +39,13 @@ def test_apply_check_eks_h100(base_image):
3939
runner.cleanup()
4040

4141

42-
def test_apply_check_eks_gb200(base_image):
43-
"""Test apply_check.sh with eks-gb200 combination."""
42+
def test_apply_check_aws_gb200(base_image):
43+
"""Test apply_check.sh with aws-gb200 combination."""
4444
runner = DockerTestRunner(package="nvidia-setup", base_image=base_image)
4545
try:
4646
result = runner.run_script(
4747
script="apply_check.sh",
48-
configmaps={"service": "eks", "accelerator": "gb200"}
48+
configmaps={"service": "aws", "accelerator": "gb200"}
4949
)
5050

5151
assert result.exit_code is not None
@@ -59,7 +59,7 @@ def test_apply_check_with_env_overrides(base_image):
5959
try:
6060
result = runner.run_script(
6161
script="apply_check.sh",
62-
configmaps={"service": "eks", "accelerator": "h100"},
62+
configmaps={"service": "aws", "accelerator": "h100"},
6363
env_vars={
6464
"NVIDIA_KERNEL": "5.15.0-1025-aws",
6565
"NVIDIA_EFA": "1.31.0",
@@ -77,7 +77,7 @@ def test_post_interrupt_check_with_install_kernel_true_fails_when_kernel_mismatc
7777
try:
7878
result = runner.run_script(
7979
script="post_interrupt_check.sh",
80-
configmaps={"service": "eks", "accelerator": "h100"},
80+
configmaps={"service": "aws", "accelerator": "h100"},
8181
env_vars={"NVIDIA_SETUP_INSTALL_KERNEL": "true"},
8282
)
8383
assert_exit_code(result, 1)
@@ -92,7 +92,7 @@ def test_post_interrupt_check_with_install_kernel_false_exits_success(base_image
9292
try:
9393
result = runner.run_script(
9494
script="post_interrupt_check.sh",
95-
configmaps={"service": "eks", "accelerator": "h100"},
95+
configmaps={"service": "aws", "accelerator": "h100"},
9696
env_vars={"NVIDIA_SETUP_INSTALL_KERNEL": "false"},
9797
)
9898
assert_exit_code(result, 0)

0 commit comments

Comments
 (0)