Skip to content

Commit 3e56590

Browse files
authored
feat(nvidia-setup): can use NVIDIA_SETUP_KERNEL_ALLOW_NEWER to set if… (#25)
* feat(nvidia-setup): can use NVIDIA_SETUP_KERNEL_ALLOW_NEWER to set if kernel is exact or not
1 parent 0c2dea1 commit 3e56590

File tree

6 files changed

+116
-28
lines changed

6 files changed

+116
-28
lines changed

nvidia-setup/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ Defaults are defined in `skyhook_dir/defaults/eks-h100.conf` and `eks-gb200.conf
3434

3535
Set these on the package spec in the Skyhook Custom Resource (`spec.packages.<name>.env`):
3636

37-
- `NVIDIA_SETUP_INSTALL_KERNEL``true` or `false` (default: `false`). If `true`, apply **only** installs the exact kernel from the defaults file (via `downgrade_kernel.sh`) and then exits; a reboot is required. After reboot, the **post-interrupt-check** verifies the running kernel matches the expected version. If `false`, apply verifies the current kernel is >= the required version and errors otherwise, then continues with the full apply
37+
- `NVIDIA_SETUP_INSTALL_KERNEL``true` or `false` (default: `false`). If `true`, apply **only** installs the exact kernel from the defaults file (via `downgrade_kernel.sh`) and then exits; a reboot is required. After reboot, the **post-interrupt-check** verifies the running kernel matches the expected version. If `false`, apply verifies the current kernel meets the requirement (see `NVIDIA_SETUP_KERNEL_ALLOW_NEWER`) and errors otherwise, then continues with the full apply.
38+
- `NVIDIA_SETUP_KERNEL_ALLOW_NEWER``true` or `false` (default: `false`). When `NVIDIA_SETUP_INSTALL_KERNEL=false`, this controls the kernel check: if `false`, the running kernel must match the required upstream version exactly; if `true`, the running kernel may be newer (current >= required).
3839
- `NVIDIA_PIN_KERNEL` - `true` or `false` (defaults: `false`). If `true`, pin the kernel to the exact version in the package so that it will not upgrade in future.
3940
- `NVIDIA_KERNEL` – kernel version (overrides default from defaults file)
4041
- `NVIDIA_EFA` – EFA installer version
@@ -43,7 +44,7 @@ Set these on the package spec in the Skyhook Custom Resource (`spec.packages.<na
4344

4445
For `service=eks` the apply step currently runs, in order:
4546

46-
1. **ensure_kernel** – if `NVIDIA_SETUP_INSTALL_KERNEL=false`: verify running kernel is >= required; if `true`: install exact kernel only (then exit; reboot required).
47+
1. **ensure_kernel** – if `NVIDIA_SETUP_INSTALL_KERNEL=false`: verify running kernel meets requirement (exact match by default; allow newer if `NVIDIA_SETUP_KERNEL_ALLOW_NEWER=true`); if `true`: install exact kernel only (then exit; reboot required).
4748
2. **upgrade**`apt-get update && apt-get upgrade -y`
4849
3. **install-efa-driver** – download and run AWS EFA installer
4950

nvidia-setup/skyhook_dir/apply_check.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ STEPS_CHECK_DIR="${SKYHOOK_DIR}/skyhook_dir/steps_check"
55
# shellcheck source=load_defaults.sh
66
. "${SKYHOOK_DIR}/skyhook_dir/load_defaults.sh"
77

8+
# Skip checks if only installing kernel as we need to reboot before any check would work
9+
if [ "${NVIDIA_SETUP_INSTALL_KERNEL}" = "true" ]; then
10+
exit 0
11+
fi
12+
813
check_eks_h100() {
914
"${STEPS_CHECK_DIR}/upgrade_check.sh"
1015
"${STEPS_CHECK_DIR}/install_efa_driver_check.sh"

nvidia-setup/skyhook_dir/steps/ensure_kernel.sh

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
#!/bin/bash
22
# ensure_kernel.sh: install exact kernel (if NVIDIA_SETUP_INSTALL_KERNEL=true) or
3-
# verify current kernel is >= required (if false).
3+
# verify current kernel meets requirement (see NVIDIA_SETUP_KERNEL_ALLOW_NEWER).
44
set -e
5+
#
6+
# NVIDIA_SETUP_KERNEL_ALLOW_NEWER (default: false). When false, the running kernel
7+
# must match the required upstream version exactly. When true, the running kernel
8+
# may be newer (current >= required).
59

610
STEPS_DIR="${SKYHOOK_DIR}/skyhook_dir/steps"
711

@@ -37,18 +41,35 @@ check_kernel_at_least() {
3741
return 1
3842
}
3943

44+
# Returns 0 if current upstream version equals required upstream version (exact match).
45+
check_kernel_exact() {
46+
local required="$1"
47+
local current
48+
current=$(uname -r)
49+
local required_upstream="${required%%-*}"
50+
local current_upstream="${current%%-*}"
51+
[ "${current_upstream}" = "${required_upstream}" ]
52+
}
53+
4054
# When TEST_CHECK_KERNEL_AT_LEAST is set, skip normal execution so tests can source this file and call check_kernel_at_least.
4155
if [ -z "${TEST_CHECK_KERNEL_AT_LEAST:-}" ]; then
4256
if [ "${NVIDIA_SETUP_INSTALL_KERNEL:-false}" = "true" ]; then
4357
install_kernel
4458
exit 0
4559
fi
4660

47-
# Check current kernel is >= required
61+
# Check current kernel meets requirement (exact or at-least depending on env)
4862
required_full="$(resolve_full_kernel "${KERNEL}")"
49-
if ! check_kernel_at_least "${required_full}"; then
50-
echo "Error: current kernel $(uname -r) is not >= required ${required_full}. Set NVIDIA_SETUP_INSTALL_KERNEL=true to install the exact kernel, or boot with a compatible kernel." >&2
51-
exit 1
63+
if [ "${NVIDIA_SETUP_KERNEL_ALLOW_NEWER:-false}" = "true" ]; then
64+
if ! check_kernel_at_least "${required_full}"; then
65+
echo "Error: current kernel $(uname -r) is not >= required ${required_full}. Set NVIDIA_SETUP_INSTALL_KERNEL=true to install the exact kernel, or boot with a compatible kernel." >&2
66+
exit 1
67+
fi
68+
else
69+
if ! check_kernel_exact "${required_full}"; then
70+
echo "Error: current kernel $(uname -r) does not match required ${required_full} (exact match required). Set NVIDIA_SETUP_KERNEL_ALLOW_NEWER=true to allow a newer kernel, or NVIDIA_SETUP_INSTALL_KERNEL=true to install the exact kernel." >&2
71+
exit 1
72+
fi
5273
fi
5374
fi
5475

tests/integration/nvidia_setup/run_check_kernel_at_least_test.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
#!/bin/bash
2-
# Test harness for check_kernel_at_least. Set CURRENT_KERNEL and REQUIRED_KERNEL,
3-
# then source ensure_kernel.sh (with uname mocked) and run the check. Exit code
4-
# 0 = current >= required, 1 = current < required.
2+
# Test harness for check_kernel_at_least and check_kernel_exact. Set CURRENT_KERNEL,
3+
# REQUIRED_KERNEL, and optionally KERNEL_CHECK_MODE=at_least|exact (default: at_least).
4+
# Source ensure_kernel.sh (with uname mocked) and run the chosen check.
5+
# Exit code 0 = pass, 1 = fail.
56
set -e
67

78
CURRENT_KERNEL="${CURRENT_KERNEL:?CURRENT_KERNEL must be set}"
89
REQUIRED_KERNEL="${REQUIRED_KERNEL:?REQUIRED_KERNEL must be set}"
10+
KERNEL_CHECK_MODE="${KERNEL_CHECK_MODE:-at_least}"
911
[ -n "${SKYHOOK_DIR:-}" ] || { echo "SKYHOOK_DIR must be set" >&2; exit 1; }
1012

1113
uname() {
@@ -19,5 +21,9 @@ export TEST_CHECK_KERNEL_AT_LEAST=1
1921
# shellcheck source=ensure_kernel.sh
2022
. "${SKYHOOK_DIR}/skyhook_dir/steps/ensure_kernel.sh"
2123

22-
check_kernel_at_least "$REQUIRED_KERNEL"
24+
case "${KERNEL_CHECK_MODE}" in
25+
at_least) check_kernel_at_least "$REQUIRED_KERNEL" ;;
26+
exact) check_kernel_exact "$REQUIRED_KERNEL" ;;
27+
*) echo "KERNEL_CHECK_MODE must be at_least or exact" >&2; exit 1 ;;
28+
esac
2329
exit $?

tests/integration/nvidia_setup/test_apply.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def test_apply_with_env_overrides(base_image):
6969
configmaps={"service": "eks", "accelerator": "h100"},
7070
env_vars={
7171
"NVIDIA_KERNEL": "6.8.0",
72+
"NVIDIA_SETUP_KERNEL_ALLOW_NEWER": "true", # container kernel may be newer than override
7273
"NVIDIA_EFA": "1.31.0",
7374
"NVIDIA_LUSTRE": "aws"
7475
},
Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/usr/bin/env python3
22
"""
3-
Tests for check_kernel_at_least in ensure_kernel.sh.
3+
Tests for check_kernel_at_least and check_kernel_exact in ensure_kernel.sh.
44
5-
The check compares upstream kernel versions (before first '-') so that
6-
e.g. 6.17.0-1007-aws is correctly considered >= 6.14.0-1018-aws (6.17 >= 6.14).
5+
- at_least: compares upstream versions (before first '-'); current >= required passes.
6+
- exact: current upstream must equal required upstream (NVIDIA_SETUP_KERNEL_ALLOW_NEWER=false behavior).
77
"""
88

99
from pathlib import Path
@@ -15,55 +15,109 @@
1515
_CHECK_SCRIPT_DEST = "skyhook_dir/steps/run_check_kernel_at_least_test.sh"
1616

1717

18-
def _run_check(runner: DockerTestRunner, current_kernel: str, required_kernel: str) -> int:
19-
"""Run the check script; return exit code."""
18+
def _run_check(
19+
runner: DockerTestRunner,
20+
current_kernel: str,
21+
required_kernel: str,
22+
mode: str = "at_least",
23+
) -> int:
24+
"""Run the check script; return exit code. mode is 'at_least' or 'exact'."""
25+
env = {
26+
"CURRENT_KERNEL": current_kernel,
27+
"REQUIRED_KERNEL": required_kernel,
28+
}
29+
if mode != "at_least":
30+
env["KERNEL_CHECK_MODE"] = mode
2031
result = runner.run_script(
2132
script="steps/run_check_kernel_at_least_test.sh",
2233
configmaps={},
23-
env_vars={
24-
"CURRENT_KERNEL": current_kernel,
25-
"REQUIRED_KERNEL": required_kernel,
26-
},
34+
env_vars=env,
2735
extra_files=[(_CHECK_SCRIPT_SOURCE, _CHECK_SCRIPT_DEST)],
2836
)
2937
return result.exit_code
3038

3139

32-
def test_current_newer_upstream_passes():
40+
# --- check_kernel_at_least (allow newer: current >= required) ---
41+
42+
43+
def test_at_least_current_newer_upstream_passes():
3344
"""6.17.0-1007-aws >= 6.14.0-1018-aws (upstream 6.17 >= 6.14); was previously failing with sort -V on full string."""
3445
runner = DockerTestRunner(package="nvidia-setup")
3546
try:
36-
exit_code = _run_check(runner, "6.17.0-1007-aws", "6.14.0-1018-aws")
47+
exit_code = _run_check(runner, "6.17.0-1007-aws", "6.14.0-1018-aws", mode="at_least")
3748
assert exit_code == 0
3849
finally:
3950
runner.cleanup()
4051

4152

42-
def test_current_same_upstream_passes():
53+
def test_at_least_current_same_upstream_passes():
4354
"""6.14.0-1000-aws >= 6.14.0-1018-aws (same upstream)."""
4455
runner = DockerTestRunner(package="nvidia-setup")
4556
try:
46-
exit_code = _run_check(runner, "6.14.0-1000-aws", "6.14.0-1018-aws")
57+
exit_code = _run_check(runner, "6.14.0-1000-aws", "6.14.0-1018-aws", mode="at_least")
4758
assert exit_code == 0
4859
finally:
4960
runner.cleanup()
5061

5162

52-
def test_current_older_upstream_fails():
63+
def test_at_least_current_older_upstream_fails():
5364
"""6.13.0-1000-aws < 6.14.0-1018-aws (upstream 6.13 < 6.14)."""
5465
runner = DockerTestRunner(package="nvidia-setup")
5566
try:
56-
exit_code = _run_check(runner, "6.13.0-1000-aws", "6.14.0-1018-aws")
67+
exit_code = _run_check(runner, "6.13.0-1000-aws", "6.14.0-1018-aws", mode="at_least")
5768
assert exit_code == 1
5869
finally:
5970
runner.cleanup()
6071

6172

62-
def test_current_exact_required_passes():
73+
def test_at_least_current_exact_required_passes():
6374
"""6.14.0-1018-aws >= 6.14.0-1018-aws (equal)."""
6475
runner = DockerTestRunner(package="nvidia-setup")
6576
try:
66-
exit_code = _run_check(runner, "6.14.0-1018-aws", "6.14.0-1018-aws")
77+
exit_code = _run_check(runner, "6.14.0-1018-aws", "6.14.0-1018-aws", mode="at_least")
78+
assert exit_code == 0
79+
finally:
80+
runner.cleanup()
81+
82+
83+
# --- check_kernel_exact (exact upstream match; NVIDIA_SETUP_KERNEL_ALLOW_NEWER=false) ---
84+
85+
86+
def test_exact_current_newer_upstream_fails():
87+
"""6.17.0-1007-aws vs 6.14.0-1018-aws: exact requires same upstream, so fails."""
88+
runner = DockerTestRunner(package="nvidia-setup")
89+
try:
90+
exit_code = _run_check(runner, "6.17.0-1007-aws", "6.14.0-1018-aws", mode="exact")
91+
assert exit_code == 1
92+
finally:
93+
runner.cleanup()
94+
95+
96+
def test_exact_current_same_upstream_passes():
97+
"""6.14.0-1000-aws vs 6.14.0-1018-aws: same upstream 6.14.0, exact passes."""
98+
runner = DockerTestRunner(package="nvidia-setup")
99+
try:
100+
exit_code = _run_check(runner, "6.14.0-1000-aws", "6.14.0-1018-aws", mode="exact")
101+
assert exit_code == 0
102+
finally:
103+
runner.cleanup()
104+
105+
106+
def test_exact_current_older_upstream_fails():
107+
"""6.13.0-1000-aws vs 6.14.0-1018-aws: different upstream, exact fails."""
108+
runner = DockerTestRunner(package="nvidia-setup")
109+
try:
110+
exit_code = _run_check(runner, "6.13.0-1000-aws", "6.14.0-1018-aws", mode="exact")
111+
assert exit_code == 1
112+
finally:
113+
runner.cleanup()
114+
115+
116+
def test_exact_current_exact_required_passes():
117+
"""6.14.0-1018-aws vs 6.14.0-1018-aws: exact match passes."""
118+
runner = DockerTestRunner(package="nvidia-setup")
119+
try:
120+
exit_code = _run_check(runner, "6.14.0-1018-aws", "6.14.0-1018-aws", mode="exact")
67121
assert exit_code == 0
68122
finally:
69123
runner.cleanup()

0 commit comments

Comments
 (0)