Skip to content

Commit d32bbea

Browse files
committed
feat(nvidia-tuned): change containerd drop to be a script
feat(nvidia-tuned): update so final profile is {service}-{accelerator}-{intent}
1 parent aaad5b0 commit d32bbea

File tree

4 files changed

+119
-32
lines changed

4 files changed

+119
-32
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# TuneD script plugin lifecycle: start | stop [full_rollback] | verify [ignore_missing]
3+
# https://github.com/redhat-performance/tuned/blob/v2.21.0/tuned/plugins/plugin_script.py
4+
5+
set -e
6+
7+
DROPIN_DIR=/etc/systemd/system/containerd.service.d
8+
DROPIN_FILE=containerd.conf
9+
EXPECTED_CONTENT='[Service]
10+
LimitSTACK=67108864
11+
'
12+
13+
apply_dropin() {
14+
mkdir -p "$DROPIN_DIR"
15+
cat <<EOF > "$DROPIN_DIR/$DROPIN_FILE"
16+
[Service]
17+
LimitSTACK=67108864
18+
EOF
19+
systemctl daemon-reload
20+
}
21+
22+
remove_dropin() {
23+
rm -f "$DROPIN_DIR/$DROPIN_FILE"
24+
if [ -d "$DROPIN_DIR" ] && [ -z "$(ls -A "$DROPIN_DIR" 2>/dev/null)" ]; then
25+
rmdir "$DROPIN_DIR"
26+
fi
27+
systemctl daemon-reload
28+
}
29+
30+
verify_dropin() {
31+
local ignore_missing=false
32+
[ "${2:-}" = "ignore_missing" ] && ignore_missing=true
33+
34+
if [ ! -f "$DROPIN_DIR/$DROPIN_FILE" ]; then
35+
$ignore_missing && exit 0 || exit 1
36+
fi
37+
if [ "$(cat "$DROPIN_DIR/$DROPIN_FILE")" != "$EXPECTED_CONTENT" ]; then
38+
exit 1
39+
fi
40+
exit 0
41+
}
42+
43+
cmd="${1:-}"
44+
case "$cmd" in
45+
start)
46+
apply_dropin
47+
;;
48+
stop)
49+
remove_dropin
50+
# full_rollback (arg 2) - same unapply for this script
51+
;;
52+
verify)
53+
verify_dropin "$@"
54+
;;
55+
*)
56+
echo "Usage: $0 start | stop [full_rollback] | verify [ignore_missing]" >&2
57+
exit 1
58+
;;
59+
esac

nvidia-tuned/profiles/os/common/nvidia-gb200-performance/tuned.conf

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ vm.max_map_count=262144
2323
vm.min_free_kbytes=65536
2424
vm.overcommit_memory=1
2525

26-
[service]
27-
service.containerd=start,enable,file:/etc/tuned/nvidia-gb200-performance/service_containerd.conf
26+
[script]
27+
# Workaround for tuned not working if the dropin folder already exists
28+
script=${i:PROFILE_DIR}/containerd_service.sh

nvidia-tuned/skyhook_dir/prepare_nvidia_profiles.sh

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@
1818

1919
# Prepares NVIDIA tuned profiles by:
2020
# 1. Reading intent, accelerator, and service from configmap
21-
# 2. Constructing the profile name as nvidia-{accelerator}-{intent}
22-
# 3. Copying common base profiles to /usr/lib/tuned/
23-
# 4. Selecting the appropriate OS-specific workload profiles
24-
# 5. Setting up the service profile with dynamic include
21+
# 2. Constructing the workload profile name as nvidia-{accelerator}-{intent}
22+
# 3. Final profile name: {service}-{accelerator}-{intent} when service is set, else workload profile name
23+
# 4. Copying common base profiles to /usr/lib/tuned/
24+
# 5. Selecting the appropriate OS-specific workload profiles
25+
# 6. Setting up the service profile with dynamic include
2526

2627
set -xe
2728
set -u
@@ -132,10 +133,23 @@ validate_profile() {
132133
echo "Validated profile exists: $profile"
133134
}
134135

135-
# Deploy service profile with dynamic include
136+
# Build the final profile name: {service}-{accelerator}-{intent} when service is set
137+
build_final_profile_name() {
138+
local service=$1
139+
local accelerator=$2
140+
local intent=$3
141+
if [ -n "$service" ]; then
142+
echo "${service}-${accelerator}-${intent}"
143+
else
144+
echo "nvidia-${accelerator}-${intent}"
145+
fi
146+
}
147+
148+
# Deploy service profile with dynamic include (into directory named {service}-{accelerator}-{intent})
136149
deploy_service_profile() {
137150
local service=$1
138151
local profile=$2
152+
local final_profile_name=$3
139153
local service_dir="$PROFILES_DIR/service/$service"
140154

141155
if [ ! -d "$service_dir" ]; then
@@ -158,15 +172,15 @@ deploy_service_profile() {
158172
local profile_to_include="$profile"
159173
fi
160174

161-
# Create service profile directory
162-
mkdir -p "$TUNED_USER_DIR/$service"
175+
# Create service profile directory (final profile name = {service}-{accelerator}-{intent})
176+
mkdir -p "$TUNED_USER_DIR/$final_profile_name"
163177

164178
# Copy template and inject include line
165179
local template="$service_dir/tuned.conf.template"
166180
if [ -f "$template" ]; then
167181
# Insert include= line after [main]
168-
sed "s/^\[main\]/[main]\ninclude=$profile_to_include/" "$template" | tee "$TUNED_USER_DIR/$service/tuned.conf" > /dev/null
169-
echo "Created service profile: $service with include=$profile_to_include"
182+
sed "s/^\[main\]/[main]\ninclude=$profile_to_include/" "$template" | tee "$TUNED_USER_DIR/$final_profile_name/tuned.conf" > /dev/null
183+
echo "Created service profile: $final_profile_name with include=$profile_to_include"
170184
else
171185
echo "ERROR: Service template not found: $template"
172186
exit 1
@@ -178,8 +192,8 @@ deploy_service_profile() {
178192
filename=$(basename "$file")
179193
[ "$filename" = "tuned.conf.template" ] && continue
180194
[[ "$filename" == *.conf ]] && continue # Skip .conf files (they're service-specific profiles)
181-
cp "$file" "$TUNED_USER_DIR/$service/$filename"
182-
chmod +x "$TUNED_USER_DIR/$service/$filename" 2>/dev/null || true
195+
cp "$file" "$TUNED_USER_DIR/$final_profile_name/$filename"
196+
chmod +x "$TUNED_USER_DIR/$final_profile_name/$filename" 2>/dev/null || true
183197
echo "Copied service file: $filename"
184198
done
185199
}
@@ -247,9 +261,10 @@ main() {
247261
SERVICE=$(cat "$SERVICE_FILE" | xargs)
248262
if [ -n "$SERVICE" ]; then
249263
echo "Requested service: $SERVICE"
250-
deploy_service_profile "$SERVICE" "$PROFILE"
251-
# Active profile is the service (which includes the workload profile)
252-
write_tuned_profile "$SERVICE"
264+
FINAL_PROFILE=$(build_final_profile_name "$SERVICE" "$ACCELERATOR" "$INTENT")
265+
echo "Final profile name: $FINAL_PROFILE (service=$SERVICE, accelerator=$ACCELERATOR, intent=$INTENT)"
266+
deploy_service_profile "$SERVICE" "$PROFILE" "$FINAL_PROFILE"
267+
write_tuned_profile "$FINAL_PROFILE"
253268
else
254269
# No service, use workload profile directly
255270
write_tuned_profile "$PROFILE"

nvidia-tuned/skyhook_dir/prepare_nvidia_profiles_check.sh

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,25 @@ ACCELERATOR_FILE="$CONFIGMAP_DIR/accelerator"
3131
SERVICE_FILE="$CONFIGMAP_DIR/service"
3232
TUNED_PROFILE_FILE="$CONFIGMAP_DIR/tuned_profile"
3333

34-
# Build the profile name from configmap fields
34+
# Build the workload profile name from configmap fields
3535
build_profile_name() {
3636
local intent=$1
3737
local accelerator=$2
3838
echo "nvidia-${accelerator}-${intent}"
3939
}
4040

41+
# Build the final profile name: {service}-{accelerator}-{intent} when service is set
42+
build_final_profile_name() {
43+
local service=$1
44+
local accelerator=$2
45+
local intent=$3
46+
if [ -n "$service" ]; then
47+
echo "${service}-${accelerator}-${intent}"
48+
else
49+
echo "nvidia-${accelerator}-${intent}"
50+
fi
51+
}
52+
4153
# Verify common profiles are deployed
4254
verify_common_profiles() {
4355
echo "Verifying common profiles in $TUNED_SYSTEM_DIR..."
@@ -125,31 +137,31 @@ verify_constructed_profile() {
125137
echo "Verified constructed profile: $profile"
126138
}
127139

128-
# Verify service profile is deployed with correct include
140+
# Verify service profile is deployed with correct include (final profile name = {service}-{accelerator}-{intent})
129141
verify_service_profile() {
130-
local service=$1
131-
local expected_profile=$2
142+
local final_profile_name=$1
143+
local expected_workload_profile=$2
132144

133-
if [ ! -d "$TUNED_USER_DIR/$service" ]; then
134-
echo "ERROR: Service profile directory missing: $TUNED_USER_DIR/$service"
145+
if [ ! -d "$TUNED_USER_DIR/$final_profile_name" ]; then
146+
echo "ERROR: Service profile directory missing: $TUNED_USER_DIR/$final_profile_name"
135147
exit 1
136148
fi
137149

138-
local service_conf="$TUNED_USER_DIR/$service/tuned.conf"
150+
local service_conf="$TUNED_USER_DIR/$final_profile_name/tuned.conf"
139151
if [ ! -f "$service_conf" ]; then
140-
echo "ERROR: tuned.conf missing for service profile: $service"
152+
echo "ERROR: tuned.conf missing for service profile: $final_profile_name"
141153
exit 1
142154
fi
143155

144-
# Verify include line points to correct profile
145-
if ! grep -q "^include=$expected_profile" "$service_conf"; then
146-
echo "ERROR: Service profile $service does not include $expected_profile"
156+
# Verify include line points to workload profile
157+
if ! grep -q "^include=$expected_workload_profile" "$service_conf"; then
158+
echo "ERROR: Service profile $final_profile_name does not include $expected_workload_profile"
147159
echo "Contents of $service_conf:"
148160
cat "$service_conf"
149161
exit 1
150162
fi
151163

152-
echo "Verified service profile: $service includes $expected_profile"
164+
echo "Verified service profile: $final_profile_name includes $expected_workload_profile"
153165
}
154166

155167
# Verify tuned_profile file exists and is correct
@@ -206,10 +218,10 @@ main() {
206218
if [ -f "$SERVICE_FILE" ]; then
207219
SERVICE=$(cat "$SERVICE_FILE" | xargs)
208220
if [ -n "$SERVICE" ]; then
209-
# Verify service profile
210-
verify_service_profile "$SERVICE" "$PROFILE"
211-
# Expected active profile is the service
212-
verify_tuned_profile_file "$SERVICE"
221+
FINAL_PROFILE=$(build_final_profile_name "$SERVICE" "$ACCELERATOR" "$INTENT")
222+
# Verify service profile (final name = {service}-{accelerator}-{intent})
223+
verify_service_profile "$FINAL_PROFILE" "$PROFILE"
224+
verify_tuned_profile_file "$FINAL_PROFILE"
213225
else
214226
# No service, active profile is the workload profile
215227
verify_tuned_profile_file "$PROFILE"

0 commit comments

Comments
 (0)