Skip to content

Commit ddbcdcc

Browse files
refactor: trim GPU provisioning critical path (skip redundant pull, async cleanup, defer DCGM) (#8615)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 452494e commit ddbcdcc

2 files changed

Lines changed: 65 additions & 4 deletions

File tree

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,14 +1014,21 @@ configGPUDrivers() {
10141014
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
10151015
waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
10161016
mkdir -p /opt/{actions,gpu}
1017-
ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
1017+
# The driver image is normally pre-pulled into the VHD; only hit the registry when it is
1018+
# actually missing so provisioning doesn't pay a redundant manifest/layer round trip.
1019+
# Use containerd's native exact-name filter rather than text-matching `images ls` output.
1020+
if [ -z "$(ctr -n k8s.io images ls -q "name==${NVIDIA_DRIVER_IMAGE}:${NVIDIA_DRIVER_IMAGE_TAG}")" ]; then
1021+
ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
1022+
fi
10181023
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
10191024
ret=$?
10201025
if [ "$ret" -ne 0 ]; then
10211026
echo "Failed to install GPU driver, exiting..."
10221027
exit $ERR_GPU_DRIVERS_START_FAIL
10231028
fi
1024-
ctr -n k8s.io images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
1029+
# Drop the driver image reference so containerd can reclaim its space, but skip --sync so
1030+
# garbage collection runs asynchronously instead of blocking node provisioning.
1031+
ctr -n k8s.io images rm $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
10251032
elif isMarinerOrAzureLinux "$OS" && ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
10261033
downloadGPUDrivers
10271034
installNvidiaContainerToolkit
@@ -1647,7 +1654,9 @@ EOF
16471654
logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL
16481655

16491656
# 2. Start the nvidia-dcgm service.
1650-
logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStart nvidia-dcgm 30" || exit $ERR_NVIDIA_DCGM_FAIL
1657+
# DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without
1658+
# blocking node provisioning and treat a slow/failed start as non-fatal.
1659+
logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously"
16511660

16521661
# 3. Start the nvidia-dcgm-exporter service.
16531662
# Create systemd drop-in directory for nvidia-dcgm-exporter service
@@ -1669,7 +1678,9 @@ EOF
16691678
systemctl daemon-reload
16701679

16711680
# Start the nvidia-dcgm-exporter service.
1672-
logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStart nvidia-dcgm-exporter 30" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL
1681+
# The exporter is telemetry only and does not gate scheduling, so start it off the critical
1682+
# path and treat a slow/failed start as non-fatal.
1683+
logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" || echo "warning: nvidia-dcgm-exporter could not be enqueued; GPU metrics will start asynchronously"
16731684
}
16741685

16751686
get_compute_sku() {

spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,4 +1678,54 @@ SETUP_EOF
16781678
The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled"
16791679
End
16801680
End
1681+
1682+
Describe 'startNvidiaManagedExpServices'
1683+
logs_to_events() {
1684+
echo "logs_to_events $1"
1685+
eval "$2"
1686+
}
1687+
systemctlEnableAndStart() {
1688+
echo "systemctlEnableAndStart $@"
1689+
}
1690+
systemctlEnableAndStartNoBlock() {
1691+
echo "systemctlEnableAndStartNoBlock $@"
1692+
}
1693+
mkdir() {
1694+
echo "mkdir $@"
1695+
}
1696+
tee() {
1697+
cat > /dev/null
1698+
echo "tee $@"
1699+
}
1700+
systemctl() {
1701+
echo "systemctl $@"
1702+
}
1703+
1704+
BeforeEach 'MIG_NODE="false"'
1705+
1706+
It 'starts the device-plugin blocking but dcgm and dcgm-exporter off the critical path'
1707+
When call startNvidiaManagedExpServices
1708+
1709+
# device-plugin gates GPU scheduling, so it must stay blocking.
1710+
The output should include "systemctlEnableAndStart nvidia-device-plugin 30"
1711+
# dcgm/dcgm-exporter are telemetry only and must not block provisioning.
1712+
The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm 30"
1713+
The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30"
1714+
The output should not include "systemctlEnableAndStart nvidia-dcgm 30"
1715+
The output should not include "systemctlEnableAndStart nvidia-dcgm-exporter 30"
1716+
End
1717+
1718+
It 'does not fail when dcgm telemetry services cannot be enqueued'
1719+
systemctlEnableAndStartNoBlock() {
1720+
echo "systemctlEnableAndStartNoBlock $@"
1721+
return 1
1722+
}
1723+
1724+
When call startNvidiaManagedExpServices
1725+
1726+
The status should be success
1727+
The output should include "warning: nvidia-dcgm could not be enqueued"
1728+
The output should include "warning: nvidia-dcgm-exporter could not be enqueued"
1729+
End
1730+
End
16811731
End

0 commit comments

Comments
 (0)