Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 125 additions & 16 deletions ubuntu22.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ _mount_rootfs() {
mount --make-private /sys
mkdir -p ${RUN_DIR}/driver
mount --rbind / ${RUN_DIR}/driver
echo "Driver container rootfs mounted at ${RUN_DIR}/driver"
}

# Unmount the driver rootfs from the run directory.
Expand Down Expand Up @@ -638,11 +639,134 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

# Read the currently loaded NVIDIA driver version from sysfs.
_read_loaded_version() {
cat /sys/module/nvidia/version 2>/dev/null || return 1
}

_is_rootfs_mounted() {
findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1
}

# Ensure the driver rootfs is mounted exactly once.
_ensure_rootfs_mounted_idempotent() {
_is_rootfs_mounted || _mount_rootfs
}

_ensure_persistence_running() {
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
return 0
fi

if command -v nvidia-persistenced >/dev/null 2>&1; then
nvidia-persistenced --persistence-mode || true
else
echo "nvidia-persistenced not found; continuing without persistence"
fi
}

_build_driver_config() {
local config="DRIVER_VERSION=${DRIVER_VERSION}
KERNEL_VERSION=$(uname -r)
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
USE_HOST_MOFED=${USE_HOST_MOFED}
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}"

# Append config file contents directly
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
if [ -f "/drivers/$conf_file" ]; then
config="${config}
$(cat "/drivers/$conf_file")"
fi
done

echo "$config"
}

_store_driver_config() {
local config_file="/run/nvidia/driver-config.state"
echo "Storing driver configuration state..."
_build_driver_config > "$config_file"
echo "Driver configuration stored at $config_file"
}

init() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
fi

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

exec 3> ${PID_FILE}
if ! flock -n 3; then
echo "An instance of the NVIDIA driver is already running, aborting"
exit 1
fi
echo $$ >&3

trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
trap "_shutdown" EXIT

# Fast path: if the NVIDIA kernel modules are already loaded and driver config matches,
# skip kernel module build/load but install userspace components.
# This handles non-clean restarts where modules are in use and can't be unloaded.
if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
current_config=$(_build_driver_config)
stored_config=$(cat /run/nvidia/driver-config.state)

if [ "${current_config}" = "${stored_config}" ]; then
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"

# Skip kernel module unload since they're already loaded with correct version
# Unmount any existing rootfs
_unmount_rootfs

# Update package cache for userspace install
_update_package_cache
_resolve_kernel_version || exit 1
_install_prerequisites

# Install userspace components only (libraries, binaries)
# The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
echo "Installing userspace components (libraries and binaries)..."
cd /drivers
# Extract the driver first
sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}
./nvidia-installer \
--silent \
--no-kernel-module \
--no-nouveau-check \
--no-nvidia-modprobe \
--no-drm \
--no-peermem

# Mount the driver rootfs to make components available
_mount_rootfs

# Ensure persistence daemon is running
_ensure_persistence_running

# Write kernel update hook
_write_kernel_update_hook

# Store driver configuration
_store_driver_config

echo "Userspace-only install complete, now waiting for signal"
sleep infinity &
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
trap - EXIT
while true; do wait $! || continue; done
exit 0
fi
fi

_unload_driver || exit 1
_unmount_rootfs

# Install the userspace components
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
Expand All @@ -668,22 +792,6 @@ init() {
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

exec 3> ${PID_FILE}
if ! flock -n 3; then
echo "An instance of the NVIDIA driver is already running, aborting"
exit 1
fi
echo $$ >&3

trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
trap "_shutdown" EXIT

_unload_driver || exit 1
_unmount_rootfs

if _kernel_requires_package; then
_update_ca_certificates
_update_package_cache
Expand All @@ -699,6 +807,7 @@ init() {
_load_driver || exit 1
_mount_rootfs
_write_kernel_update_hook
_store_driver_config

echo "Done, now waiting for signal"
sleep infinity &
Expand Down