diff --git a/templates/mps-control-daemon.tmpl.yaml b/templates/mps-control-daemon.tmpl.yaml index 15af97e33..315e1b494 100644 --- a/templates/mps-control-daemon.tmpl.yaml +++ b/templates/mps-control-daemon.tmpl.yaml @@ -23,23 +23,38 @@ spec: image: {{ .MpsImageName }} securityContext: privileged: true - command: [chroot, /driver-root, sh, -c] + command: [sh, -c] args: - |- set -e - rm -f /var/log/nvidia-mps/startup.log + rm -f /driver-root/var/log/nvidia-mps/startup.log - nvidia-cuda-mps-control -d + if [ -x /driver-root/bin/sh ] || [ -x /driver-root/usr/bin/sh ]; then + # Use chroot to avoid library mismatch between container and host + # when driver root is / (default value) or /run/nvidia/driver (default location for driver installation by GPU Operator) + RUN="chroot /driver-root sh -c" + else + # No shell in driver root (e.g. GKE COS): run directly with PATH/LD_LIBRARY_PATH + export PATH="/driver-root/usr/bin:/driver-root/bin:/driver-root/usr/local/bin:$PATH" + export LD_LIBRARY_PATH="/driver-root/lib64:/driver-root/lib:/driver-root/usr/lib64:/driver-root/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + RUN="sh -c" + + # Point MPS to mounted paths (not set in Deployment spec as for chroot approach /driver-root/... paths don't exist inside chroot) + export CUDA_MPS_PIPE_DIRECTORY=/driver-root/tmp/nvidia-mps + export CUDA_MPS_LOG_DIRECTORY=/driver-root/var/log/nvidia-mps + fi + + $RUN "nvidia-cuda-mps-control -d" {{- if .DefaultActiveThreadPercentage }} - echo set_default_active_thread_percentage {{.DefaultActiveThreadPercentage}} | nvidia-cuda-mps-control + $RUN "echo set_default_active_thread_percentage {{.DefaultActiveThreadPercentage}} | nvidia-cuda-mps-control" {{- end}} {{- range $id, $limit := .DefaultPinnedDeviceMemoryLimits }} - echo set_default_device_pinned_mem_limit {{ $id }} {{ $limit }} | nvidia-cuda-mps-control + $RUN "echo set_default_device_pinned_mem_limit {{ $id }} {{ $limit }} | nvidia-cuda-mps-control" {{- end}} - echo "startup complete" > /var/log/nvidia-mps/startup.log + echo "startup complete" > /driver-root/var/log/nvidia-mps/startup.log - tail -n +1 -f /var/log/nvidia-mps/control.log + tail -n +1 -f /driver-root/var/log/nvidia-mps/control.log startupProbe: exec: command: