@@ -20,6 +20,7 @@ RHEL_MAJOR_VERSION=9
2020RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:- " " }
2121KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:- auto}
2222MODPROBE_CONFIG_DIR=" /etc/modprobe.d"
23+ FABRIC_MANAGER_FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE:- 0}
2324
2425DRIVER_ARCH=${TARGETARCH/ amd64/ x86_64} && DRIVER_ARCH=${DRIVER_ARCH/ arm64/ aarch64}
2526echo " DRIVER_ARCH is $DRIVER_ARCH "
@@ -305,6 +306,86 @@ _ensure_nvlink5_prerequisites() (
305306 done
306307)
307308
309+ _configure_fabric_manager_config () {
310+ local fm_config_file=" $1 "
311+ local fmpm_socket_path=" $2 "
312+
313+ if [ " ${FABRIC_MANAGER_FABRIC_MODE} " = " 1" ]; then
314+ echo " Updating NVIDIA fabric manager configuration to fabric mode ${FABRIC_MANAGER_FABRIC_MODE} ..."
315+ sed -i " s/FABRIC_MODE=.*/FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE} /g" $fm_config_file
316+
317+ echo " Updating NVIDIA fabric manager configuration to use a UNIX socket instead of TCP: ${fmpm_socket_path} "
318+ sed -i " s|^UNIX_SOCKET_PATH=.*|UNIX_SOCKET_PATH=${fmpm_socket_path} |g" $fm_config_file
319+ sed -i " s|^FM_CMD_UNIX_SOCKET_PATH=.*|FM_CMD_UNIX_SOCKET_PATH=${fmpm_socket_path} |g" $fm_config_file
320+ fi
321+ }
322+
323+ _setup_fabric_manager () {
324+ local fmpm_socket_path=" $1 "
325+ local fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
326+
327+ if _assert_nvlink5_system; then
328+ _ensure_nvlink5_prerequisites || return 1
329+
330+ _configure_fabric_manager_config " ${fm_config_file} " " ${fmpm_socket_path} "
331+
332+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
333+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
334+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
335+
336+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
337+
338+ /usr/bin/nvidia-fabricmanager-start.sh --mode start \
339+ --fm-config-file $fm_config_file \
340+ --fm-pid-file $fm_pid_file \
341+ --nvlsm-config-file $nvlsm_config_file \
342+ --nvlsm-pid-file $nvlsm_pid_file
343+
344+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
345+ elif _assert_nvswitch_system; then
346+ _configure_fabric_manager_config " ${fm_config_file} " " ${fmpm_socket_path} "
347+
348+ echo " Starting NVIDIA fabric manager daemon..."
349+ nv-fabricmanager -c $fm_config_file
350+ fi
351+ }
352+
353+ # Capture GPU PCI address to physical module ID mapping and persist to JSON file.
354+ _capture_gpu_mapping () {
355+ local gpu_mapping
356+
357+ echo " Capturing GPU PCI to Module ID mapping..."
358+ if command -v nvidia-smi > /dev/null 2>&1 ; then
359+ gpu_mapping=$( nvidia-smi -q | egrep " (Module|Bus).*Id" )
360+ if [ -n " $gpu_mapping " ]; then
361+ echo " $gpu_mapping "
362+ # Parse and convert to JSON format
363+ json_entries=" "
364+ module_id=" "
365+ while IFS= read -r line; do
366+ if [[ " $line " =~ Module\ Id.* :\ ([0-9]+) ]]; then
367+ module_id=" ${BASH_REMATCH[1]} "
368+ elif [[ " $line " =~ Bus\ Id.* :\ ([0-9A-Fa-f:\. ]+) ]] && [ -n " $module_id " ]; then
369+ pci_id=" ${BASH_REMATCH[1]} "
370+ if [ -n " $json_entries " ]; then
371+ json_entries=" ${json_entries} ,"
372+ fi
373+ json_entries=" ${json_entries} \" ${pci_id} \" : \" ${module_id} \" "
374+ module_id=" "
375+ fi
376+ done <<< " $gpu_mapping"
377+
378+ mkdir -p /run/nvidia-fabricmanager
379+ echo " {${json_entries} }" > /run/nvidia-fabricmanager/gpu-pci-module-mapping.json
380+ echo " GPU mapping saved to /run/nvidia-fabricmanager/gpu-pci-module-mapping.json"
381+ else
382+ echo " Warning: Could not retrieve GPU PCI to Module ID mapping"
383+ fi
384+ else
385+ echo " Warning: nvidia-smi not available for GPU mapping"
386+ fi
387+ }
388+
308389# For each kernel module configuration file mounted into the container,
309390# parse the file contents and extract the custom module parameters that
310391# are to be passed as input to 'modprobe'.
@@ -380,6 +461,7 @@ _load_driver() {
380461 local nv_fw_search_path=" $RUN_DIR /driver/lib/firmware"
381462 local set_fw_path=" true"
382463 local fw_path_config_file=" /sys/module/firmware_class/parameters/path"
464+ local fmpm_socket_path=" /run/nvidia-fabricmanager/fmpm.sock"
383465 for param in " ${NVIDIA_MODULE_PARAMS[@]} " ; do
384466 if [[ " $param " == " NVreg_EnableGpuFirmware=0" ]]; then
385467 set_fw_path=" false"
@@ -696,8 +778,12 @@ _start_vgpu_topology_daemon() {
696778}
697779
698780_start_daemons () {
699- echo " Starting NVIDIA persistence daemon..."
700- nvidia-persistenced --persistence-mode
781+ if [ " ${FABRIC_MANAGER_FABRIC_MODE} " = " 1" ]; then
782+ echo " Skipping NVIDIA persistence daemon..."
783+ else
784+ echo " Starting NVIDIA persistence daemon..."
785+ nvidia-persistenced --persistence-mode
786+ fi
701787
702788 if [ " ${DRIVER_TYPE} " = " vgpu" ]; then
703789 echo " Copying gridd.conf..."
@@ -715,25 +801,7 @@ _start_daemons() {
715801 _start_vgpu_topology_daemon
716802 fi
717803
718- if _assert_nvlink5_system; then
719- _ensure_nvlink5_prerequisites || return 1
720- echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
721-
722- fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
723- fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
724- nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
725- nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
726- /usr/bin/nvidia-fabricmanager-start.sh --mode start \
727- --fm-config-file $fm_config_file \
728- --fm-pid-file $fm_pid_file \
729- --nvlsm-config-file $nvlsm_config_file \
730- --nvlsm-pid-file $nvlsm_pid_file
731-
732- # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
733- elif _assert_nvswitch_system; then
734- echo " Starting NVIDIA fabric manager daemon..."
735- nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
736- fi
804+ _setup_fabric_manager " ${fmpm_socket_path} "
737805}
738806
739807_store_driver_digest () {
@@ -817,6 +885,11 @@ _build() {
817885
818886_load () {
819887 _load_driver
888+
889+ if [ " ${FABRIC_MANAGER_FABRIC_MODE} " = " 1" ]; then
890+ _capture_gpu_mapping
891+ fi
892+
820893 _mount_rootfs
821894 _write_kernel_update_hook
822895 _store_driver_digest
0 commit comments