Skip to content

Commit 078ef34

Browse files
author
Michail Resvanis
committed
Add support for fabric manager shared-nvswitch mode
The changes include: - add the `FABRIC_MANAGER_FABRIC_MODE` env var that configures FM with either full-passthrough (0) or shared-nvswitch (1) fabric mode. It defaults to 0. - when fabric manager mode is set to 0 no changes to the flow, i.e. execute the fabric manager daemon with its default configuration. - when fabric manager mode is set to 1: - edit the fabric manager configuration file and set `FABRIC_MODE=1`. - persist mapping of physical GPU module IDs to their PCIe address by creating a JSON file on disk (the physical GPU module IDs are available through nvidia-smi). - disable `nvidia-persistenced`, as the GPU devices should be unbound from the NVIDIA driver and bound to vfio-pci (a step executed by the vfio-manager). Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent 4aa2a8e commit 078ef34

2 files changed

Lines changed: 111 additions & 22 deletions

File tree

rhel9/Dockerfile

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK
5353
# Avoid dependency of container-toolkit for driver container
5454
ENV NVIDIA_VISIBLE_DEVICES=void
5555

56+
# Fabric manager fabric mode, default is 0 (full-passthrough)
57+
ARG FABRIC_MANAGER_FABRIC_MODE=0
58+
ENV FABRIC_MANAGER_FABRIC_MODE=$FABRIC_MANAGER_FABRIC_MODE
59+
5660
ADD install.sh /tmp/
5761

5862
RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
@@ -74,7 +78,19 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \
7478
cd drivers && \
7579
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} && \
7680
curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \
77-
chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run; fi
81+
chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \
82+
versionArray=(${DRIVER_VERSION//./ }); \
83+
DRIVER_BRANCH=${versionArray[0]}; \
84+
dnf install git -y && \
85+
dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
86+
dnf module enable -y nvidia-driver:${DRIVER_BRANCH}-dkms && \
87+
dnf install -y nvidia-fabric-manager-${DRIVER_VERSION}-1 nvidia-fabric-manager-devel-${DRIVER_VERSION}-1 libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION}-1 jsoncpp-devel gcc-c++ make && \
88+
git clone https://github.com/mresvanis/Fabric-Manager-Client.git && \
89+
cd Fabric-Manager-Client && \
90+
git checkout fix-ignoring-unix-socket && \
91+
make fmpm && \
92+
cp fmpm /usr/bin/ && \
93+
chmod +x /usr/bin/fmpm; fi
7894

7995
# Fetch the installer, fabricmanager, libnvidia-nscq, libnvsdm, imex packages
8096
RUN sh /tmp/install.sh extrapkgsinstall

rhel9/nvidia-driver

Lines changed: 94 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ RHEL_MAJOR_VERSION=9
2020
RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:-""}
2121
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
2222
MODPROBE_CONFIG_DIR="/etc/modprobe.d"
23+
FABRIC_MANAGER_FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE:-0}
2324

2425
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
2526
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -305,6 +306,86 @@ _ensure_nvlink5_prerequisites() (
305306
done
306307
)
307308

309+
_configure_fabric_manager_config() {
310+
local fm_config_file="$1"
311+
local fmpm_socket_path="$2"
312+
313+
if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then
314+
echo "Updating NVIDIA fabric manager configuration to fabric mode ${FABRIC_MANAGER_FABRIC_MODE}..."
315+
sed -i "s/FABRIC_MODE=.*/FABRIC_MODE=${FABRIC_MANAGER_FABRIC_MODE}/g" $fm_config_file
316+
317+
echo "Updating NVIDIA fabric manager configuration to use a UNIX socket instead of TCP: ${fmpm_socket_path}"
318+
sed -i "s|^UNIX_SOCKET_PATH=.*|UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file
319+
sed -i "s|^FM_CMD_UNIX_SOCKET_PATH=.*|FM_CMD_UNIX_SOCKET_PATH=${fmpm_socket_path}|g" $fm_config_file
320+
fi
321+
}
322+
323+
_setup_fabric_manager() {
324+
local fmpm_socket_path="$1"
325+
local fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
326+
327+
if _assert_nvlink5_system; then
328+
_ensure_nvlink5_prerequisites || return 1
329+
330+
_configure_fabric_manager_config "${fm_config_file}" "${fmpm_socket_path}"
331+
332+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
333+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
334+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
335+
336+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
337+
338+
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
339+
--fm-config-file $fm_config_file \
340+
--fm-pid-file $fm_pid_file \
341+
--nvlsm-config-file $nvlsm_config_file \
342+
--nvlsm-pid-file $nvlsm_pid_file
343+
344+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
345+
elif _assert_nvswitch_system; then
346+
_configure_fabric_manager_config "${fm_config_file}" "${fmpm_socket_path}"
347+
348+
echo "Starting NVIDIA fabric manager daemon..."
349+
nv-fabricmanager -c $fm_config_file
350+
fi
351+
}
352+
353+
# Capture GPU PCI address to physical module ID mapping and persist to JSON file.
354+
_capture_gpu_mapping() {
355+
local gpu_mapping
356+
357+
echo "Capturing GPU PCI to Module ID mapping..."
358+
if command -v nvidia-smi >/dev/null 2>&1; then
359+
gpu_mapping=$(nvidia-smi -q | egrep "(Module|Bus).*Id")
360+
if [ -n "$gpu_mapping" ]; then
361+
echo "$gpu_mapping"
362+
# Parse and convert to JSON format
363+
json_entries=""
364+
module_id=""
365+
while IFS= read -r line; do
366+
if [[ "$line" =~ Module\ Id.*:\ ([0-9]+) ]]; then
367+
module_id="${BASH_REMATCH[1]}"
368+
elif [[ "$line" =~ Bus\ Id.*:\ ([0-9A-Fa-f:\.]+) ]] && [ -n "$module_id" ]; then
369+
pci_id="${BASH_REMATCH[1]}"
370+
if [ -n "$json_entries" ]; then
371+
json_entries="${json_entries},"
372+
fi
373+
json_entries="${json_entries}\"${pci_id}\": \"${module_id}\""
374+
module_id=""
375+
fi
376+
done <<< "$gpu_mapping"
377+
378+
mkdir -p /run/nvidia-fabricmanager
379+
echo "{${json_entries}}" > /run/nvidia-fabricmanager/gpu-pci-module-mapping.json
380+
echo "GPU mapping saved to /run/nvidia-fabricmanager/gpu-pci-module-mapping.json"
381+
else
382+
echo "Warning: Could not retrieve GPU PCI to Module ID mapping"
383+
fi
384+
else
385+
echo "Warning: nvidia-smi not available for GPU mapping"
386+
fi
387+
}
388+
308389
# For each kernel module configuration file mounted into the container,
309390
# parse the file contents and extract the custom module parameters that
310391
# are to be passed as input to 'modprobe'.
@@ -380,6 +461,7 @@ _load_driver() {
380461
local nv_fw_search_path="$RUN_DIR/driver/lib/firmware"
381462
local set_fw_path="true"
382463
local fw_path_config_file="/sys/module/firmware_class/parameters/path"
464+
local fmpm_socket_path="/run/nvidia-fabricmanager/fmpm.sock"
383465
for param in "${NVIDIA_MODULE_PARAMS[@]}"; do
384466
if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then
385467
set_fw_path="false"
@@ -696,8 +778,12 @@ _start_vgpu_topology_daemon() {
696778
}
697779

698780
_start_daemons() {
699-
echo "Starting NVIDIA persistence daemon..."
700-
nvidia-persistenced --persistence-mode
781+
if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then
782+
echo "Skipping NVIDIA persistence daemon..."
783+
else
784+
echo "Starting NVIDIA persistence daemon..."
785+
nvidia-persistenced --persistence-mode
786+
fi
701787

702788
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
703789
echo "Copying gridd.conf..."
@@ -715,25 +801,7 @@ _start_daemons() {
715801
_start_vgpu_topology_daemon
716802
fi
717803

718-
if _assert_nvlink5_system; then
719-
_ensure_nvlink5_prerequisites || return 1
720-
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
721-
722-
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
723-
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
724-
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
725-
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
726-
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
727-
--fm-config-file $fm_config_file \
728-
--fm-pid-file $fm_pid_file \
729-
--nvlsm-config-file $nvlsm_config_file \
730-
--nvlsm-pid-file $nvlsm_pid_file
731-
732-
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
733-
elif _assert_nvswitch_system; then
734-
echo "Starting NVIDIA fabric manager daemon..."
735-
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
736-
fi
804+
_setup_fabric_manager "${fmpm_socket_path}"
737805
}
738806

739807
_store_driver_digest() {
@@ -817,6 +885,11 @@ _build() {
817885

818886
_load() {
819887
_load_driver
888+
889+
if [ "${FABRIC_MANAGER_FABRIC_MODE}" = "1" ]; then
890+
_capture_gpu_mapping
891+
fi
892+
820893
_mount_rootfs
821894
_write_kernel_update_hook
822895
_store_driver_digest

0 commit comments

Comments
 (0)