fix(nvml-mock): use realistic per-link NVLink speeds in profiles

giuliocalzo · giuliocalzo · commit b1643646509d · 2026-06-22T09:07:22.000+02:00
The a100/h100/b200 profiles previously used the bidirectional-per-link
marketing figure (50/50/100 GB/s) rather than the per-link unidirectional
line rate that `nvidia-smi nvlink -s` actually reports. Align them with the
gb200/gb300 convention:

- a100 (NVLink3): 25781 Mbps (25.781 GB/s/link)
- h100 (NVLink4): 26562 Mbps (26.562 GB/s/link)
- b200 (NVLink5): 53125 Mbps (53.125 GB/s/link, same silicon as gb200)

Per-link x 2 x links still reproduces the marketed bidirectional aggregates
(~600 GB/s, ~900 GB/s, ~1.8 TB/s). Update the docs example to the NVLink4
rate and regenerate Helm snapshots.

Signed-off-by: Giulio Calzolari &lt;gcalzolari@nvidia.com&gt;
diff --git a/deployments/nvml-mock/helm/nvml-mock/profiles/a100.yaml b/deployments/nvml-mock/helm/nvml-mock/profiles/a100.yaml
@@ -373,7 +373,7 @@ devices:
 nvlink:
   version: 3
   links_per_gpu: 12
-  bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 600 GB/s total per GPU
+  bandwidth_per_link_mbps: 25781    # NVLink3 25.781 GB/s/link unidirectional (`nvlink -s`); 12 links ~= 309 GB/s/GPU (600 GB/s bidir)
   c2c_enabled: false                # no Grace C2C on DGX A100
   # DGX A100: 6x NVSwitch -> NV12 all-to-all. Switch-link auto-expansion
   # (switches declared + links_per_gpu > 0, and no explicit per-device links)
diff --git a/deployments/nvml-mock/helm/nvml-mock/profiles/b200.yaml b/deployments/nvml-mock/helm/nvml-mock/profiles/b200.yaml
@@ -380,7 +380,7 @@ devices:
 nvlink:
   version: 5
   links_per_gpu: 18
-  bandwidth_per_link_mbps: 100000   # 100 GB/s per link = 1.8 TB/s total per GPU
+  bandwidth_per_link_mbps: 53125    # NVLink5 53.125 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 0.95 TB/s/GPU (1.8 TB/s bidir)
   c2c_enabled: false                # no Grace C2C
   # Standalone B200: negative-control profile -- no NVSwitch, no fabricmanager.
   # topo -m shows PCIe paths only (no NV#); nvmlDeviceGetGpuFabricInfo returns
diff --git a/deployments/nvml-mock/helm/nvml-mock/profiles/h100.yaml b/deployments/nvml-mock/helm/nvml-mock/profiles/h100.yaml
@@ -389,7 +389,7 @@ devices:
 nvlink:
   version: 4
   links_per_gpu: 18
-  bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 900 GB/s total per GPU
+  bandwidth_per_link_mbps: 26562    # NVLink4 26.562 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 478 GB/s/GPU (900 GB/s bidir)
   c2c_enabled: false                # no Grace C2C on HGX H100
   # Switch-link auto-expansion: switches declared + links_per_gpu > 0 (and no
   # explicit per-device links) makes the engine synthesize 18 active links per
diff --git a/deployments/nvml-mock/helm/nvml-mock/tests/__snapshot__/configmap_test.yaml.snap b/deployments/nvml-mock/helm/nvml-mock/tests/__snapshot__/configmap_test.yaml.snap
@@ -385,7 +385,7 @@ should match snapshot with b200 profile:
         nvlink:
           version: 5
           links_per_gpu: 18
-          bandwidth_per_link_mbps: 100000   # 100 GB/s per link = 1.8 TB/s total per GPU
+          bandwidth_per_link_mbps: 53125    # NVLink5 53.125 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 0.95 TB/s/GPU (1.8 TB/s bidir)
           c2c_enabled: false                # no Grace C2C
           # Standalone B200: negative-control profile -- no NVSwitch, no fabricmanager.
           # topo -m shows PCIe paths only (no NV#); nvmlDeviceGetGpuFabricInfo returns
@@ -819,7 +819,7 @@ should match snapshot with default a100 profile:
         nvlink:
           version: 3
           links_per_gpu: 12
-          bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 600 GB/s total per GPU
+          bandwidth_per_link_mbps: 25781    # NVLink3 25.781 GB/s/link unidirectional (`nvlink -s`); 12 links ~= 309 GB/s/GPU (600 GB/s bidir)
           c2c_enabled: false                # no Grace C2C on DGX A100
           # DGX A100: 6x NVSwitch -> NV12 all-to-all. Switch-link auto-expansion
           # (switches declared + links_per_gpu > 0, and no explicit per-device links)
@@ -2275,7 +2275,7 @@ should match snapshot with h100 profile:
         nvlink:
           version: 4
           links_per_gpu: 18
-          bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 900 GB/s total per GPU
+          bandwidth_per_link_mbps: 26562    # NVLink4 26.562 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 478 GB/s/GPU (900 GB/s bidir)
           c2c_enabled: false                # no Grace C2C on HGX H100
           # Switch-link auto-expansion: switches declared + links_per_gpu > 0 (and no
           # explicit per-device links) makes the engine synthesize 18 active links per
diff --git a/deployments/nvml-mock/helm/nvml-mock/tests/__snapshot__/daemonset_test.yaml.snap b/deployments/nvml-mock/helm/nvml-mock/tests/__snapshot__/daemonset_test.yaml.snap
@@ -18,7 +18,7 @@ should match snapshot with all overrides:
       template:
         metadata:
           annotations:
-            checksum/config: 08de8a7b2e29b1c08e048f7ccdec437e023ba73a8f05ab3b915406fb3bcb1588
+            checksum/config: 55bb2f5a4208bf44b344cc0227800acef90b155558045d62d3f2bbf1c55b2184
           labels:
             app.kubernetes.io/instance: custom
             app.kubernetes.io/name: nvml-mock
@@ -143,7 +143,7 @@ should match snapshot with b200 profile:
       template:
         metadata:
           annotations:
-            checksum/config: a6a14db699978d788fdc32ee04278a4be30d980dfa36a8862e2c9810ec99840f
+            checksum/config: 3f424b3ca8461088c7b01c5753e4f75f59b7ed0bd48cb6875a4c7786d6b05a91
           labels:
             app.kubernetes.io/instance: RELEASE-NAME
             app.kubernetes.io/name: nvml-mock
@@ -247,7 +247,7 @@ should match snapshot with default values:
       template:
         metadata:
           annotations:
-            checksum/config: d0268a92daa1ffc7755ffb80e1f7e55577cecf07ea4793e2fc793a61e1c85c64
+            checksum/config: 9b3eb20285d8ff1481993ed8af7a3f69d2999cbf8e9097c308c13cc191a79c44
           labels:
             app.kubernetes.io/instance: RELEASE-NAME
             app.kubernetes.io/name: nvml-mock
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -336,7 +336,7 @@ devices:
 nvlink:
   version: 4
   links_per_gpu: 18
-  bandwidth_per_link_mbps: 25000
+  bandwidth_per_link_mbps: 26562
   c2c_enabled: false
   links:
     - link: 0
diff --git a/pkg/gpu/mocknvml/configs/mock-nvml-config-a100.yaml b/pkg/gpu/mocknvml/configs/mock-nvml-config-a100.yaml
@@ -371,7 +371,7 @@ devices:
 nvlink:
   version: 3
   links_per_gpu: 12
-  bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 600 GB/s total per GPU
+  bandwidth_per_link_mbps: 25781    # NVLink3 25.781 GB/s/link unidirectional (`nvlink -s`); 12 links ~= 309 GB/s/GPU (600 GB/s bidir)
   c2c_enabled: false                # no Grace C2C on DGX A100
   # DGX A100: 6x NVSwitch -> NV12 all-to-all. Switch-link auto-expansion
   # (switches declared + links_per_gpu > 0, and no explicit per-device links)
diff --git a/pkg/gpu/mocknvml/configs/mock-nvml-config-b200.yaml b/pkg/gpu/mocknvml/configs/mock-nvml-config-b200.yaml
@@ -380,7 +380,7 @@ devices:
 nvlink:
   version: 5
   links_per_gpu: 18
-  bandwidth_per_link_mbps: 100000   # 100 GB/s per link = 1.8 TB/s total per GPU
+  bandwidth_per_link_mbps: 53125    # NVLink5 53.125 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 0.95 TB/s/GPU (1.8 TB/s bidir)
   c2c_enabled: false                # no Grace C2C
   # Standalone B200: negative-control profile -- no NVSwitch, no fabricmanager.
   # topo -m shows PCIe paths only (no NV#); nvmlDeviceGetGpuFabricInfo returns
diff --git a/pkg/gpu/mocknvml/configs/mock-nvml-config-h100.yaml b/pkg/gpu/mocknvml/configs/mock-nvml-config-h100.yaml
@@ -389,7 +389,7 @@ devices:
 nvlink:
   version: 4
   links_per_gpu: 18
-  bandwidth_per_link_mbps: 50000    # 50 GB/s per link = 900 GB/s total per GPU
+  bandwidth_per_link_mbps: 26562    # NVLink4 26.562 GB/s/link unidirectional (`nvlink -s`); 18 links ~= 478 GB/s/GPU (900 GB/s bidir)
   c2c_enabled: false                # no Grace C2C on HGX H100
   # Switch-link auto-expansion: switches declared + links_per_gpu > 0 (and no
   # explicit per-device links) makes the engine synthesize 18 active links per