From 9dfd36b142f4b573b3d896e1a9109b122bacace6 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Mon, 24 Mar 2025 12:25:35 -0700 Subject: [PATCH] update topology labels Signed-off-by: Dmitry Shmulevich --- .../nwtopo/templates/jobset/jobset-acc.yaml | 2 +- .../nwtopo/templates/jobset/jobset.yaml | 4 +- .../nwtopo/templates/runai/mpijob.yaml | 4 +- .../nwtopo/workflows/config-nodes-acc.yaml | 96 +++++++++---------- .../nwtopo/workflows/config-nodes.yaml | 72 +++++++------- 5 files changed, 89 insertions(+), 89 deletions(-) diff --git a/resources/benchmarks/nwtopo/templates/jobset/jobset-acc.yaml b/resources/benchmarks/nwtopo/templates/jobset/jobset-acc.yaml index b6ec7cdc..6fa36fb0 100644 --- a/resources/benchmarks/nwtopo/templates/jobset/jobset-acc.yaml +++ b/resources/benchmarks/nwtopo/templates/jobset/jobset-acc.yaml @@ -51,7 +51,7 @@ spec: operator: In values: - {{._NAME_}} - topologyKey: network.topology.kubernetes.io/accelerator + topologyKey: network.topology.nvidia.com/accelerator containers: - name: test image: ubuntu diff --git a/resources/benchmarks/nwtopo/templates/jobset/jobset.yaml b/resources/benchmarks/nwtopo/templates/jobset/jobset.yaml index 413e4ca7..4cfcbd0d 100644 --- a/resources/benchmarks/nwtopo/templates/jobset/jobset.yaml +++ b/resources/benchmarks/nwtopo/templates/jobset/jobset.yaml @@ -53,7 +53,7 @@ spec: operator: In values: - {{._NAME_}} - topologyKey: network.topology.kubernetes.io/spine + topologyKey: network.topology.nvidia.com/spine - weight: 90 podAffinityTerm: labelSelector: @@ -62,7 +62,7 @@ spec: operator: In values: - {{._NAME_}} - topologyKey: network.topology.kubernetes.io/block + topologyKey: network.topology.nvidia.com/block containers: - name: test image: ubuntu diff --git a/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml b/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml index 0ac015a2..1285513e 100644 --- a/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml +++ b/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml @@ -51,7 +51,7 @@ spec: operator: In values: - {{._NAME_}} - topologyKey: network.topology.kubernetes.io/spine + topologyKey: network.topology.nvidia.com/spine - weight: 90 podAffinityTerm: labelSelector: @@ -60,7 +60,7 @@ spec: operator: In values: - {{._NAME_}} - topologyKey: network.topology.kubernetes.io/block + topologyKey: network.topology.nvidia.com/block schedulerName: runai-scheduler containers: - image: runai/mpi-worker:latest diff --git a/resources/benchmarks/nwtopo/workflows/config-nodes-acc.yaml b/resources/benchmarks/nwtopo/workflows/config-nodes-acc.yaml index 632d501c..f6313ce6 100644 --- a/resources/benchmarks/nwtopo/workflows/config-nodes-acc.yaml +++ b/resources/benchmarks/nwtopo/workflows/config-nodes-acc.yaml @@ -38,108 +38,108 @@ tasks: count: 1 labels: node-id: n1 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw11 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw11 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n2 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw11 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw11 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n3 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw11 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw11 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n4 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw12 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw12 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n5 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw12 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw12 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n6 - network.topology.kubernetes.io/accelerator: nvl1 - network.topology.kubernetes.io/block: sw12 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl1 + network.topology.nvidia.com/block: sw12 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n7 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw13 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw13 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n8 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw13 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw13 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n9 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw13 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw13 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n10 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw14 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw14 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n11 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw14 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw14 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n12 - network.topology.kubernetes.io/accelerator: nvl2 - network.topology.kubernetes.io/block: sw14 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/accelerator: nvl2 + network.topology.nvidia.com/block: sw14 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" timeout: 5m diff --git a/resources/benchmarks/nwtopo/workflows/config-nodes.yaml b/resources/benchmarks/nwtopo/workflows/config-nodes.yaml index 25735fc3..c715cb29 100644 --- a/resources/benchmarks/nwtopo/workflows/config-nodes.yaml +++ b/resources/benchmarks/nwtopo/workflows/config-nodes.yaml @@ -35,100 +35,100 @@ tasks: count: 1 labels: node-id: n1 - network.topology.kubernetes.io/block: sw11 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw11 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n2 - network.topology.kubernetes.io/block: sw11 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw11 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n3 - network.topology.kubernetes.io/block: sw12 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw12 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n4 - network.topology.kubernetes.io/block: sw12 - network.topology.kubernetes.io/spine: sw21 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw12 + network.topology.nvidia.com/spine: sw21 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n5 - network.topology.kubernetes.io/block: sw13 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw13 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 net-optimal: true nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n6 - network.topology.kubernetes.io/block: sw13 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw13 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n7 - network.topology.kubernetes.io/block: sw14 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw14 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 net-optimal: true nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n8 - network.topology.kubernetes.io/block: sw14 - network.topology.kubernetes.io/spine: sw22 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw14 + network.topology.nvidia.com/spine: sw22 + network.topology.nvidia.com/datacenter: sw31 net-optimal: true nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n9 - network.topology.kubernetes.io/block: sw15 - network.topology.kubernetes.io/spine: sw23 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw15 + network.topology.nvidia.com/spine: sw23 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n10 - network.topology.kubernetes.io/block: sw15 - network.topology.kubernetes.io/spine: sw23 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw15 + network.topology.nvidia.com/spine: sw23 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n11 - network.topology.kubernetes.io/block: sw16 - network.topology.kubernetes.io/spine: sw23 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw16 + network.topology.nvidia.com/spine: sw23 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" - type: dgxa100.80g count: 1 labels: node-id: n12 - network.topology.kubernetes.io/block: sw16 - network.topology.kubernetes.io/spine: sw23 - network.topology.kubernetes.io/datacenter: sw31 + network.topology.nvidia.com/block: sw16 + network.topology.nvidia.com/spine: sw23 + network.topology.nvidia.com/datacenter: sw31 nvidia.com/gpu.count: "8" timeout: 5m - id: update