Skip to content

Commit eba26f6

Browse files
committed
feat: refactor device affinity and support strict device affinity requirement
1 parent b337b63 commit eba26f6

File tree

9 files changed

+1599
-1594
lines changed

9 files changed

+1599
-1594
lines changed

cmd/katalyst-agent/app/options/qrm/gpu_plugin.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ type GPUOptions struct {
3030
GPUMemoryAllocatablePerGPU string
3131
SkipGPUStateCorruption bool
3232
RDMADeviceNames []string
33+
RequiredDeviceAffinity bool
3334

3435
GPUStrategyOptions *gpustrategy.GPUStrategyOptions
3536
}
@@ -55,6 +56,8 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) {
5556
fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption",
5657
o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties")
5758
fs.StringSliceVar(&o.RDMADeviceNames, "rdma-resource-names", o.RDMADeviceNames, "The name of the RDMA resource")
59+
fs.BoolVar(&o.RequiredDeviceAffinity, "gpu-required-device-affinity", o.RequiredDeviceAffinity,
60+
"required device affinity, and when true it will cause pods to admit fail if unable to meet device affinity")
5861
o.GPUStrategyOptions.AddFlags(fss)
5962
}
6063

@@ -71,5 +74,6 @@ func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error {
7174
if err := o.GPUStrategyOptions.ApplyTo(conf.GPUStrategyConfig); err != nil {
7275
return err
7376
}
77+
conf.RequiredDeviceAffinity = o.RequiredDeviceAffinity
7478
return nil
7579
}

pkg/agent/qrm-plugins/gpu/baseplugin/base.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevices(
246246
deviceTopology.Devices[device.ID] = machine.DeviceInfo{
247247
Health: device.Health,
248248
NumaNodes: numaNode,
249-
DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs),
249+
DeviceAffinity: make(map[machine.Dimension]machine.DeviceIDs),
250250
}
251251
}
252252

0 commit comments

Comments
 (0)