Skip to content

Commit a2b27c9

Browse files
committed
fix: determine if the default resource state should be init at start
1 parent 38c81a8 commit a2b27c9

File tree

28 files changed

+440
-99
lines changed

28 files changed

+440
-99
lines changed

pkg/agent/qrm-plugins/gpu/baseplugin/base.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,15 +291,19 @@ func (p *BasePlugin) GenerateMachineStateFromPodEntries(
291291
// For example, we may have multiple device names for a same device type, e.g. "nvidia.com/gpu" and "hw.com/npu",
292292
// so we map them to the same device type, which allows us to allocate them interchangeably.
293293
func (p *BasePlugin) RegisterDeviceNames(deviceNames []string, deviceType string) {
294-
for _, deviceeName := range deviceNames {
295-
p.deviceNameToTypeMap[deviceeName] = deviceType
294+
for _, deviceName := range deviceNames {
295+
p.deviceNameToTypeMap[deviceName] = deviceType
296296
if _, ok := p.deviceTypeToNames[deviceType]; !ok {
297297
p.deviceTypeToNames[deviceType] = sets.NewString()
298298
}
299-
p.deviceTypeToNames[deviceType].Insert(deviceeName)
299+
p.deviceTypeToNames[deviceType].Insert(deviceName)
300300
}
301301
}
302302

303+
func (p *BasePlugin) GetDeviceNameToTypeMap() map[string]string {
304+
return p.deviceNameToTypeMap
305+
}
306+
303307
// ResolveResourceName takes in a resourceName and tries to find a mapping of resource type from deviceNameToTypeMap.
304308
// If no mapping is found, resourceName is returned if fallback is true. If fallback is false, an empty string is returned.
305309
func (p *BasePlugin) ResolveResourceName(resourceName string, fallback bool) string {

pkg/agent/qrm-plugins/gpu/baseplugin/reporter/reporter.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,9 @@ func (p *gpuReporterPlugin) GetReportContent(ctx context.Context, _ *v1alpha1.Em
206206

207207
func (p *gpuReporterPlugin) buildReportResponse() (*v1alpha1.GetReportContentResponse, error) {
208208
// The reporter picks the latest topology from all configured GPU devices to report to CNR.
209-
topologiesMap, err := p.deviceTopologyRegistry.GetDeviceTopologies(p.gpuDeviceNames)
210-
if err != nil {
211-
return nil, err
209+
topologiesMap, ok := p.deviceTopologyRegistry.GetDeviceTopologies(p.gpuDeviceNames)
210+
if !ok {
211+
return nil, fmt.Errorf("failed to get any device topology")
212212
}
213213
latestDeviceTopology := machine.PickLatestDeviceTopology(topologiesMap)
214214

@@ -273,7 +273,7 @@ func (p *gpuReporterPlugin) getTopologyZoneReportField(topologiesMap map[string]
273273
return nil, fmt.Errorf("no zone resources found for device topology")
274274
}
275275

276-
zoneAllocations := p.getZoneAllocations(machineState)
276+
zoneAllocations := p.getZoneAllocations(topologiesMap, machineState)
277277

278278
generatedTopologyZones := topologyZoneGenerator.GenerateTopologyZoneStatus(zoneAllocations, zoneResources,
279279
zoneAttributes, nil, nil, nil)
@@ -424,7 +424,7 @@ func (p *gpuReporterPlugin) getZoneResources(topologiesMap map[string]*machine.D
424424
}
425425

426426
// getZoneAllocations returns the map of gpu zone nodes to their pod allocations
427-
func (p *gpuReporterPlugin) getZoneAllocations(machineState state.AllocationResourcesMap) map[util.ZoneNode]util.ZoneAllocations {
427+
func (p *gpuReporterPlugin) getZoneAllocations(topologiesMap map[string]*machine.DeviceTopology, machineState state.AllocationResourcesMap) map[util.ZoneNode]util.ZoneAllocations {
428428
// First construct map of device id to allocations
429429
idToAllocations := make(map[string]util.ZoneAllocations)
430430

@@ -444,6 +444,11 @@ func (p *gpuReporterPlugin) getZoneAllocations(machineState state.AllocationReso
444444

445445
// Override the resource name if there is a specified device name
446446
if allocInfo.DeviceName != "" {
447+
// Skip reporting if it is not a GPU device
448+
if _, ok := topologiesMap[allocInfo.DeviceName]; !ok {
449+
continue
450+
}
451+
447452
resourceName = v1.ResourceName(allocInfo.DeviceName)
448453
}
449454

pkg/agent/qrm-plugins/gpu/baseplugin/reporter/reporter_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2216,6 +2216,61 @@ func TestGpuReporterPlugin_GetReportContent(t *testing.T) {
22162216
},
22172217
},
22182218
},
2219+
{
2220+
name: "Allocations are skipped when allocation DeviceName is not a GPU device",
2221+
deviceTopology: &machine.DeviceTopology{
2222+
PriorityDimensions: []string{"numa"},
2223+
Devices: map[string]machine.DeviceInfo{
2224+
"gpu-0": {
2225+
Health: pluginapi.Healthy,
2226+
NumaNodes: []int{0},
2227+
DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{{PriorityLevel: 0, Dimension: machine.Dimension{Name: "numa", Value: "0"}}: {}},
2228+
},
2229+
},
2230+
},
2231+
machineTopology: []cadvisorapi.Node{{Id: 0, Cores: []cadvisorapi.Core{{SocketID: 0, Id: 0, Threads: []int{0, 4}}}}},
2232+
machineState: state.AllocationResourcesMap{
2233+
v1.ResourceName("test-gpu-resource"): state.AllocationMap{
2234+
"gpu-0": {
2235+
Allocatable: 1,
2236+
PodEntries: state.PodEntries{
2237+
"pod-uid-0": state.ContainerEntries{
2238+
"c0": &state.AllocationInfo{
2239+
AllocationMeta: commonstate.AllocationMeta{PodUid: "pod-uid-0", PodNamespace: "default", PodName: "p0", ContainerName: "c0"},
2240+
DeviceName: "not-a-gpu-device",
2241+
AllocatedAllocation: state.Allocation{Quantity: 1, NUMANodes: []int{0}},
2242+
},
2243+
},
2244+
},
2245+
},
2246+
},
2247+
},
2248+
expectedSpec: []*nodev1alpha1.Property{{PropertyName: propertyNameGPUTopology, PropertyValues: []string{"numa"}}},
2249+
expectedStatus: []*nodev1alpha1.TopologyZone{
2250+
{
2251+
Type: nodev1alpha1.TopologyTypeSocket,
2252+
Name: "0",
2253+
Children: []*nodev1alpha1.TopologyZone{
2254+
{
2255+
Type: nodev1alpha1.TopologyTypeNuma,
2256+
Name: "0",
2257+
Children: []*nodev1alpha1.TopologyZone{
2258+
{
2259+
Type: nodev1alpha1.TopologyTypeGPU,
2260+
Name: "gpu-0",
2261+
Attributes: []nodev1alpha1.Attribute{{Name: "numa", Value: "0"}},
2262+
Resources: nodev1alpha1.Resources{
2263+
Allocatable: &v1.ResourceList{"test-gpu": resource.MustParse("1")},
2264+
Capacity: &v1.ResourceList{"test-gpu": resource.MustParse("1")},
2265+
},
2266+
Allocations: []*nodev1alpha1.Allocation{},
2267+
},
2268+
},
2269+
},
2270+
},
2271+
},
2272+
},
2273+
},
22192274
{
22202275
name: "Merge resources across multiple device names for same device type",
22212276
gpuDeviceNames: []string{"test-gpu-a", "test-gpu-b"},

pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDe
5555
// GPUDeviceType is the key used for GPU state management in the QRM framework,
5656
// while GPUDeviceNames are the actual resource names used to fetch the GPU device topologies.
5757
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.GPUDeviceType,
58-
state.NewGenericDefaultResourceStateGenerator(base.Conf.GPUDeviceNames, base.DeviceTopologyRegistry, 1))
58+
state.NewGenericDefaultResourceStateGenerator(base.Conf.GPUDeviceNames, base.DeviceTopologyRegistry, 1, true))
5959
base.RegisterDeviceNames(base.Conf.GPUDeviceNames, gpuconsts.GPUDeviceType)
6060

6161
return &GPUDevicePlugin{
@@ -149,6 +149,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(
149149
qosLevel,
150150
deviceReq.DeviceName,
151151
"",
152+
p.GetDeviceNameToTypeMap(),
152153
)
153154
if err != nil {
154155
return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err)

pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomD
4949
// RDMADeviceType is the key used for RDMA state management in the QRM framework,
5050
// while RDMADeviceNames are the actual resource names used to fetch the RDMA device topologies
5151
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType,
52-
state.NewGenericDefaultResourceStateGenerator(base.Conf.RDMADeviceNames, base.DeviceTopologyRegistry))
53-
base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType)
52+
state.NewGenericDefaultResourceStateGenerator(base.Conf.RDMADeviceNames, base.DeviceTopologyRegistry, 1, false))
53+
base.RegisterDeviceNames(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType)
5454

5555
return &RDMADevicePlugin{
5656
BasePlugin: base,
@@ -139,6 +139,7 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
139139
qosLevel,
140140
rdmaDeviceName,
141141
accompanyResourceName,
142+
p.GetDeviceNameToTypeMap(),
142143
)
143144
if err != nil {
144145
return nil, fmt.Errorf("RDMA allocation using strategy failed: %v", err)
@@ -170,6 +171,7 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
170171
Quantity: float64(len(allocatedRdmaDevices)),
171172
NUMANodes: hintNodes.ToSliceInt(),
172173
},
174+
DeviceName: deviceReq.DeviceName,
173175
}
174176

175177
allocationInfo.TopologyAwareAllocations = topologyAwareAllocations

pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin"
3333
gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts"
3434
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state"
35+
gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util"
3536
"github.com/kubewharf/katalyst-core/pkg/config"
3637
"github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/statedirectory"
3738
"github.com/kubewharf/katalyst-core/pkg/metaserver"
@@ -100,6 +101,9 @@ func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin {
100101
conf.StateDirectoryConfiguration = &statedirectory.StateDirectoryConfiguration{
101102
StateFileDirectory: tmpDir,
102103
}
104+
// Ensure both GPU and RDMA device names are configured so that
105+
// gpuutil.ResolveResourceName can resolve logical types in tests
106+
conf.GPUDeviceNames = []string{"test-gpu"}
103107
conf.RDMADeviceNames = []string{"test-rdma"}
104108

105109
basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{})
@@ -110,13 +114,14 @@ func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin {
110114

111115
basePlugin.SetState(stateImpl)
112116

113-
// Register gpu device type and gpu device topology provider as it is an accompany resource for rdma
114-
basePlugin.RegisterDeviceNameToType([]string{"test-gpu"}, gpuconsts.GPUDeviceType)
115117
gpuTopologyProvider := machine.NewDeviceTopologyProvider()
116118
basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider)
117119
// Also register a provider for rdma device type to align with plugin lookup by type
118120
basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, machine.NewDeviceTopologyProvider())
119121

122+
// Register GPU device names for resource name resolution in tests.
123+
basePlugin.RegisterDeviceNames(conf.GPUDeviceNames, gpuconsts.GPUDeviceType)
124+
120125
return basePlugin
121126
}
122127

@@ -466,7 +471,7 @@ func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) {
466471
}
467472

468473
if tt.accompanyResourceAllocationInfo != nil && tt.accompanyResourceName != "" {
469-
accompanyResourceType := basePlugin.ResolveResourceName(tt.accompanyResourceName, false)
474+
accompanyResourceType := gpuutil.ResolveResourceName(basePlugin.GetDeviceNameToTypeMap(), tt.accompanyResourceName, false)
470475
assert.NotEmpty(t, accompanyResourceType)
471476
basePlugin.GetState().SetAllocationInfo(v1.ResourceName(accompanyResourceType), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false)
472477
}
@@ -480,7 +485,7 @@ func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) {
480485
}
481486

482487
if tt.accompanyResourceName != "" && tt.accompanyDeviceTopology != nil {
483-
accompanyResourceType := basePlugin.ResolveResourceName(tt.accompanyResourceName, false)
488+
accompanyResourceType := gpuutil.ResolveResourceName(basePlugin.GetDeviceNameToTypeMap(), tt.accompanyResourceName, false)
484489
assert.NotEmpty(t, accompanyResourceType)
485490
err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(accompanyResourceType, tt.accompanyDeviceTopology)
486491
assert.NoError(t, err)

pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin
5252
// string(consts.ResourceGPUMemory) is the key used for state management in the QRM framework,
5353
// while GPUDeviceNames are the actual resource names used to fetch the device topologies.
5454
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(string(consts.ResourceGPUMemory),
55-
state.NewGenericDefaultResourceStateGenerator(base.Conf.GPUDeviceNames, base.DeviceTopologyRegistry, float64(base.Conf.GPUMemoryAllocatablePerGPU.Value())))
55+
state.NewGenericDefaultResourceStateGenerator(base.Conf.GPUDeviceNames, base.DeviceTopologyRegistry,
56+
float64(base.Conf.GPUMemoryAllocatablePerGPU.Value()), true))
5657
return &GPUMemPlugin{
5758
BasePlugin: base,
5859
}
@@ -535,6 +536,7 @@ func (p *GPUMemPlugin) Allocate(
535536
qosLevel,
536537
deviceReq.DeviceName,
537538
"",
539+
p.GetDeviceNameToTypeMap(),
538540
)
539541
if err != nil {
540542
return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err)

pkg/agent/qrm-plugins/gpu/state/generator_stub.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ func NewDefaultResourceStateGeneratorStub() DefaultResourceStateGenerator {
2424
return &defaultResourceStateGeneratorStub{}
2525
}
2626

27+
func (d *defaultResourceStateGeneratorStub) MustInitDefaultResourceState() bool {
28+
return true
29+
}
30+
2731
func (d *defaultResourceStateGeneratorStub) GenerateDefaultResourceState() (AllocationMap, error) {
2832
return map[string]*AllocationState{
2933
"gpu-0": {

pkg/agent/qrm-plugins/gpu/state/interface.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ type writer interface {
4949
// DefaultResourceStateGenerator interface is used to generate default resource state for each resource
5050
type DefaultResourceStateGenerator interface {
5151
GenerateDefaultResourceState() (AllocationMap, error)
52+
53+
// MustInitDefaultResourceState indicates whether the default resource state of a resource should be initialized at the start
54+
MustInitDefaultResourceState() bool
5255
}
5356

5457
// ReadonlyState interface only provides methods for tracking pod assignments

pkg/agent/qrm-plugins/gpu/state/state.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ type AllocationInfo struct {
3232

3333
AllocatedAllocation Allocation `json:"allocated_allocation"`
3434
TopologyAwareAllocations map[string]Allocation `json:"topology_aware_allocations"`
35-
DeviceName string `json:"device_name"`
35+
// DeviceName will be empty if it is a resource allocation (e.g. gpu memory), but it will be non-empty if
36+
// it is a device allocation (e.g. gpu, rdma)
37+
DeviceName string `json:"device_name"`
3638
}
3739

3840
type Allocation struct {

0 commit comments

Comments
 (0)