Skip to content

Commit 38c81a8

Browse files
committed
fix: refactor code to fit new topology provider format
1 parent 9cd3e0d commit 38c81a8

17 files changed

Lines changed: 77 additions & 63 deletions

File tree

pkg/agent/qrm-plugins/gpu/baseplugin/reporter/reporter.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ func (p *gpuReporterPlugin) getResourcePropertyReportField(latestDeviceTopology
311311
// getGPUResourceProperty returns the different dimensions to differentiate affinity priority of gpu devices.
312312
func (p *gpuReporterPlugin) getGPUResourceProperty(deviceTopology *machine.DeviceTopology) []*nodev1alpha1.Property {
313313
if deviceTopology == nil || len(deviceTopology.PriorityDimensions) == 0 {
314-
return []*nodev1alpha1.Property{}
314+
return nil
315315
}
316316

317317
return []*nodev1alpha1.Property{

pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDe
5252
base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(deviceName, gpuTopologyProvider)
5353
}
5454

55-
// GPUDeviceType is the key used for state management in the QRM framework,
56-
// while GPUDeviceNames are the actual resource names used to fetch the device topologies.
55+
// GPUDeviceType is the key used for GPU state management in the QRM framework,
56+
// while GPUDeviceNames are the actual resource names used to fetch the GPU device topologies.
5757
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.GPUDeviceType,
5858
state.NewGenericDefaultResourceStateGenerator(base.Conf.GPUDeviceNames, base.DeviceTopologyRegistry, 1))
5959
base.RegisterDeviceNames(base.Conf.GPUDeviceNames, gpuconsts.GPUDeviceType)
@@ -137,13 +137,6 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(
137137
"podName", resReq.PodName,
138138
"containerName", resReq.ContainerName)
139139

140-
// Get GPU topology using the specific device resource name
141-
gpuTopology, err := p.DeviceTopologyRegistry.GetDeviceTopology(deviceReq.DeviceName)
142-
if err != nil {
143-
general.Warningf("failed to get gpu topology: %v", err)
144-
return nil, fmt.Errorf("failed to get gpu topology: %w", err)
145-
}
146-
147140
// Use the strategy framework to allocate GPU devices
148141
result, err := manager.AllocateDevicesUsingStrategy(
149142
resReq,
@@ -154,7 +147,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(
154147
p.MetaServer,
155148
p.GetState().GetMachineState(),
156149
qosLevel,
157-
"",
150+
deviceReq.DeviceName,
158151
"",
159152
)
160153
if err != nil {

pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,15 @@ type RDMADevicePlugin struct {
4141
}
4242

4343
func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin {
44-
rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.RDMADeviceNames)
45-
base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider)
44+
for _, deviceName := range base.Conf.RDMADeviceNames {
45+
rdmaTopologyProvider := machine.NewDeviceTopologyProvider()
46+
base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(deviceName, rdmaTopologyProvider)
47+
}
48+
49+
// RDMADeviceType is the key used for RDMA state management in the QRM framework,
50+
// while RDMADeviceNames are the actual resource names used to fetch the RDMA device topologies
4651
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType,
47-
state.NewGenericDefaultResourceStateGenerator(gpuconsts.RDMADeviceType, base.DeviceTopologyRegistry))
52+
state.NewGenericDefaultResourceStateGenerator(base.Conf.RDMADeviceNames, base.DeviceTopologyRegistry))
4853
base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType)
4954

5055
return &RDMADevicePlugin{
@@ -62,7 +67,7 @@ func (p *RDMADevicePlugin) DeviceNames() []string {
6267
}
6368

6469
func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) {
65-
return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType)
70+
return p.BasePlugin.UpdateAllocatableAssociatedDevices(request)
6671
}
6772

6873
func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) {
@@ -110,7 +115,8 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
110115
}, nil
111116
}
112117

113-
rdmaTopology, _, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType)
118+
rdmaDeviceName := deviceReq.DeviceName
119+
rdmaTopology, err := p.DeviceTopologyRegistry.GetDeviceTopology(rdmaDeviceName)
114120
if err != nil {
115121
return nil, fmt.Errorf("failed to get gpu device topology: %v", err)
116122
}
@@ -121,8 +127,6 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
121127
return nil, err
122128
}
123129

124-
accompanyResourceName = p.ResolveResourceName(accompanyResourceName, false)
125-
126130
// Use strategy framework to allocate RDMA devices
127131
result, err := manager.AllocateDevicesUsingStrategy(
128132
resReq,
@@ -133,7 +137,7 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
133137
p.MetaServer,
134138
p.GetState().GetMachineState(),
135139
qosLevel,
136-
gpuconsts.RDMADeviceType,
140+
rdmaDeviceName,
137141
accompanyResourceName,
138142
)
139143
if err != nil {

pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,10 @@ func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin {
112112

113113
// Register gpu device type and gpu device topology provider as it is an accompany resource for rdma
114114
basePlugin.RegisterDeviceNameToType([]string{"test-gpu"}, gpuconsts.GPUDeviceType)
115-
gpuTopologyProvider := machine.NewDeviceTopologyProvider([]string{"test-gpu"})
115+
gpuTopologyProvider := machine.NewDeviceTopologyProvider()
116116
basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider)
117+
// Also register a provider for rdma device type to align with plugin lookup by type
118+
basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, machine.NewDeviceTopologyProvider())
117119

118120
return basePlugin
119121
}
@@ -157,9 +159,8 @@ func TestRDMADevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) {
157159

158160
// Verify device topology is updated
159161
gpuDevicePlugin := devicePlugin.(*RDMADevicePlugin)
160-
deviceTopology, numaTopologyReady, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType)
162+
deviceTopology, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(basePlugin.Conf.RDMADeviceNames[0])
161163
assert.NoError(t, err)
162-
assert.True(t, numaTopologyReady)
163164
assert.NotNil(t, deviceTopology)
164165

165166
expectedDeviceTopology := &machine.DeviceTopology{
@@ -175,7 +176,8 @@ func TestRDMADevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) {
175176
},
176177
}
177178

178-
assert.Equal(t, expectedDeviceTopology, deviceTopology)
179+
// Compare device contents; ignore dynamic UpdateTime field
180+
assert.Equal(t, expectedDeviceTopology.Devices, deviceTopology.Devices)
179181
}
180182

181183
func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) {
@@ -470,8 +472,11 @@ func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) {
470472
}
471473

472474
if tt.deviceTopology != nil {
475+
// Set topology under both type and specific device name to satisfy plugin + generator lookups
473476
err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.RDMADeviceType, tt.deviceTopology)
474477
assert.NoError(t, err)
478+
err = basePlugin.DeviceTopologyRegistry.SetDeviceTopology(basePlugin.Conf.RDMADeviceNames[0], tt.deviceTopology)
479+
assert.NoError(t, err)
475480
}
476481

477482
if tt.accompanyResourceName != "" && tt.accompanyDeviceTopology != nil {

pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,7 @@ func (p *GPUMemPlugin) Allocate(
533533
p.MetaServer,
534534
p.GetState().GetMachineState(),
535535
qosLevel,
536-
"",
536+
deviceReq.DeviceName,
537537
"",
538538
)
539539
if err != nil {

pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,19 @@ const (
4545
testResourcePluginName = "resource-plugin-stub"
4646
testCustomDevicePluginName = "custom-device-plugin-stub"
4747
testCustomDevicePluginName2 = "custom-device-plugin-stub-2"
48+
// RDMA device name used for tests so that RDMA default state generation succeeds
49+
testRDMADeviceName = "rdma-stub"
4850
)
4951

5052
func generateTestConfiguration(t *testing.T) *config.Configuration {
5153
conf := config.NewConfiguration()
5254
tmpDir := t.TempDir()
5355
conf.QRMPluginSocketDirs = []string{tmpDir}
5456
conf.CheckpointManagerDir = tmpDir
55-
conf.GPUDeviceNames = []string{testResourcePluginName} // Add default device name for tests
57+
// Add default device names for tests
58+
conf.GPUDeviceNames = []string{testResourcePluginName}
59+
// Ensure RDMA generator has a device name to reference during InitState
60+
conf.RDMADeviceNames = []string{testRDMADeviceName}
5661

5762
return conf
5863
}
@@ -158,6 +163,15 @@ func makeTestStaticPolicy(t *testing.T) *StaticPolicy {
158163
err = staticPolicy.DeviceTopologyRegistry.SetDeviceTopology(testResourcePluginName, testDeviceTopology)
159164
assert.NoError(t, err)
160165

166+
// Also set a minimal RDMA device topology to satisfy RDMA default state generation during InitState
167+
rdmaDeviceTopology := &machine.DeviceTopology{
168+
Devices: map[string]machine.DeviceInfo{
169+
"rdma-1": {},
170+
},
171+
}
172+
err = staticPolicy.DeviceTopologyRegistry.SetDeviceTopology(testRDMADeviceName, rdmaDeviceTopology)
173+
assert.NoError(t, err)
174+
161175
return staticPolicy
162176
}
163177

pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121

2222
pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
2323

24-
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts"
2524
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state"
2625
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate"
2726
"github.com/kubewharf/katalyst-core/pkg/config/agent/qrm"
@@ -50,10 +49,6 @@ func AllocateDevicesUsingStrategy(
5049
}, err
5150
}
5251

53-
if resourceName == "" {
54-
resourceName = consts.GPUDeviceType
55-
}
56-
5752
// Create allocation context
5853
ctx := &allocate.AllocationContext{
5954
ResourceReq: resourceReq,

pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/accompanyresource/bind.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ import (
2121
"math"
2222
"sort"
2323

24-
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies"
25-
"github.com/kubewharf/katalyst-core/pkg/util/machine"
2624
v1 "k8s.io/api/core/v1"
2725
"k8s.io/apimachinery/pkg/util/sets"
2826

2927
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate"
28+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies"
3029
"github.com/kubewharf/katalyst-core/pkg/util/general"
30+
"github.com/kubewharf/katalyst-core/pkg/util/machine"
3131
)
3232

3333
// Bind tries to allocate devices by maximizing affinity with the accompany resource devices, making sure that it is

pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/accompanyresource/bind_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -574,8 +574,8 @@ func buildSimpleAffinityRegistry(rdmaToGPU map[string][]string) *machine.DeviceT
574574
reg := machine.NewDeviceTopologyRegistry()
575575

576576
// Register topology providers for both devices
577-
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider([]string{"rdma"}))
578-
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider([]string{"gpu"}))
577+
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider())
578+
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider())
579579

580580
// Construct RDMA topology
581581
rdmaTopo := &machine.DeviceTopology{Devices: map[string]machine.DeviceInfo{}}
@@ -614,8 +614,8 @@ func buildSimpleAffinityRegistry(rdmaToGPU map[string][]string) *machine.DeviceT
614614
func buildMultiPriorityAffinityRegistry(cfg map[string]map[int][]string) *machine.DeviceTopologyRegistry {
615615
reg := machine.NewDeviceTopologyRegistry()
616616

617-
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider([]string{"rdma"}))
618-
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider([]string{"gpu"}))
617+
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider())
618+
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider())
619619

620620
rdmaTopo := &machine.DeviceTopology{Devices: map[string]machine.DeviceInfo{}}
621621
for rdmaID, prios := range cfg {
@@ -652,8 +652,8 @@ func buildMultiPriorityAffinityRegistry(cfg map[string]map[int][]string) *machin
652652
func buildNoAffinityRegistry() *machine.DeviceTopologyRegistry {
653653
reg := machine.NewDeviceTopologyRegistry()
654654

655-
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider([]string{"rdma"}))
656-
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider([]string{"gpu"}))
655+
reg.RegisterDeviceTopologyProvider("rdma", machine.NewDeviceTopologyProvider())
656+
reg.RegisterDeviceTopologyProvider("gpu", machine.NewDeviceTopologyProvider())
657657

658658
// RDMA devices use dimension "rdma_link"
659659
rdmaTopo := &machine.DeviceTopology{Devices: map[string]machine.DeviceInfo{}}

pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func TestCanonicalStrategy_Bind(t *testing.T) {
135135
// Prepare topology registry if provided
136136
if tt.topology != nil {
137137
reg := machine.NewDeviceTopologyRegistry()
138-
reg.RegisterDeviceTopologyProvider(consts.GPUDeviceType, machine.NewDeviceTopologyProvider([]string{"gpu"}))
138+
reg.RegisterDeviceTopologyProvider(consts.GPUDeviceType, machine.NewDeviceTopologyProvider())
139139
_ = reg.SetDeviceTopology(consts.GPUDeviceType, tt.topology)
140140
tt.ctx.DeviceTopologyRegistry = reg
141141
}

0 commit comments

Comments
 (0)