Skip to content

Commit de87f4c

Browse files
committed
refactor(cpu): simplify topology allocation annotations logic
remove isReclaimedOrSharedQoS parameter and improve handling of different QoS levels use aggregated pod request for shared/reclaimed cores with NUMA binding add support for decimal CPU quantities in topology annotations
1 parent 3cdd6d2 commit de87f4c

File tree

5 files changed

+217
-53
lines changed

5 files changed

+217
-53
lines changed

pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,9 +1003,7 @@ func (p *DynamicPolicy) Allocate(ctx context.Context,
10031003
// Add topologyAllocationAnnotations for numa binding containers
10041004
var topologyAllocationAnnotations map[string]string
10051005
if allocationInfo.CheckNUMABinding() {
1006-
isReclaimedOrSharedQoS := allocationInfo.CheckReclaimed() || allocationInfo.CheckShared()
1007-
topologyAllocationAnnotations, err = cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.topologyAllocationAnnotationKey,
1008-
req, isReclaimedOrSharedQoS)
1006+
topologyAllocationAnnotations, err = cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.topologyAllocationAnnotationKey, req)
10091007
if err != nil {
10101008
return nil, fmt.Errorf("GetCPUTopologyAllocationsAnnotations failed with error: %v", err)
10111009
}

pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,7 @@ func (p *DynamicPolicy) reclaimedCoresAllocationHandler(ctx context.Context,
327327
var topologyAllocationAnnotations map[string]string
328328
if allocationInfo.CheckReclaimedActualNUMABinding() {
329329
var err error
330-
topologyAllocationAnnotations, err = cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey,
331-
req, true)
330+
topologyAllocationAnnotations, err = cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey, req)
332331
if err != nil {
333332
return nil, fmt.Errorf("GetCPUTopologyAllocationsAnnotations failed with error: %v", err)
334333
}
@@ -506,11 +505,11 @@ func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationHandler(ctx conte
506505
return nil, fmt.Errorf("adjustAllocationEntries failed with error: %v", err)
507506
}
508507

509-
topologyAllocationAnnotations, err := cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey,
510-
req, false)
508+
topologyAllocationAnnotations, err := cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey, req)
511509
if err != nil {
512510
return nil, fmt.Errorf("GetCPUTopologyAllocationsAnnotations failed with error: %v", err)
513511
}
512+
514513
resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU),
515514
util.OCIPropertyNameCPUSetCPUs, false, true, req, topologyAllocationAnnotations)
516515
if err != nil {
@@ -608,8 +607,7 @@ func (p *DynamicPolicy) sharedCoresWithNUMABindingAllocationHandler(ctx context.
608607

609608
// there is no need to call SetPodEntries and SetMachineState,
610609
// since they are already done in doAndCheckPutAllocationInfo of allocateSharedNumaBindingCPUs
611-
topologyAllocationAnnotations, err := cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey,
612-
req, true)
610+
topologyAllocationAnnotations, err := cpuutil.GetCPUTopologyAllocationsAnnotations(allocationInfo, p.conf.TopologyAllocationAnnotationKey, req)
613611
if err != nil {
614612
return nil, fmt.Errorf("GetCPUTopologyAllocationsAnnotations failed with error: %v", err)
615613
}

pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,7 @@ func TestAllocate(t *testing.T) {
872872
},
873873
},
874874
Annotations: map[string]string{
875-
coreconsts.QRMPodAnnotationTopologyAllocationKey: `{"Numa":{"0":{"allocated":{"cpu":"0"},"attributes":{"CpusetCpus":"1"}}}}`,
875+
coreconsts.QRMPodAnnotationTopologyAllocationKey: `{"Numa":{"0":{"allocated":{"cpu":"300m"},"attributes":{"CpusetCpus":"1"}}}}`,
876876
},
877877
},
878878
},
@@ -1132,7 +1132,7 @@ func TestAllocate(t *testing.T) {
11321132
},
11331133
},
11341134
Annotations: map[string]string{
1135-
coreconsts.QRMPodAnnotationTopologyAllocationKey: `{"Numa":{"0":{"allocated":{"cpu":"0"},"attributes":{"CpusetCpus":"1"}}}}`,
1135+
coreconsts.QRMPodAnnotationTopologyAllocationKey: `{"Numa":{"0":{"allocated":{"cpu":"300m"},"attributes":{"CpusetCpus":"1"}}}}`,
11361136
},
11371137
},
11381138
},

pkg/agent/qrm-plugins/cpu/util/util.go

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -207,61 +207,85 @@ func PackAllocationResponse(allocationInfo *state.AllocationInfo, resourceName,
207207
}, nil
208208
}
209209

210-
// GetCPUTopologyAllocationsAnnotations gets the cpu topology allocation in the form of annotations.
210+
// GetCPUTopologyAllocationsAnnotations returns the topology-aware CPU allocation annotations for a given container.
211+
// It handles different QoS levels:
212+
// - For DedicatedCores: uses the actual assigned CPUSet size on each NUMA node.
213+
// - For SharedCores/ReclaimedCores with NUMA binding: uses the pod's aggregated request quantity.
214+
// This function only processes main containers and returns nil for sidecars or invalid allocation info.
211215
func GetCPUTopologyAllocationsAnnotations(allocationInfo *state.AllocationInfo,
212-
topologyAllocationAnnotationKey string, req *pluginapi.ResourceRequest, isReclaimedOrSharedQoS bool,
216+
topologyAllocationAnnotationKey string, req *pluginapi.ResourceRequest,
213217
) (map[string]string, error) {
214-
if allocationInfo == nil {
218+
// Skip processing if allocation info is invalid or it's not the main container.
219+
if allocationInfo == nil || !allocationInfo.CheckMainContainer() {
215220
return nil, nil
216221
}
217222

218223
assignments := allocationInfo.TopologyAwareAssignments
224+
// Determine if this is a shared or reclaimed QoS level with NUMA binding enabled.
225+
// This affects whether we use the request quantity or the physical assignment size.
226+
isSharedOrReclaimedNUMABinding := allocationInfo.CheckSharedNUMABinding() || allocationInfo.CheckReclaimedNUMABinding()
219227

220-
// Validate shared/reclaimed case
221-
var reqQty int64
222-
if isReclaimedOrSharedQoS {
228+
// For shared or reclaimed cores with NUMA binding, they must be restricted to a single NUMA node.
229+
var reqQty float64
230+
if isSharedOrReclaimedNUMABinding {
223231
if len(assignments) != 1 {
224-
return nil, fmt.Errorf("allocations should not be in more than 1 numa for shared or reclaimed cores")
232+
return nil, fmt.Errorf("shared/reclaimed cores with NUMA binding should be pinned to exactly 1 NUMA node, got %d", len(assignments))
225233
}
226234

227-
_, reqFloat64, err := util.GetQuantityFromResourceReq(req)
235+
// Use the aggregated pod request quantity for shared/reclaimed cores to represent the logical allocation.
236+
_, reqFloat64, err := util.GetPodAggregatedRequestResource(req)
228237
if err != nil {
229-
return nil, fmt.Errorf("getReqQuantityFromResourceReq failed: %w", err)
238+
return nil, fmt.Errorf("failed to get aggregated request resource: %v", err)
230239
}
231-
reqQty = int64(reqFloat64)
240+
reqQty = reqFloat64
232241
}
233242

234-
// Build topology allocation
243+
// Initialize the internal TopologyAllocation structure.
235244
topologyAllocation := v1alpha1.TopologyAllocation{
236245
v1alpha1.TopologyTypeNuma: make(map[string]v1alpha1.ZoneAllocation),
237246
}
238247

239248
for numaNode, assignment := range assignments {
240-
var cpuQty int64
241-
if isReclaimedOrSharedQoS {
249+
var cpuQty float64
250+
var assignmentToReport machine.CPUSet
251+
if isSharedOrReclaimedNUMABinding {
242252
cpuQty = reqQty
253+
// For shared/reclaimed cores, we don't report the CPUSet attribute in the annotation.
254+
assignmentToReport = machine.NewCPUSet()
243255
} else {
244-
cpuQty = int64(assignment.Size())
256+
// For dedicated cores, the quantity is the physical count of CPUs assigned on this NUMA node.
257+
cpuQty = float64(assignment.Size())
258+
assignmentToReport = assignment
245259
}
246260

247-
topologyAllocation[v1alpha1.TopologyTypeNuma][strconv.Itoa(numaNode)] = buildZoneAllocation(cpuQty, assignment)
261+
topologyAllocation[v1alpha1.TopologyTypeNuma][strconv.Itoa(numaNode)] = buildZoneAllocation(cpuQty, assignmentToReport)
248262
}
249263

264+
// Generate the final resource allocation annotations from the topology structure.
250265
return util.MakeTopologyAllocationResourceAllocationAnnotations(
251266
topologyAllocation,
252267
topologyAllocationAnnotationKey,
253268
), nil
254269
}
255270

256-
func buildZoneAllocation(cpuQty int64, assignment machine.CPUSet) v1alpha1.ZoneAllocation {
257-
return v1alpha1.ZoneAllocation{
271+
// buildZoneAllocation creates a ZoneAllocation for a specific NUMA node.
272+
// It uses NewMilliQuantity to accurately represent decimal CPU quantities (up to 0.001 core precision).
273+
func buildZoneAllocation(cpuQty float64, assignment machine.CPUSet) v1alpha1.ZoneAllocation {
274+
za := v1alpha1.ZoneAllocation{
258275
Allocated: map[v1.ResourceName]resource.Quantity{
259-
v1.ResourceCPU: *resource.NewQuantity(cpuQty, resource.DecimalSI),
276+
// Convert cores to milli-cores and round to the nearest integer.
277+
v1.ResourceCPU: *resource.NewMilliQuantity(int64(math.Round(cpuQty*1000)), resource.DecimalSI),
260278
},
261-
Attributes: map[string]string{
279+
}
280+
281+
// Only report the CPUSet attribute if the assignment is non-empty.
282+
if !assignment.IsEmpty() {
283+
za.Attributes = map[string]string{
262284
util.OCIPropertyNameCPUSetCPUs: assignment.String(),
263-
},
285+
}
264286
}
287+
288+
return za
265289
}
266290

267291
// AdvisorDegradation checks if the advisor is in a degraded state.

0 commit comments

Comments
 (0)