diff --git a/cmd/katalyst-agent/app/options/qrm/memory_plugin.go b/cmd/katalyst-agent/app/options/qrm/memory_plugin.go index bf5cd7963f..20dffa0565 100644 --- a/cmd/katalyst-agent/app/options/qrm/memory_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/memory_plugin.go @@ -42,6 +42,7 @@ type MemoryOptions struct { EnableNonBindingShareCoresMemoryResourceCheck bool EnableNUMAAllocationReactor bool NUMABindResultResourceAllocationAnnotationKey string + ExtraMemoryResources []string SockMemOptions LogCacheOptions @@ -157,6 +158,7 @@ func NewMemoryOptions() *MemoryOptions { EnabledQoS: []string{apiconsts.PodAnnotationQoSLevelSharedCores}, MonGroupEnabledClosIDs: []string{}, }, + ExtraMemoryResources: []string{}, } } @@ -235,6 +237,8 @@ func (o *MemoryOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.MonGroupEnabledClosIDs, "enabled-closid mon-groups") fs.Float64Var(&o.MonGroupMaxCountRatio, "resctrl-mon-groups-max-count-ratio", o.MonGroupMaxCountRatio, "ratio of mon_groups max count") + fs.StringSliceVar(&o.ExtraMemoryResources, "extra-memory-resources", o.ExtraMemoryResources, + "extra memory resources such as hugepages-*") } func (o *MemoryOptions) ApplyTo(conf *qrmconfig.MemoryQRMPluginConfig) error { @@ -273,6 +277,7 @@ func (o *MemoryOptions) ApplyTo(conf *qrmconfig.MemoryQRMPluginConfig) error { conf.EnabledQoS = o.EnabledQoS conf.MonGroupEnabledClosIDs = o.MonGroupEnabledClosIDs conf.MonGroupMaxCountRatio = o.MonGroupMaxCountRatio + conf.ExtraMemoryResources = o.ExtraMemoryResources for _, reservation := range o.ReservedNumaMemory { conf.ReservedNumaMemory[reservation.NumaNode] = reservation.Limits diff --git a/go.mod b/go.mod index e96fb39627..319e879fe3 100644 --- a/go.mod +++ b/go.mod @@ -197,7 +197,7 @@ replace ( k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6 k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6 k8s.io/kubectl => k8s.io/kubectl v0.24.6 - k8s.io/kubelet => github.com/kubewharf/kubelet v1.24.6-kubewharf-pre.2 + k8s.io/kubelet => github.com/luomingmeng/kubelet v0.0.0-20260306101749-66566cd8838b k8s.io/kubernetes => k8s.io/kubernetes v1.24.6 k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6 k8s.io/metrics => k8s.io/metrics v0.24.6 diff --git a/go.sum b/go.sum index 748426d8ac..e5439ca1f6 100644 --- a/go.sum +++ b/go.sum @@ -576,8 +576,6 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kubewharf/katalyst-api v0.5.11-0.20260324091059-cae1d07d9882 h1:4KYYk/mAJAOIYDW5V+43wnjnP8p3bwHXAkAcw/AbzuQ= github.com/kubewharf/katalyst-api v0.5.11-0.20260324091059-cae1d07d9882/go.mod h1:BZMVGVl3EP0eCn5xsDgV41/gjYkoh43abIYxrB10e3k= -github.com/kubewharf/kubelet v1.24.6-kubewharf-pre.2 h1:2KLMzgntDypiFJRX4fSQJCD+a6zIgHuhcAzd/7nAGmU= -github.com/kubewharf/kubelet v1.24.6-kubewharf-pre.2/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/libopenstorage/openstorage v1.0.0/go.mod h1:Sp1sIObHjat1BeXhfMqLZ14wnOzEhNx2YQedreMcUyc= @@ -587,6 +585,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= +github.com/luomingmeng/kubelet v0.0.0-20260306101749-66566cd8838b h1:4fQ2SJiAbt+RMD/RCN/8iN8LevcHnLxXaFY5z2cuQVI= +github.com/luomingmeng/kubelet v0.0.0-20260306101749-66566cd8838b/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/commonstate/state.go b/pkg/agent/qrm-plugins/commonstate/state.go index 8bb71a6704..7007956340 100644 --- a/pkg/agent/qrm-plugins/commonstate/state.go +++ b/pkg/agent/qrm-plugins/commonstate/state.go @@ -145,8 +145,8 @@ func (am *AllocationMeta) GetSpecifiedNUMABindingNUMAID() (int, error) { return GetSpecifiedNUMABindingNUMAID(am.Annotations) } -// SetSpecifiedNUMABindingNUMAID set the numa id for AllocationInfo -func (am *AllocationMeta) SetSpecifiedNUMABindingNUMAID(numaID uint64) { +// SetSpecifiedNUMABindingNUMAID set the numa ids for AllocationInfo +func (am *AllocationMeta) SetSpecifiedNUMABindingNUMAID(numaIDs []uint64) { if am == nil { return } @@ -155,7 +155,12 @@ func (am *AllocationMeta) SetSpecifiedNUMABindingNUMAID(numaID uint64) { am.Annotations = make(map[string]string) } - am.Annotations[cpuconsts.CPUStateAnnotationKeyNUMAHint] = machine.NewCPUSet(int(numaID)).String() + intIDs := make([]int, len(numaIDs)) + for i, id := range numaIDs { + intIDs[i] = int(id) + } + + am.Annotations[cpuconsts.CPUStateAnnotationKeyNUMAHint] = machine.NewCPUSet(intIDs...).String() } // GetSpecifiedNUMABindingPoolName get numa_binding pool name @@ -316,3 +321,14 @@ func (am *AllocationMeta) CheckDedicatedPool() bool { } return am.OwnerPoolName == PoolNameDedicated } + +// CheckDistributeEvenlyAcrossNuma returns true if the AllocationInfo is for pod with distribute evenly across numa +// annotation enabled. +func (am *AllocationMeta) CheckDistributeEvenlyAcrossNuma() bool { + if am == nil { + return false + } + + return am.Annotations[consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma] == + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable +} diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go index 23a60ed01f..6a03b18839 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go @@ -284,7 +284,7 @@ func (p *DynamicPolicy) reclaimedCoresAllocationHandler(ctx context.Context, // set reclaimed numa_binding NUMA ID to allocationInfo if req.Hint != nil && len(req.Hint.Nodes) == 1 && (reclaimActualBindingNUMAs.Contains(int(req.Hint.Nodes[0])) || !nonReclaimActualBindingNUMAs.Equals(machine.NewCPUSet(int(req.Hint.Nodes[0])))) { - allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes[0]) + allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes) } } @@ -470,7 +470,7 @@ func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationHandler(ctx conte return nil, fmt.Errorf("numa binding without numa exclusive allocation result numa node size is %d, "+ "not equal to 1", len(req.Hint.Nodes)) } - allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes[0]) + allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes) } // update pod entries directly. @@ -741,7 +741,7 @@ func (p *DynamicPolicy) allocateSharedNumaBindingCPUs(req *pluginapi.ResourceReq InitTimestamp: time.Now().Format(util.QRMTimeFormat), RequestQuantity: reqFloat64, } - allocationInfo.SetSpecifiedNUMABindingNUMAID(hint.Nodes[0]) + allocationInfo.SetSpecifiedNUMABindingNUMAID(hint.Nodes) if util.PodInplaceUpdateResizing(req) { originAllocationInfo := p.state.GetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers_test.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers_test.go index dacf55d56e..09924516f6 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers_test.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers_test.go @@ -277,7 +277,7 @@ func TestAllocateSharedNumaBindingCPUs(t *testing.T) { 0: machine.NewCPUSet(0, 1), }, } - originAllocationInfo.SetSpecifiedNUMABindingNUMAID(0) + originAllocationInfo.SetSpecifiedNUMABindingNUMAID([]uint64{0}) policy.state.SetAllocationInfo(podUID, containerName, originAllocationInfo, false) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go index 78996fd447..8a94a0d0f7 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go @@ -161,7 +161,9 @@ func (p *DynamicPolicy) dedicatedCoresWithNUMABindingHintHandler(_ context.Conte (*commonstate.AllocationMeta).CheckDedicatedNUMABindingNUMAExclusive)) var extraErr error - hints, extraErr = util.GetHintsFromExtraStateFile(req.PodName, string(v1.ResourceCPU), p.extraStateFileAbsPath, availableNUMAs) + hints, extraErr = util.GetHintsFromExtraStateFile(req.PodName, p.extraStateFileAbsPath, availableNUMAs, []v1.ResourceName{ + v1.ResourceCPU, + }) if extraErr != nil { general.Infof("pod: %s/%s, container: %s GetHintsFromExtraStateFile failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, extraErr) @@ -303,10 +305,10 @@ func (p *DynamicPolicy) calculateHints( maskCount := mask.Count() if maskCount < minNUMAsCountNeeded { return - } else if numaBinding && !numaExclusive && numaNumber <= 1 && maskCount > 1 { + } else if numaBinding && !numaExclusive && maskCount > 1 && numaNumber <= 1 { // because it's hard to control memory allocation accurately, // we only support numa_binding but not exclusive container with request smaller than 1 NUMA - // pods with distribute evenly across numa annotation can occupy more than 1 NUMA + // pods with numa number more than 1 can occupy more than 1 NUMA return } @@ -371,7 +373,6 @@ func (p *DynamicPolicy) calculateHints( if numaNumber != 0 { minAffinitySize = numaNumber } - // Update hint to be preferred if they have minimum number of NUMA nodes for _, hint := range availableNumaHints { if len(hint.Nodes) == minAffinitySize { diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go index 2d700c2bea..4446413c43 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_test.go @@ -4906,6 +4906,269 @@ func TestGetTopologyHints(t *testing.T) { }, cpuTopology: cpuTopology, }, + { + name: "req with numa number makes sure that the hints only belong to those numa nodes", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 2, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationCPUEnhancementKey: `{"katalyst.kubewharf.io/numa_number": "2"}`, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodName: testName, + PodNamespace: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceCPU): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationCPUEnhancementNumaNumber: "2", + }, + }, + cpuTopology: cpuTopology, + }, + { + name: "req with numa ID for one numa node only", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationCPUEnhancementKey: `{"katalyst.kubewharf.io/numa_ids": "1"}`, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodName: testName, + PodNamespace: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceCPU): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{1}, + Preferred: true, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationCPUEnhancementNumaIDs: "1", + }, + }, + cpuTopology: cpuTopology, + }, + { + name: "req with numa ID for multiple numa nodes", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 4, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationCPUEnhancementKey: `{"katalyst.kubewharf.io/numa_ids": "1-3"}`, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodName: testName, + PodNamespace: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceCPU): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{1, 2, 3}, + Preferred: true, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationCPUEnhancementNumaIDs: "1-3", + }, + }, + cpuTopology: cpuTopology, + }, + { + name: "numa IDs will override numa number", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 4, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationCPUEnhancementKey: `{"katalyst.kubewharf.io/numa_number": "2", "katalyst.kubewharf.io/numa_ids": "0-2"}`, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodName: testName, + PodNamespace: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceCPU): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1, 2}, + Preferred: true, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationCPUEnhancementNumaIDs: "0-2", + consts.PodAnnotationCPUEnhancementNumaNumber: "2", + }, + }, + cpuTopology: cpuTopology, + }, + { + name: "custom numa number and numa ids annotation are supported", + numaNumberAnnotationKey: "custom_numa_number_annotation", + numaIDsAnnotationKey: "custom_numa_ids_annotation", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 4, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationCPUEnhancementKey: `{"custom_numa_number_annotation": "2", "custom_numa_ids_annotation": "0-2"}`, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodName: testName, + PodNamespace: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceCPU), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceCPU): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1, 2}, + Preferred: true, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + "custom_numa_number_annotation": "2", + "custom_numa_ids_annotation": "0-2", + }, + }, + cpuTopology: cpuTopology, + }, } for _, tc := range testCases { diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 7253dcc67d..d5574d2b6a 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -71,7 +71,7 @@ func (p *GPUMemPlugin) GetTopologyHints(ctx context.Context, req *pluginapi.Reso return nil, err } - _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), nil) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } @@ -458,7 +458,7 @@ func (p *GPUMemPlugin) Allocate( return nil, err } - _, gpuMemory, err := util.GetQuantityFromResourceRequests(resourceReq.ResourceRequests, p.ResourceName(), false) + _, gpuMemory, err := util.GetQuantityFromResourceRequests(resourceReq.ResourceRequests, p.ResourceName(), nil) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go index 56c65fcd71..5076aebc86 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -34,7 +34,7 @@ func (s *GPUMemoryStrategy) Filter(ctx *allocate.AllocationContext, allAvailable return nil, fmt.Errorf("GPU topology is nil") } - _, gpuMemory, err := util.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + _, gpuMemory, err := util.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), nil) if err != nil { general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default available devices", err) return allAvailableDevices, nil diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go index e4cfaef234..6e2291aa63 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -34,7 +34,7 @@ func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevice return nil, fmt.Errorf("GPU topology is nil") } - _, gpuMemory, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + _, gpuMemory, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), nil) if err != nil { general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default filtered devices", err) return filteredDevices, nil diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index c8c1331593..d31c3f5a0b 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -79,7 +79,7 @@ func GetGPUCount(req *pluginapi.ResourceRequest, deviceNames []string) (float64, gpuNames := sets.NewString() for _, resourceName := range deviceNames { - _, request, err := qrmutil.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + _, request, err := qrmutil.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, nil) if err != nil && !errors.IsNotFound(err) { return 0, nil, err } diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go index 0840cf9125..6a4f7ccaf2 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go @@ -166,12 +166,14 @@ type DynamicPolicy struct { numaAllocationReactor reactor.AllocationReactor numaBindResultResourceAllocationAnnotationKey string + + extraResourceNames []string } func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, _ interface{}, agentName string, ) (bool, agent.Component, error) { - reservedMemory, err := getReservedMemory(conf, agentCtx.MetaServer, agentCtx.MachineInfo) + resourcesReservedMemory, err := getResourcesReservedMemory(conf, agentCtx.MetaServer, agentCtx.MachineInfo, conf.ExtraMemoryResources) if err != nil { return false, agent.ComponentStub{}, fmt.Errorf("getReservedMemoryFromOptions failed with error: %v", err) } @@ -180,11 +182,11 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration Key: util.QRMPluginPolicyTagName, Val: memconsts.MemoryResourcePluginPolicyNameDynamic, }) - resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, - } + stateImpl, err := state.NewCheckpointState(conf.StateDirectoryConfiguration, memoryPluginStateFileName, - memconsts.MemoryResourcePluginPolicyNameDynamic, agentCtx.CPUTopology, agentCtx.MachineInfo, resourcesReservedMemory, conf.SkipMemoryStateCorruption, wrappedEmitter) + memconsts.MemoryResourcePluginPolicyNameDynamic, agentCtx.CPUTopology, agentCtx.MachineInfo, agentCtx.MemoryTopology, resourcesReservedMemory, conf.SkipMemoryStateCorruption, + wrappedEmitter, conf.ExtraMemoryResources, + ) if err != nil { return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err) } @@ -238,6 +240,7 @@ func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration resctrlHinter: newResctrlHinter(&conf.ResctrlConfig, wrappedEmitter), enableNonBindingShareCoresMemoryResourceCheck: conf.EnableNonBindingShareCoresMemoryResourceCheck, numaBindResultResourceAllocationAnnotationKey: conf.NUMABindResultResourceAllocationAnnotationKey, + extraResourceNames: conf.ExtraMemoryResources, } policyImplement.allocationHandlers = map[string]util.AllocationHandler{ @@ -594,7 +597,7 @@ func (p *DynamicPolicy) GetTopologyHints(ctx context.Context, return nil, err } - reqInt, _, err := util.GetQuantityFromResourceReq(req) + resourceReqInt, _, err := util.GetQuantityMapFromResourceReq(req) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } @@ -607,7 +610,7 @@ func (p *DynamicPolicy) GetTopologyHints(ctx context.Context, "podRole", req.PodRole, "containerType", req.ContainerType, "qosLevel", qosLevel, - "memoryReq(bytes)", reqInt, + "memoryReq map(bytes)", resourceReqInt, "isDebugPod", isDebugPod) if req.ContainerType == pluginapi.ContainerType_INIT || isDebugPod { @@ -727,54 +730,59 @@ func (p *DynamicPolicy) GetResourcesAllocation(_ context.Context, defer p.RUnlock() podResources := make(map[string]*pluginapi.ContainerResources) - podEntries := p.state.GetPodResourceEntries()[v1.ResourceMemory] + podResourceEntries := p.state.GetPodResourceEntries() needUpdateMachineState := false - for podUID, containerEntries := range podEntries { - if podResources[podUID] == nil { - podResources[podUID] = &pluginapi.ContainerResources{} - } - mainContainerAllocationInfo, _ := podEntries.GetMainContainerAllocation(podUID) - for containerName, allocationInfo := range containerEntries { - if allocationInfo == nil { - continue + for resourceName, podEntries := range podResourceEntries { + for podUID, containerEntries := range podEntries { + if podResources[podUID] == nil { + podResources[podUID] = &pluginapi.ContainerResources{} } - if allocationInfo.CheckSideCar() && mainContainerAllocationInfo != nil { - if applySidecarAllocationInfoFromMainContainer(allocationInfo, mainContainerAllocationInfo) { - general.Infof("pod: %s/%s sidecar container: %s update its allocation", - allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) - p.state.SetAllocationInfo(v1.ResourceMemory, podUID, containerName, allocationInfo, true) - needUpdateMachineState = true + mainContainerAllocationInfo, _ := podEntries.GetMainContainerAllocation(podUID) + for containerName, allocationInfo := range containerEntries { + if allocationInfo == nil { + continue } - } - if podResources[podUID].ContainerResources == nil { - podResources[podUID].ContainerResources = make(map[string]*pluginapi.ResourceAllocation) - } + if allocationInfo.CheckSideCar() && mainContainerAllocationInfo != nil { + if applySidecarAllocationInfoFromMainContainer(allocationInfo, mainContainerAllocationInfo) { + general.Infof("pod: %s/%s sidecar container: %s update its allocation", + allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) + p.state.SetAllocationInfo(resourceName, podUID, containerName, allocationInfo, true) + needUpdateMachineState = true + } + } - resourceAllocation, err := allocationInfo.GetResourceAllocation() - if err != nil { - errMsg := "allocationInfo.GetResourceAllocation failed" - general.ErrorS(err, errMsg, - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "containerName", allocationInfo.ContainerName) - return nil, fmt.Errorf(errMsg) - } + if podResources[podUID].ContainerResources == nil { + podResources[podUID].ContainerResources = make(map[string]*pluginapi.ResourceAllocation) + } - if p.resctrlHinter != nil { - p.resctrlHinter.HintResourceAllocation(allocationInfo.AllocationMeta, resourceAllocation) - } + resourceAllocation, err := podResourceEntries.GetResourceAllocation(podUID, containerName) + if err != nil { + errMsg := "allocationInfo.GetResourceAllocation failed" + general.ErrorS(err, errMsg, + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "resourceName", resourceName) + return nil, fmt.Errorf(errMsg) + } + + if p.resctrlHinter != nil { + p.resctrlHinter.HintResourceAllocation(allocationInfo.AllocationMeta, resourceAllocation) + } - podResources[podUID].ContainerResources[containerName] = resourceAllocation + podResources[podUID].ContainerResources[containerName] = resourceAllocation + } } } if needUpdateMachineState { general.Infof("GetResourcesAllocation update machine state") - podResourceEntries := p.state.GetPodResourceEntries() - resourcesState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + podResourceEntries = p.state.GetPodResourceEntries() + resourcesState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Infof("GetResourcesAllocation GenerateMachineStateFromPodEntries failed with error: %v", err) return nil, fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) @@ -798,42 +806,45 @@ func (p *DynamicPolicy) GetTopologyAwareResources(_ context.Context, p.RLock() defer p.RUnlock() - allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) - if allocationInfo == nil { + resourceAllocationInfo := p.state.GetResourceAllocationInfo(req.PodUid, req.ContainerName) + if resourceAllocationInfo == nil { return nil, fmt.Errorf("pod: %s, container: %s is not show up in memory plugin state", req.PodUid, req.ContainerName) } - topologyAwareQuantityList := util.GetTopologyAwareQuantityFromAssignmentsSize(allocationInfo.TopologyAwareAllocations) - resp := &pluginapi.GetTopologyAwareResourcesResponse{ - PodUid: allocationInfo.PodUid, - PodName: allocationInfo.PodName, - PodNamespace: allocationInfo.PodNamespace, - ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ - ContainerName: allocationInfo.ContainerName, - }, - } + var resp *pluginapi.GetTopologyAwareResourcesResponse - if allocationInfo.CheckSideCar() { - resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ - string(v1.ResourceMemory): { + for resourceName, allocationInfo := range resourceAllocationInfo { + topologyAwareQuantityList := util.GetTopologyAwareQuantityFromAssignmentsSize(allocationInfo.TopologyAwareAllocations) + if resp == nil { + resp = &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: allocationInfo.PodUid, + PodName: allocationInfo.PodName, + PodNamespace: allocationInfo.PodNamespace, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: allocationInfo.ContainerName, + AllocatedResources: make(map[string]*pluginapi.TopologyAwareResource), + }, + } + } + + if allocationInfo.CheckSideCar() { + resp.ContainerTopologyAwareResources.AllocatedResources[string(resourceName)] = &pluginapi.TopologyAwareResource{ IsNodeResource: false, IsScalarResource: true, AggregatedQuantity: 0, OriginalAggregatedQuantity: 0, TopologyAwareQuantityList: nil, OriginalTopologyAwareQuantityList: nil, - }, - } - } else { - resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ - string(v1.ResourceMemory): { + } + } else { + resp.ContainerTopologyAwareResources.AllocatedResources[string(resourceName)] = &pluginapi.TopologyAwareResource{ IsNodeResource: false, IsScalarResource: true, AggregatedQuantity: float64(allocationInfo.AggregatedQuantity), OriginalAggregatedQuantity: float64(allocationInfo.AggregatedQuantity), TopologyAwareQuantityList: topologyAwareQuantityList, OriginalTopologyAwareQuantityList: topologyAwareQuantityList, - }, + } } } @@ -847,42 +858,46 @@ func (p *DynamicPolicy) GetTopologyAwareAllocatableResources(context.Context, p.RLock() defer p.RUnlock() - machineState := p.state.GetMachineState()[v1.ResourceMemory] + allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) + resourceMachineState := p.state.GetMachineState() numaNodes := p.topology.CPUDetails.NUMANodes().ToSliceInt() - topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - - var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 = 0, 0 - for _, numaNode := range numaNodes { - numaNodeState := machineState[numaNode] - if numaNodeState == nil { - return nil, fmt.Errorf("nil numaNodeState for NUMA: %d", numaNode) + + for resourceName, machineState := range resourceMachineState { + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + + var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 = 0, 0 + for _, numaNode := range numaNodes { + numaNodeState := machineState[numaNode] + if numaNodeState == nil { + return nil, fmt.Errorf("nil numaNodeState for NUMA: %d", numaNode) + } + + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: float64(numaNodeState.Allocatable), + Node: uint64(numaNode), + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: float64(numaNodeState.TotalMemSize), + Node: uint64(numaNode), + }) + aggregatedAllocatableQuantity += numaNodeState.Allocatable + aggregatedCapacityQuantity += numaNodeState.TotalMemSize } - topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(numaNodeState.Allocatable), - Node: uint64(numaNode), - }) - topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(numaNodeState.TotalMemSize), - Node: uint64(numaNode), - }) - aggregatedAllocatableQuantity += numaNodeState.Allocatable - aggregatedCapacityQuantity += numaNodeState.TotalMemSize + allocatableResources[string(resourceName)] = &pluginapi.AllocatableTopologyAwareResource{ + IsNodeResource: false, + IsScalarResource: true, + AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), + TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, + AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), + TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, + } } return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ - AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{ - string(v1.ResourceMemory): { - IsNodeResource: false, - IsScalarResource: true, - AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), - TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, - AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), - TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, - }, - }, + AllocatableResources: allocatableResources, }, nil } @@ -894,6 +909,7 @@ func (p *DynamicPolicy) GetResourcePluginOptions(context.Context, PreStartRequired: false, WithTopologyAlignment: true, NeedReconcile: true, + ExtraResources: p.extraResourceNames, }, nil } @@ -940,7 +956,7 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, } }() - reqInt, _, err := util.GetQuantityFromResourceReq(req) + resourceReqInt, _, err := util.GetQuantityMapFromResourceReq(req) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } @@ -952,7 +968,7 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, "podType", req.PodType, "podRole", req.PodRole, "qosLevel", qosLevel, - "memoryReq(bytes)", reqInt, + "memoryReq map(bytes)", resourceReqInt, "hint", req.Hint) if req.ContainerType == pluginapi.ContainerType_INIT { @@ -999,6 +1015,7 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, p.Lock() defer func() { // calls sys-advisor to inform the latest container + // currently, sys-advisor only supports v1.ResourceMemory, and hugepages is not supported if p.enableMemoryAdvisor && respErr == nil && req.ContainerType != pluginapi.ContainerType_INIT { _, err := p.advisorClient.AddContainer(ctx, &advisorsvc.ContainerMetadata{ PodUid: req.PodUid, @@ -1010,7 +1027,7 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, Labels: maputil.CopySS(req.Labels), Annotations: maputil.CopySS(req.Annotations), QosLevel: qosLevel, - RequestQuantity: uint64(reqInt), + RequestQuantity: uint64(resourceReqInt[v1.ResourceMemory]), }) if err != nil { resp = nil @@ -1053,38 +1070,39 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, return }() - allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) - if allocationInfo != nil && allocationInfo.AggregatedQuantity >= uint64(reqInt) && !util.PodInplaceUpdateResizing(req) { - general.InfoS("already allocated and meet requirement", + resourceAllocationInfo := p.state.GetResourceAllocationInfo(req.PodUid, req.ContainerName) + // The length of all current allocation for every resource should be the same as the length of requested resources. + if len(resourceAllocationInfo) > 0 && len(resourceAllocationInfo) != len(resourceReqInt) { + general.ErrorS(fmt.Errorf("number of existing allocated resources: %d does not match number of resource requests: %d", + len(resourceAllocationInfo), len(resourceReqInt)), + "allocation error", "podNamespace", req.PodNamespace, "podName", req.PodName, - "containerName", req.ContainerName, - "memoryReq(bytes)", reqInt, - "currentResult(bytes)", allocationInfo.AggregatedQuantity) - return &pluginapi.ResourceAllocationResponse{ - PodUid: req.PodUid, - PodNamespace: req.PodNamespace, - PodName: req.PodName, - ContainerName: req.ContainerName, - ContainerType: req.ContainerType, - ContainerIndex: req.ContainerIndex, - PodRole: req.PodRole, - PodType: req.PodType, - ResourceName: string(v1.ResourceMemory), - AllocationResult: &pluginapi.ResourceAllocation{ - ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ - string(v1.ResourceMemory): { - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: float64(allocationInfo.AggregatedQuantity), - AllocationResult: allocationInfo.NumaAllocationResult.String(), - }, - }, - }, - Labels: general.DeepCopyMap(req.Labels), - Annotations: general.DeepCopyMap(req.Annotations), - }, nil + "containerName", req.ContainerName) + return nil, fmt.Errorf("number of existing allocated resources: %d does not match number of resource requests: %d", + len(resourceAllocationInfo), len(resourceReqInt)) + } + + for resName, allocationInfo := range resourceAllocationInfo { + reqInt, ok := resourceReqInt[resName] + if !ok { + general.ErrorS(fmt.Errorf("unable to find request quantity for resource that is already allocated"), + "allocation error", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "resourceName", resName) + return nil, fmt.Errorf("unable to find request quantity for resource that is already allocated") + } + + if allocationInfo != nil && allocationInfo.AggregatedQuantity >= uint64(reqInt) && !util.PodInplaceUpdateResizing(req) { + general.InfoS("already allocated and meet requirement", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "memoryReq(bytes)", resourceReqInt, + "currentResult(bytes)", allocationInfo.AggregatedQuantity) + } } if p.allocationHandlers[qosLevel] == nil { @@ -1115,7 +1133,8 @@ func (p *DynamicPolicy) removePod(podUID string, persistCheckpoint bool) error { delete(podEntries, podUID) } - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) @@ -1145,7 +1164,8 @@ func (p *DynamicPolicy) removeContainer(podUID, containerName string, persistChe return nil } - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("pod: %s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", podUID, containerName, err) return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) @@ -1278,7 +1298,7 @@ func (p *DynamicPolicy) hasLastLevelEnhancementKey(lastLevelEnhancementKey strin func (p *DynamicPolicy) checkNonBindingShareCoresMemoryResource(req *pluginapi.ResourceRequest) (bool, error) { reqInt, _, err := util.GetPodAggregatedRequestResource(req) if err != nil { - return false, fmt.Errorf("GetQuantityFromResourceReq failed with error: %v", err) + return false, fmt.Errorf("GetQuantityMapFromResourceReq failed with error: %v", err) } shareCoresAllocated := uint64(reqInt) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go index 021f017e44..18d8668e5f 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_advisor_handler.go @@ -379,7 +379,8 @@ func (p *DynamicPolicy) handleAdvisorResp(advisorResp *advisorsvc.ListAndWatchRe } } - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers.go index 5a372798e1..7730d16b6f 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers.go @@ -108,150 +108,154 @@ func (p *DynamicPolicy) numaBindingAllocationHandler(ctx context.Context, return p.numaBindingAllocationSidecarHandler(ctx, req, qosLevel, persistCheckpoint) } - // use the pod aggregated request to instead of main container. - podAggregatedRequest, _, err := util.GetPodAggregatedRequestResource(req) + podAggregatedResourceRequests, _, err := util.GetPodAggregatedRequestResourceMap(req) if err != nil { - return nil, fmt.Errorf("GetPodAggregatedRequestResource failed with error: %v", err) + return nil, fmt.Errorf("GetPodAggregatedRequestResourceMao failed with error: %v", err) } + // resourceAllocationInfo stores the final allocation info for each resource name. + resourceAllocationInfo := make(map[v1.ResourceName]*state.AllocationInfo) machineState := p.state.GetMachineState() - memoryState := machineState[v1.ResourceMemory] - podResourceEntries := p.state.GetPodResourceEntries() - podEntries := podResourceEntries[v1.ResourceMemory] - allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) - if allocationInfo != nil { - if allocationInfo.AggregatedQuantity >= uint64(podAggregatedRequest) && !util.PodInplaceUpdateResizing(req) { - general.InfoS("already allocated and meet requirement", + for resourceName, requestQuantity := range podAggregatedResourceRequests { + memoryState := machineState[resourceName] + + podEntries := podResourceEntries[resourceName] + + allocationInfo := p.state.GetAllocationInfo(resourceName, req.PodUid, req.ContainerName) + if allocationInfo != nil { + if allocationInfo.AggregatedQuantity >= uint64(requestQuantity) && !util.PodInplaceUpdateResizing(req) { + general.InfoS("already allocated and meet requirement", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "memoryReq(bytes)", requestQuantity, + "currentResult(bytes)", allocationInfo.AggregatedQuantity) + + resourceAllocationInfo[resourceName] = allocationInfo + continue + } + general.InfoS("not meet requirement, clear record and re-allocate", "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, - "memoryReq(bytes)", podAggregatedRequest, + "memoryReq(bytes)", requestQuantity, "currentResult(bytes)", allocationInfo.AggregatedQuantity) - resp, packErr := packAllocationResponse(allocationInfo, req, nil) - if packErr != nil { - general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, packErr) - return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) + if !allocationInfo.CheckNUMABinding() { + general.Errorf("pod: %s/%s, container: %s, resource %s, request to memory inplace update resize allocation, but origin allocation info is not numa_binding, reject it", + req.PodNamespace, req.PodName, req.ContainerName, resourceName) + return nil, fmt.Errorf("cannot change from non-numa_binding to numa_binding during inplace update for resource: %s", resourceName) } - return resp, nil - } - general.InfoS("not meet requirement, clear record and re-allocate", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, - "memoryReq(bytes)", podAggregatedRequest, - "currentResult(bytes)", allocationInfo.AggregatedQuantity) - if !allocationInfo.CheckNUMABinding() { - general.Errorf("pod: %s/%s, container: %s request to memory inplace update resize allocation, but origin allocation info is not numa_binding, reject it", - req.PodNamespace, req.PodName, req.ContainerName) - return nil, fmt.Errorf("cannot change from non-numa_binding to numa_binding during inplace update") - } + // remove the main container of this pod (the main container involve the whole pod requests), and the + // sidecar container request in state is zero. + containerEntries := podEntries[req.PodUid] + delete(containerEntries, req.ContainerName) - // remove the main container of this pod (the main container involve the whole pod requests), and the - // sidecar container request in state is zero. - containerEntries := podEntries[req.PodUid] - delete(containerEntries, req.ContainerName) + var stateErr error + memoryState, stateErr = state.GenerateMemoryStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podEntries, p.state.GetReservedMemory(), resourceName) + if stateErr != nil { + general.ErrorS(stateErr, "generateMemoryMachineStateByPodEntries failed", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "memoryReq(bytes)", requestQuantity, + "currentResult(bytes)", allocationInfo.AggregatedQuantity) + return nil, fmt.Errorf("generateMemoryMachineStateByPodEntries failed with error: %v", stateErr) + } + } else if util.PodInplaceUpdateResizing(req) { + general.Errorf("pod %s/%s, container: %s, resource: %s, request to memory inplace update resize, but no origin allocation info", + req.PodNamespace, req.PodName, req.ContainerName, resourceName) + return nil, fmt.Errorf("no origin allocation info for resource %s", resourceName) + } - var stateErr error - memoryState, stateErr = state.GenerateMemoryStateFromPodEntries(p.state.GetMachineInfo(), podEntries, p.state.GetReservedMemory()) - if stateErr != nil { - general.ErrorS(stateErr, "generateMemoryMachineStateByPodEntries failed", + // call calculateMemoryAllocation to update memoryState in-place, + // and we can use this adjusted state to pack allocation results + err = p.calculateMemoryAllocation(req, memoryState, qosLevel, requestQuantity) + if err != nil { + general.ErrorS(err, "unable to allocate Memory", "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, - "memoryReq(bytes)", podAggregatedRequest, - "currentResult(bytes)", allocationInfo.AggregatedQuantity) - return nil, fmt.Errorf("generateMemoryMachineStateByPodEntries failed with error: %v", stateErr) + "memoryReq", requestQuantity) + return nil, err } - } else if util.PodInplaceUpdateResizing(req) { - general.Errorf("pod %s/%s, container: %s request to memory inplace update resize, but no origin allocation info", - req.PodNamespace, req.PodName, req.ContainerName) - return nil, fmt.Errorf("no origin allocation info") - } - // call calculateMemoryAllocation to update memoryState in-place, - // and we can use this adjusted state to pack allocation results - err = p.calculateMemoryAllocation(req, memoryState, qosLevel, podAggregatedRequest) - if err != nil { - general.ErrorS(err, "unable to allocate Memory", + topologyAwareAllocations := make(map[int]uint64) + result := machine.NewCPUSet() + var aggregatedQuantity uint64 = 0 + for numaNode, numaNodeState := range memoryState { + if numaNodeState.PodEntries[req.PodUid][req.ContainerName] != nil { + result = result.Union(machine.NewCPUSet(numaNode)) + aggregatedQuantity += numaNodeState.PodEntries[req.PodUid][req.ContainerName].AggregatedQuantity + topologyAwareAllocations[numaNode] = numaNodeState.PodEntries[req.PodUid][req.ContainerName].AggregatedQuantity + } + } + + general.InfoS("allocate memory successfully", "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, - "memoryReq", podAggregatedRequest) - return nil, err - } + "reqMemoryQuantity", requestQuantity, + "numaAllocationResult", result.String()) - topologyAwareAllocations := make(map[int]uint64) - result := machine.NewCPUSet() - var aggregatedQuantity uint64 = 0 - for numaNode, numaNodeState := range memoryState { - if numaNodeState.PodEntries[req.PodUid][req.ContainerName] != nil { - result = result.Union(machine.NewCPUSet(numaNode)) - aggregatedQuantity += numaNodeState.PodEntries[req.PodUid][req.ContainerName].AggregatedQuantity - topologyAwareAllocations[numaNode] = numaNodeState.PodEntries[req.PodUid][req.ContainerName].AggregatedQuantity + allocationInfo = &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), + AggregatedQuantity: aggregatedQuantity, + NumaAllocationResult: result.Clone(), + TopologyAwareAllocations: topologyAwareAllocations, } - } - general.InfoS("allocate memory successfully", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, - "reqMemoryQuantity", podAggregatedRequest, - "numaAllocationResult", result.String()) + if !qosutil.AnnotationsIndicateNUMAExclusive(req.Annotations) { + // shared cores with numa binding and non distribute evenly across numa pods cannot occupy multiple NUMA nodes + if (qosLevel == apiconsts.PodAnnotationQoSLevelSharedCores || !qosutil.AnnotationsIndicateDistributeEvenlyAcrossNuma(req.Annotations)) && len(req.Hint.Nodes) != 1 { + return nil, fmt.Errorf("numa binding without numa exclusive allocation result numa node size is %d, "+ + "not equal to 1", len(req.Hint.Nodes)) + } + allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes) + } - allocationInfo = &state.AllocationInfo{ - AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), - AggregatedQuantity: aggregatedQuantity, - NumaAllocationResult: result.Clone(), - TopologyAwareAllocations: topologyAwareAllocations, + // Set the final allocationInfo for the resource name + resourceAllocationInfo[resourceName] = allocationInfo } - if !qosutil.AnnotationsIndicateNUMAExclusive(req.Annotations) { - if len(req.Hint.Nodes) != 1 { - return nil, fmt.Errorf("numa binding without numa exclusive allocation result numa node size is %d, "+ - "not equal to 1", len(req.Hint.Nodes)) + for resName, allocationInfo := range resourceAllocationInfo { + p.state.SetAllocationInfo(resName, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) + + // only v1.ResourceMemory can be adjusted + if resName == v1.ResourceMemory { + err = p.adjustAllocationEntries(persistCheckpoint) + if err != nil { + return nil, fmt.Errorf("adjustAllocationEntries failed with error: %v", err) + } } - allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes[0]) - } - p.state.SetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) + // update the numa allocation result for numa binding pod + err = p.updateSpecifiedNUMAAllocation(ctx, allocationInfo) + if err != nil { + general.Errorf("pod: %s/%s, container: %s updateSpecifiedNUMAAllocation failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + return nil, fmt.Errorf("updateSpecifiedNUMAAllocation failed with error: %v", err) + } + } podResourceEntries = p.state.GetPodResourceEntries() - machineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + machineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { - general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", + general.Errorf("pod: %s/%s, container: %s, GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) return nil, fmt.Errorf("calculate memoryState by updated pod entries failed with error: %v", err) } p.state.SetMachineState(machineState, persistCheckpoint) - err = p.adjustAllocationEntries(persistCheckpoint) - if err != nil { - return nil, fmt.Errorf("adjustAllocationEntries failed with error: %v", err) - } - - resp, err := packAllocationResponse(allocationInfo, req, nil) - if err != nil { - general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - return nil, fmt.Errorf("packAllocationResponse failed with error: %v", err) - } - - // update the numa allocation result for numa binding pod - err = p.updateSpecifiedNUMAAllocation(ctx, allocationInfo) - if err != nil { - general.Errorf("pod: %s/%s, container: %s updateSpecifiedNUMAAllocation failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - return nil, fmt.Errorf("updateSpecifiedNUMAAllocation failed with error: %v", err) - } - - return resp, nil + return packAllocationResponse(resourceAllocationInfo, req, nil) } +// reclaimedCoresBestEffortNUMABindingAllocationHandler allocates reclaimed cores with numa binding pods in best-effort manner. +// Note that this only supports v1.ResourceMemory, hugepages is not supported. func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx context.Context, req *pluginapi.ResourceRequest, persistCheckpoint bool, ) (*pluginapi.ResourceAllocationResponse, error) { @@ -302,7 +306,9 @@ func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx "containerName", req.ContainerName, "memoryReq(bytes)", allocationInfo.AggregatedQuantity, "currentResult(bytes)", allocationInfo.AggregatedQuantity) - return packAllocationResponse(allocationInfo, req, nil) + return packAllocationResponse(map[v1.ResourceName]*state.AllocationInfo{ + v1.ResourceMemory: allocationInfo, + }, req, nil) } general.InfoS("not meet requirement, clear record and re-allocate", @@ -318,7 +324,7 @@ func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx delete(containerEntries, req.ContainerName) var stateErr error - memoryState, stateErr = state.GenerateMemoryStateFromPodEntries(p.state.GetMachineInfo(), podEntries, p.state.GetReservedMemory()) + memoryState, stateErr = state.GenerateMemoryStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podEntries, p.state.GetReservedMemory(), v1.ResourceMemory) if stateErr != nil { general.ErrorS(stateErr, "generateMemoryMachineStateByPodEntries failed", "podNamespace", req.PodNamespace, @@ -342,7 +348,7 @@ func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx if req.Hint != nil && len(req.Hint.Nodes) == 1 && (reclaimActualBindingNUMAs.Contains(int(req.Hint.Nodes[0])) || !nonReclaimActualBindingNUMAs.Equals(machine.NewCPUSet(int(req.Hint.Nodes[0])))) { - allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes[0]) + allocationInfo.SetSpecifiedNUMABindingNUMAID(req.Hint.Nodes) numaAllocationResult = machine.NewCPUSet(int(req.Hint.Nodes[0])) } else { numaAllocationResult = nonReclaimActualBindingNUMAs @@ -356,7 +362,8 @@ func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx p.state.SetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) - machineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetPodResourceEntries(), p.state.GetMachineState(), p.state.GetReservedMemory()) + machineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), p.state.GetPodResourceEntries(), + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -370,7 +377,9 @@ func (p *DynamicPolicy) reclaimedCoresBestEffortNUMABindingAllocationHandler(ctx } } - resp, err := packAllocationResponse(allocationInfo, req, p.getReclaimedResourceAllocationAnnotations(allocationInfo)) + resp, err := packAllocationResponse(map[v1.ResourceName]*state.AllocationInfo{ + v1.ResourceMemory: allocationInfo, + }, req, p.getReclaimedResourceAllocationAnnotations(allocationInfo)) if err != nil { general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -409,36 +418,43 @@ func (p *DynamicPolicy) dedicatedCoresWithoutNUMABindingAllocationHandler(_ cont func (p *DynamicPolicy) numaBindingAllocationSidecarHandler(_ context.Context, req *pluginapi.ResourceRequest, qosLevel string, persistCheckpoint bool, ) (*pluginapi.ResourceAllocationResponse, error) { + resourceAllocationInfo := make(map[v1.ResourceName]*state.AllocationInfo) podResourceEntries := p.state.GetPodResourceEntries() - podEntries := podResourceEntries[v1.ResourceMemory] - if podEntries[req.PodUid] == nil { - general.Infof("there is no pod entry, pod: %s/%s, sidecar: %s, waiting next reconcile", - req.PodNamespace, req.PodName, req.ContainerName) - return &pluginapi.ResourceAllocationResponse{}, nil - } + for resourceName := range req.ResourceRequests { + podEntries := podResourceEntries[v1.ResourceName(resourceName)] + if podEntries[req.PodUid] == nil { + general.Infof("there is no pod entry, pod: %s/%s, sidecar: %s, waiting next reconcile", + req.PodNamespace, req.PodName, req.ContainerName) + return &pluginapi.ResourceAllocationResponse{}, nil + } - // todo: consider sidecar without reconcile in vpa - mainContainerAllocationInfo, ok := podEntries.GetMainContainerAllocation(req.PodUid) - if !ok { - general.Infof("main container is not found for pod: %s/%s, sidecar: %s, waiting next reconcile", - req.PodNamespace, req.PodName, req.ContainerName) - return &pluginapi.ResourceAllocationResponse{}, nil - } + // todo: consider sidecar without reconcile in vpa + mainContainerAllocationInfo, ok := podEntries.GetMainContainerAllocation(req.PodUid) + if !ok { + general.Infof("main container is not found for pod: %s/%s, sidecar: %s, waiting next reconcile", + req.PodNamespace, req.PodName, req.ContainerName) + return &pluginapi.ResourceAllocationResponse{}, nil + } + + allocationInfo := &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), + AggregatedQuantity: 0, // not count sidecar quantity + TopologyAwareAllocations: nil, // not count sidecar quantity + } - allocationInfo := &state.AllocationInfo{ - AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), - AggregatedQuantity: 0, // not count sidecar quantity - TopologyAwareAllocations: nil, // not count sidecar quantity + applySidecarAllocationInfoFromMainContainer(allocationInfo, mainContainerAllocationInfo) + + resourceAllocationInfo[v1.ResourceName(resourceName)] = allocationInfo } - applySidecarAllocationInfoFromMainContainer(allocationInfo, mainContainerAllocationInfo) + for resourceName, allocationInfo := range resourceAllocationInfo { + p.state.SetAllocationInfo(resourceName, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) + } - // update pod entries directly. if one of subsequent steps is failed, - // we will delete current allocationInfo from podEntries in defer function of allocation function. - p.state.SetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) podResourceEntries = p.state.GetPodResourceEntries() - resourcesState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Infof("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -446,7 +462,7 @@ func (p *DynamicPolicy) numaBindingAllocationSidecarHandler(_ context.Context, } p.state.SetMachineState(resourcesState, persistCheckpoint) - resp, err := packAllocationResponse(allocationInfo, req, nil) + resp, err := packAllocationResponse(resourceAllocationInfo, req, nil) if err != nil { general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -465,38 +481,47 @@ func (p *DynamicPolicy) allocateNUMAsWithoutNUMABindingPods(_ context.Context, } machineState := p.state.GetMachineState() - resourceState := machineState[v1.ResourceMemory] - numaWithoutNUMABindingPods := resourceState.GetNUMANodesWithoutSharedOrDedicatedNUMABindingPods() + resourceAllocationInfo := make(map[v1.ResourceName]*state.AllocationInfo) - allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) - if allocationInfo != nil { - general.Infof("pod: %s/%s, container: %s change cpuset.mems from: %s to %s", - req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.NumaAllocationResult.String(), numaWithoutNUMABindingPods.String()) - } + for resourceName := range req.ResourceRequests { + resourceState := machineState[v1.ResourceName(resourceName)] + numaWithoutNUMABindingPods := resourceState.GetNUMANodesWithoutSharedOrDedicatedNUMABindingPods() - // use real container request size here - reqInt, _, err := util.GetQuantityFromResourceReq(req) - if err != nil { - return nil, fmt.Errorf("GetQuantityFromResourceReq failed with error: %v", err) + allocationInfo := p.state.GetAllocationInfo(v1.ResourceName(resourceName), req.PodUid, req.ContainerName) + if allocationInfo != nil { + general.Infof("pod: %s/%s, container: %s change cpuset.mems from: %s to %s", + req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.NumaAllocationResult.String(), numaWithoutNUMABindingPods.String()) + } + + reqInt, _, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, req.Annotations) + if err != nil { + return nil, fmt.Errorf("GetQuantityFromResourceReq failed with error: %v", err) + } + + allocationInfo = &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), + NumaAllocationResult: numaWithoutNUMABindingPods.Clone(), + AggregatedQuantity: uint64(reqInt), + } + + resourceAllocationInfo[v1.ResourceName(resourceName)] = allocationInfo } - allocationInfo = &state.AllocationInfo{ - AllocationMeta: state.GenerateMemoryContainerAllocationMeta(req, qosLevel), - NumaAllocationResult: numaWithoutNUMABindingPods.Clone(), - AggregatedQuantity: uint64(reqInt), + for resourceName, allocationInfo := range resourceAllocationInfo { + p.state.SetAllocationInfo(resourceName, req.PodUid, req.ContainerName, allocationInfo, persistCheckpoint) } - p.state.SetAllocationInfo(v1.ResourceMemory, allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo, persistCheckpoint) podResourceEntries := p.state.GetPodResourceEntries() - machineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, machineState, p.state.GetReservedMemory()) + machineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, machineState, p.state.GetReservedMemory(), + p.extraResourceNames) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) return nil, fmt.Errorf("calculate resourceState by updated pod entries failed with error: %v", err) } - resp, err := packAllocationResponse(allocationInfo, req, nil) + resp, err := packAllocationResponse(resourceAllocationInfo, req, nil) if err != nil { general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -509,6 +534,7 @@ func (p *DynamicPolicy) allocateNUMAsWithoutNUMABindingPods(_ context.Context, // allocateTargetNUMAs returns target numa nodes as allocation results, // and it will store the allocation in states. +// Note that allocateTargetNUMAs only allocates v1.ResourceMemory. func (p *DynamicPolicy) allocateTargetNUMAs(req *pluginapi.ResourceRequest, qosLevel string, targetNUMAs machine.CPUSet, persistCheckpoint bool, ) (*pluginapi.ResourceAllocationResponse, error) { @@ -530,14 +556,17 @@ func (p *DynamicPolicy) allocateTargetNUMAs(req *pluginapi.ResourceRequest, p.state.SetAllocationInfo(v1.ResourceMemory, allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo, persistCheckpoint) podResourceEntries := p.state.GetPodResourceEntries() - machineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + machineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) return nil, fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } - resp, err := packAllocationResponse(allocationInfo, req, nil) + resp, err := packAllocationResponse(map[v1.ResourceName]*state.AllocationInfo{ + v1.ResourceMemory: allocationInfo, + }, req, nil) if err != nil { general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -572,7 +601,8 @@ func (p *DynamicPolicy) adjustAllocationEntries(persistCheckpoint bool) error { p.adjustAllocationEntriesForSystemCores(numaSetChangedContainers, podEntries, machineState) p.adjustAllocationEntriesForReclaimedCores(numaSetChangedContainers, podEntries, machineState) - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } @@ -597,14 +627,21 @@ func (p *DynamicPolicy) adjustAllocationEntries(persistCheckpoint bool) error { // it will update the passed by machineState in-place; so the function will be // called `calculateXXX` rather than `allocateXXX` func (p *DynamicPolicy) calculateMemoryAllocation(req *pluginapi.ResourceRequest, machineState state.NUMANodeMap, qosLevel string, podAggregatedRequest int) error { + numaBinding := qosutil.AnnotationsIndicateNUMABinding(req.Annotations) + numaExclusive := qosutil.AnnotationsIndicateNUMAExclusive(req.Annotations) + distributeEvenlyAcrossNUMANodes := qosutil.AnnotationsIndicateDistributeEvenlyAcrossNuma(req.Annotations) + if req.Hint == nil { return fmt.Errorf("hint is nil") } else if len(req.Hint.Nodes) == 0 { return fmt.Errorf("hint is empty") - } else if qosutil.AnnotationsIndicateNUMABinding(req.Annotations) && - !qosutil.AnnotationsIndicateNUMAExclusive(req.Annotations) && + } else if numaBinding && !numaExclusive && !distributeEvenlyAcrossNUMANodes && len(req.Hint.Nodes) > 1 { return fmt.Errorf("NUMA not exclusive binding container has request larger than 1 NUMA") + } else if qosLevel == apiconsts.PodAnnotationQoSLevelSharedCores && numaBinding && distributeEvenlyAcrossNUMANodes { + return fmt.Errorf("shared cores with numa binding and distribute evenly across numa is not supported at the same time") + } else if numaExclusive && distributeEvenlyAcrossNUMANodes { + return fmt.Errorf("NUMA exclusive and distribute evenly across numa is not supported at the same time") } hintNumaNodes := machine.NewCPUSet(util.HintToIntArray(req.Hint)...) @@ -624,7 +661,8 @@ func (p *DynamicPolicy) calculateMemoryAllocation(req *pluginapi.ResourceRequest return fmt.Errorf("calculateExclusiveMemory failed with error: %v", err) } } else { - leftQuantity, err = calculateMemoryInNumaNodes(req, machineState, hintNumaNodes.ToSliceInt(), uint64(podAggregatedRequest), qosLevel) + leftQuantity, err = calculateMemoryInNumaNodes(req, machineState, hintNumaNodes.ToSliceInt(), + uint64(podAggregatedRequest), qosLevel, distributeEvenlyAcrossNUMANodes) if err != nil { return fmt.Errorf("calculateMemoryInNumaNodes failed with error: %v", err) } @@ -695,8 +733,20 @@ func calculateExclusiveMemory(req *pluginapi.ResourceRequest, // the given container, and returns the remaining un-satisfied quantity. func calculateMemoryInNumaNodes(req *pluginapi.ResourceRequest, machineState state.NUMANodeMap, numaNodes []int, - reqQuantity uint64, qosLevel string, + reqQuantity uint64, qosLevel string, distributeEvenlyAcrossNuma bool, ) (leftQuantity uint64, err error) { + var requestPerNuma uint64 + if distributeEvenlyAcrossNuma { + if len(numaNodes) == 0 { + return reqQuantity, fmt.Errorf("NUMA nodes is empty and need to distribute evenly across numa nodes") + } + + if int(reqQuantity)%len(numaNodes) != 0 { + return reqQuantity, fmt.Errorf("request quantity %d is not divisible by numa nodes number %d", reqQuantity, len(numaNodes)) + } + + requestPerNuma = reqQuantity / uint64(len(numaNodes)) + } for _, numaNode := range numaNodes { var curNumaNodeAllocated uint64 @@ -705,13 +755,27 @@ func calculateMemoryInNumaNodes(req *pluginapi.ResourceRequest, return reqQuantity, fmt.Errorf("NUMA: %d has nil state", numaNode) } - if reqQuantity < numaNodeState.Free { - curNumaNodeAllocated = reqQuantity - reqQuantity = 0 + if distributeEvenlyAcrossNuma { + // allocate exactly requestPerNuma from each NUMA node + if numaNodeState.Free < requestPerNuma { + return reqQuantity, fmt.Errorf( + "NUMA %d does not have enough free memory to distribute evenly across numa: need %d, have %d", + numaNode, requestPerNuma, numaNodeState.Free, + ) + } + + curNumaNodeAllocated = requestPerNuma + reqQuantity -= requestPerNuma } else { - curNumaNodeAllocated = numaNodeState.Free - reqQuantity -= numaNodeState.Free + if reqQuantity < numaNodeState.Free { + curNumaNodeAllocated = reqQuantity + reqQuantity = 0 + } else { + curNumaNodeAllocated = numaNodeState.Free + reqQuantity -= numaNodeState.Free + } } + numaNodeState.Free -= curNumaNodeAllocated numaNodeState.Allocated += curNumaNodeAllocated @@ -736,15 +800,17 @@ func calculateMemoryInNumaNodes(req *pluginapi.ResourceRequest, return reqQuantity, nil } -// packAllocationResponse fills pluginapi.ResourceAllocationResponse with information from AllocationInfo and pluginapi.ResourceRequest -func packAllocationResponse(allocationInfo *state.AllocationInfo, req *pluginapi.ResourceRequest, resourceAllocationAnnotations map[string]string) (*pluginapi.ResourceAllocationResponse, error) { - if allocationInfo == nil { - return nil, fmt.Errorf("packAllocationResponse got nil allocationInfo") +// packAllocationResponse fills pluginapi.ResourceAllocationResponse with information from map of resource to AllocationInfo and pluginapi.ResourceRequest +func packAllocationResponse(resourceAllocationInfo map[v1.ResourceName]*state.AllocationInfo, req *pluginapi.ResourceRequest, + resourceAllocationAnnotations map[string]string, +) (*pluginapi.ResourceAllocationResponse, error) { + if resourceAllocationInfo == nil { + return nil, fmt.Errorf("packAllocationResponse got nil resourceAllocationInfo") } else if req == nil { return nil, fmt.Errorf("packAllocationResponse got nil request") } - return &pluginapi.ResourceAllocationResponse{ + resp := &pluginapi.ResourceAllocationResponse{ PodUid: req.PodUid, PodNamespace: req.PodNamespace, PodName: req.PodName, @@ -755,25 +821,33 @@ func packAllocationResponse(allocationInfo *state.AllocationInfo, req *pluginapi PodType: req.PodType, ResourceName: string(v1.ResourceMemory), AllocationResult: &pluginapi.ResourceAllocation{ - ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ - string(v1.ResourceMemory): { - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - Annotations: resourceAllocationAnnotations, - AllocatedQuantity: float64(allocationInfo.AggregatedQuantity), - AllocationResult: allocationInfo.NumaAllocationResult.String(), - ResourceHints: &pluginapi.ListOfTopologyHints{ - Hints: []*pluginapi.TopologyHint{ - req.Hint, - }, - }, - }, - }, + ResourceAllocation: make(map[string]*pluginapi.ResourceAllocationInfo), }, Labels: general.DeepCopyMap(req.Labels), Annotations: general.DeepCopyMap(req.Annotations), - }, nil + } + + for resourceName, allocationInfo := range resourceAllocationInfo { + if allocationInfo == nil { + continue + } + + resp.AllocationResult.ResourceAllocation[string(resourceName)] = &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + Annotations: resourceAllocationAnnotations, + AllocatedQuantity: float64(allocationInfo.AggregatedQuantity), + AllocationResult: allocationInfo.NumaAllocationResult.String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + req.Hint, + }, + }, + } + } + + return resp, nil } func (p *DynamicPolicy) adjustAllocationEntriesForSharedCores(numaSetChangedContainers map[string]map[string]*state.AllocationInfo, diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers_test.go index da6cc55a60..dd495390b6 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers_test.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_allocation_handlers_test.go @@ -18,6 +18,7 @@ package dynamicpolicy import ( "context" + "fmt" "io/ioutil" "os" "testing" @@ -122,7 +123,7 @@ func TestSharedCoresAllocationHandler(t *testing.T) { as.Nil(err) defer os.RemoveAll(tmpDir) - policy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + policy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) as.Nil(err) as.NotNil(policy) @@ -416,11 +417,17 @@ func TestNumaBindingAllocationHandler(t *testing.T) { Topology: []info.Node{ { Memory: 100 * 1024 * 1024 * 1024, + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, + NumPages: 1024, + }, + }, }, }, } - policy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + policy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) as.Nil(err) // Pre-populate state for some tests @@ -516,6 +523,29 @@ func TestNumaBindingAllocationHandler(t *testing.T) { qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, expectErr: true, // Should fail because no origin allocation info (simulated by not allocating first) }, + { + name: "allocate memory and hugepages resources", + req: &pluginapi.ResourceRequest{ + PodUid: "pod-new-hugepages", + PodNamespace: "default", + PodName: "pod-new-hugepages", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN, + Annotations: map[string]string{ + apiconsts.PodAnnotationMemoryEnhancementNumaBinding: apiconsts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1024 * 1024, + "hugepages-2Mi": 2 * 1024, + }, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + }, + }, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + expectErr: false, + }, } // For the "inplace update resize: non-binding to binding" test, we need to first allocate it WITHOUT binding, then try to update WITH binding. @@ -819,3 +849,477 @@ func TestCalculateMemoryAllocation(t *testing.T) { }) } } + +func TestCalculateMemoryInNumaNodes(t *testing.T) { + t.Parallel() + + type args struct { + req *pluginapi.ResourceRequest + machineState state.NUMANodeMap + numaNodes []int + reqQuantity uint64 + qosLevel string + distributeEvenlyAcrossNuma bool + } + tests := []struct { + name string + args args + want uint64 + wantErr bool + wantMachine state.NUMANodeMap + }{ + { + name: "distributeEvenlyAcrossNuma=true, empty numaNodes", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + machineState: state.NUMANodeMap{}, + numaNodes: []int{}, + reqQuantity: 100, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: true, + }, + want: 100, + wantErr: true, + wantMachine: state.NUMANodeMap{}, + }, + { + name: "distributeEvenlyAcrossNuma=true, reqQuantity not divisible", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 200, Allocated: 0}, + 1: &state.NUMANodeState{Free: 200, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 101, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: true, + }, + want: 101, + wantErr: true, + wantMachine: state.NUMANodeMap{0: {Free: 200, Allocated: 0}, 1: {Free: 200, Allocated: 0}}, + }, + { + name: "distributeEvenlyAcrossNuma=true, numaNodeState nil", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + machineState: state.NUMANodeMap{}, + numaNodes: []int{0}, + reqQuantity: 100, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: true, + }, + want: 100, + wantErr: true, + wantMachine: state.NUMANodeMap{}, + }, + { + name: "distributeEvenlyAcrossNuma=true, insufficient free memory", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 40, Allocated: 0}, + 1: &state.NUMANodeState{Free: 200, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 100, // 50 per numa + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: true, + }, + want: 100, + wantErr: true, + wantMachine: state.NUMANodeMap{0: {Free: 40, Allocated: 0}, 1: {Free: 200, Allocated: 0}}, + }, + { + name: "distributeEvenlyAcrossNuma=true, success", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 200, Allocated: 0}, + 1: &state.NUMANodeState{Free: 200, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 100, // 50 per numa + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: true, + }, + want: 0, + wantErr: false, + wantMachine: state.NUMANodeMap{ + 0: { + Free: 150, + Allocated: 50, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 50, + NumaAllocationResult: machine.NewCPUSet(0), + TopologyAwareAllocations: map[int]uint64{ + 0: 50, + }, + }, + }, + }, + }, + 1: { + Free: 150, + Allocated: 50, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 50, + NumaAllocationResult: machine.NewCPUSet(1), + TopologyAwareAllocations: map[int]uint64{ + 1: 50, + }, + }, + }, + }, + }, + }, + }, + { + name: "distributeEvenlyAcrossNuma=false, numaNodeState nil", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + machineState: state.NUMANodeMap{}, + numaNodes: []int{0}, + reqQuantity: 100, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: false, + }, + want: 100, + wantErr: true, + wantMachine: state.NUMANodeMap{}, + }, + { + name: "distributeEvenlyAcrossNuma=false, reqQuantity fully satisfied by first NUMA", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 200, Allocated: 0}, + 1: &state.NUMANodeState{Free: 200, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 100, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: false, + }, + want: 0, + wantErr: false, + wantMachine: state.NUMANodeMap{ + 0: { + Free: 100, + Allocated: 100, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 100, + NumaAllocationResult: machine.NewCPUSet(0), + TopologyAwareAllocations: map[int]uint64{ + 0: 100, + }, + }, + }, + }, + }, + 1: {Free: 200, Allocated: 0}, + }, + }, + { + name: "distributeEvenlyAcrossNuma=false, reqQuantity partially satisfied", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 100, Allocated: 0}, + 1: &state.NUMANodeState{Free: 150, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 300, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: false, + }, + want: 50, // 300 - 100 - 150 = 50 + wantErr: false, + wantMachine: state.NUMANodeMap{ + 0: { + Free: 0, + Allocated: 100, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 100, + NumaAllocationResult: machine.NewCPUSet(0), + TopologyAwareAllocations: map[int]uint64{ + 0: 100, + }, + }, + }, + }, + }, + 1: { + Free: 0, + Allocated: 150, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 150, + NumaAllocationResult: machine.NewCPUSet(1), + TopologyAwareAllocations: map[int]uint64{ + 1: 150, + }, + }, + }, + }, + }, + }, + }, + { + name: "distributeEvenlyAcrossNuma=false, reqQuantity exactly satisfied", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{Free: 100, Allocated: 0}, + 1: &state.NUMANodeState{Free: 150, Allocated: 0}, + }, + numaNodes: []int{0, 1}, + reqQuantity: 250, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: false, + }, + want: 0, + wantErr: false, + wantMachine: state.NUMANodeMap{ + 0: { + Free: 0, + Allocated: 100, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 100, + NumaAllocationResult: machine.NewCPUSet(0), + TopologyAwareAllocations: map[int]uint64{ + 0: 100, + }, + }, + }, + }, + }, + 1: { + Free: 0, + Allocated: 150, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 150, + NumaAllocationResult: machine.NewCPUSet(1), + TopologyAwareAllocations: map[int]uint64{ + 1: 150, + }, + }, + }, + }, + }, + }, + }, + { + name: "distributeEvenlyAcrossNuma=false, existing pod entries", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-2", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, + machineState: state.NUMANodeMap{ + 0: { + Free: 100, + Allocated: 100, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AggregatedQuantity: 100, + }, + }, + }, + }, + }, + numaNodes: []int{0}, + reqQuantity: 50, + qosLevel: apiconsts.PodAnnotationQoSLevelSharedCores, + distributeEvenlyAcrossNuma: false, + }, + want: 0, + wantErr: false, + wantMachine: state.NUMANodeMap{ + 0: { + Free: 50, + Allocated: 150, + PodEntries: state.PodEntries{ + "pod-1": state.ContainerEntries{ + "container-1": &state.AllocationInfo{ + AggregatedQuantity: 100, + }, + "container-2": &state.AllocationInfo{ + AllocationMeta: state.GenerateMemoryContainerAllocationMeta(&pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-2", + PodNamespace: "default", + PodName: "test-pod", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + }, apiconsts.PodAnnotationQoSLevelSharedCores), + AggregatedQuantity: 50, + NumaAllocationResult: machine.NewCPUSet(0), + TopologyAwareAllocations: map[int]uint64{ + 0: 50, + }, + }, + }, + }, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got, err := calculateMemoryInNumaNodes(tt.args.req, tt.args.machineState, tt.args.numaNodes, tt.args.reqQuantity, tt.args.qosLevel, tt.args.distributeEvenlyAcrossNuma) + if (err != nil) != tt.wantErr { + t.Errorf("calculateMemoryInNumaNodes() error = %v, wantErr %v", err, tt.wantErr) + return + } + assert.Equal(t, tt.want, got) + + // Deep compare machineState, ignoring NumaAllocationResult and TopologyAwareAllocations in AllocationInfo + // because machine.NewCPUSet(numaNode) creates a new object each time, which will fail deep equality. + // Instead, we compare the string representation of NumaAllocationResult and the content of TopologyAwareAllocations. + if !tt.wantErr { + for numaID, wantNumaState := range tt.wantMachine { + gotNumaState := tt.args.machineState[numaID] + assert.NotNil(t, gotNumaState, fmt.Sprintf("NUMA %d state is nil in actual machine state", numaID)) + assert.Equal(t, wantNumaState.Free, gotNumaState.Free, fmt.Sprintf("NUMA %d Free mismatch", numaID)) + assert.Equal(t, wantNumaState.Allocated, gotNumaState.Allocated, fmt.Sprintf("NUMA %d Allocated mismatch", numaID)) + + for podUID, wantContainerEntries := range wantNumaState.PodEntries { + gotContainerEntries, ok := gotNumaState.PodEntries[podUID] + assert.True(t, ok, fmt.Sprintf("Pod %s not found in NUMA %d", podUID, numaID)) + + for containerName, wantAllocInfo := range wantContainerEntries { + gotAllocInfo, ok := gotContainerEntries[containerName] + assert.True(t, ok, fmt.Sprintf("Container %s not found for Pod %s in NUMA %d", containerName, podUID, numaID)) + + assert.Equal(t, wantAllocInfo.AggregatedQuantity, gotAllocInfo.AggregatedQuantity, fmt.Sprintf("AggregatedQuantity mismatch for %s/%s in NUMA %d", podUID, containerName, numaID)) + assert.Equal(t, wantAllocInfo.NumaAllocationResult.String(), gotAllocInfo.NumaAllocationResult.String(), fmt.Sprintf("NumaAllocationResult mismatch for %s/%s in NUMA %d", podUID, containerName, numaID)) + assert.Equal(t, wantAllocInfo.TopologyAwareAllocations, gotAllocInfo.TopologyAwareAllocations, fmt.Sprintf("TopologyAwareAllocations mismatch for %s/%s in NUMA %d", podUID, containerName, numaID)) + assert.Equal(t, wantAllocInfo.AllocationMeta, gotAllocInfo.AllocationMeta, fmt.Sprintf("AllocationMeta mismatch for %s/%s in NUMA %d", podUID, containerName, numaID)) + } + } + } + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_async_handler.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_async_handler.go index dff75d8617..c2693a0ce1 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_async_handler.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_async_handler.go @@ -161,7 +161,8 @@ func (p *DynamicPolicy) setExtraControlKnobByConfigs(_ *coreconfig.Configuration } var resourcesMachineState state.NUMANodeResourcesMap - resourcesMachineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return @@ -492,7 +493,8 @@ func (p *DynamicPolicy) clearResidualState(_ *coreconfig.Configuration, } } - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return @@ -817,7 +819,8 @@ func (p *DynamicPolicy) syncOOMPriority(conf *coreconfig.Configuration, } var resourcesMachineState state.NUMANodeResourcesMap - resourcesMachineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err = state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go index 15b7cc2fc5..3332d6773c 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers.go @@ -24,6 +24,7 @@ import ( v1 "k8s.io/api/core/v1" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "k8s.io/kubernetes/pkg/apis/core/v1/helper" apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" @@ -138,22 +139,35 @@ func (p *DynamicPolicy) numaBindingHintHandler(_ context.Context, }) } - podAggregatedRequest, _, err := util.GetPodAggregatedRequestResource(req) + requestedResources, _, err := util.GetPodAggregatedRequestResourceMap(req) if err != nil { - return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + return nil, fmt.Errorf("get pod aggregated request map failed with error %v", err) } resourcesMachineState := p.state.GetMachineState() var hints map[string]*pluginapi.ListOfTopologyHints - allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) - if allocationInfo != nil { - if allocationInfo.NumaAllocationResult.Size() != 1 { - general.Errorf("pod: %s/%s, container: %s is share cores with numa binding, but its numa set length is %d", - req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.NumaAllocationResult.Size()) - return nil, fmt.Errorf("invalid numa set size") + resourceAllocationInfo := p.state.GetResourceAllocationInfo(req.PodUid, req.ContainerName) + if len(resourceAllocationInfo) != 0 { + // Check if there is scale up for hugepages + for resName, allocationInfo := range resourceAllocationInfo { + if allocationInfo == nil { + continue + } + + if _, ok := requestedResources[resName]; !ok || !helper.IsHugePageResourceName(resName) { + continue + } + + // Scale up for hugepages is not supported + if allocationInfo.AggregatedQuantity < uint64(requestedResources[resName]) { + general.Errorf("memory's already allocated with smaller quantity(%d) than requested(%d) for pod(%s_%s(%s))", + allocationInfo.AggregatedQuantity, requestedResources[resName], allocationInfo.PodName, allocationInfo.PodNamespace, allocationInfo.PodUid) + return nil, fmt.Errorf("resource %v already allocated with smaller quantity(%v < %v)", req.ResourceName, allocationInfo.AggregatedQuantity, + requestedResources[resName]) + } } - hints = regenerateHints(allocationInfo, util.PodInplaceUpdateResizing(req)) + hints = regenerateHints(resourceAllocationInfo, util.PodInplaceUpdateResizing(req), req, requestedResources) // clear the current container and regenerate machine state in follow cases: // 1. regenerateHints failed. @@ -168,38 +182,22 @@ func (p *DynamicPolicy) numaBindingHintHandler(_ context.Context, return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) } } - - if allocationInfo.NumaAllocationResult.Size() != 1 { - general.Errorf("pod: %s/%s, container: %s is snb, but its numa size is %d", - req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.NumaAllocationResult.Size()) - return nil, fmt.Errorf("invalid hints for inplace update pod") + } else { + // if hints exists in extra state-file, prefer to use them + totalAvailableNUMAs := p.topology.CPUDetails.NUMANodes() + for resource := range requestedResources { + availableNUMA := resourcesMachineState[resource].GetNUMANodesWithoutSharedOrDedicatedNUMABindingPods() + totalAvailableNUMAs = totalAvailableNUMAs.Intersection(availableNUMA) } - machineMemoryState := resourcesMachineState[v1.ResourceMemory] - nodeID := allocationInfo.NumaAllocationResult.ToSliceInt()[0] - nodeMemoryState := machineMemoryState[nodeID] - - // the main container aggregated quantity involve all container requests of the pod in memory admit. - originPodAggregatedRequest := allocationInfo.AggregatedQuantity - general.Infof("pod: %s/%s, main container: %s request to memory inplace update resize (%d->%d)", - req.PodNamespace, req.PodName, req.ContainerName, originPodAggregatedRequest, podAggregatedRequest) - - if uint64(podAggregatedRequest) > nodeMemoryState.Free && uint64(podAggregatedRequest) > originPodAggregatedRequest { // scaling up and no left resource to scale out - general.Infof("pod: %s/%s, container: %s request to memory inplace update resize (%d->%d, diff: %d), but no enough memory(%d)", - req.PodNamespace, req.PodName, req.ContainerName, originPodAggregatedRequest, podAggregatedRequest, uint64(podAggregatedRequest)-originPodAggregatedRequest, nodeMemoryState.Free) - return nil, fmt.Errorf("memory inplace update resize scale out failed with no enough resource") + requestedResourcesList := make([]v1.ResourceName, 0, len(requestedResources)) + for resName := range requestedResources { + requestedResourcesList = append(requestedResourcesList, resName) } - general.Infof("pod: %s/%s, container: %s request inplace update resize, there is enough resource for it in current NUMA", - req.PodNamespace, req.PodName, req.ContainerName) - hints = regenerateHints(allocationInfo, false) - } else { - // if hints exists in extra state-file, prefer to use them - availableNUMAs := resourcesMachineState[v1.ResourceMemory].GetNUMANodesWithoutSharedOrDedicatedNUMABindingPods() - var extraErr error - hints, extraErr = util.GetHintsFromExtraStateFile(req.PodName, string(v1.ResourceMemory), - p.extraStateFileAbsPath, availableNUMAs) + hints, extraErr = util.GetHintsFromExtraStateFile(req.PodName, p.extraStateFileAbsPath, + totalAvailableNUMAs, requestedResourcesList) if extraErr != nil { general.Infof("pod: %s/%s, container: %s GetHintsFromExtraStateFile failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, extraErr) @@ -210,7 +208,7 @@ func (p *DynamicPolicy) numaBindingHintHandler(_ context.Context, // otherwise, calculate hint for container without allocated memory var calculateErr error // calculate hint for container without allocated memory - hints, calculateErr = p.calculateHints(uint64(podAggregatedRequest), resourcesMachineState, req) + hints, calculateErr = p.calculateHints(resourcesMachineState, req, requestedResources) if calculateErr != nil { general.Errorf("failed to calculate hints for pod: %s/%s, container: %s, error: %v", req.PodNamespace, req.PodName, req.ContainerName, calculateErr) @@ -249,7 +247,7 @@ func (p *DynamicPolicy) reclaimedCoresWithNUMABindingHintHandler(_ context.Conte allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) if allocationInfo != nil { - hints = regenerateHints(allocationInfo, util.PodInplaceUpdateResizing(req)) + hints = regenerateSingleResourceHints(allocationInfo, util.PodInplaceUpdateResizing(req), v1.ResourceMemory) if hints == nil { if uint64(podAggregatedRequest) > allocationInfo.AggregatedQuantity { resourcesMachineState, err = p.clearContainerAndRegenerateMachineState(req) @@ -278,7 +276,7 @@ func (p *DynamicPolicy) reclaimedCoresWithNUMABindingHintHandler(_ context.Conte general.Infof("pod: %s/%s, container: %s request memory inplace update resize, there is enough resource for it in current NUMA", req.PodNamespace, req.PodName, req.ContainerName) - hints = regenerateHints(allocationInfo, false) + hints = regenerateSingleResourceHints(allocationInfo, false, v1.ResourceMemory) } } @@ -312,7 +310,8 @@ func (p *DynamicPolicy) clearContainerAndRegenerateMachineState(req *pluginapi.R } var err error - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetMachineState(), p.state.GetReservedMemory()) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), p.state.GetMemoryTopology(), podResourceEntries, + p.state.GetMachineState(), p.state.GetReservedMemory(), p.extraResourceNames) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -330,18 +329,19 @@ func (p *DynamicPolicy) dedicatedCoresWithoutNUMABindingHintHandler(_ context.Co // calculateHints is a helper function to calculate the topology hints // with the given container requests. -func (p *DynamicPolicy) calculateHints(reqInt uint64, +func (p *DynamicPolicy) calculateHints( resourcesMachineState state.NUMANodeResourcesMap, req *pluginapi.ResourceRequest, + requestedResources map[v1.ResourceName]int, ) (map[string]*pluginapi.ListOfTopologyHints, error) { - machineState := resourcesMachineState[v1.ResourceMemory] + mainMachineState := resourcesMachineState[v1.ResourceMemory] - if len(machineState) == 0 { - return nil, fmt.Errorf("calculateHints with empty machineState") + if len(mainMachineState) == 0 { + return nil, fmt.Errorf("calculateHints with empty memory machine state") } - numaNodes := make([]int, 0, len(machineState)) - for numaNode := range machineState { + numaNodes := make([]int, 0, len(mainMachineState)) + for numaNode := range mainMachineState { numaNodes = append(numaNodes, numaNode) } @@ -354,50 +354,85 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, sort.Ints(numaNodes) } - bytesPerNUMA, err := machineState.BytesPerNUMA() - if err != nil { - return nil, fmt.Errorf("getBytesPerNUMAFromMachineState failed with error: %v", err) - } - - minNUMAsCountNeeded, _, err := util.GetNUMANodesCountToFitMemoryReq(reqInt, bytesPerNUMA, len(machineState)) - if err != nil { - return nil, fmt.Errorf("GetNUMANodesCountToFitMemoryReq failed with error: %v", err) - } reqAnnotations := req.Annotations + sharedCores := qosutil.AnnotationsIndicateSharedCores(reqAnnotations) numaBinding := qosutil.AnnotationsIndicateNUMABinding(reqAnnotations) numaExclusive := qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations) - - // because it's hard to control memory allocation accurately, - // we only support numa_binding but not exclusive container with request smaller than 1 NUMA - if numaBinding && !numaExclusive && minNUMAsCountNeeded > 1 { - return nil, fmt.Errorf("NUMA not exclusive binding container has request larger than 1 NUMA") - } + distributeEvenlyAcrossNuma := qosutil.AnnotationsIndicateDistributeEvenlyAcrossNuma(reqAnnotations) numaPerSocket, err := p.topology.NUMAsPerSocket() if err != nil { return nil, fmt.Errorf("NUMAsPerSocket failed with error: %v", err) } - numaToFreeMemoryBytes := make(map[int]uint64, len(numaNodes)) + // minAffinitySize is the smallest number of NUMA nodes needed for a hint. + // Initialize minAffinitySize to include all NUMA cells. + minAffinitySize := len(numaNodes) - for _, nodeID := range numaNodes { - if machineState[nodeID] == nil { - general.Warningf("NUMA: %d has nil state", nodeID) - numaToFreeMemoryBytes[nodeID] = 0 - continue + // minNUMAsCountNeeded is the minimum number of NUMA nodes needed to fulfill the resource requests given the requested quantity + // and the number of bytes in one NUMA node. + // Initialize minNUMAsCountNeeded to include all NUMA cells. + minNUMAsCountNeeded := len(numaNodes) + + freeMemoryByResourceAndNUMA := make(map[v1.ResourceName]map[int]uint64, len(requestedResources)) + + for resourceName, requestedSize := range requestedResources { + // calculate all the free memory for each resource and each numa node + machineState := resourcesMachineState[resourceName] + if len(machineState) == 0 { + return nil, fmt.Errorf("calculateHints with empty mainMachineState for resource %s", resourceName) } - if numaExclusive && machineState[nodeID].Allocated > 0 { - numaToFreeMemoryBytes[nodeID] = 0 - general.Warningf("numa_exclusive container skip NUMA: %d allocated: %d", - nodeID, machineState[nodeID].Allocated) - } else { - numaToFreeMemoryBytes[nodeID] = machineState[nodeID].Free + bytesPerNUMA, err := machineState.BytesPerNUMA() + if err != nil { + return nil, fmt.Errorf("BytesPerNUMA from machine state failed with error: %v", err) + } + + currMinNUMAsCount, _, err := util.GetNUMANodesCountToFitMemoryReq(uint64(requestedSize), bytesPerNUMA, len(machineState)) + if err != nil { + return nil, fmt.Errorf("GetNUMANodesCountToFitMemoryReq failed with error: %v", err) + } + + if currMinNUMAsCount < minNUMAsCountNeeded { + minNUMAsCountNeeded = currMinNUMAsCount + } + + // because it's hard to control memory allocation accurately, + // we only support numa_binding but not exclusive container with request smaller than 1 NUMA + // pods with distribute evenly across numa annotation can occupy more than 1 NUMA + if numaBinding && !numaExclusive && !distributeEvenlyAcrossNuma && minNUMAsCountNeeded > 1 { + return nil, fmt.Errorf("NUMA not exclusive binding container with no distribute_evenly_across_numa" + + " has request larger than 1 NUMA") + } + + if numaExclusive && distributeEvenlyAcrossNuma { + return nil, fmt.Errorf("NUMA exclusive and distribute_evenly_across_numa is not supported at the same time") + } + + if sharedCores && numaBinding && distributeEvenlyAcrossNuma { + return nil, fmt.Errorf("shared cores with numa binding and distribute_evenly_across_numa is not supported at the same time") + } + + freeMemoryByResourceAndNUMA[resourceName] = make(map[int]uint64, len(numaNodes)) + for _, nodeID := range numaNodes { + if machineState[nodeID] == nil { + general.Warningf("NUMA: %d has nil state for resource %s", nodeID, resourceName) + freeMemoryByResourceAndNUMA[resourceName][nodeID] = 0 + continue + } + + if numaExclusive && machineState[nodeID].Allocated > 0 { + freeMemoryByResourceAndNUMA[resourceName][nodeID] = 0 + general.Warningf("numa_exclusive container skip NUMA: %d allocated: %d for resource: %s", + nodeID, machineState[nodeID].Allocated, resourceName) + } else { + freeMemoryByResourceAndNUMA[resourceName][nodeID] = machineState[nodeID].Free + } } } - general.Infof("calculate hints with req: %d, numaToFreeMemoryBytes: %+v", - reqInt, numaToFreeMemoryBytes) + general.Infof("calculate hints with requested resources: %+v, freeMemoryByResourceAndNUMA: %+v", + requestedResources, freeMemoryByResourceAndNUMA) numaBound := len(numaNodes) if numaBound > machine.LargeNUMAsPoint { @@ -405,64 +440,137 @@ func (p *DynamicPolicy) calculateHints(reqInt uint64, numaBound = minNUMAsCountNeeded + 1 } - var availableNumaHints []*pluginapi.TopologyHint + availableNumaHints := make(map[string]*pluginapi.ListOfTopologyHints) machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { maskCount := mask.Count() if maskCount < minNUMAsCountNeeded { return - } else if numaBinding && !numaExclusive && maskCount > 1 { + } else if numaBinding && !numaExclusive && !distributeEvenlyAcrossNuma && maskCount > 1 { // because it's hard to control memory allocation accurately, // we only support numa_binding but not exclusive container with request smaller than 1 NUMA + // pods with distribute evenly across numa annotation can occupy more than 1 NUMA return } maskBits := mask.GetBits() numaCountNeeded := mask.Count() - var freeBytesInMask uint64 = 0 + totalFreeSize := map[v1.ResourceName]uint64{} + totalAllocatableSize := map[v1.ResourceName]uint64{} + for _, nodeID := range maskBits { - freeBytesInMask += numaToFreeMemoryBytes[nodeID] + for resourceName := range requestedResources { + machineState := resourcesMachineState[resourceName] + + if _, ok := totalFreeSize[resourceName]; !ok { + totalFreeSize[resourceName] = 0 + } + totalFreeSize[resourceName] += machineState[nodeID].Free + + if _, ok := totalAllocatableSize[resourceName]; !ok { + totalAllocatableSize[resourceName] = 0 + } + totalAllocatableSize[resourceName] += machineState[nodeID].Allocatable + } } - if freeBytesInMask < reqInt { - return + for resourceName, requestedSize := range requestedResources { + // verify that for all memory types the node mask has enough allocatable resources + if totalAllocatableSize[resourceName] < uint64(requestedSize) { + return + } } - crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.topology) - if err != nil { - return - } else if numaCountNeeded <= numaPerSocket && crossSockets { - return + // set the minimum amount of NUMA nodes that can satisfy the container resources requests + if mask.Count() < minAffinitySize { + minAffinitySize = mask.Count() } - availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{ - Nodes: machine.MaskToUInt64Array(mask), - Preferred: len(maskBits) == minNUMAsCountNeeded, - }) + // Start generating hints for each memory resource type + for resourceName, requestedSize := range requestedResources { + var freeBytesInMask uint64 = 0 + for _, nodeID := range maskBits { + freeBytesInMask += freeMemoryByResourceAndNUMA[resourceName][nodeID] + } + + if freeBytesInMask < uint64(requestedSize) { + return + } + + crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.topology) + if err != nil { + return + } else if numaCountNeeded <= numaPerSocket && crossSockets { + return + } + + // check if the mask can be evenly allocated + if distributeEvenlyAcrossNuma && maskCount > 1 { + if requestedSize%maskCount != 0 { + return + } + + requestedSizePerNode := requestedSize / maskCount + machineState := resourcesMachineState[resourceName] + + // Check if each node has enough free memory + for _, nodeID := range maskBits { + if machineState[nodeID] == nil { + return + } + + if machineState[nodeID].Free < uint64(requestedSizePerNode) { + return + } + } + } + + if _, ok := availableNumaHints[string(resourceName)]; !ok { + availableNumaHints[string(resourceName)] = &pluginapi.ListOfTopologyHints{ + Hints: make([]*pluginapi.TopologyHint, 0), + } + } + + // Append to the slice and assign the result back to the map + availableNumaHints[string(resourceName)].Hints = append(availableNumaHints[string(resourceName)].Hints, &pluginapi.TopologyHint{ + Nodes: machine.MaskToUInt64Array(mask), + Preferred: false, + }) + } }) - // todo support numa_binding without numa_exclusive in the future - if numaBinding && numaExclusive { - err = p.preferAvailableNumaHintsByPreOccupation(req, machineState, availableNumaHints) - if err != nil { - return nil, fmt.Errorf("preferAvailableNumaHintsByPreOccupation failed with error: %v", err) + for resourceName := range requestedResources { + // update hints preferred according to whether the minimal amount of NUMA nodes are used. + hints := availableNumaHints[string(resourceName)].Hints + for _, hint := range hints { + hint.Preferred = p.isHintPreferred(hint.Nodes, minAffinitySize) } - } - // NOTE: because grpc is inability to distinguish between an empty array and nil, - // we return an error instead of an empty array. - // we should resolve this issue if we need manage multi resource in one plugin. - if len(availableNumaHints) == 0 { - general.Warningf("calculateHints got no available memory hints for pod: %s/%s, container: %s", - req.PodNamespace, req.PodName, req.ContainerName) - return nil, errNoAvailableMemoryHints + machineState := resourcesMachineState[resourceName] + + // todo support numa_binding without numa_exclusive in the future + if numaBinding && numaExclusive { + err = p.preferAvailableNumaHintsByPreOccupation(req, machineState, hints) + if err != nil { + return nil, fmt.Errorf("preferAvailableNumaHintsByPreOccupation failed with error: %v", err) + } + } + + // NOTE: because grpc is inability to distinguish between an empty array and nil, + // we return an error instead of an empty array. + // we should resolve this issue if we need manage multi resource in one plugin. + if len(hints) == 0 { + general.Warningf("calculateHints got no available memory hints for resource: %s, pod: %s/%s, container: %s", + resourceName, req.PodNamespace, req.PodName, req.ContainerName) + return nil, errNoAvailableMemoryHints + } } - return map[string]*pluginapi.ListOfTopologyHints{ - string(v1.ResourceMemory): { - Hints: availableNumaHints, - }, - }, nil + return availableNumaHints, nil +} + +func (p *DynamicPolicy) isHintPreferred(maskBits []uint64, minAffinitySize int) bool { + return len(maskBits) == minAffinitySize } // calculateHints is a helper function to calculate the topology hints @@ -653,10 +761,50 @@ func (p *DynamicPolicy) filterNUMANodesByNonBindingReclaimedRequestedQuantity(no return filteredNUMANodes } -// regenerateHints regenerates hints for container that'd already been allocated memory, +// regenerateHints regenerates hints for all resource requests for a container that'd already been allocated memory, // and regenerateHints will assemble hints based on already-existed AllocationInfo, // without any calculation logics at all -func regenerateHints(allocationInfo *state.AllocationInfo, regenerate bool) map[string]*pluginapi.ListOfTopologyHints { +func regenerateHints(allAllocationInfo map[v1.ResourceName]*state.AllocationInfo, regenerate bool, + req *pluginapi.ResourceRequest, requestedResources map[v1.ResourceName]int, +) map[string]*pluginapi.ListOfTopologyHints { + hints := map[string]*pluginapi.ListOfTopologyHints{} + + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "podUID", req.PodUid, "containerName", req.ContainerName) + return nil + } + + if len(allAllocationInfo) != len(requestedResources) { + general.Errorf("number of requested resources by the container differs from state resources, podName: %v, containerName: %v", + req.PodName, req.ContainerName) + return nil + } + + for resourceName, allocInfo := range allAllocationInfo { + if allocInfo == nil { + continue + } + + singleResourceHints := regenerateSingleResourceHints(allocInfo, false, resourceName) + if singleResourceHints == nil { + continue + } + + hints[string(resourceName)] = singleResourceHints[string(resourceName)] + } + + return hints +} + +// regenerateSingleResourceHints regenerates hints for a single resource for a container that has already been allocated memory, +// and regenerateSingleResourceHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func regenerateSingleResourceHints(allocationInfo *state.AllocationInfo, regenerate bool, + resourceName v1.ResourceName, +) map[string]*pluginapi.ListOfTopologyHints { hints := map[string]*pluginapi.ListOfTopologyHints{} if regenerate { @@ -673,8 +821,10 @@ func regenerateHints(allocationInfo *state.AllocationInfo, regenerate bool) map[ "podNamespace", allocationInfo.PodNamespace, "podName", allocationInfo.PodName, "containerName", allocationInfo.ContainerName, - "hint", allocatedNumaNodes) - hints[string(v1.ResourceMemory)] = &pluginapi.ListOfTopologyHints{ + "hint", allocatedNumaNodes, + "resourceName", resourceName) + + hints[string(resourceName)] = &pluginapi.ListOfTopologyHints{ Hints: []*pluginapi.TopologyHint{ { Nodes: allocatedNumaNodes, @@ -682,6 +832,7 @@ func regenerateHints(allocationInfo *state.AllocationInfo, regenerate bool) map[ }, }, } + return hints } diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers_test.go new file mode 100644 index 0000000000..8c1f857483 --- /dev/null +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_hint_handlers_test.go @@ -0,0 +1,630 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dynamicpolicy + +import ( + "context" + "os" + "testing" + + info "github.com/google/cadvisor/info/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestDynamicPolicy_numaBindingHintHandler(t *testing.T) { + t.Parallel() + + type args struct { + req *pluginapi.ResourceRequest + } + tests := []struct { + name string + args args + wantErr bool + want *pluginapi.ResourceHintsResponse + }{ + { + name: "test for sidecar container", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_SIDECAR, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_SIDECAR, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): nil, + }, + }, + }, + { + name: "test for dedicated cores with numa binding without numa exclusive main container", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + { + name: "test for dedicated cores with numa binding with distribute evenly across numa", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 160 * 1024 * 1024 * 1024, // 160Gi + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + { + Nodes: []uint64{0, 1, 2, 3}, + Preferred: false, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + { + name: "shared cores with numa binding and distribute evenly across numa will return error", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 160 * 1024 * 1024 * 1024, // 160Gi + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + wantErr: true, + want: nil, + }, + { + name: "test for hugepages-2Mi dedicated cores with numa binding without numa exclusive main container", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + "hugepages-2Mi": 2 * 1024 * 1024 * 1024, // 2Gi + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + { + name: "test for hugepages-2Mi dedicated cores with numa binding with numa exclusive main container", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + "hugepages-2Mi": 4 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + { + Nodes: []uint64{0, 1, 2}, + Preferred: false, + }, + { + Nodes: []uint64{0, 1, 3}, + Preferred: false, + }, + { + Nodes: []uint64{0, 2, 3}, + Preferred: false, + }, + { + Nodes: []uint64{1, 2, 3}, + Preferred: false, + }, + { + Nodes: []uint64{0, 1, 2, 3}, + Preferred: false, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + }, + }, + { + name: "test for hugepages-2Mi dedicated cores without numa exclusive with distribute evenly across numa", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + "hugepages-2Mi": 4 * 1024 * 1024 * 1024, // 6Gi can be split into 2 or 4 numa nodes + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + { + Nodes: []uint64{0, 1, 2, 3}, + Preferred: false, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + { + name: "not enough memory for hugepages-2Mi returns error", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + "hugepages-2Mi": 1000 * 1024 * 1024 * 1024, // 1000Gi + }, + }, + }, + wantErr: true, + }, + { + name: "test for hugepages-1Gi dedicated cores without numa exclusive with distribute evenly across numa", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + "hugepages-1Gi": 16 * 1024 * 1024 * 1024, // 16Gi can fit onto 2 or 4 NUMA nodes + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + "hugepages-1Gi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + { + Nodes: []uint64{0, 1, 2, 3}, + Preferred: false, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + { + name: "distribute evenly across numa and numa exclusive not supported", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1024 * 1024 * 1024, + "hugepages-2Mi": 2 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + }, + }, + wantErr: true, + }, + { + name: "get topology hints for both memory and hugepages-2Mi", + args: args{ + req: &pluginapi.ResourceRequest{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1024 * 1024 * 1024, + "hugepages-2Mi": 2 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + wantErr: false, + want: &pluginapi.ResourceHintsResponse{ + PodUid: "pod1_uid", + PodName: "pod1", + ContainerName: "container1", + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + tmpDir, err := os.MkdirTemp("", "checkpoint-TestNumaBindingHintHandler") + require.NoError(t, err) + defer os.RemoveAll(tmpDir) + + cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) + assert.NoError(t, err) + machineInfo := &info.MachineInfo{ + Topology: []info.Node{ + { + Id: 0, + Memory: 100 * 1024 * 1024 * 1024, // 100 GB + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2Mi + NumPages: 1024, + }, + { + PageSize: 1 * 1024 * 1024, // 1Gi + NumPages: 8, + }, + }, + }, + { + Id: 1, + Memory: 100 * 1024 * 1024 * 1024, + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2Mi + NumPages: 1024, + }, + { + PageSize: 1 * 1024 * 1024, // 1Gi + NumPages: 8, + }, + }, + }, + { + Id: 2, + Memory: 100 * 1024 * 1024 * 1024, + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2Mi + NumPages: 1024, + }, + { + PageSize: 1 * 1024 * 1024, // 1Gi + NumPages: 8, + }, + }, + }, + { + Id: 3, + Memory: 100 * 1024 * 1024 * 1024, + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2Mi + NumPages: 1024, + }, + { + PageSize: 1 * 1024 * 1024, // 1Gi + NumPages: 8, + }, + }, + }, + }, + } + + policy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) + assert.NoError(t, err) + + got, err := policy.numaBindingHintHandler(context.Background(), tt.args.req) + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.want, got) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_test.go index a0cd7c136a..00d75a4534 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_test.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy_test.go @@ -111,16 +111,103 @@ var fakeConf = &config.Configuration{ }, } +var fakeConfWithExtraResources = &config.Configuration{ + AgentConfiguration: &configagent.AgentConfiguration{ + GenericAgentConfiguration: &configagent.GenericAgentConfiguration{ + GenericQRMPluginConfiguration: &qrmconfig.GenericQRMPluginConfiguration{ + UseKubeletReservedConfig: false, + }, + }, + StaticAgentConfiguration: &configagent.StaticAgentConfiguration{ + QRMPluginsConfiguration: &qrmconfig.QRMPluginsConfiguration{ + MemoryQRMPluginConfig: &qrmconfig.MemoryQRMPluginConfig{ + ReservedMemoryGB: 4, + ExtraMemoryResources: []string{"hugepages-2Mi", "hugepages-1Gi"}, + }, + }, + }, + }, +} + func getTestDynamicPolicyWithInitialization( topology *machine.CPUTopology, machineInfo *info.MachineInfo, stateFileDirectory string, ) (*DynamicPolicy, error) { - reservedMemory, err := getReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo) + resourcesReservedMemory, err := getResourcesReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo, + []string{string(v1.ResourceMemory)}) if err != nil { return nil, err } - resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, + qosConfig := generic.NewQoSConfiguration() + qosConfig.SetExpandQoSLevelSelector(consts.PodAnnotationQoSLevelSharedCores, map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }) + qosConfig.SetExpandQoSLevelSelector(consts.PodAnnotationQoSLevelDedicatedCores, map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }) + qosConfig.SetExpandQoSLevelSelector(consts.PodAnnotationQoSLevelReclaimedCores, map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + }) + + stateDirectoryConfig := &statedirectory.StateDirectoryConfiguration{ + StateFileDirectory: stateFileDirectory, + } + stateImpl, err := state.NewCheckpointState(stateDirectoryConfig, memoryPluginStateFileName, + memconsts.MemoryResourcePluginPolicyNameDynamic, topology, machineInfo, nil, resourcesReservedMemory, false, + metrics.DummyMetrics{}, fakeConf.ExtraMemoryResources) + if err != nil { + return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) + } + + policyImplement := &DynamicPolicy{ + topology: topology, + dynamicConf: dynamic.NewDynamicAgentConfiguration(), + featureGateManager: featuregatenegotiation.NewFeatureGateManager(config.NewConfiguration()), + qosConfig: qosConfig, + state: stateImpl, + emitter: metrics.DummyMetrics{}, + migratingMemory: make(map[string]map[string]bool), + stopCh: make(chan struct{}), + podDebugAnnoKeys: []string{podDebugAnnoKey}, + enableReclaimNUMABinding: true, + enableNonBindingShareCoresMemoryResourceCheck: true, + numaBindResultResourceAllocationAnnotationKey: coreconsts.QRMResourceAnnotationKeyNUMABindResult, + extraResourceNames: fakeConf.ExtraMemoryResources, + } + + policyImplement.allocationHandlers = map[string]util.AllocationHandler{ + consts.PodAnnotationQoSLevelSharedCores: policyImplement.sharedCoresAllocationHandler, + consts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresAllocationHandler, + consts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresAllocationHandler, + consts.PodAnnotationQoSLevelSystemCores: policyImplement.systemCoresAllocationHandler, + } + + policyImplement.hintHandlers = map[string]util.HintHandler{ + consts.PodAnnotationQoSLevelSharedCores: policyImplement.sharedCoresHintHandler, + consts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresHintHandler, + consts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresHintHandler, + consts.PodAnnotationQoSLevelSystemCores: policyImplement.systemCoresHintHandler, + } + + policyImplement.asyncWorkers = asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, policyImplement.emitter) + + policyImplement.defaultAsyncLimitedWorkers = asyncworker.NewAsyncLimitedWorkers(memoryPluginAsyncWorkersName, defaultAsyncWorkLimit, policyImplement.emitter) + policyImplement.asyncLimitedWorkersMap = map[string]*asyncworker.AsyncLimitedWorkers{ + memoryPluginAsyncWorkTopicMovePage: asyncworker.NewAsyncLimitedWorkers(memoryPluginAsyncWorkTopicMovePage, movePagesWorkLimit, policyImplement.emitter), + } + + policyImplement.numaAllocationReactor = reactor.DummyAllocationReactor{} + + return policyImplement, nil +} + +func getTestDynamicPolicyWithExtraResourcesWithInitialization( + topology *machine.CPUTopology, machineInfo *info.MachineInfo, stateFileDirectory string, +) (*DynamicPolicy, error) { + resourcesReservedMemory, err := getResourcesReservedMemory(fakeConfWithExtraResources, &metaserver.MetaServer{}, machineInfo, + []string{string(v1.ResourceMemory)}) + if err != nil { + return nil, err } qosConfig := generic.NewQoSConfiguration() @@ -138,7 +225,8 @@ func getTestDynamicPolicyWithInitialization( StateFileDirectory: stateFileDirectory, } stateImpl, err := state.NewCheckpointState(stateDirectoryConfig, memoryPluginStateFileName, - memconsts.MemoryResourcePluginPolicyNameDynamic, topology, machineInfo, resourcesReservedMemory, false, metrics.DummyMetrics{}) + memconsts.MemoryResourcePluginPolicyNameDynamic, topology, machineInfo, nil, resourcesReservedMemory, false, + metrics.DummyMetrics{}, fakeConfWithExtraResources.ExtraMemoryResources) if err != nil { return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) } @@ -156,6 +244,7 @@ func getTestDynamicPolicyWithInitialization( enableReclaimNUMABinding: true, enableNonBindingShareCoresMemoryResourceCheck: true, numaBindResultResourceAllocationAnnotationKey: coreconsts.QRMResourceAnnotationKeyNUMABindResult, + extraResourceNames: fakeConfWithExtraResources.ExtraMemoryResources, } policyImplement.allocationHandlers = map[string]util.AllocationHandler{ @@ -471,6 +560,7 @@ func TestAllocate(t *testing.T) { req *pluginapi.ResourceRequest expectedResp *pluginapi.ResourceAllocationResponse enhancementDefaultValues map[string]string + expectedAllocationInfos map[v1.ResourceName]*state.AllocationInfo }{ { name: "req for init container", @@ -940,6 +1030,66 @@ func TestAllocate(t *testing.T) { }, }, }, + { + name: "req for shared cores main container for memory and hugepages", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + "hugepages-2Mi": 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceMemory): { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{nil}, + }, + }, + "hugepages-2Mi": { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{nil}, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + }, { name: "req for reclaim_cores with actual numa_binding main container", req: &pluginapi.ResourceRequest{ @@ -1055,125 +1205,8 @@ func TestAllocate(t *testing.T) { }, }, }, - } - - for _, tc := range testCases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - as := require.New(t) - tmpDir, err := ioutil.TempDir("", "checkpoint-TestAllocate") - as.Nil(err) - - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) - - if tc.enhancementDefaultValues != nil { - dynamicPolicy.qosConfig.QoSEnhancementDefaultValues = tc.enhancementDefaultValues - } - - dynamicPolicy.enableMemoryAdvisor = true - dynamicPolicy.advisorClient = advisorsvc.NewStubAdvisorServiceClient() - - resp, err := dynamicPolicy.Allocate(context.Background(), tc.req) - as.Nil(err) - - tc.expectedResp.PodUid = tc.req.PodUid - as.Equalf(tc.expectedResp, resp, "failed in test case: %s", tc.name) - - os.RemoveAll(tmpDir) - }) - } -} - -func TestAllocateForPod(t *testing.T) { - t.Parallel() - - as := require.New(t) - cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) - as.Nil(err) - - machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) - as.Nil(err) - - testName := "test" - - tmpDir, err := ioutil.TempDir("", "checkpoint-TestAllocateForPod") - as.Nil(err) - - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) - - req := &pluginapi.PodResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ResourceName: string(v1.ResourceMemory), - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 1073741824, - }, - } - - _, err = dynamicPolicy.AllocateForPod(context.Background(), req) - as.NotNil(err) - os.RemoveAll(tmpDir) -} - -func TestGetPodTopologyHints(t *testing.T) { - t.Parallel() - - as := require.New(t) - cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) - as.Nil(err) - - machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) - as.Nil(err) - - testName := "test" - - tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetPodTopologyHints") - as.Nil(err) - - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) - - req := &pluginapi.PodResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ResourceName: string(v1.ResourceMemory), - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 1073741824, - }, - } - - _, err = dynamicPolicy.GetPodTopologyHints(context.Background(), req) - as.NotNil(err) - os.RemoveAll(tmpDir) -} - -func TestGetTopologyHints(t *testing.T) { - t.Parallel() - - as := require.New(t) - cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) - as.Nil(err) - - machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) - as.Nil(err) - - testName := "test" - - testCases := []struct { - name string - req *pluginapi.ResourceRequest - expectedResp *pluginapi.ResourceHintsResponse - enhancementDefaultValues map[string]string - numaHeadroom map[int]int64 - }{ { - name: "req for container of debug pod", + name: "req for memory and hugepages for shared cores with numa binding main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1182,8 +1215,369 @@ func TestGetTopologyHints(t *testing.T) { ContainerType: pluginapi.ContainerType_MAIN, ContainerIndex: 0, ResourceName: string(v1.ResourceMemory), - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 1073741824, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + "hugepages-2Mi": 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceMemory): { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + }, + }, + }, + "hugepages-2Mi": { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + }, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + { + name: "req for memory and hugepages for dedicated core with numa binding without numa exclusive with distribute evenly across numa", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + "hugepages-2Mi": 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "false", "distribute_evenly_across_numa": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + string(v1.ResourceMemory): { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0, 1).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + }, + }, + }, + "hugepages-2Mi": { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0, 1).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + }, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: "false", + }, + }, + expectedAllocationInfos: map[v1.ResourceName]*state.AllocationInfo{ + v1.ResourceMemory: { + AggregatedQuantity: 2147483648, + NumaAllocationResult: machine.NewCPUSet(0, 1), + TopologyAwareAllocations: map[int]uint64{ + 0: 1073741824, // should be distributed evenly across 2 numa nodes + 1: 1073741824, + }, + }, + "hugepages-2Mi": { + AggregatedQuantity: 2147483648, + NumaAllocationResult: machine.NewCPUSet(0, 1), + TopologyAwareAllocations: map[int]uint64{ + 0: 1073741824, // should be distributed evenly across 2 numa nodes + 1: 1073741824, + }, + }, + }, + }, + { + name: "test for dedicated cores with numa binding without numa exclusive for only hugepages", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + "hugepages-2Mi": 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + "hugepages-2Mi": { + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0).String(), + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + }, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + }, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + as := require.New(t) + tmpDir, err := ioutil.TempDir("", "checkpoint-TestAllocate") + as.Nil(err) + + dynamicPolicy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) + as.Nil(err) + + if tc.enhancementDefaultValues != nil { + dynamicPolicy.qosConfig.QoSEnhancementDefaultValues = tc.enhancementDefaultValues + } + + dynamicPolicy.enableMemoryAdvisor = true + dynamicPolicy.advisorClient = advisorsvc.NewStubAdvisorServiceClient() + + resp, err := dynamicPolicy.Allocate(context.Background(), tc.req) + as.Nil(err) + + tc.expectedResp.PodUid = tc.req.PodUid + as.Equalf(tc.expectedResp, resp, "failed in test case: %s", tc.name) + + if tc.expectedAllocationInfos != nil { + for resourceName, expectedAllocationInfo := range tc.expectedAllocationInfos { + actualAllocationInfo := dynamicPolicy.state.GetAllocationInfo(resourceName, tc.req.PodUid, tc.req.ContainerName) + as.NotNilf(actualAllocationInfo, "failed in test case: %s", tc.name) + + as.Equalf(expectedAllocationInfo.AggregatedQuantity, actualAllocationInfo.AggregatedQuantity, "failed in test case: %s", tc.name) + as.Equalf(expectedAllocationInfo.NumaAllocationResult, actualAllocationInfo.NumaAllocationResult, "failed in test case: %s", tc.name) + as.Equalf(expectedAllocationInfo.TopologyAwareAllocations, actualAllocationInfo.TopologyAwareAllocations, "failed in test case: %s", tc.name) + + } + } + + os.RemoveAll(tmpDir) + }) + } +} + +func TestAllocateForPod(t *testing.T) { + t.Parallel() + + as := require.New(t) + cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) + as.Nil(err) + + machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) + as.Nil(err) + + testName := "test" + + tmpDir, err := ioutil.TempDir("", "checkpoint-TestAllocateForPod") + as.Nil(err) + + dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + as.Nil(err) + + req := &pluginapi.PodResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + }, + } + + _, err = dynamicPolicy.AllocateForPod(context.Background(), req) + as.NotNil(err) + os.RemoveAll(tmpDir) +} + +func TestGetPodTopologyHints(t *testing.T) { + t.Parallel() + + as := require.New(t) + cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) + as.Nil(err) + + machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) + as.Nil(err) + + testName := "test" + + tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetPodTopologyHints") + as.Nil(err) + + dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + as.Nil(err) + + req := &pluginapi.PodResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + }, + } + + _, err = dynamicPolicy.GetPodTopologyHints(context.Background(), req) + as.NotNil(err) + os.RemoveAll(tmpDir) +} + +func TestGetTopologyHints(t *testing.T) { + t.Parallel() + + as := require.New(t) + cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) + as.Nil(err) + + machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) + as.Nil(err) + + testName := "test" + + testCases := []struct { + name string + req *pluginapi.ResourceRequest + expectedResp *pluginapi.ResourceHintsResponse + enhancementDefaultValues map[string]string + numaHeadroom map[int]int64 + }{ + { + name: "req for container of debug pod", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, }, Annotations: map[string]string{ podDebugAnnoKey: "", @@ -1273,12 +1667,122 @@ func TestGetTopologyHints(t *testing.T) { consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + }, + }, + }, + { + name: "req for system_cores main container", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): nil, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + }, + }, + { + name: "req for dedicated_cores with numa_binding & numa_exclusive main container", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 10737418240, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + expectedResp: &pluginapi.ResourceHintsResponse{ + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ + string(v1.ResourceMemory): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + Preferred: true, + }, + { + Nodes: []uint64{2, 3}, + Preferred: true, + }, + { + Nodes: []uint64{0, 1, 2}, + Preferred: false, + }, + { + Nodes: []uint64{0, 1, 3}, + Preferred: false, + }, + { + Nodes: []uint64{0, 2, 3}, + Preferred: false, + }, + { + Nodes: []uint64{1, 2, 3}, + Preferred: false, + }, + { + Nodes: []uint64{0, 1, 2, 3}, + Preferred: false, + }, + }, + }, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, }, }, }, { - name: "req for system_cores main container", + name: "req for dedicated_cores with numa_binding & not numa_exclusive main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1290,11 +1794,12 @@ func TestGetTopologyHints(t *testing.T) { ResourceRequests: map[string]float64{ string(v1.ResourceMemory): 1073741824, }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, - }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "false"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, }, }, expectedResp: &pluginapi.ResourceHintsResponse{ @@ -1305,18 +1810,39 @@ func TestGetTopologyHints(t *testing.T) { ContainerIndex: 0, ResourceName: string(v1.ResourceMemory), ResourceHints: map[string]*pluginapi.ListOfTopologyHints{ - string(v1.ResourceMemory): nil, + string(v1.ResourceMemory): { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, }, Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: "false", }, }, }, { - name: "req for dedicated_cores with numa_binding & numa_exclusive main container", + name: "req for dedicated_cores with numa_binding & default numa_exclusive true main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1330,7 +1856,7 @@ func TestGetTopologyHints(t *testing.T) { }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, }, Labels: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, @@ -1386,9 +1912,12 @@ func TestGetTopologyHints(t *testing.T) { consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, }, }, + enhancementDefaultValues: map[string]string{ + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, }, { - name: "req for dedicated_cores with numa_binding & not numa_exclusive main container", + name: "req for dedicated_cores with numa_binding & without numa_exclusive main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1402,7 +1931,7 @@ func TestGetTopologyHints(t *testing.T) { }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "false"}`, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, }, Labels: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, @@ -1441,14 +1970,13 @@ func TestGetTopologyHints(t *testing.T) { consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, - consts.PodAnnotationMemoryEnhancementNumaExclusive: "false", + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, }, }, }, { - name: "req for dedicated_cores with numa_binding & default numa_exclusive true main container", + name: "req for hugepages resource and memory for dedicated cores with numa_binding & without numa exclusive main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1458,7 +1986,8 @@ func TestGetTopologyHints(t *testing.T) { ContainerIndex: 0, ResourceName: string(v1.ResourceMemory), ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 10737418240, + string(v1.ResourceMemory): 1073741824, + "hugepages-2Mi": 1073741824, }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, @@ -1479,32 +2008,40 @@ func TestGetTopologyHints(t *testing.T) { string(v1.ResourceMemory): { Hints: []*pluginapi.TopologyHint{ { - Nodes: []uint64{0, 1}, + Nodes: []uint64{0}, Preferred: true, }, { - Nodes: []uint64{2, 3}, + Nodes: []uint64{1}, Preferred: true, }, { - Nodes: []uint64{0, 1, 2}, - Preferred: false, + Nodes: []uint64{2}, + Preferred: true, }, { - Nodes: []uint64{0, 1, 3}, - Preferred: false, + Nodes: []uint64{3}, + Preferred: true, }, + }, + }, + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ { - Nodes: []uint64{0, 2, 3}, - Preferred: false, + Nodes: []uint64{0}, + Preferred: true, }, { - Nodes: []uint64{1, 2, 3}, - Preferred: false, + Nodes: []uint64{1}, + Preferred: true, }, { - Nodes: []uint64{0, 1, 2, 3}, - Preferred: false, + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, }, }, }, @@ -1513,17 +2050,13 @@ func TestGetTopologyHints(t *testing.T) { consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, - consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, }, }, - enhancementDefaultValues: map[string]string{ - consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, - }, }, { - name: "req for dedicated_cores with numa_binding & without numa_exclusive main container", + name: "req for shared_cores with numa_binding main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1536,11 +2069,11 @@ func TestGetTopologyHints(t *testing.T) { string(v1.ResourceMemory): 1073741824, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, }, Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, }, }, expectedResp: &pluginapi.ResourceHintsResponse{ @@ -1573,16 +2106,16 @@ func TestGetTopologyHints(t *testing.T) { }, }, Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, }, Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, }, }, }, { - name: "req for shared_cores with numa_binding main container", + name: "req for hugepages and memory for shared cores with numa binding main container", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1593,6 +2126,7 @@ func TestGetTopologyHints(t *testing.T) { ResourceName: string(v1.ResourceMemory), ResourceRequests: map[string]float64{ string(v1.ResourceMemory): 1073741824, + "hugepages-2Mi": 1073741824, }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, @@ -1630,13 +2164,33 @@ func TestGetTopologyHints(t *testing.T) { }, }, }, + "hugepages-2Mi": { + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0}, + Preferred: true, + }, + { + Nodes: []uint64{1}, + Preferred: true, + }, + { + Nodes: []uint64{2}, + Preferred: true, + }, + { + Nodes: []uint64{3}, + Preferred: true, + }, + }, + }, }, Labels: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, }, }, }, @@ -1715,7 +2269,7 @@ func TestGetTopologyHints(t *testing.T) { tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetTopologyHints") as.Nil(err) - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + dynamicPolicy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) as.Nil(err) if tc.enhancementDefaultValues != nil { @@ -1752,7 +2306,7 @@ func TestGetTopologyAwareAllocatableResources(t *testing.T) { machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) as.Nil(err) - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + dynamicPolicy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) as.Nil(err) resp, err := dynamicPolicy.GetTopologyAwareAllocatableResources(context.Background(), &pluginapi.GetTopologyAwareAllocatableResourcesRequest{}) @@ -1778,6 +2332,42 @@ func TestGetTopologyAwareAllocatableResources(t *testing.T) { AggregatedAllocatableQuantity: 30064771072, AggregatedCapacityQuantity: 34359738368, }, + "hugepages-2Mi": { + IsNodeResource: false, + IsScalarResource: true, + TopologyAwareCapacityQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 2147483648, Node: 0}, + {ResourceValue: 2147483648, Node: 1}, + {ResourceValue: 2147483648, Node: 2}, + {ResourceValue: 2147483648, Node: 3}, + }, + TopologyAwareAllocatableQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 2147483648, Node: 0}, + {ResourceValue: 2147483648, Node: 1}, + {ResourceValue: 2147483648, Node: 2}, + {ResourceValue: 2147483648, Node: 3}, + }, + AggregatedCapacityQuantity: 8589934592, + AggregatedAllocatableQuantity: 8589934592, + }, + "hugepages-1Gi": { + IsNodeResource: false, + IsScalarResource: true, + TopologyAwareCapacityQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 8589934592, Node: 0}, + {ResourceValue: 8589934592, Node: 1}, + {ResourceValue: 8589934592, Node: 2}, + {ResourceValue: 8589934592, Node: 3}, + }, + TopologyAwareAllocatableQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 8589934592, Node: 0}, + {ResourceValue: 8589934592, Node: 1}, + {ResourceValue: 8589934592, Node: 2}, + {ResourceValue: 8589934592, Node: 3}, + }, + AggregatedCapacityQuantity: 34359738368, + AggregatedAllocatableQuantity: 34359738368, + }, }, }, resp) } @@ -1893,7 +2483,116 @@ func TestGetTopologyAwareResources(t *testing.T) { }, }, { - description: "req for dedicated_cores with numa_binding main container", + description: "req for dedicated_cores with numa_binding main container", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 10737418240, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + expectedResp: &pluginapi.GetTopologyAwareResourcesResponse{ + PodNamespace: testName, + PodName: testName, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: testName, + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + string(v1.ResourceMemory): { + IsNodeResource: false, + IsScalarResource: true, + AggregatedQuantity: 15032385536, + OriginalAggregatedQuantity: 15032385536, + TopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 7516192768, Node: 0}, + {ResourceValue: 7516192768, Node: 1}, + }, + OriginalTopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 7516192768, Node: 0}, + {ResourceValue: 7516192768, Node: 1}, + }, + }, + }, + }, + }, + }, + { + description: "req for dedicated_cores with numa_binding main container for memory and hugepages", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + "hugepages-2Mi": 1073741824, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + expectedResp: &pluginapi.GetTopologyAwareResourcesResponse{ + PodNamespace: testName, + PodName: testName, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: testName, + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + string(v1.ResourceMemory): { + IsNodeResource: false, + IsScalarResource: true, + AggregatedQuantity: 1073741824, + OriginalAggregatedQuantity: 1073741824, + TopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 1073741824, Node: 0}, + }, + OriginalTopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 1073741824, Node: 0}, + }, + }, + "hugepages-2Mi": { + IsNodeResource: false, + IsScalarResource: true, + AggregatedQuantity: 1073741824, + OriginalAggregatedQuantity: 1073741824, + TopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 1073741824, Node: 0}, + }, + OriginalTopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 1073741824, Node: 0}, + }, + }, + }, + }, + }, + }, + { + description: "req for dedicated_cores with numa_binding main container with distribute evenly across numa for memory and hugepages", req: &pluginapi.ResourceRequest{ PodUid: string(uuid.NewUUID()), PodNamespace: testName, @@ -1907,11 +2606,12 @@ func TestGetTopologyAwareResources(t *testing.T) { Preferred: true, }, ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 10737418240, + string(v1.ResourceMemory): 1073741824, + "hugepages-2Mi": 1073741824, }, Annotations: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "distribute_evenly_across_numa": "true"}`, }, Labels: map[string]string{ consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, @@ -1926,15 +2626,29 @@ func TestGetTopologyAwareResources(t *testing.T) { string(v1.ResourceMemory): { IsNodeResource: false, IsScalarResource: true, - AggregatedQuantity: 15032385536, - OriginalAggregatedQuantity: 15032385536, + AggregatedQuantity: 1073741824, + OriginalAggregatedQuantity: 1073741824, TopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ - {ResourceValue: 7516192768, Node: 0}, - {ResourceValue: 7516192768, Node: 1}, + {ResourceValue: 536870912, Node: 0}, + {ResourceValue: 536870912, Node: 1}, }, OriginalTopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ - {ResourceValue: 7516192768, Node: 0}, - {ResourceValue: 7516192768, Node: 1}, + {ResourceValue: 536870912, Node: 0}, + {ResourceValue: 536870912, Node: 1}, + }, + }, + "hugepages-2Mi": { + IsNodeResource: false, + IsScalarResource: true, + AggregatedQuantity: 1073741824, + OriginalAggregatedQuantity: 1073741824, + TopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 536870912, Node: 0}, + {ResourceValue: 536870912, Node: 1}, + }, + OriginalTopologyAwareQuantityList: []*pluginapi.TopologyAwareQuantity{ + {ResourceValue: 536870912, Node: 0}, + {ResourceValue: 536870912, Node: 1}, }, }, }, @@ -1944,31 +2658,36 @@ func TestGetTopologyAwareResources(t *testing.T) { } for _, tc := range testCases { - tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetTopologyAwareResources") - as.Nil(err) - - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) + tc := tc + t.Run(tc.description, func(t *testing.T) { + t.Parallel() - _, err = dynamicPolicy.Allocate(context.Background(), tc.req) - as.Nil(err) + tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetTopologyAwareResources") + as.Nil(err) - resp, err := dynamicPolicy.GetTopologyAwareResources(context.Background(), &pluginapi.GetTopologyAwareResourcesRequest{ - PodUid: tc.req.PodUid, - ContainerName: testName, - }) + dynamicPolicy, err := getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) + as.Nil(err) - if tc.err != nil { - as.NotNil(err) - continue - } else { + _, err = dynamicPolicy.Allocate(context.Background(), tc.req) as.Nil(err) - tc.expectedResp.PodUid = tc.req.PodUid - } - as.Equalf(tc.expectedResp, resp, "failed in test case: %s", tc.description) + resp, err := dynamicPolicy.GetTopologyAwareResources(context.Background(), &pluginapi.GetTopologyAwareResourcesRequest{ + PodUid: tc.req.PodUid, + ContainerName: testName, + }) + + if tc.err != nil { + as.NotNil(err) + return + } else { + as.Nil(err) + tc.expectedResp.PodUid = tc.req.PodUid + } - os.Remove(tmpDir) + as.Equalf(tc.expectedResp, resp, "failed in test case: %s", tc.description) + + os.Remove(tmpDir) + }) } } @@ -1976,10 +2695,7 @@ func TestGetResourcesAllocation(t *testing.T) { t.Parallel() as := require.New(t) - - tmpDir, err := ioutil.TempDir("", "checkpoint-TestGetResourcesAllocation") - as.Nil(err) - defer os.RemoveAll(tmpDir) + testName := "test" cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) as.Nil(err) @@ -1987,215 +2703,308 @@ func TestGetResourcesAllocation(t *testing.T) { machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) as.Nil(err) - dynamicPolicy, err := getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) - - testName := "test" + type testCase struct { + name string + useExtraResources bool + buildRequest func() []*pluginapi.ResourceRequest + expectedMemory *pluginapi.ResourceAllocationInfo + expectedHugepages *pluginapi.ResourceAllocationInfo + checkHugepages bool + } - // test for shared_cores - req := &pluginapi.ResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ContainerName: testName, - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceName: string(v1.ResourceMemory), - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 1073741824, + tests := []testCase{ + { + name: "shared_cores", + useExtraResources: true, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 1073741824, + AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), + }, }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + { + name: "reclaimed_cores", + useExtraResources: true, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 1073741824, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 1073741824, + AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), + }, }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + { + name: "dedicated_cores_numa_binding", + useExtraResources: false, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 7516192768, + AllocationResult: machine.NewCPUSet(0).String(), + }, }, - } - - _, err = dynamicPolicy.Allocate(context.Background(), req) - as.Nil(err) - - resp1, err := dynamicPolicy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) - as.Nil(err) - - as.NotNil(resp1.PodResources[req.PodUid]) - as.NotNil(resp1.PodResources[req.PodUid].ContainerResources[testName]) - as.NotNil(resp1.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - as.Equal(&pluginapi.ResourceAllocationInfo{ - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: 1073741824, - AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), - }, resp1.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - - // test for reclaimed_cores - req = &pluginapi.ResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ContainerName: testName, - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceName: string(v1.ResourceMemory), - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 1073741824, + { + name: "system_cores_cpuset_reserve", + useExtraResources: false, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + consts.PodAnnotationCPUEnhancementKey: `{"cpuset_pool": "reserve"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 0, + AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), + }, }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + { + name: "system_cores_cpuset_reserve_with_numa_binding", + useExtraResources: false, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + consts.PodAnnotationCPUEnhancementKey: `{"cpuset_pool": "reserve"}`, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 0, + AllocationResult: machine.NewCPUSet(1, 2, 3).String(), + }, }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelReclaimedCores, + { + name: "dedicated_cores_memory_and_hugepages", + useExtraResources: true, + buildRequest: func() []*pluginapi.ResourceRequest { + return []*pluginapi.ResourceRequest{ + { + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: string(v1.ResourceMemory), + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + ResourceRequests: map[string]float64{ + string(v1.ResourceMemory): 2147483648, + "hugepages-2Mi": 2147483648, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + } + }, + expectedMemory: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0).String(), + }, + checkHugepages: true, + expectedHugepages: &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: 2147483648, + AllocationResult: machine.NewCPUSet(0).String(), + }, }, } - _, err = dynamicPolicy.Allocate(context.Background(), req) - as.Nil(err) - - resp2, err := dynamicPolicy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) - as.Nil(err) + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() - as.NotNil(resp2.PodResources[req.PodUid]) - as.NotNil(resp2.PodResources[req.PodUid].ContainerResources[testName]) - as.NotNil(resp2.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - as.Equal(&pluginapi.ResourceAllocationInfo{ - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: 1073741824, - AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), - }, resp2.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) + tmpDir, err := ioutil.TempDir("", "checkpoint-"+tc.name) + as.Nil(err) + defer os.RemoveAll(tmpDir) - os.RemoveAll(tmpDir) - dynamicPolicy, err = getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) - as.Nil(err) + var policy *DynamicPolicy + if tc.useExtraResources { + policy, err = getTestDynamicPolicyWithExtraResourcesWithInitialization(cpuTopology, machineInfo, tmpDir) + } else { + policy, err = getTestDynamicPolicyWithInitialization(cpuTopology, machineInfo, tmpDir) + } + as.Nil(err) - // test for dedicated_cores with numa_binding - req = &pluginapi.ResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ContainerName: testName, - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceName: string(v1.ResourceMemory), - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 2147483648, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true", "numa_exclusive": "true"}`, - }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, - }, - } + reqs := tc.buildRequest() - _, err = dynamicPolicy.Allocate(context.Background(), req) - as.Nil(err) + // Execute all requests sequentially + for _, req := range reqs { + _, err = policy.Allocate(context.Background(), req) + as.Nil(err) + } - resp3, err := dynamicPolicy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) - as.Nil(err) + lastReq := reqs[len(reqs)-1] - as.NotNil(resp3.PodResources[req.PodUid]) - as.NotNil(resp3.PodResources[req.PodUid].ContainerResources[testName]) - as.NotNil(resp3.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - as.Equal(&pluginapi.ResourceAllocationInfo{ - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: 7516192768, - AllocationResult: machine.NewCPUSet(0).String(), - }, resp3.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - - // test for system_cores with cpuset_pool reserve - req = &pluginapi.ResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ContainerName: testName, - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceName: string(v1.ResourceMemory), - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 2147483648, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, - consts.PodAnnotationCPUEnhancementKey: `{"cpuset_pool": "reserve"}`, - }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, - }, - } + resp, err := policy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) + as.Nil(err) - _, err = dynamicPolicy.Allocate(context.Background(), req) - as.Nil(err) + memAlloc := resp.PodResources[lastReq.PodUid]. + ContainerResources[testName]. + ResourceAllocation[string(v1.ResourceMemory)] - resp4, err := dynamicPolicy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) - as.Nil(err) + as.NotNil(memAlloc) + as.Equal(tc.expectedMemory, memAlloc) - as.NotNil(resp4.PodResources[req.PodUid]) - as.NotNil(resp4.PodResources[req.PodUid].ContainerResources[testName]) - as.NotNil(resp4.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - as.Equal(&pluginapi.ResourceAllocationInfo{ - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: 0, - AllocationResult: machine.NewCPUSet(0, 1, 2, 3).String(), - }, resp4.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - - // test for system_cores with cpuset_pool reserve and with numa binding - req = &pluginapi.ResourceRequest{ - PodUid: string(uuid.NewUUID()), - PodNamespace: testName, - PodName: testName, - ContainerName: testName, - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceName: string(v1.ResourceMemory), - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - ResourceRequests: map[string]float64{ - string(v1.ResourceMemory): 2147483648, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, - consts.PodAnnotationCPUEnhancementKey: `{"cpuset_pool": "reserve"}`, - consts.PodAnnotationMemoryEnhancementKey: `{"numa_binding": "true"}`, - }, - Labels: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSystemCores, - }, + if tc.checkHugepages { + hpAlloc := resp.PodResources[lastReq.PodUid]. + ContainerResources[testName]. + ResourceAllocation["hugepages-2Mi"] + as.NotNil(hpAlloc) + as.Equal(tc.expectedHugepages, hpAlloc) + } + }) } - - _, err = dynamicPolicy.Allocate(context.Background(), req) - as.Nil(err) - - resp5, err := dynamicPolicy.GetResourcesAllocation(context.Background(), &pluginapi.GetResourcesAllocationRequest{}) - as.Nil(err) - - as.NotNil(resp5.PodResources[req.PodUid]) - as.NotNil(resp5.PodResources[req.PodUid].ContainerResources[testName]) - as.NotNil(resp5.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) - as.Equal(&pluginapi.ResourceAllocationInfo{ - OciPropertyName: util.OCIPropertyNameCPUSetMems, - IsNodeResource: false, - IsScalarResource: true, - AllocatedQuantity: 0, - AllocationResult: machine.NewCPUSet(1, 2, 3).String(), - }, resp5.PodResources[req.PodUid].ContainerResources[testName].ResourceAllocation[string(v1.ResourceMemory)]) } func TestGenerateResourcesMachineStateFromPodEntries(t *testing.T) { @@ -2206,7 +3015,7 @@ func TestGenerateResourcesMachineStateFromPodEntries(t *testing.T) { machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) as.Nil(err) - reservedMemory, err := getReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo) + reserved, err := getResourcesReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo, []string{string(v1.ResourceMemory)}) as.Nil(err) podUID := string(uuid.NewUUID()) @@ -2236,11 +3045,7 @@ func TestGenerateResourcesMachineStateFromPodEntries(t *testing.T) { v1.ResourceMemory: podEntries, } - reserved := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, - } - - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, podResourceEntries, nil, reserved) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, nil, podResourceEntries, nil, reserved, nil) as.Nil(err) as.NotNil(resourcesMachineState[v1.ResourceMemory][0]) @@ -2262,13 +3067,10 @@ func TestHandleAdvisorResp(t *testing.T) { machineInfo, err := machine.GenerateDummyMachineInfo(4, 32) as.Nil(err) - reservedMemory, err := getReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo) + resourcesReservedMemory, err := getResourcesReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo, + []string{string(v1.ResourceMemory)}) as.Nil(err) - resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, - } - pod1UID := string(uuid.NewUUID()) pod2UID := string(uuid.NewUUID()) pod3UID := string(uuid.NewUUID()) @@ -2793,7 +3595,8 @@ func TestHandleAdvisorResp(t *testing.T) { memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyMemoryNUMAHeadroom, memoryadvisor.ControlKnobHandlerWithChecker(dynamicPolicy.handleAdvisorMemoryNUMAHeadroom)) - machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, tc.podResourceEntries, nil, resourcesReservedMemory) + machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, nil, tc.podResourceEntries, + nil, resourcesReservedMemory, dynamicPolicy.extraResourceNames) as.Nil(err) if tc.podResourceEntries != nil { @@ -3164,14 +3967,11 @@ func TestSetExtraControlKnobByConfigs(t *testing.T) { v1.ResourceMemory: podEntries, } - reservedMemory, err := getReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo) + reserved, err := getResourcesReservedMemory(fakeConf, &metaserver.MetaServer{}, machineInfo, []string{string(v1.ResourceMemory)}) as.Nil(err) - reserved := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, - } - - resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, podResourceEntries, dynamicPolicy.state.GetMachineState(), reserved) + resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, nil, podResourceEntries, + dynamicPolicy.state.GetMachineState(), reserved, dynamicPolicy.extraResourceNames) as.Nil(err) dynamicPolicy.state.SetPodResourceEntries(podResourceEntries, true) @@ -4109,13 +4909,12 @@ func TestDynamicPolicy_adjustAllocationEntries(t *testing.T) { dynamicPolicy.metaServer = tt.fields.metaServer dynamicPolicy.asyncWorkers = asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, dynamicPolicy.emitter) dynamicPolicy.state.SetPodResourceEntries(podResourceEntries, true) - reservedMemory, err := getReservedMemory(fakeConf, dynamicPolicy.metaServer, machineInfo) + resourcesReservedMemory, err := getResourcesReservedMemory(fakeConf, dynamicPolicy.metaServer, machineInfo, + []string{string(v1.ResourceMemory)}) assert.NoError(t, err) - resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{ - v1.ResourceMemory: reservedMemory, - } - machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, podResourceEntries, dynamicPolicy.state.GetMachineState(), resourcesReservedMemory) + machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, nil, podResourceEntries, + dynamicPolicy.state.GetMachineState(), resourcesReservedMemory, dynamicPolicy.extraResourceNames) assert.NoError(t, err) dynamicPolicy.state.SetMachineState(machineState, true) @@ -4864,7 +5663,8 @@ func Test_adjustAllocationEntries(t *testing.T) { dynamicPolicy.state.SetAllocationInfo(v1.ResourceMemory, "test-pod-4-uid", "test-container-1", pod4Container1Allocation, true) podResourceEntries := dynamicPolicy.state.GetPodResourceEntries() - machineState, err := state.GenerateMachineStateFromPodEntries(dynamicPolicy.state.GetMachineInfo(), podResourceEntries, dynamicPolicy.state.GetMachineState(), dynamicPolicy.state.GetReservedMemory()) + machineState, err := state.GenerateMachineStateFromPodEntries(dynamicPolicy.state.GetMachineInfo(), nil, podResourceEntries, + dynamicPolicy.state.GetMachineState(), dynamicPolicy.state.GetReservedMemory(), dynamicPolicy.extraResourceNames) as.NoError(err) dynamicPolicy.state.SetMachineState(machineState, true) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor.go index b43b79b8cc..6dca80029a 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor.go @@ -53,8 +53,9 @@ func (p numaPodAllocationWrapper) UpdateAllocation(pod *v1.Pod) error { } func (p numaPodAllocationWrapper) getNUMABindResult() (string, error) { - if p.CheckDedicatedNUMABindingNUMAExclusive() { - // numa binding is exclusive, we can directly use numa allocation result as numa bind result + if p.CheckDedicatedNUMABindingNUMAExclusive() || p.CheckDistributeEvenlyAcrossNuma() { + // numa binding is exclusive or distribute evenly across numa annotation enabled, + // we can directly use numa allocation result as numa bind result // which is more than one numa numaList := p.AllocationInfo.NumaAllocationResult.ToSliceInt() if len(numaList) == 0 { diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor_test.go index dfed116c54..7dfce76531 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor_test.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/reactor/numa_allocation_reactor_test.go @@ -290,6 +290,236 @@ func Test_podNUMAAllocationReactor_UpdateAllocation(t *testing.T) { }, }, }, + { + name: "distribute_evenly_across_numa_pod", + fields: fields{ + podFetcher: &pod.PodFetcherStub{ + PodList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + }, + }, + client: fake.NewSimpleClientset( + &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + ), + }, + args: args{ + allocation: &state.AllocationInfo{ + AllocationMeta: commonstate.AllocationMeta{ + PodUid: "test-1-uid", + PodNamespace: "test", + PodName: "test-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN.String(), + ContainerIndex: 0, + QoSLevel: consts.PodAnnotationQoSLevelSharedCores, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNuma: consts.PodAnnotationCPUEnhancementDistributeEvenlyAcrossNumaEnable, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + AggregatedQuantity: 7516192768, + NumaAllocationResult: machine.NewCPUSet(0, 1), + TopologyAwareAllocations: map[int]uint64{ + 0: 3758096384, + 1: 3758096384, + }, + }, + }, + wantPod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: types.UID("test-1-uid"), + Annotations: map[string]string{ + consts.PodAnnotationNUMABindResultKey: "0,1", + }, + }, + }, + }, + { + name: "exclusive_enabled_but_empty_allocation_result", + fields: fields{ + podFetcher: &pod.PodFetcherStub{ + PodList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + }, + }, + client: fake.NewSimpleClientset( + &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + ), + }, + args: args{ + allocation: &state.AllocationInfo{ + AllocationMeta: commonstate.AllocationMeta{ + PodUid: "test-1-uid", + PodNamespace: "test", + PodName: "test-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN.String(), + ContainerIndex: 0, + QoSLevel: consts.PodAnnotationQoSLevelDedicatedCores, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelDedicatedCores, + }, + }, + AggregatedQuantity: 7516192768, + NumaAllocationResult: machine.NewCPUSet(), + }, + }, + wantPod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: types.UID("test-1-uid"), + }, + }, + wantErr: true, + }, + { + name: "invalid_numa_hint_multiple_values", + fields: fields{ + podFetcher: &pod.PodFetcherStub{ + PodList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + }, + }, + client: fake.NewSimpleClientset( + &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + ), + }, + args: args{ + allocation: &state.AllocationInfo{ + AllocationMeta: commonstate.AllocationMeta{ + PodUid: "test-1-uid", + PodNamespace: "test", + PodName: "test-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN.String(), + ContainerIndex: 0, + QoSLevel: consts.PodAnnotationQoSLevelSharedCores, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + cpuconsts.CPUStateAnnotationKeyNUMAHint: "0,1", + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + AggregatedQuantity: 7516192768, + NumaAllocationResult: machine.NewCPUSet(0), + }, + }, + wantPod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: types.UID("test-1-uid"), + }, + }, + wantErr: true, + }, + { + name: "invalid_numa_hint_parse_error", + fields: fields{ + podFetcher: &pod.PodFetcherStub{ + PodList: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + }, + }, + client: fake.NewSimpleClientset( + &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: "test-1-uid", + }, + }, + ), + }, + args: args{ + allocation: &state.AllocationInfo{ + AllocationMeta: commonstate.AllocationMeta{ + PodUid: "test-1-uid", + PodNamespace: "test", + PodName: "test-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN.String(), + ContainerIndex: 0, + QoSLevel: consts.PodAnnotationQoSLevelSharedCores, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + cpuconsts.CPUStateAnnotationKeyNUMAHint: "abc", + }, + Labels: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + }, + AggregatedQuantity: 7516192768, + NumaAllocationResult: machine.NewCPUSet(0), + }, + }, + wantPod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-1", + Namespace: "test", + UID: types.UID("test-1-uid"), + }, + }, + wantErr: true, + }, } for _, tt := range tests { tt := tt diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state.go index dde37ff3e7..959ca8e1cb 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state.go @@ -203,6 +203,50 @@ func (pre PodResourceEntries) Clone() PodResourceEntries { return clone } +// GetResourceAllocation gets the ResourceAllocation of every resource of a certain pod UID and container name. +func (pre PodResourceEntries) GetResourceAllocation(podUID, containerName string) (*pluginapi.ResourceAllocation, error) { + if pre == nil { + return nil, fmt.Errorf("GetResourceAllocation of nil PodResourceEntries") + } + + resourceAllocation := make(map[string]*pluginapi.ResourceAllocationInfo) + + for resourceName, podEntries := range pre { + allocationInfo := podEntries[podUID][containerName] + if allocationInfo == nil { + continue + } + + resourceAllocation[string(resourceName)] = &pluginapi.ResourceAllocationInfo{ + OciPropertyName: util.OCIPropertyNameCPUSetMems, + IsNodeResource: false, + IsScalarResource: true, + AllocatedQuantity: float64(allocationInfo.AggregatedQuantity), + AllocationResult: allocationInfo.NumaAllocationResult.String(), + } + + // deal with accompanying resources + for name, entry := range allocationInfo.ExtraControlKnobInfo { + if entry.OciPropertyName == "" { + continue + } + + if resourceAllocation[name] != nil { + return nil, fmt.Errorf("name: %s meets conflict", name) + } + + resourceAllocation[name] = &pluginapi.ResourceAllocationInfo{ + OciPropertyName: entry.OciPropertyName, + AllocationResult: entry.ControlKnobValue, + } + } + } + + return &pluginapi.ResourceAllocation{ + ResourceAllocation: resourceAllocation, + }, nil +} + func (ns *NUMANodeState) String() string { if ns == nil { return "" @@ -487,6 +531,9 @@ type reader interface { GetNUMAHeadroom() map[int]int64 GetPodResourceEntries() PodResourceEntries GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo + // GetResourceAllocationInfo gets the allocationInfo of all resources of a specific container. + // Returns nil if there is no such container in state. + GetResourceAllocationInfo(podUID, containerName string) map[v1.ResourceName]*AllocationInfo } // writer is used to store information into local states, @@ -507,6 +554,8 @@ type ReadonlyState interface { reader GetMachineInfo() *info.MachineInfo + // GetMemoryTopology returns the memory topology info (including NormalMemoryDetails etc.) + GetMemoryTopology() *machine.MemoryTopology GetReservedMemory() map[v1.ResourceName]map[int]uint64 } diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_checkpoint.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_checkpoint.go index e595f08ad4..3fc5d6b879 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_checkpoint.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_checkpoint.go @@ -58,18 +58,21 @@ type stateCheckpoint struct { skipStateCorruption bool emitter metrics.MetricEmitter machineInfo *info.MachineInfo - reservedMemory map[v1.ResourceName]map[int]uint64 + // memoryTopology contains detailed memory capacities (e.g. NormalMemoryDetails excluding hugepages) + memoryTopology *machine.MemoryTopology + reservedMemory map[v1.ResourceName]map[int]uint64 + extraResourceNames []string } func NewCheckpointState( stateDirectoryConfig *statedirectory.StateDirectoryConfiguration, checkpointName, policyName string, - topology *machine.CPUTopology, machineInfo *info.MachineInfo, + topology *machine.CPUTopology, machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, reservedMemory map[v1.ResourceName]map[int]uint64, skipStateCorruption bool, - emitter metrics.MetricEmitter, + emitter metrics.MetricEmitter, extraResourceNames []string, ) (State, error) { currentStateDir, otherStateDir := stateDirectoryConfig.GetCurrentAndPreviousStateFileDirectory() - defaultCache, err := NewMemoryPluginState(topology, machineInfo, reservedMemory) + defaultCache, err := NewMemoryPluginState(topology, machineInfo, memoryTopology, reservedMemory, extraResourceNames) if err != nil { return nil, fmt.Errorf("NewMemoryPluginState failed with error: %v", err) } @@ -81,7 +84,9 @@ func NewCheckpointState( skipStateCorruption: skipStateCorruption, emitter: emitter, machineInfo: machineInfo, + memoryTopology: memoryTopology, reservedMemory: reservedMemory, + extraResourceNames: extraResourceNames, } cm, err := customcheckpointmanager.NewCustomCheckpointManager(currentStateDir, otherStateDir, checkpointName, @@ -106,7 +111,8 @@ func (sc *stateCheckpoint) RestoreState(cp checkpointmanager.Checkpoint) (bool, return false, fmt.Errorf("[memory_plugin] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName) } - generatedResourcesMachineState, err := GenerateMachineStateFromPodEntries(sc.machineInfo, checkpoint.PodResourceEntries, checkpoint.MachineState, sc.reservedMemory) + generatedResourcesMachineState, err := GenerateMachineStateFromPodEntries(sc.machineInfo, sc.memoryTopology, checkpoint.PodResourceEntries, + checkpoint.MachineState, sc.reservedMemory, sc.extraResourceNames) if err != nil { return false, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) } @@ -177,6 +183,13 @@ func (sc *stateCheckpoint) GetMachineInfo() *info.MachineInfo { return sc.cache.GetMachineInfo() } +func (sc *stateCheckpoint) GetMemoryTopology() *machine.MemoryTopology { + sc.RLock() + defer sc.RUnlock() + + return sc.cache.GetMemoryTopology() +} + func (sc *stateCheckpoint) GetMachineState() NUMANodeResourcesMap { sc.RLock() defer sc.RUnlock() @@ -200,6 +213,13 @@ func (sc *stateCheckpoint) GetAllocationInfo( return sc.cache.GetAllocationInfo(resourceName, podUID, containerName) } +func (sc *stateCheckpoint) GetResourceAllocationInfo(podUID, containerName string) map[v1.ResourceName]*AllocationInfo { + sc.RLock() + defer sc.RUnlock() + + return sc.cache.GetResourceAllocationInfo(podUID, containerName) +} + func (sc *stateCheckpoint) GetPodResourceEntries() PodResourceEntries { sc.RLock() defer sc.RUnlock() diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_mem.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_mem.go index a6a34e6e00..9387f711b9 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_mem.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_mem.go @@ -36,14 +36,20 @@ type memoryPluginState struct { socketTopology map[int]string machineInfo *info.MachineInfo + // memoryTopology contains detailed memory capacities (e.g. NormalMemoryDetails excluding hugepages) + memoryTopology *machine.MemoryTopology reservedMemory map[v1.ResourceName]map[int]uint64 machineState NUMANodeResourcesMap numaHeadroom map[int]int64 podResourceEntries PodResourceEntries + + extraResourceNames []string } -func NewMemoryPluginState(topology *machine.CPUTopology, machineInfo *info.MachineInfo, reservedMemory map[v1.ResourceName]map[int]uint64) (*memoryPluginState, error) { +func NewMemoryPluginState(topology *machine.CPUTopology, machineInfo *info.MachineInfo, + memoryTopology *machine.MemoryTopology, reservedMemory map[v1.ResourceName]map[int]uint64, extraResourceNames []string, +) (*memoryPluginState, error) { klog.InfoS("[memory_plugin] initializing new memory plugin in-memory state store") socketTopology := make(map[int]string) @@ -51,7 +57,7 @@ func NewMemoryPluginState(topology *machine.CPUTopology, machineInfo *info.Machi socketTopology[socketID] = topology.CPUDetails.NUMANodesInSockets(socketID).String() } - defaultMachineState, err := GenerateMachineState(machineInfo, reservedMemory) + defaultMachineState, err := GenerateMachineState(machineInfo, memoryTopology, reservedMemory, extraResourceNames) if err != nil { return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) } @@ -62,7 +68,9 @@ func NewMemoryPluginState(topology *machine.CPUTopology, machineInfo *info.Machi numaHeadroom: make(map[int]int64), socketTopology: socketTopology, machineInfo: machineInfo.Clone(), + memoryTopology: memoryTopology, reservedMemory: reservedMemory, + extraResourceNames: extraResourceNames, }, nil } @@ -103,6 +111,13 @@ func (s *memoryPluginState) GetMachineInfo() *info.MachineInfo { return s.machineInfo.Clone() } +func (s *memoryPluginState) GetMemoryTopology() *machine.MemoryTopology { + s.RLock() + defer s.RUnlock() + + return s.memoryTopology +} + func (s *memoryPluginState) GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo { s.RLock() defer s.RUnlock() @@ -113,6 +128,24 @@ func (s *memoryPluginState) GetAllocationInfo(resourceName v1.ResourceName, podU return nil } +func (s *memoryPluginState) GetResourceAllocationInfo(podUID, containerName string) map[v1.ResourceName]*AllocationInfo { + s.RLock() + defer s.RUnlock() + + var allAllocationInfos map[v1.ResourceName]*AllocationInfo + for resourceName, res := range s.podResourceEntries { + if allocInfo, ok := res[podUID][containerName]; ok { + // Lazy initialization of map only when there is allocation info for a container + if allAllocationInfos == nil { + allAllocationInfos = make(map[v1.ResourceName]*AllocationInfo) + } + allAllocationInfos[resourceName] = allocInfo.Clone() + } + } + + return allAllocationInfos +} + func (s *memoryPluginState) GetPodResourceEntries() PodResourceEntries { s.RLock() defer s.RUnlock() @@ -193,8 +226,9 @@ func (s *memoryPluginState) ClearState() { s.Lock() defer s.Unlock() - s.machineState, _ = GenerateMachineState(s.machineInfo, s.reservedMemory) + s.machineState, _ = GenerateMachineState(s.machineInfo, s.memoryTopology, s.reservedMemory, s.extraResourceNames) s.podResourceEntries = make(PodResourceEntries) + s.numaHeadroom = make(map[int]int64) s.socketTopology = make(map[int]string) klog.V(2).InfoS("[memory_plugin] cleared state") diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_test.go index e0c5bccfde..98d0c6eb7a 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_test.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/state_test.go @@ -140,7 +140,7 @@ func TestNewMemoryPluginCheckpoint(t *testing.T) { }, } oldCheckpoint.PodResourceEntries = podResourceEntries - machineState, err := GenerateMachineStateFromPodEntries(machineInfo, podResourceEntries, nil, reservedMemory) + machineState, err := GenerateMachineStateFromPodEntries(machineInfo, nil, podResourceEntries, nil, reservedMemory, nil) assert.NoError(t, err) oldCheckpoint.MachineState = machineState err = oldCheckpointManager.CreateCheckpoint(checkpointName, oldCheckpoint) @@ -153,8 +153,8 @@ func TestNewMemoryPluginCheckpoint(t *testing.T) { EnableInMemoryState: true, } - state, err := NewCheckpointState(stateDirectoryConfig, checkpointName, policyName, cpuTopology, machineInfo, - reservedMemory, false, metrics.DummyMetrics{}) + state, err := NewCheckpointState(stateDirectoryConfig, checkpointName, policyName, cpuTopology, machineInfo, nil, + reservedMemory, false, metrics.DummyMetrics{}, nil) if tt.corruptFile { assert.Error(t, err) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/util.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/util.go index 7bab660c00..fedda197f5 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/util.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/state/util.go @@ -18,12 +18,15 @@ package state import ( "fmt" + "strings" "time" info "github.com/google/cadvisor/info/v1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "k8s.io/kubernetes/pkg/apis/core/v1/helper" "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" @@ -48,21 +51,24 @@ func GenerateMemoryContainerAllocationMeta(req *pluginapi.ResourceRequest, qosLe } // GenerateMachineState returns NUMANodeResourcesMap based on -// machine info and reserved resources -func GenerateMachineState(machineInfo *info.MachineInfo, reserved map[v1.ResourceName]map[int]uint64) (NUMANodeResourcesMap, error) { +// machine info, memory topology (to get precise capacities like NormalMemoryDetails), +// and reserved resources. +func GenerateMachineState(machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, reserved map[v1.ResourceName]map[int]uint64, + extraResourceNames []string, +) (NUMANodeResourcesMap, error) { if machineInfo == nil { return nil, fmt.Errorf("GenerateMachineState got nil machineInfo") } - // todo: currently only support memory, we will support huge page later. + resources := append(extraResourceNames, string(v1.ResourceMemory)) defaultResourcesMachineState := make(NUMANodeResourcesMap) - for _, resourceName := range []v1.ResourceName{v1.ResourceMemory} { - machineState, err := GenerateResourceState(machineInfo, reserved, resourceName) + for _, resourceName := range resources { + machineState, err := GenerateResourceState(machineInfo, memoryTopology, reserved, v1.ResourceName(resourceName)) if err != nil { return nil, fmt.Errorf("GenerateResourceState for resource: %s failed with error: %v", resourceName, err) } - defaultResourcesMachineState[resourceName] = machineState + defaultResourcesMachineState[v1.ResourceName(resourceName)] = machineState } return defaultResourcesMachineState, nil } @@ -91,14 +97,21 @@ func GetReclaimedNUMAHeadroom(numaHeadroom map[int]int64, numaSet machine.CPUSet } // GenerateResourceState returns NUMANodeMap for given resource based on -// machine info and reserved resources -func GenerateResourceState(machineInfo *info.MachineInfo, reserved map[v1.ResourceName]map[int]uint64, resourceName v1.ResourceName) (NUMANodeMap, error) { +// machine info, memory topology (to extract normal memory capacities), and reserved resources +func GenerateResourceState(machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, reserved map[v1.ResourceName]map[int]uint64, resourceName v1.ResourceName) (NUMANodeMap, error) { defaultMachineState := make(NUMANodeMap) - switch resourceName { - case v1.ResourceMemory: + switch { + case resourceName == v1.ResourceMemory: for _, node := range machineInfo.Topology { - totalMemSizeQuantity := node.Memory + var totalMemSizeQuantity uint64 + // Use NormalMemoryDetails to exclude hugepages when calculating allocatable memory + if memoryTopology != nil { + totalMemSizeQuantity = memoryTopology.NormalMemoryDetails[node.Id] + } else { + // Fallback for tests or environments where memory topology isn't fully initialized + totalMemSizeQuantity = node.Memory + } numaReservedMemQuantity := reserved[resourceName][node.Id] if totalMemSizeQuantity < numaReservedMemQuantity { @@ -117,6 +130,8 @@ func GenerateResourceState(machineInfo *info.MachineInfo, reserved map[v1.Resour PodEntries: make(PodEntries), } } + case strings.HasPrefix(string(resourceName), v1.ResourceHugePagesPrefix): + return generateHugePagesResourceState(machineInfo, reserved, resourceName) default: return nil, fmt.Errorf("unsupported resource name: %s", resourceName) } @@ -124,11 +139,44 @@ func GenerateResourceState(machineInfo *info.MachineInfo, reserved map[v1.Resour return defaultMachineState, nil } +// generateHugePagesResourceState returns NUMANodeMap for a particular hugepage resource based on machine info and reserved resources. +func generateHugePagesResourceState(machineInfo *info.MachineInfo, reserved map[v1.ResourceName]map[int]uint64, + resourceName v1.ResourceName, +) (NUMANodeMap, error) { + hugepageResourceMachineState := make(NUMANodeMap) + for _, node := range machineInfo.Topology { + nodeState := &NUMANodeState{} + for _, hugepage := range node.HugePages { + hugepageQuantity := resource.NewQuantity(int64(hugepage.PageSize)*1024, resource.BinarySI) + hugepageName := helper.HugePageResourceName(*hugepageQuantity) + if hugepageName != resourceName { + continue + } + + systemReserved := reserved[resourceName][node.Id] + totalHugepagesSize := hugepage.PageSize * hugepage.NumPages * 1024 + if totalHugepagesSize < systemReserved { + return nil, fmt.Errorf("invalid reserved %v: %d in NUMA: %d with total memory size: %d", resourceName, + systemReserved, node.Id, totalHugepagesSize) + } + + allocatable := totalHugepagesSize - systemReserved + nodeState.TotalMemSize = totalHugepagesSize + nodeState.SystemReserved = systemReserved + nodeState.Allocatable = allocatable + nodeState.Free = allocatable + } + hugepageResourceMachineState[node.Id] = nodeState + } + + return hugepageResourceMachineState, nil +} + // GenerateMachineStateFromPodEntries returns NUMANodeResourcesMap based on -// machine info and reserved resources (along with existed pod entries) -func GenerateMachineStateFromPodEntries(machineInfo *info.MachineInfo, +// machine info, memory topology (for exact capacity logic), and reserved resources (along with existed pod entries) +func GenerateMachineStateFromPodEntries(machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, podResourceEntries PodResourceEntries, originResourcesMachineState NUMANodeResourcesMap, - reserved map[v1.ResourceName]map[int]uint64, + reserved map[v1.ResourceName]map[int]uint64, extraResourceNames []string, ) (NUMANodeResourcesMap, error) { if machineInfo == nil { return nil, fmt.Errorf("GenerateMachineStateFromPodEntries got nil machineInfo") @@ -138,16 +186,16 @@ func GenerateMachineStateFromPodEntries(machineInfo *info.MachineInfo, originResourcesMachineState = make(NUMANodeResourcesMap) } - // todo: currently only support memory, we will support huge page later. + resources := append(extraResourceNames, string(v1.ResourceMemory)) currentResourcesMachineState := make(NUMANodeResourcesMap) - for _, resourceName := range []v1.ResourceName{v1.ResourceMemory} { - machineState, err := GenerateResourceStateFromPodEntries(machineInfo, podResourceEntries[resourceName], - originResourcesMachineState[resourceName], reserved, resourceName) + for _, resourceName := range resources { + machineState, err := GenerateResourceStateFromPodEntries(machineInfo, memoryTopology, podResourceEntries[v1.ResourceName(resourceName)], + originResourcesMachineState[v1.ResourceName(resourceName)], reserved, v1.ResourceName(resourceName)) if err != nil { return nil, fmt.Errorf("GenerateResourceState for resource: %s failed with error: %v", resourceName, err) } - currentResourcesMachineState[resourceName] = machineState + currentResourcesMachineState[v1.ResourceName(resourceName)] = machineState } return currentResourcesMachineState, nil } @@ -214,30 +262,32 @@ func updateMachineStatePreOccPodEntries(currentMachineState, originMachineState } // GenerateResourceStateFromPodEntries returns NUMANodeMap for given resource based on -// machine info and reserved resources along with existed pod entries -func GenerateResourceStateFromPodEntries(machineInfo *info.MachineInfo, +// machine info, memory topology, and reserved resources along with existed pod entries +func GenerateResourceStateFromPodEntries(machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, podEntries PodEntries, originMachineState NUMANodeMap, reserved map[v1.ResourceName]map[int]uint64, resourceName v1.ResourceName, ) (NUMANodeMap, error) { - switch resourceName { - case v1.ResourceMemory: - currentMachineState, err := GenerateMemoryStateFromPodEntries(machineInfo, podEntries, reserved) + switch { + case resourceName == v1.ResourceMemory: + currentMachineState, err := GenerateMemoryStateFromPodEntries(machineInfo, memoryTopology, podEntries, reserved, resourceName) if err != nil { return nil, err } updateMachineStatePreOccPodEntries(currentMachineState, originMachineState) return currentMachineState, nil + case strings.HasPrefix(string(resourceName), v1.ResourceHugePagesPrefix): + return GenerateMemoryStateFromPodEntries(machineInfo, memoryTopology, podEntries, reserved, resourceName) default: return nil, fmt.Errorf("unsupported resource name: %s", resourceName) } } // GenerateMemoryStateFromPodEntries returns NUMANodeMap for memory based on -// machine info and reserved resources along with existed pod entries -func GenerateMemoryStateFromPodEntries(machineInfo *info.MachineInfo, - podEntries PodEntries, reserved map[v1.ResourceName]map[int]uint64, +// machine info, memory topology, and reserved resources along with existed pod entries +func GenerateMemoryStateFromPodEntries(machineInfo *info.MachineInfo, memoryTopology *machine.MemoryTopology, + podEntries PodEntries, reserved map[v1.ResourceName]map[int]uint64, resourceName v1.ResourceName, ) (NUMANodeMap, error) { - machineState, err := GenerateResourceState(machineInfo, reserved, v1.ResourceMemory) + machineState, err := GenerateResourceState(machineInfo, memoryTopology, reserved, resourceName) if err != nil { return nil, fmt.Errorf("GenerateResourceState failed with error: %v", err) } diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/util.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/util.go index 7713bd3d76..a7a2b02605 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/util.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/util.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "math" + "sort" + "strings" info "github.com/google/cadvisor/info/v1" v1 "k8s.io/api/core/v1" @@ -48,10 +50,12 @@ func GetFullyDropCacheBytes(container *v1.Container) int64 { return fullyDropCacheBytes } -// GetReservedMemory is used to spread total reserved memories into per-numa level. +// getResourcesReservedMemory is used to spread total reserved memories for all memory resources into per-numa level. // this reserve resource calculation logic should be kept in qrm, if advisor wants // to get this info, it should depend on the returned checkpoint (through cpu-server) -func getReservedMemory(conf *config.Configuration, metaServer *metaserver.MetaServer, machineInfo *info.MachineInfo) (map[int]uint64, error) { +func getResourcesReservedMemory(conf *config.Configuration, metaServer *metaserver.MetaServer, machineInfo *info.MachineInfo, + resourceNames []string, +) (map[v1.ResourceName]map[int]uint64, error) { if conf == nil { return nil, fmt.Errorf("nil conf") } else if metaServer == nil { @@ -60,6 +64,40 @@ func getReservedMemory(conf *config.Configuration, metaServer *metaserver.MetaSe return nil, fmt.Errorf("nil machineInfo") } + resourceNames = append(resourceNames, string(v1.ResourceMemory)) + + resourcesReservedMemory := make(map[v1.ResourceName]map[int]uint64) + for _, resourceName := range resourceNames { + // ignore duplicated resource name + if _, ok := resourcesReservedMemory[v1.ResourceName(resourceName)]; ok { + continue + } + + var reservedMemory map[int]uint64 + var err error + switch { + case v1.ResourceName(resourceName) == v1.ResourceMemory: + reservedMemory, err = getReservedMemory(conf, metaServer, machineInfo) + if err != nil { + return nil, err + } + case strings.HasPrefix(resourceName, v1.ResourceHugePagesPrefix): + reservedMemory, err = getReservedHugePagesMemory(conf, metaServer, machineInfo, v1.ResourceName(resourceName)) + if err != nil { + return nil, err + } + default: + return nil, fmt.Errorf("unknown memory resource name: %s", resourceName) + } + + resourcesReservedMemory[v1.ResourceName(resourceName)] = reservedMemory + } + + return resourcesReservedMemory, nil +} + +// getReservedMemory gets reserved v1.ResourceMemory per numa level. +func getReservedMemory(conf *config.Configuration, metaServer *metaserver.MetaServer, machineInfo *info.MachineInfo) (map[int]uint64, error) { numasCount := len(machineInfo.Topology) var reservedMemoryGB float64 @@ -96,6 +134,71 @@ func getReservedMemory(conf *config.Configuration, metaServer *metaserver.MetaSe return reservedMemory, nil } +// getReservedHugePagesMemory gets the reserved memory for the hugepages resource name per numa level. +func getReservedHugePagesMemory(conf *config.Configuration, metaServer *metaserver.MetaServer, machineInfo *info.MachineInfo, + resourceName v1.ResourceName, +) (map[int]uint64, error) { + numaNodes := make([]int, 0, len(machineInfo.Topology)) + for _, node := range machineInfo.Topology { + numaNodes = append(numaNodes, node.Id) + } + + if len(numaNodes) == 0 { + return nil, fmt.Errorf("[memory_plugin] machine topology numa node number is zero") + } + + nodeNumber := int64(len(numaNodes)) + sort.Ints(numaNodes) + + reservedMemory := make(map[int]uint64) + var reservedBytes int64 + if conf.UseKubeletReservedConfig { + klConfig, err := metaServer.GetKubeletConfig(context.TODO()) + if err != nil { + return nil, fmt.Errorf("failed to get kubelet config: %v", err) + } + + reservedMemoryInfo, err := utilkubeconfig.GetReservedMemoryInfo(klConfig) + if err == nil && len(reservedMemoryInfo) != 0 { + for _, numaNode := range numaNodes { + if reservedMem, ok := reservedMemoryInfo[int32(numaNode)]; ok { + quantity := reservedMem[resourceName] + reservedMemory[numaNode] = uint64(quantity.Value()) + } + } + general.Infof("get numa reserved %v:%v bytes from kubelet config", resourceName, reservedMemory) + return reservedMemory, nil + } + + reservedQuantity, found, err := utilkubeconfig.GetReservedQuantity(klConfig, string(resourceName)) + if err != nil { + return nil, fmt.Errorf("GetReservedQuantity failed with error: %v", err) + } + reservedBytes = reservedQuantity.Value() + general.Infof("get reserved %v:%d bytes from kubelet config, found: %v", resourceName, reservedBytes, found) + } else { + if len(conf.ReservedNumaMemory) > 0 { + for _, numaNode := range numaNodes { + if reservedMem, ok := conf.ReservedNumaMemory[int32(numaNode)]; ok { + quantity := reservedMem[resourceName] + reservedMemory[numaNode] = uint64(quantity.Value()) + } + } + general.Infof("get numa reserved %v:%v bytes from ReservedNumaMemory configuration", resourceName, reservedMemory) + return reservedMemory, nil + } + } + + integerPart := uint64(reservedBytes / nodeNumber) + remainder := uint64(reservedBytes % nodeNumber) + for _, node := range numaNodes { + reservedMemory[node] = integerPart + } + + reservedMemory[numaNodes[0]] = reservedMemory[numaNodes[0]] + remainder + return reservedMemory, nil +} + func applySidecarAllocationInfoFromMainContainer(sidecarAllocationInfo, mainAllocationInfo *state.AllocationInfo) bool { changed := false if !sidecarAllocationInfo.NumaAllocationResult.Equals(mainAllocationInfo.NumaAllocationResult) { diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/vpa_test.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/vpa_test.go index a9ab369bfc..60d669ede1 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/vpa_test.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/vpa_test.go @@ -1564,8 +1564,8 @@ func TestRNBMemoryVPA(t *testing.T) { if tc.PodEntries != nil { podResourceEntries := map[v1.ResourceName]state.PodEntries{v1.ResourceMemory: tc.PodEntries} - machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, podResourceEntries, nil, - dynamicPolicy.state.GetReservedMemory()) + machineState, err := state.GenerateMachineStateFromPodEntries(machineInfo, nil, podResourceEntries, nil, + dynamicPolicy.state.GetReservedMemory(), nil) as.Nil(err) dynamicPolicy.state.SetMachineState(machineState, true) diff --git a/pkg/agent/qrm-plugins/util/util.go b/pkg/agent/qrm-plugins/util/util.go index 412e418583..3e80601f4c 100644 --- a/pkg/agent/qrm-plugins/util/util.go +++ b/pkg/agent/qrm-plugins/util/util.go @@ -26,6 +26,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -45,15 +46,17 @@ func GetQuantityFromResourceReq(req *pluginapi.ResourceRequest) (int, float64, e return 0, 0, fmt.Errorf("invalid req.ResourceRequests length: %d", len(req.ResourceRequests)) } - return GetQuantityFromResourceRequests(req.ResourceRequests, req.ResourceName, IsQuantityFromQRMDeclaration(req.Annotations)) + return GetQuantityFromResourceRequests(req.ResourceRequests, req.ResourceName, req.Annotations) } -func GetQuantityFromResourceRequests(resourceRequests map[string]float64, resourceName string, isQuantityFromQRMDeclaration bool) (int, float64, error) { +func GetQuantityFromResourceRequests(resourceRequests map[string]float64, resourceName string, reqAnnotations map[string]string) (int, float64, error) { quantity, ok := resourceRequests[resourceName] if !ok { return 0, 0, errors.NewNotFound(schema.GroupResource{}, resourceName) } + isQuantityFromQRMDeclaration := IsQuantityFromQRMDeclaration(reqAnnotations) + switch resourceName { case string(apiconsts.ReclaimedResourceMilliCPU): return general.Max(int(math.Ceil(quantity/1000.0)), 0), quantity / 1000.0, nil @@ -73,6 +76,29 @@ func IsQuantityFromQRMDeclaration(podAnnotations map[string]string) bool { return podAnnotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue } +// GetQuantityMapFromResourceReq parses all resources quantity into maps of resources to value, +// since pods with reclaimed_cores and un-reclaimed_cores have different +// representations, we may to adapt to both cases. +func GetQuantityMapFromResourceReq(req *pluginapi.ResourceRequest) (map[v1.ResourceName]int, map[v1.ResourceName]float64, error) { + intQuantity := make(map[v1.ResourceName]int) + floatQuantity := make(map[v1.ResourceName]float64) + + resourceRequests := req.ResourceRequests + + for key := range resourceRequests { + resName := v1.ResourceName(key) + resInt, resFloat, err := GetQuantityFromResourceRequests(resourceRequests, key, req.Annotations) + if err != nil { + return nil, nil, fmt.Errorf("error getting quantity from resource requests for resource %s: %v", key, err) + } + + intQuantity[resName] = resInt + floatQuantity[resName] = resFloat + } + + return intQuantity, floatQuantity, nil +} + // IsDebugPod returns true if the pod annotations show up any configurable debug key func IsDebugPod(podAnnotations map[string]string, podDebugAnnoKeys []string) bool { for _, debugKey := range podDebugAnnoKeys { @@ -276,9 +302,10 @@ func GetNUMANodesCountToFitMemoryReq(memoryReq, bytesPerNUMA uint64, numaCount i }, } */ -func GetHintsFromExtraStateFile(podName, resourceName, extraHintsStateFileAbsPath string, - availableNUMAs machine.CPUSet, +func GetHintsFromExtraStateFile(podName, extraHintsStateFileAbsPath string, + availableNUMAs machine.CPUSet, requestedResources []v1.ResourceName, ) (map[string]*pluginapi.ListOfTopologyHints, error) { + hints := make(map[string]*pluginapi.ListOfTopologyHints) if extraHintsStateFileAbsPath == "" { return nil, nil } @@ -319,21 +346,23 @@ func GetHintsFromExtraStateFile(podName, resourceName, extraHintsStateFileAbsPat } allocatedNumaNodes := numaSet.ToSliceUInt64() - klog.InfoS("[GetHintsFromExtraStateFile] get hints from extra state file", - "podName", podName, - "resourceName", resourceName, - "hint", allocatedNumaNodes) - hints := map[string]*pluginapi.ListOfTopologyHints{ - resourceName: { + for _, resourceName := range requestedResources { + klog.InfoS("[GetHintsFromExtraStateFile] get hints from extra state file", + "podName", podName, + "resourceName", resourceName, + "hint", allocatedNumaNodes) + + hints[string(resourceName)] = &pluginapi.ListOfTopologyHints{ Hints: []*pluginapi.TopologyHint{ { Nodes: allocatedNumaNodes, Preferred: true, }, }, - }, + } } + return hints, nil } @@ -353,6 +382,60 @@ func PodInplaceUpdateResizing(req *pluginapi.ResourceRequest) bool { return req.Annotations != nil && req.Annotations[apiconsts.PodAnnotationInplaceUpdateResizingKey] == "true" } +// GetPodAggregatedRequestResourceMap returns both integer and float64 quantities for all resources in the pod request. +// If the pod has aggregated resource annotations, those values are used; otherwise, it falls back to the original +// request quantities. Returns an error if any calculation fails. +func GetPodAggregatedRequestResourceMap(req *pluginapi.ResourceRequest) (map[v1.ResourceName]int, map[v1.ResourceName]float64, error) { + annotations := req.Annotations + if annotations == nil { + return GetQuantityMapFromResourceReq(req) + } + + value, ok := annotations[apiconsts.PodAnnotationAggregatedRequestsKey] + if !ok { + return GetQuantityMapFromResourceReq(req) + } + + var resourceList v1.ResourceList + if err := json.Unmarshal([]byte(value), &resourceList); err != nil { + return GetQuantityMapFromResourceReq(req) + } + + intQuantities := make(map[v1.ResourceName]int) + floatQuantities := make(map[v1.ResourceName]float64) + resourceRequests := req.ResourceRequests + + for key := range resourceRequests { + resName := v1.ResourceName(key) + + if _, ok = resourceList[resName]; !ok { + // for resources that do not appear in the aggregated resources map, simply calculate quantity from request + intQuantity, floatQuantity, err := GetQuantityFromResourceRequests(resourceRequests, key, req.Annotations) + if err != nil { + return nil, nil, fmt.Errorf("get resource quantity for resource %s failed with error: %v", resName, err) + } + + intQuantities[resName] = intQuantity + floatQuantities[resName] = floatQuantity + } else { + // otherwise, calculate the aggregated quantity of the resource + intQuantity, floatQuantity, err := calculateAggregatedResource(resName, resourceList) + if err != nil { + return nil, nil, fmt.Errorf("calculate aggregated resource quantity for resource %s failed with error: %v", + resName, err) + } + + intQuantities[resName] = intQuantity + floatQuantities[resName] = floatQuantity + } + } + + return intQuantities, floatQuantities, nil +} + +// GetPodAggregatedRequestResource returns both integer and float64 quantities for the main resource in the pod request. +// If the pod has aggregated resource annotations, those values are used; otherwise, it falls back to the original +// request quantities. Returns an error if any calculation fails. func GetPodAggregatedRequestResource(req *pluginapi.ResourceRequest) (int, float64, error) { annotations := req.Annotations if annotations == nil { @@ -367,15 +450,17 @@ func GetPodAggregatedRequestResource(req *pluginapi.ResourceRequest) (int, float return GetQuantityFromResourceReq(req) } - switch req.ResourceName { - case string(v1.ResourceCPU): + return calculateAggregatedResource(v1.ResourceName(req.ResourceName), resourceList) +} + +func calculateAggregatedResource(resourceName v1.ResourceName, resourceList v1.ResourceList) (int, float64, error) { + switch resourceName { + case v1.ResourceCPU: podAggregatedReqFloat64 := float64(resourceList.Cpu().MilliValue()) / 1000 return int(math.Ceil(podAggregatedReqFloat64)), podAggregatedReqFloat64, nil - case string(v1.ResourceMemory): - podAggregatedReqFloat64 := float64(resourceList.Memory().MilliValue()) / 1000 - return int(math.Ceil(podAggregatedReqFloat64)), podAggregatedReqFloat64, nil default: - return 0, 0, fmt.Errorf("not support resource name: %s", req.ResourceName) + podAggregatedReqFloat64 := float64(resourceList.Name(resourceName, resource.BinarySI).Value()) + return int(podAggregatedReqFloat64), podAggregatedReqFloat64, nil } } diff --git a/pkg/agent/qrm-plugins/util/util_test.go b/pkg/agent/qrm-plugins/util/util_test.go index fe8f85a0f5..60516ef1d4 100644 --- a/pkg/agent/qrm-plugins/util/util_test.go +++ b/pkg/agent/qrm-plugins/util/util_test.go @@ -17,12 +17,15 @@ limitations under the License. package util import ( + "encoding/json" + "reflect" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime/schema" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -557,3 +560,182 @@ func TestCeilEdgeCases(t *testing.T) { }) } } + +func TestGetPodAggregatedRequestResourceMap(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + resourceRequest *pluginapi.ResourceRequest + expectedInt map[v1.ResourceName]int + expectedFloat map[v1.ResourceName]float64 + expectedErr bool + }{ + { + name: "no annotations", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, + string(v1.ResourceMemory): 2 * 1024 * 1024 * 1024, + }, + Annotations: nil, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 1, + v1.ResourceMemory: 2 * 1024 * 1024 * 1024, // 2Gi + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 1, + v1.ResourceMemory: 2 * 1024 * 1024 * 1024.0, + }, + expectedErr: false, + }, + { + name: "annotations without aggregated key", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, + string(v1.ResourceMemory): 1 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + "some-other-annotation": "value", + }, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 1, + v1.ResourceMemory: 1 * 1024 * 1024 * 1024, // 1Gi + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 1.0, + v1.ResourceMemory: 1 * 1024 * 1024 * 1024.0, + }, + expectedErr: false, + }, + { + name: "invalid aggregated json", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 2, + }, + Annotations: map[string]string{ + consts.PodAnnotationAggregatedRequestsKey: "{invalid json", + }, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 2, + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 2.0, + }, + expectedErr: false, // Should fall back to GetQuantityMapFromResourceReq + }, + { + name: "valid aggregated json - mixed resources", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, // Will be aggregated + string(v1.ResourceMemory): 2 * 1024 * 1024 * 1024, // Will be from original request + "example.com/gpu": 1, // Will be aggregated + }, + Annotations: map[string]string{ + consts.PodAnnotationAggregatedRequestsKey: func() string { + rl := v1.ResourceList{ + v1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + "example.com/gpu": *resource.NewQuantity(2, resource.DecimalSI), + } + b, _ := json.Marshal(rl) + return string(b) + }(), + }, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 2, + v1.ResourceMemory: 2 * 1024 * 1024 * 1024, + "example.com/gpu": 2, + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 2.0, + v1.ResourceMemory: 2 * 1024 * 1024 * 1024.0, + "example.com/gpu": 2.0, + }, + expectedErr: false, + }, + { + name: "valid aggregated json - all resources aggregated", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, + string(v1.ResourceMemory): 2 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationAggregatedRequestsKey: func() string { + rl := v1.ResourceList{ + v1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(3*1024*1024*1024, resource.DecimalSI), + } + b, _ := json.Marshal(rl) + return string(b) + }(), + }, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 2, + v1.ResourceMemory: 3 * 1024 * 1024 * 1024, + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 2.0, + v1.ResourceMemory: 3 * 1024 * 1024 * 1024.0, + }, + expectedErr: false, + }, + { + name: "valid aggregated json - no resources aggregated (empty aggregated list)", + resourceRequest: &pluginapi.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(v1.ResourceCPU): 1, + string(v1.ResourceMemory): 1 * 1024 * 1024 * 1024, + }, + Annotations: map[string]string{ + consts.PodAnnotationAggregatedRequestsKey: func() string { + rl := v1.ResourceList{} // Empty aggregated list + b, _ := json.Marshal(rl) + return string(b) + }(), + }, + }, + expectedInt: map[v1.ResourceName]int{ + v1.ResourceCPU: 1, + v1.ResourceMemory: 1 * 1024 * 1024 * 1024, + }, + expectedFloat: map[v1.ResourceName]float64{ + v1.ResourceCPU: 1.0, + v1.ResourceMemory: 1 * 1024 * 1024 * 1024.0, + }, + expectedErr: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + // Note: This test assumes that GetQuantityFromResourceReq, GetQuantityMapFromResourceReq, + // and calculateAggregatedResource (unexported functions in the same package) + // behave as expected and that GetQuantityFromResourceReq is intended to take + // the current resource name as an implicit argument or processes it correctly + // within the loop context. + gotInt, gotFloat, err := GetPodAggregatedRequestResourceMap(tt.resourceRequest) + + if (err != nil) != tt.expectedErr { + t.Errorf("GetPodAggregatedRequestResourceMap() error = %v, expectedErr %v", err, tt.expectedErr) + return + } + if !reflect.DeepEqual(gotInt, tt.expectedInt) { + t.Errorf("GetPodAggregatedRequestResourceMap() gotInt = %v, want %v", gotInt, tt.expectedInt) + } + if !reflect.DeepEqual(gotFloat, tt.expectedFloat) { + t.Errorf("GetPodAggregatedRequestResourceMap() gotFloat = %v, want %v", gotFloat, tt.expectedFloat) + } + }) + } +} diff --git a/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go b/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go index 9fb8b2027b..944eac5595 100644 --- a/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go +++ b/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go @@ -261,7 +261,8 @@ func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableCPU(reserved resource.Qu } func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableMemory(reserved resource.Quantity) { - capacity := resource.NewQuantity(int64(ra.metaServer.MemoryCapacity), resource.BinarySI) + // Use NormalMemoryCapacity which excludes static hugepages for accurate allocatable memory calculation + capacity := resource.NewQuantity(int64(ra.metaServer.NormalMemoryCapacity), resource.BinarySI) capacity.Sub(reserved) diff --git a/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime_test.go b/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime_test.go index 85edcef21f..7739e520d1 100644 --- a/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime_test.go +++ b/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime_test.go @@ -44,6 +44,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" metric2 "github.com/kubewharf/katalyst-core/pkg/util/metric" "github.com/kubewharf/katalyst-core/pkg/util/native" ) @@ -442,6 +443,9 @@ func generateTestMetaServer(t *testing.T, conf *config.Configuration, podList [] NumCores: 16, MemoryCapacity: 32 * 1024 * 1024 * 1024, } + meta.KatalystMachineInfo.MemoryTopology = &machine.MemoryTopology{ + NormalMemoryCapacity: 32 * 1024 * 1024 * 1024, + } return meta } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go index 7b6854407a..5208258a5d 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go @@ -194,8 +194,9 @@ func (ra *memoryResourceAdvisor) update() (*types.InternalMemoryCalculationResul for _, headroomPolicy := range ra.headroomPolices { // capacity and reserved can both be adjusted dynamically during running process headroomPolicy.SetEssentials(types.ResourceEssentials{ - EnableReclaim: ra.conf.GetDynamicConfiguration().EnableReclaim, - ResourceUpperBound: float64(ra.metaServer.MemoryCapacity), + EnableReclaim: ra.conf.GetDynamicConfiguration().EnableReclaim, + // Use NormalMemoryCapacity which excludes static hugepages for accurate upper bound calculation + ResourceUpperBound: float64(ra.metaServer.MemoryTopology.NormalMemoryCapacity), ReservedForAllocate: reservedForAllocate.AsApproximateFloat64(), }) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go index 0463ac0697..8ee5a824b0 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go @@ -123,6 +123,7 @@ func newTestMemoryAdvisor(t *testing.T, pods []*v1.Pod, checkpointDir, stateFile require.NoError(t, err) memoryTopology, err := machine.GenerateDummyMemoryTopology(4, 500<<30) require.NoError(t, err) + memoryTopology.NormalMemoryCapacity = 1000 << 30 extraTopology, err := machine.GenerateDummyExtraTopology(4) require.NoError(t, err) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go index 5abd5a8a1e..043d2895e8 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go @@ -95,7 +95,8 @@ func (p *PolicyCanonical) estimateNonReclaimedQoSMemoryRequirement() (float64, e if ci.ContainerType == v1alpha1.ContainerType_MAIN { bindingNumas := machine.GetCPUAssignmentNUMAs(ci.TopologyAwareAssignments) for _, numaID := range bindingNumas.ToSliceInt() { - memoryCap, ok := p.metaServer.MemoryDetails[numaID] + // Use NormalMemoryDetails which excludes static hugepages for accurate per-NUMA capacity + memoryCap, ok := p.metaServer.NormalMemoryDetails[numaID] if !ok { errList = append(errList, fmt.Errorf("get memory capacity of numa %v failed", numaID)) return true diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go index 49ae8e89a0..80496df43e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/memory_provisioner.go @@ -100,8 +100,9 @@ func (m *memoryProvisioner) Reconcile(status *types.MemoryPressureStatus) (err e ReservedResourceForAllocate[v1.ResourceMemory] m.policy.SetEssentials( types.ResourceEssentials{ - EnableReclaim: m.conf.GetDynamicConfiguration().EnableReclaim, - ResourceUpperBound: float64(m.metaServer.MemoryCapacity), + EnableReclaim: m.conf.GetDynamicConfiguration().EnableReclaim, + // Use NormalMemoryCapacity which excludes static hugepages for accurate upper bound calculation + ResourceUpperBound: float64(m.metaServer.MemoryTopology.NormalMemoryCapacity), ReservedForAllocate: reservedForAllocate.AsApproximateFloat64(), }) err = m.policy.Update() diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical.go index d38e0ad33a..fd9bac3c4e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical.go @@ -77,7 +77,8 @@ func (p *PolicyCanonical) Update() error { } memoryProvisions[numaID] += uint64(memFreeNuma.Value) - memoryTotals[numaID] += uint64(memTotalNuma.Value) + // static huge pages should be excluded from total memory + memoryTotals[numaID] += uint64(memTotalNuma.Value - float64(p.metaServer.StaticHugePagesDetails[numaID])) availNUMATotal += memTotalNuma.Value general.InfoS("numa memory free", "numaID", numaID, diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical_test.go index ccda066168..0a323b6455 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/plugin/provisioner/policy/policy_canonical_test.go @@ -122,10 +122,11 @@ func TestPolicyCanonical(t *testing.T) { now := time.Now() type fields struct { - podList []*v1.Pod - containers []*types.ContainerInfo - essentials types.ResourceEssentials - setFakeMetric func(store *metric.FakeMetricsFetcher) + podList []*v1.Pod + containers []*types.ContainerInfo + essentials types.ResourceEssentials + setFakeMetric func(store *metric.FakeMetricsFetcher) + staticHugePages machine.MemoryDetails } tests := []struct { @@ -397,6 +398,34 @@ func TestPolicyCanonical(t *testing.T) { 1: 0, }, }, + { + name: "normal: with large static huge pages causing clamp", + fields: fields{ + podList: []*v1.Pod{}, + containers: []*types.ContainerInfo{}, + essentials: types.ResourceEssentials{ + EnableReclaim: true, + ResourceUpperBound: 100 << 30, + ReservedForAllocate: 0, + }, + setFakeMetric: func(store *metric.FakeMetricsFetcher) { + store.SetNodeMetric(pkgconsts.MetricMemScaleFactorSystem, utilmetric.MetricData{Value: 0, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemTotalNuma, utilmetric.MetricData{Value: 100 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemTotalNuma, utilmetric.MetricData{Value: 100 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 80 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 80 << 30, Time: &now}) + }, + staticHugePages: machine.MemoryDetails{ + 0: 60 << 30, + 1: 0, + }, + }, + wantErr: false, + want: machine.MemoryDetails{ + 0: 40 << 30, + 1: 80 << 30, + }, + }, } for _, tt := range tests { @@ -425,6 +454,7 @@ func TestPolicyCanonical(t *testing.T) { } metaServer := generateTestMetaServer(t, tt.fields.podList, metricsFetcher) + metaServer.StaticHugePagesDetails = tt.fields.staticHugePages p := NewPolicyCanonical(conf, nil, metaCache, metaServer, metrics.DummyMetrics{}) diff --git a/pkg/config/agent/qrm/memory_plugin.go b/pkg/config/agent/qrm/memory_plugin.go index 25d936fa4b..3812f98e32 100644 --- a/pkg/config/agent/qrm/memory_plugin.go +++ b/pkg/config/agent/qrm/memory_plugin.go @@ -50,6 +50,8 @@ type MemoryQRMPluginConfig struct { // NUMABindResultResourceAllocationAnnotationKey: the annotation key for numa bind result resource allocation // it will be used to set cgroup path for numa bind result resource allocation NUMABindResultResourceAllocationAnnotationKey string + // ExtraMemoryResources: the slice of extra memory resources such as hugepages-* + ExtraMemoryResources []string // SockMemQRMPluginConfig: the configuration for sockmem limitation in cgroup and host level SockMemQRMPluginConfig // LogCacheQRMPluginConfig: the configuration for logcache evicting diff --git a/pkg/util/machine/topology.go b/pkg/util/machine/topology.go index cfe568bb78..0ad52e58df 100644 --- a/pkg/util/machine/topology.go +++ b/pkg/util/machine/topology.go @@ -122,6 +122,14 @@ func (d MemoryDetails) FillNUMANodesWithZero(allNUMAs CPUSet) MemoryDetails { type MemoryTopology struct { MemoryDetails MemoryDetails PageSize int + // NormalMemoryCapacity is the total memory capacity in bytes, excluding static hugepages + NormalMemoryCapacity uint64 + // NormalMemoryDetails is the memory capacity details by NUMA node, excluding static hugepages + NormalMemoryDetails MemoryDetails + // StaticHugePagesCapacity is the total static hugepages capacity in bytes + StaticHugePagesCapacity uint64 + // StaticHugePagesDetails is the static hugepages capacity details by NUMA node + StaticHugePagesDetails MemoryDetails } // AlignToPageSize returns the page numbers from mem numbers. @@ -190,6 +198,16 @@ func GenerateDummyMachineInfo(numaNum int, memoryCapacityGB int) (*info.MachineI machineInfo.Topology = append(machineInfo.Topology, info.Node{ Id: i, Memory: uint64(perNumaCapacityQuantity.Value()), + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2Mi + NumPages: 1024, + }, + { + PageSize: 1 * 1024 * 1024, // 1Gi + NumPages: 8, + }, + }, }) } @@ -249,9 +267,18 @@ func GenerateDummyCPUTopology(cpuNum, socketNum, numaNum int) (*CPUTopology, err } func GenerateDummyMemoryTopology(numaNum int, memoryCapacity uint64) (*MemoryTopology, error) { - memoryTopology := &MemoryTopology{map[int]uint64{}, 4096} + memoryTopology := &MemoryTopology{ + MemoryDetails: map[int]uint64{}, + PageSize: 4096, + NormalMemoryDetails: map[int]uint64{}, + NormalMemoryCapacity: memoryCapacity, + StaticHugePagesDetails: map[int]uint64{}, + StaticHugePagesCapacity: 0, + } for i := 0; i < numaNum; i++ { memoryTopology.MemoryDetails[i] = memoryCapacity / uint64(numaNum) + memoryTopology.NormalMemoryDetails[i] = memoryCapacity / uint64(numaNum) + memoryTopology.StaticHugePagesDetails[i] = 0 } return memoryTopology, nil } @@ -472,6 +499,44 @@ func (d CPUDetails) CPUsInCores(ids ...int) CPUSet { return b } +// DiscoverMemoryTopology returns MemoryTopology based on cadvisor node info +func DiscoverMemoryTopology(machineInfo *info.MachineInfo) (*MemoryTopology, error) { + if machineInfo == nil { + return nil, fmt.Errorf("machineInfo is nil") + } + + memoryTopology := MemoryTopology{ + MemoryDetails: map[int]uint64{}, + PageSize: unix.Getpagesize(), + NormalMemoryDetails: map[int]uint64{}, + StaticHugePagesDetails: map[int]uint64{}, + StaticHugePagesCapacity: 0, + } + + for _, node := range machineInfo.Topology { + memoryTopology.MemoryDetails[node.Id] = node.Memory + + staticHugePagesInBytes := uint64(0) + for _, page := range node.HugePages { + staticHugePagesInBytes += page.NumPages * page.PageSize * 1024 + } + + memoryTopology.StaticHugePagesDetails[node.Id] = staticHugePagesInBytes + memoryTopology.StaticHugePagesCapacity += staticHugePagesInBytes + + normalMemory := node.Memory + if normalMemory > staticHugePagesInBytes { + normalMemory -= staticHugePagesInBytes + } else { + normalMemory = 0 + } + memoryTopology.NormalMemoryDetails[node.Id] = normalMemory + memoryTopology.NormalMemoryCapacity += normalMemory + } + + return &memoryTopology, nil +} + // Discover returns CPUTopology based on cadvisor node info func Discover(machineInfo *info.MachineInfo) (*CPUTopology, *MemoryTopology, error) { if machineInfo.NumCores == 0 { @@ -482,14 +547,12 @@ func Discover(machineInfo *info.MachineInfo) (*CPUTopology, *MemoryTopology, err numaNodeIDToSocketID := make(map[int]int, len(machineInfo.Topology)) numPhysicalCores := 0 - memoryTopology := MemoryTopology{ - MemoryDetails: map[int]uint64{}, - PageSize: unix.Getpagesize(), + memoryTopology, err := DiscoverMemoryTopology(machineInfo) + if err != nil { + return nil, nil, err } for _, node := range machineInfo.Topology { - memoryTopology.MemoryDetails[node.Id] = node.Memory - numPhysicalCores += len(node.Cores) for _, core := range node.Cores { l3CacheID := getUniqueL3CacheID(core) @@ -532,7 +595,7 @@ func Discover(machineInfo *info.MachineInfo) (*CPUTopology, *MemoryTopology, err NUMAToCPUs: numaToCPUs, CPUDetails: cpuDetails, CPUInfo: cpuInfo, - }, &memoryTopology, nil + }, memoryTopology, nil } // getUniqueL3CacheID returns the unique L3 cache ID for the given core. diff --git a/pkg/util/machine/topology_test.go b/pkg/util/machine/topology_test.go index 41da5ef468..c47b4f2632 100644 --- a/pkg/util/machine/topology_test.go +++ b/pkg/util/machine/topology_test.go @@ -21,6 +21,7 @@ import ( "sync" "testing" + info "github.com/google/cadvisor/info/v1" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/util/sets" @@ -76,6 +77,128 @@ func TestMemoryDetailsEqual(t *testing.T) { } } +func TestDiscoverMemoryTopology(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + machineInfo *info.MachineInfo + wantMemoryTopology *MemoryTopology + wantErr bool + }{ + { + name: "Nil MachineInfo", + machineInfo: nil, + wantErr: true, + }, + { + name: "Single NUMA Node, No HugePages", + machineInfo: &info.MachineInfo{ + Topology: []info.Node{ + { + Id: 0, + Memory: 1024 * 1024 * 1024, // 1GB + }, + }, + }, + wantMemoryTopology: &MemoryTopology{ + MemoryDetails: map[int]uint64{0: 1024 * 1024 * 1024}, + NormalMemoryDetails: map[int]uint64{0: 1024 * 1024 * 1024}, + NormalMemoryCapacity: 1024 * 1024 * 1024, + StaticHugePagesDetails: map[int]uint64{0: 0}, + StaticHugePagesCapacity: 0, + }, + }, + { + name: "Single NUMA Node, With HugePages", + machineInfo: &info.MachineInfo{ + Topology: []info.Node{ + { + Id: 0, + Memory: 2 * 1024 * 1024 * 1024, // 2GB + HugePages: []info.HugePagesInfo{ + { + PageSize: 1024 * 1024, // 1GB + NumPages: 1, + }, + }, + }, + }, + }, + wantMemoryTopology: &MemoryTopology{ + MemoryDetails: map[int]uint64{0: 2 * 1024 * 1024 * 1024}, + NormalMemoryDetails: map[int]uint64{0: 1 * 1024 * 1024 * 1024}, + NormalMemoryCapacity: 1 * 1024 * 1024 * 1024, + StaticHugePagesDetails: map[int]uint64{0: 1 * 1024 * 1024 * 1024}, + StaticHugePagesCapacity: 1 * 1024 * 1024 * 1024, + }, + }, + { + name: "Multiple NUMA Nodes, Mixed HugePages", + machineInfo: &info.MachineInfo{ + Topology: []info.Node{ + { + Id: 0, + Memory: 4 * 1024 * 1024 * 1024, // 4GB + HugePages: []info.HugePagesInfo{ + { + PageSize: 1 * 1024 * 1024, // 1GB + NumPages: 2, + }, + }, + }, + { + Id: 1, + Memory: 4 * 1024 * 1024 * 1024, // 4GB + HugePages: []info.HugePagesInfo{ + { + PageSize: 2 * 1024, // 2MB + NumPages: 512, // 1GB + }, + }, + }, + }, + }, + wantMemoryTopology: &MemoryTopology{ + MemoryDetails: map[int]uint64{ + 0: 4 * 1024 * 1024 * 1024, + 1: 4 * 1024 * 1024 * 1024, + }, + NormalMemoryDetails: map[int]uint64{ + 0: 2 * 1024 * 1024 * 1024, + 1: 3 * 1024 * 1024 * 1024, + }, + NormalMemoryCapacity: 5 * 1024 * 1024 * 1024, + StaticHugePagesDetails: map[int]uint64{ + 0: 2 * 1024 * 1024 * 1024, + 1: 1 * 1024 * 1024 * 1024, + }, + StaticHugePagesCapacity: 3 * 1024 * 1024 * 1024, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got, err := DiscoverMemoryTopology(tt.machineInfo) + if (err != nil) != tt.wantErr { + t.Errorf("DiscoverMemoryTopology() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + // Ignore PageSize for comparison as it depends on system + got.PageSize = 0 + tt.wantMemoryTopology.PageSize = 0 + + assert.Equal(t, tt.wantMemoryTopology, got) + } + }) + } +} + func TestMemoryDetailsClone(t *testing.T) { t.Parallel() diff --git a/pkg/util/qos/mem_enhancement.go b/pkg/util/qos/mem_enhancement.go index 434e660ce6..793f0424c0 100644 --- a/pkg/util/qos/mem_enhancement.go +++ b/pkg/util/qos/mem_enhancement.go @@ -66,6 +66,11 @@ func AnnotationsIndicateNUMAExclusive(annotations map[string]string) bool { apiconsts.PodAnnotationMemoryEnhancementNumaExclusiveEnable } +func AnnotationsIndicateSharedCores(annotations map[string]string) bool { + return annotations[apiconsts.PodAnnotationQoSLevelKey] == + apiconsts.PodAnnotationQoSLevelSharedCores +} + // GetRSSOverUseEvictThreshold parse the user specified threshold and checks if it's valid func GetRSSOverUseEvictThreshold(qosConf *generic.QoSConfiguration, pod *v1.Pod) (threshold *float64, invalid bool) { memoryEnhancement := ParseMemoryEnhancement(qosConf, pod)