Skip to content

Commit 920e970

Browse files
authored
Merge pull request #555 from caohe/validate-pod-resources-0.3
cherry pick of #541: feat(reporter): validate specific resources in pod resources response
2 parents f6152b8 + 41dbbfd commit 920e970

File tree

6 files changed

+38
-13
lines changed

6 files changed

+38
-13
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ jobs:
8686
fi
8787
- name: Upload coverage to Codecov
8888
if: ${{ (env.NEED_TO_CHECK == 'true') || (github.event_name != 'pull_request') }}
89-
uses: codecov/codecov-action@v3
89+
uses: codecov/codecov-action@v4
90+
env:
91+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
9092
with:
9193
flags: unittest
9294
file: coverage.txt

cmd/katalyst-agent/app/options/reporter/kubelet_plugin.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package reporter
1818

1919
import (
20+
v1 "k8s.io/api/core/v1"
2021
cliflag "k8s.io/component-base/cli/flag"
2122
pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
2223

@@ -28,6 +29,7 @@ type KubeletPluginOptions struct {
2829
KubeletResourcePluginPaths []string
2930
EnableReportTopologyPolicy bool
3031
ResourceNameToZoneTypeMap map[string]string
32+
NeedValidationResources []string
3133
}
3234

3335
func NewKubeletPluginOptions() *KubeletPluginOptions {
@@ -40,6 +42,10 @@ func NewKubeletPluginOptions() *KubeletPluginOptions {
4042
},
4143
EnableReportTopologyPolicy: false,
4244
ResourceNameToZoneTypeMap: make(map[string]string),
45+
NeedValidationResources: []string{
46+
string(v1.ResourceCPU),
47+
string(v1.ResourceMemory),
48+
},
4349
}
4450
}
4551

@@ -54,13 +60,16 @@ func (o *KubeletPluginOptions) AddFlags(fss *cliflag.NamedFlagSets) {
5460
"whether to report topology policy")
5561
fs.StringToStringVar(&o.ResourceNameToZoneTypeMap, "resource-name-to-zone-type-map", o.ResourceNameToZoneTypeMap,
5662
"a map that stores the mapping relationship between resource names to zone types in KCNR (e.g. nvidia.com/gpu=GPU,...)")
63+
fs.StringSliceVar(&o.NeedValidationResources, "need-validation-resources", o.NeedValidationResources,
64+
"resources need to be validated")
5765
}
5866

5967
func (o *KubeletPluginOptions) ApplyTo(c *reporter.KubeletPluginConfiguration) error {
6068
c.PodResourcesServerEndpoints = o.PodResourcesServerEndpoints
6169
c.KubeletResourcePluginPaths = o.KubeletResourcePluginPaths
6270
c.EnableReportTopologyPolicy = o.EnableReportTopologyPolicy
6371
c.ResourceNameToZoneTypeMap = o.ResourceNameToZoneTypeMap
72+
c.NeedValidationResources = o.NeedValidationResources
6473

6574
return nil
6675
}

pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ func NewKubeletReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaser
9090

9191
topologyStatusAdapter, err := topology.NewPodResourcesServerTopologyAdapter(metaServer,
9292
conf.PodResourcesServerEndpoints, conf.KubeletResourcePluginPaths, conf.ResourceNameToZoneTypeMap,
93-
nil, p.getNumaInfo, nil, podresources.GetV1Client)
93+
nil, p.getNumaInfo, nil, podresources.GetV1Client, conf.NeedValidationResources)
9494
if err != nil {
9595
return nil, err
9696
}

pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,16 @@ type topologyAdapterImpl struct {
8686

8787
// resourceNameToZoneTypeMap is a map that stores the mapping relationship between resource names to zone types for device zones
8888
resourceNameToZoneTypeMap map[string]string
89+
90+
// needValidationResources is the resources needed to be validated
91+
needValidationResources []string
8992
}
9093

9194
// NewPodResourcesServerTopologyAdapter creates a topology adapter which uses pod resources server
9295
func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, endpoints []string,
9396
kubeletResourcePluginPaths []string, resourceNameToZoneTypeMap map[string]string, skipDeviceNames sets.String,
94-
numaInfoGetter NumaInfoGetter, podResourcesFilter PodResourcesFilter, getClientFunc podresources.GetClientFunc) (Adapter, error) {
97+
numaInfoGetter NumaInfoGetter, podResourcesFilter PodResourcesFilter, getClientFunc podresources.GetClientFunc,
98+
needValidationResources []string) (Adapter, error) {
9599
numaInfo, err := numaInfoGetter()
96100
if err != nil {
97101
return nil, fmt.Errorf("failed to get numa info: %s", err)
@@ -116,6 +120,7 @@ func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, end
116120
getClientFunc: getClientFunc,
117121
podResourcesFilter: podResourcesFilter,
118122
resourceNameToZoneTypeMap: resourceNameToZoneTypeMap,
123+
needValidationResources: needValidationResources,
119124
}, nil
120125
}
121126

@@ -149,7 +154,7 @@ func (p *topologyAdapterImpl) GetTopologyZones(parentCtx context.Context) ([]*no
149154
}
150155

151156
// validate pod Resources server response to make sure report topology status is correct
152-
if err = validatePodResourcesServerResponse(allocatableResources, listPodResourcesResponse); err != nil {
157+
if err = p.validatePodResourcesServerResponse(allocatableResources, listPodResourcesResponse); err != nil {
153158
return nil, errors.Wrap(err, "validate pod Resources server response failed")
154159
}
155160

@@ -266,14 +271,21 @@ func (p *topologyAdapterImpl) Run(ctx context.Context, handler func()) error {
266271

267272
// validatePodResourcesServerResponse validate pod resources server response, if the resource is empty,
268273
// maybe the kubelet or qrm plugin is restarting
269-
func validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1.AllocatableResourcesResponse,
270-
listPodResourcesResponse *podresv1.ListPodResourcesResponse) error {
271-
if allocatableResourcesResponse == nil {
272-
return fmt.Errorf("allocatable Resources response is nil")
273-
}
274+
func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1.
275+
AllocatableResourcesResponse, listPodResourcesResponse *podresv1.ListPodResourcesResponse) error {
276+
if len(p.needValidationResources) > 0 {
277+
if allocatableResourcesResponse == nil {
278+
return fmt.Errorf("allocatable resources response is nil")
279+
}
274280

275-
if len(allocatableResourcesResponse.Resources) == 0 {
276-
return fmt.Errorf("allocatable topology aware Resources is empty")
281+
allocResSet := sets.NewString()
282+
for _, res := range allocatableResourcesResponse.Resources {
283+
allocResSet.Insert(res.ResourceName)
284+
}
285+
286+
if !allocResSet.HasAll(p.needValidationResources...) {
287+
return fmt.Errorf("allocatable resources response doen't contain all the resources that need to be validated")
288+
}
277289
}
278290

279291
if listPodResourcesResponse == nil {

pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,7 +1663,8 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones(t *testing.T) {
16631663
PodFetcher: &pod.PodFetcherStub{PodList: tt.fields.podList},
16641664
},
16651665
},
1666-
numaSocketZoneNodeMap: tt.fields.numaSocketZoneNodeMap,
1666+
numaSocketZoneNodeMap: tt.fields.numaSocketZoneNodeMap,
1667+
needValidationResources: []string{"cpu", "memory"},
16671668
}
16681669
got, err := p.GetTopologyZones(context.TODO())
16691670
if (err != nil) != tt.wantErr {
@@ -1735,7 +1736,7 @@ func Test_podResourcesServerTopologyAdapterImpl_Run(t *testing.T) {
17351736
notifier := make(chan struct{}, 1)
17361737
p, _ := NewPodResourcesServerTopologyAdapter(testMetaServer,
17371738
endpoints, kubeletResourcePluginPath, nil,
1738-
nil, getNumaInfo, nil, podresources.GetV1Client)
1739+
nil, getNumaInfo, nil, podresources.GetV1Client, []string{"cpu", "memory"})
17391740
err = p.Run(ctx, func() {})
17401741
assert.NoError(t, err)
17411742

pkg/config/agent/reporter/kubelet_plugin.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ type KubeletPluginConfiguration struct {
2121
KubeletResourcePluginPaths []string
2222
EnableReportTopologyPolicy bool
2323
ResourceNameToZoneTypeMap map[string]string
24+
NeedValidationResources []string
2425
}
2526

2627
func NewKubeletPluginConfiguration() *KubeletPluginConfiguration {

0 commit comments

Comments
 (0)