Skip to content

Commit 37d6204

Browse files
feat(agent): eviction enhancement
1 parent 4646406 commit 37d6204

File tree

23 files changed

+1208
-45
lines changed

23 files changed

+1208
-45
lines changed

cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
type ReclaimedResourcesEvictionOptions struct {
2828
EvictionThreshold native.ResourceThreshold
29+
SoftEvictionThreshold native.ResourceThreshold
2930
GracePeriod int64
3031
ThresholdMetToleranceDuration int64
3132
}
@@ -36,6 +37,10 @@ func NewReclaimedResourcesEvictionOptions() *ReclaimedResourcesEvictionOptions {
3637
consts.ReclaimedResourceMilliCPU: 5.0,
3738
consts.ReclaimedResourceMemory: 5.0,
3839
},
40+
SoftEvictionThreshold: native.ResourceThreshold{
41+
consts.ReclaimedResourceMilliCPU: 1.5,
42+
consts.ReclaimedResourceMemory: 1.2,
43+
},
3944
GracePeriod: 60,
4045
ThresholdMetToleranceDuration: 0,
4146
}
@@ -54,6 +59,7 @@ func (o *ReclaimedResourcesEvictionOptions) AddFlags(fss *cliflag.NamedFlagSets)
5459

5560
func (o *ReclaimedResourcesEvictionOptions) ApplyTo(c *eviction.ReclaimedResourcesEvictionConfiguration) error {
5661
c.EvictionThreshold = o.EvictionThreshold
62+
c.SoftEvictionThreshold = o.SoftEvictionThreshold
5763
c.DeletionGracePeriod = o.GracePeriod
5864
c.ThresholdMetToleranceDuration = o.ThresholdMetToleranceDuration
5965
return nil

cmd/katalyst-agent/app/options/eviction/eviction_base.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ type GenericEvictionOptions struct {
5858

5959
// RecordManager specifies the eviction record manager to use
6060
RecordManager string
61+
62+
// HostPathNotifierPathRoot is the root path for host-path notifier
63+
HostPathNotifierRootPath string
6164
}
6265

6366
// NewGenericEvictionOptions creates a new Options with a default config.
@@ -69,6 +72,7 @@ func NewGenericEvictionOptions() *GenericEvictionOptions {
6972
EvictionSkippedAnnotationKeys: []string{},
7073
EvictionSkippedLabelKeys: []string{},
7174
EvictionBurst: 3,
75+
HostPathNotifierRootPath: "/opt/katalyst",
7276
PodKiller: consts.KillerNameEvictionKiller,
7377
StrictAuthentication: false,
7478
}
@@ -111,6 +115,9 @@ func (o *GenericEvictionOptions) AddFlags(fss *cliflag.NamedFlagSets) {
111115

112116
fs.StringVar(&o.RecordManager, "eviction-record-manager", o.RecordManager,
113117
"the eviction record manager to use")
118+
119+
fs.StringVar(&o.HostPathNotifierRootPath, "pod-notifier-root-path", o.HostPathNotifierRootPath,
120+
"root path of host-path notifier")
114121
}
115122

116123
// ApplyTo fills up config with options
@@ -126,6 +133,7 @@ func (o *GenericEvictionOptions) ApplyTo(c *evictionconfig.GenericEvictionConfig
126133
c.StrictAuthentication = o.StrictAuthentication
127134
c.PodMetricLabels.Insert(o.PodMetricLabels...)
128135
c.RecordManager = o.RecordManager
136+
c.HostPathNotifierRootPath = o.HostPathNotifierRootPath
129137
return nil
130138
}
131139

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ require (
175175
)
176176

177177
replace (
178+
// FIXME
179+
github.com/kubewharf/katalyst-api => github.com/funnydreamwinz/katalyst-api v0.0.0-20251127095537-2554d5fa74a4
178180
k8s.io/api => k8s.io/api v0.24.6
179181
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6
180182
k8s.io/apimachinery => k8s.io/apimachinery v0.24.6

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,8 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
280280
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
281281
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
282282
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
283+
github.com/funnydreamwinz/katalyst-api v0.0.0-20251127095537-2554d5fa74a4 h1:XGH5mC652fXdmxcIWPDUTtPSRoRAmAM2+ZVRiztX0Ks=
284+
github.com/funnydreamwinz/katalyst-api v0.0.0-20251127095537-2554d5fa74a4/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
283285
github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=
284286
github.com/getkin/kin-openapi v0.76.0/go.mod h1:660oXbgy5JFMKreazJaQTw7o+X00qeSyhcnluiMv+Xg=
285287
github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=
@@ -574,8 +576,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
574576
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
575577
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
576578
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
577-
github.com/kubewharf/katalyst-api v0.5.7-0.20251112070927-442dee3bec62 h1:D3FZBfIIiZV7e7aQBmTiMPTrc+mlYx+Lreq8wcpTwK8=
578-
github.com/kubewharf/katalyst-api v0.5.7-0.20251112070927-442dee3bec62/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
579579
github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc=
580580
github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c=
581581
github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8=

pkg/agent/evictionmanager/eviction_resp_collector.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,42 @@ func (e *evictionRespCollector) collectMetThreshold(dryRunPlugins []string, plug
169169
}
170170
}
171171

172+
func (e *evictionRespCollector) collectTopSoftEvictionPods(dryRunPlugins []string, pluginName string,
173+
threshold *pluginapi.ThresholdMetResponse, resp *pluginapi.GetTopEvictionPodsResponse,
174+
) {
175+
dryRun := e.isDryRun(dryRunPlugins, pluginName)
176+
177+
targetPods := make([]*v1.Pod, 0, len(resp.TargetPods))
178+
for i, pod := range resp.TargetPods {
179+
if pod == nil {
180+
continue
181+
}
182+
183+
general.Infof("%v plugin %v request to notify topN pod %v/%v, reason: met threshold in scope [%v]",
184+
e.getLogPrefix(dryRun), pluginName, pod.Namespace, pod.Name, threshold.EvictionScope)
185+
if dryRun {
186+
metricsPodToEvict(e.emitter, e.conf.GenericConfiguration.QoSConfiguration, pluginName, pod, dryRun, e.conf.GenericEvictionConfiguration.PodMetricLabels)
187+
} else {
188+
targetPods = append(targetPods, resp.TargetPods[i])
189+
}
190+
}
191+
192+
for _, pod := range targetPods {
193+
reason := fmt.Sprintf("plugin %s met threshold in scope %s, target %v, observed %v",
194+
pluginName, threshold.EvictionScope, threshold.ThresholdValue, threshold.ObservedValue)
195+
196+
e.getSoftEvictPods()[string(pod.UID)] = &rule.RuledEvictPod{
197+
EvictPod: &pluginapi.EvictPod{
198+
Pod: pod.DeepCopy(),
199+
Reason: reason,
200+
ForceEvict: false,
201+
EvictionPluginName: pluginName,
202+
},
203+
Scope: threshold.EvictionScope,
204+
}
205+
}
206+
}
207+
172208
func (e *evictionRespCollector) collectTopEvictionPods(dryRunPlugins []string, pluginName string,
173209
threshold *pluginapi.ThresholdMetResponse, resp *pluginapi.GetTopEvictionPodsResponse,
174210
) {

pkg/agent/evictionmanager/manager.go

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
clocks "k8s.io/utils/clock"
3838

3939
"github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
40+
apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
4041
"github.com/kubewharf/katalyst-api/pkg/plugins/registration"
4142
pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
4243
endpointpkg "github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/endpoint"
@@ -46,6 +47,7 @@ import (
4647
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/resource"
4748
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/rootfs"
4849
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/podkiller"
50+
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/podnotifier"
4951
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/record"
5052
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/rule"
5153
"github.com/kubewharf/katalyst-core/pkg/client"
@@ -109,7 +111,8 @@ type EvictionManger struct {
109111
// easy to test the code.
110112
clock clocks.WithTickerAndDelayedExecution
111113

112-
podKiller podkiller.PodKiller
114+
podNotifier podnotifier.PodNotifier
115+
podKiller podkiller.PodKiller
113116

114117
killQueue rule.EvictionQueue
115118
killStrategy rule.EvictionStrategy
@@ -236,6 +239,12 @@ func NewEvictionManager(genericClient *client.GenericClientSet, recorder events.
236239

237240
podKiller := podkiller.NewAsynchronizedPodKiller(killer, metaServer.PodFetcher, genericClient.KubeClient)
238241

242+
notifier, err := podnotifier.NewHostPathPodNotifier(conf, genericClient.KubeClient, metaServer, recorder, emitter)
243+
if err != nil {
244+
return nil, fmt.Errorf("failed to create pod notifier: %v", err)
245+
}
246+
podNotifier := podnotifier.NewSynchronizedPodNotifier(notifier)
247+
239248
cnrTaintReporter, err := control.NewGenericReporterPlugin(cnrTaintReporterPluginName, conf, emitter)
240249
if err != nil {
241250
return nil, fmt.Errorf("failed to initialize cnr taint reporter plugin: %v", err)
@@ -264,6 +273,7 @@ func NewEvictionManager(genericClient *client.GenericClientSet, recorder events.
264273
metaGetter: metaServer,
265274
emitter: emitter,
266275
podKiller: podKiller,
276+
podNotifier: podNotifier,
267277
cnrTaintReporter: cnrTaintReporter,
268278
endpoints: make(map[string]endpointpkg.Endpoint),
269279
conf: conf,
@@ -317,6 +327,7 @@ func (m *EvictionManger) Run(ctx context.Context) {
317327
general.RegisterHeartbeatCheck(reportTaintHealthCheckName, reportTaintToleration,
318328
general.HealthzCheckStateNotReady, reportTaintToleration)
319329
m.podKiller.Start(ctx)
330+
m.podNotifier.Start(ctx)
320331
for _, endpoint := range m.endpoints {
321332
endpoint.Start()
322333
}
@@ -360,6 +371,11 @@ func (m *EvictionManger) sync(ctx context.Context) {
360371
}
361372

362373
errList := make([]error, 0)
374+
notifyErr := m.doNotify(collector.getSoftEvictPods())
375+
if notifyErr != nil {
376+
errList = append(errList, notifyErr)
377+
}
378+
363379
evictErr := m.doEvict(collector.getSoftEvictPods(), collector.getForceEvictPods())
364380
if evictErr != nil {
365381
errList = append(errList, evictErr)
@@ -430,8 +446,8 @@ func (m *EvictionManger) collectEvictionResult(ctx context.Context, pods []*v1.P
430446
records := m.getEvictionRecords(ctx, collector.currentCandidatePods)
431447

432448
for pluginName, threshold := range thresholdsMet {
433-
if threshold.MetType != pluginapi.ThresholdMetType_HARD_MET {
434-
general.Infof(" the type: %s of met threshold from plugin: %s isn't %s", threshold.MetType.String(), pluginName, pluginapi.ThresholdMetType_HARD_MET.String())
449+
if threshold.MetType == pluginapi.ThresholdMetType_NOT_MET {
450+
general.Infof("resp from plugin: %s not met threshold", pluginName)
435451
continue
436452
}
437453

@@ -454,13 +470,19 @@ func (m *EvictionManger) collectEvictionResult(ctx context.Context, pods []*v1.P
454470
}
455471
}
456472

473+
topN := uint64(0)
474+
forceEvict := false
475+
if threshold.MetType == pluginapi.ThresholdMetType_HARD_MET {
476+
topN = 1
477+
forceEvict = true
478+
}
479+
457480
resp, err := m.endpoints[pluginName].GetTopEvictionPods(context.Background(), &pluginapi.GetTopEvictionPodsRequest{
458481
ActivePods: activePods,
459-
TopN: 1,
482+
TopN: topN,
460483
EvictionScope: threshold.EvictionScope,
461484
CandidateEvictionRecords: candidateEvictionRecords,
462485
})
463-
464486
m.endpointLock.RUnlock()
465487
if err != nil {
466488
general.Errorf(" calling GetTopEvictionPods of plugin: %s failed with error: %v", pluginName, err)
@@ -474,12 +496,38 @@ func (m *EvictionManger) collectEvictionResult(ctx context.Context, pods []*v1.P
474496
continue
475497
}
476498

477-
collector.collectTopEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
499+
if forceEvict {
500+
collector.collectTopEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
501+
} else {
502+
collector.collectTopSoftEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
503+
}
504+
478505
}
479506

480507
return collector, errors.NewAggregate(errList)
481508
}
482509

510+
func (m *EvictionManger) doNotify(softEvictPods map[string]*rule.RuledEvictPod) error {
511+
errList := make([]error, 0)
512+
513+
for _, pod := range softEvictPods {
514+
if pod == nil || pod.EvictPod.Pod == nil {
515+
continue
516+
}
517+
518+
if _, ok := pod.EvictPod.Pod.Annotations[apiconsts.PodAnnotationSoftEvictNotificationKey]; !ok {
519+
continue
520+
}
521+
522+
err := m.podNotifier.NotifyPod(pod)
523+
if err != nil {
524+
errList = append(errList, err)
525+
}
526+
}
527+
528+
return errors.NewAggregate(errList)
529+
}
530+
483531
func (m *EvictionManger) doEvict(softEvictPods, forceEvictPods map[string]*rule.RuledEvictPod) error {
484532
softEvictPods = filterOutCandidatePodsWithForcePods(softEvictPods, forceEvictPods)
485533
bestSuitedCandidate := m.getEvictPodFromCandidates(softEvictPods)
@@ -638,6 +686,10 @@ func (m *EvictionManger) getEvictPodFromCandidates(candidateEvictPods map[string
638686
for _, rp := range candidateEvictPods {
639687
// only killing pods that pass candidate validation
640688
if rp != nil && rp.Pod != nil && m.killStrategy.CandidateValidate(rp) {
689+
// do NOT select soft evict pod with notification-enable as candidate
690+
if _, ok := rp.Pod.Annotations[apiconsts.PodAnnotationSoftEvictNotificationKey]; ok {
691+
continue
692+
}
641693
rpList = append(rpList, rp)
642694
}
643695
}

pkg/agent/evictionmanager/manager_test.go

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ func makeConf() *config.Configuration {
108108
conf.PodKiller = consts.KillerNameEvictionKiller
109109
conf.GenericConfiguration.AuthConfiguration.AuthType = credential.AuthTypeInsecure
110110
conf.GenericConfiguration.AuthConfiguration.AccessControlType = authorization.AccessControlTypeInsecure
111+
conf.HostPathNotifierRootPath = "/opt/katalyst"
111112

112113
return conf
113114
}
@@ -232,12 +233,46 @@ func (p plugin2) GetEvictPods(_ context.Context, _ *pluginapi.GetEvictPodsReques
232233
return &pluginapi.GetEvictPodsResponse{EvictPods: []*pluginapi.EvictPod{}}, nil
233234
}
234235

236+
type plugin3 struct {
237+
pluginSkeleton
238+
}
239+
240+
func (p plugin3) ThresholdMet(_ context.Context, _ *pluginapi.GetThresholdMetRequest) (*pluginapi.ThresholdMetResponse, error) {
241+
return &pluginapi.ThresholdMetResponse{
242+
MetType: pluginapi.ThresholdMetType_SOFT_MET,
243+
ThresholdValue: 0.8,
244+
ObservedValue: 0.9,
245+
ThresholdOperator: pluginapi.ThresholdOperator_GREATER_THAN,
246+
EvictionScope: "plugin3_scope",
247+
GracePeriodSeconds: -1,
248+
}, nil
249+
}
250+
251+
func (p plugin3) GetTopEvictionPods(_ context.Context, _ *pluginapi.GetTopEvictionPodsRequest) (*pluginapi.GetTopEvictionPodsResponse, error) {
252+
return &pluginapi.GetTopEvictionPodsResponse{TargetPods: []*v1.Pod{
253+
{
254+
ObjectMeta: metav1.ObjectMeta{
255+
Name: "pod-3",
256+
UID: "pod-3",
257+
},
258+
Status: v1.PodStatus{
259+
Phase: v1.PodRunning,
260+
},
261+
},
262+
}}, nil
263+
}
264+
265+
func (p plugin3) GetEvictPods(_ context.Context, _ *pluginapi.GetEvictPodsRequest) (*pluginapi.GetEvictPodsResponse, error) {
266+
return &pluginapi.GetEvictPodsResponse{EvictPods: []*pluginapi.EvictPod{}}, nil
267+
}
268+
235269
func makeEvictionManager(t *testing.T) *EvictionManger {
236270
mgr, err := NewEvictionManager(&client.GenericClientSet{}, nil, makeMetaServer(), metrics.DummyMetrics{}, makeConf())
237271
assert.NoError(t, err)
238272
mgr.endpoints = map[string]endpointpkg.Endpoint{
239273
"plugin1": &plugin1{},
240274
"plugin2": &plugin2{},
275+
"plugin3": &plugin3{},
241276
}
242277

243278
return mgr
@@ -258,6 +293,7 @@ func TestEvictionManger_collectEvictionResult(t *testing.T) {
258293
dryrun: []string{},
259294
wantSoftEvictPods: sets.String{
260295
"pod-1": sets.Empty{},
296+
"pod-3": sets.Empty{},
261297
"pod-5": sets.Empty{},
262298
},
263299
wantForceEvictPods: sets.String{
@@ -269,9 +305,11 @@ func TestEvictionManger_collectEvictionResult(t *testing.T) {
269305
},
270306
},
271307
{
272-
name: "dryrun plugin1",
273-
dryrun: []string{"plugin1"},
274-
wantSoftEvictPods: sets.String{},
308+
name: "dryrun plugin1",
309+
dryrun: []string{"plugin1"},
310+
wantSoftEvictPods: sets.String{
311+
"pod-3": sets.Empty{},
312+
},
275313
wantForceEvictPods: sets.String{
276314
"pod-3": sets.Empty{},
277315
},
@@ -284,6 +322,7 @@ func TestEvictionManger_collectEvictionResult(t *testing.T) {
284322
dryrun: []string{"plugin2"},
285323
wantSoftEvictPods: sets.String{
286324
"pod-1": sets.Empty{},
325+
"pod-3": sets.Empty{},
287326
"pod-5": sets.Empty{},
288327
},
289328
wantForceEvictPods: sets.String{
@@ -292,9 +331,11 @@ func TestEvictionManger_collectEvictionResult(t *testing.T) {
292331
wantConditions: sets.String{},
293332
},
294333
{
295-
name: "dryrun plugin1 & plugin2",
296-
dryrun: []string{"plugin1", "plugin2"},
297-
wantSoftEvictPods: sets.String{},
334+
name: "dryrun plugin1 & plugin2",
335+
dryrun: []string{"plugin1", "plugin2"},
336+
wantSoftEvictPods: sets.String{
337+
"pod-3": sets.Empty{},
338+
},
298339
wantForceEvictPods: sets.String{},
299340
wantConditions: sets.String{},
300341
},

0 commit comments

Comments
 (0)