Skip to content

Commit 40b73fc

Browse files
feat(agent): eviction enhancement
1 parent a1e195e commit 40b73fc

File tree

15 files changed

+611
-18
lines changed

15 files changed

+611
-18
lines changed

cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
type ReclaimedResourcesEvictionOptions struct {
2828
EvictionThreshold native.ResourceThreshold
29+
SoftEvictionThreshold native.ResourceThreshold
2930
GracePeriod int64
3031
ThresholdMetToleranceDuration int64
3132
}
@@ -36,6 +37,10 @@ func NewReclaimedResourcesEvictionOptions() *ReclaimedResourcesEvictionOptions {
3637
consts.ReclaimedResourceMilliCPU: 5.0,
3738
consts.ReclaimedResourceMemory: 5.0,
3839
},
40+
SoftEvictionThreshold: native.ResourceThreshold{
41+
consts.ReclaimedResourceMilliCPU: 1.5,
42+
consts.ReclaimedResourceMemory: 1.5,
43+
},
3944
GracePeriod: 60,
4045
ThresholdMetToleranceDuration: 0,
4146
}
@@ -54,6 +59,7 @@ func (o *ReclaimedResourcesEvictionOptions) AddFlags(fss *cliflag.NamedFlagSets)
5459

5560
func (o *ReclaimedResourcesEvictionOptions) ApplyTo(c *eviction.ReclaimedResourcesEvictionConfiguration) error {
5661
c.EvictionThreshold = o.EvictionThreshold
62+
c.SoftEvictionThreshold = o.SoftEvictionThreshold
5763
c.DeletionGracePeriod = o.GracePeriod
5864
c.ThresholdMetToleranceDuration = o.ThresholdMetToleranceDuration
5965
return nil

cmd/katalyst-agent/app/options/eviction/eviction_base.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ type GenericEvictionOptions struct {
5252

5353
// PodMetricLabels defines the pod labels to be added into metric selector list.
5454
PodMetricLabels []string
55+
56+
// HostPathNotifierPathRoot is the root path for host-path notifier
57+
HostPathNotifierRootPath string
5558
}
5659

5760
// NewGenericEvictionOptions creates a new Options with a default config.
@@ -63,6 +66,7 @@ func NewGenericEvictionOptions() *GenericEvictionOptions {
6366
EvictionSkippedAnnotationKeys: []string{},
6467
EvictionSkippedLabelKeys: []string{},
6568
EvictionBurst: 3,
69+
HostPathNotifierRootPath: "/opt/katalyst",
6670
PodKiller: consts.KillerNameEvictionKiller,
6771
StrictAuthentication: false,
6872
}
@@ -99,6 +103,9 @@ func (o *GenericEvictionOptions) AddFlags(fss *cliflag.NamedFlagSets) {
99103

100104
fs.StringSliceVar(&o.PodMetricLabels, "eviction-pod-metric-labels", o.PodMetricLabels,
101105
"The pod labels to be added into metric selector list")
106+
107+
fs.StringVar(&o.HostPathNotifierRootPath, "pod-notifier-root-path", o.HostPathNotifierRootPath,
108+
"root path of host-path notifier")
102109
}
103110

104111
// ApplyTo fills up config with options
@@ -112,6 +119,7 @@ func (o *GenericEvictionOptions) ApplyTo(c *evictionconfig.GenericEvictionConfig
112119
c.PodKiller = o.PodKiller
113120
c.StrictAuthentication = o.StrictAuthentication
114121
c.PodMetricLabels.Insert(o.PodMetricLabels...)
122+
c.HostPathNotifierRootPath = o.HostPathNotifierRootPath
115123
return nil
116124
}
117125

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ require (
173173
)
174174

175175
replace (
176+
// FIXME
177+
github.com/kubewharf/katalyst-api => github.com/funnydreamwinz/katalyst-api v0.0.0-20250811070245-94cac124cdd7
176178
k8s.io/api => k8s.io/api v0.24.6
177179
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6
178180
k8s.io/apimachinery => k8s.io/apimachinery v0.24.6

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,8 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
279279
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
280280
github.com/fsnotify/fsnotify v1.5.4 h1:jRbGcIw6P2Meqdwuo0H1p6JVLbL5DHKAKlYndzMwVZI=
281281
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
282+
github.com/funnydreamwinz/katalyst-api v0.0.0-20250811070245-94cac124cdd7 h1:cSYxLZ+taqOLV02NutItx9IhR4wEMBe/KLTTLPr66a4=
283+
github.com/funnydreamwinz/katalyst-api v0.0.0-20250811070245-94cac124cdd7/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
282284
github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=
283285
github.com/getkin/kin-openapi v0.76.0/go.mod h1:660oXbgy5JFMKreazJaQTw7o+X00qeSyhcnluiMv+Xg=
284286
github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=

pkg/agent/evictionmanager/eviction_resp_collector.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,42 @@ func (e *evictionRespCollector) collectMetThreshold(dryRunPlugins []string, plug
158158
}
159159
}
160160

161+
func (e *evictionRespCollector) collectTopSoftEvictionPods(dryRunPlugins []string, pluginName string,
162+
threshold *pluginapi.ThresholdMetResponse, resp *pluginapi.GetTopEvictionPodsResponse,
163+
) {
164+
dryRun := e.isDryRun(dryRunPlugins, pluginName)
165+
166+
targetPods := make([]*v1.Pod, 0, len(resp.TargetPods))
167+
for i, pod := range resp.TargetPods {
168+
if pod == nil {
169+
continue
170+
}
171+
172+
general.Infof("%v plugin %v request to notify topN pod %v/%v, reason: met threshold in scope [%v]",
173+
e.getLogPrefix(dryRun), pluginName, pod.Namespace, pod.Name, threshold.EvictionScope)
174+
if dryRun {
175+
metricsPodToEvict(e.emitter, e.conf.GenericConfiguration.QoSConfiguration, pluginName, pod, dryRun, e.conf.GenericEvictionConfiguration.PodMetricLabels)
176+
} else {
177+
targetPods = append(targetPods, resp.TargetPods[i])
178+
}
179+
}
180+
181+
for _, pod := range targetPods {
182+
reason := fmt.Sprintf("plugin %s met threshold in scope %s, target %v, observed %v",
183+
pluginName, threshold.EvictionScope, threshold.ThresholdValue, threshold.ObservedValue)
184+
185+
e.getSoftEvictPods()[string(pod.UID)] = &rule.RuledEvictPod{
186+
EvictPod: &pluginapi.EvictPod{
187+
Pod: pod.DeepCopy(),
188+
Reason: reason,
189+
ForceEvict: false,
190+
EvictionPluginName: pluginName,
191+
},
192+
Scope: threshold.EvictionScope,
193+
}
194+
}
195+
}
196+
161197
func (e *evictionRespCollector) collectTopEvictionPods(dryRunPlugins []string, pluginName string,
162198
threshold *pluginapi.ThresholdMetResponse, resp *pluginapi.GetTopEvictionPodsResponse,
163199
) {

pkg/agent/evictionmanager/manager.go

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/resource"
4646
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/rootfs"
4747
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/podkiller"
48+
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/podnotifier"
4849
"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/rule"
4950
"github.com/kubewharf/katalyst-core/pkg/client"
5051
"github.com/kubewharf/katalyst-core/pkg/client/control"
@@ -104,7 +105,8 @@ type EvictionManger struct {
104105
// easy to test the code.
105106
clock clocks.WithTickerAndDelayedExecution
106107

107-
podKiller podkiller.PodKiller
108+
podNotifier podnotifier.PodNotifier
109+
podKiller podkiller.PodKiller
108110

109111
killQueue rule.EvictionQueue
110112
killStrategy rule.EvictionStrategy
@@ -171,6 +173,12 @@ func NewEvictionManager(genericClient *client.GenericClientSet, recorder events.
171173

172174
podKiller := podkiller.NewAsynchronizedPodKiller(killer, metaServer.PodFetcher, genericClient.KubeClient)
173175

176+
notifier, err := podnotifier.NewHostPathPodNotifier(conf, genericClient.KubeClient, metaServer, recorder, emitter)
177+
if err != nil {
178+
return nil, fmt.Errorf("failed to create pod notifier: %v", err)
179+
}
180+
podNotifier := podnotifier.NewSynchronizedPodNotifier(notifier)
181+
174182
cnrTaintReporter, err := control.NewGenericReporterPlugin(cnrTaintReporterPluginName, conf, emitter)
175183
if err != nil {
176184
return nil, fmt.Errorf("failed to initialize cnr taint reporter plugin: %v", err)
@@ -183,6 +191,7 @@ func NewEvictionManager(genericClient *client.GenericClientSet, recorder events.
183191
metaGetter: metaServer,
184192
emitter: emitter,
185193
podKiller: podKiller,
194+
podNotifier: podNotifier,
186195
cnrTaintReporter: cnrTaintReporter,
187196
endpoints: make(map[string]endpointpkg.Endpoint),
188197
conf: conf,
@@ -235,6 +244,7 @@ func (m *EvictionManger) Run(ctx context.Context) {
235244
general.RegisterHeartbeatCheck(reportTaintHealthCheckName, reportTaintToleration,
236245
general.HealthzCheckStateNotReady, reportTaintToleration)
237246
m.podKiller.Start(ctx)
247+
m.podNotifier.Start(ctx)
238248
for _, endpoint := range m.endpoints {
239249
endpoint.Start()
240250
}
@@ -277,6 +287,11 @@ func (m *EvictionManger) sync(ctx context.Context) {
277287
}
278288

279289
errList := make([]error, 0)
290+
notifyErr := m.doNotify(collector.getSoftEvictPods())
291+
if notifyErr != nil {
292+
errList = append(errList, notifyErr)
293+
}
294+
280295
evictErr := m.doEvict(collector.getSoftEvictPods(), collector.getForceEvictPods())
281296
if evictErr != nil {
282297
errList = append(errList, evictErr)
@@ -342,8 +357,8 @@ func (m *EvictionManger) collectEvictionResult(pods []*v1.Pod) (*evictionRespCol
342357
m.conditionLock.Unlock()
343358

344359
for pluginName, threshold := range thresholdsMet {
345-
if threshold.MetType != pluginapi.ThresholdMetType_HARD_MET {
346-
general.Infof(" the type: %s of met threshold from plugin: %s isn't %s", threshold.MetType.String(), pluginName, pluginapi.ThresholdMetType_HARD_MET.String())
360+
if threshold.MetType == pluginapi.ThresholdMetType_NOT_MET {
361+
general.Infof("resp from plugin: %s not met threshold", pluginName)
347362
continue
348363
}
349364

@@ -352,12 +367,18 @@ func (m *EvictionManger) collectEvictionResult(pods []*v1.Pod) (*evictionRespCol
352367
general.Errorf(" pluginName points to nil endpoint, can't handle threshold from it")
353368
}
354369

370+
topN := uint64(0)
371+
forceEvict := false
372+
if threshold.MetType == pluginapi.ThresholdMetType_HARD_MET {
373+
topN = 1
374+
forceEvict = true
375+
}
376+
355377
resp, err := m.endpoints[pluginName].GetTopEvictionPods(context.Background(), &pluginapi.GetTopEvictionPodsRequest{
356378
ActivePods: pods,
357-
TopN: 1,
379+
TopN: topN,
358380
EvictionScope: threshold.EvictionScope,
359381
})
360-
361382
m.endpointLock.RUnlock()
362383
if err != nil {
363384
general.Errorf(" calling GetTopEvictionPods of plugin: %s failed with error: %v", pluginName, err)
@@ -371,12 +392,40 @@ func (m *EvictionManger) collectEvictionResult(pods []*v1.Pod) (*evictionRespCol
371392
continue
372393
}
373394

374-
collector.collectTopEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
395+
if forceEvict {
396+
collector.collectTopEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
397+
} else {
398+
collector.collectTopSoftEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
399+
}
400+
375401
}
376402

377403
return collector, errors.NewAggregate(errList)
378404
}
379405

406+
func (m *EvictionManger) doNotify(softEvictPods map[string]*rule.RuledEvictPod) error {
407+
errList := make([]error, 0)
408+
409+
for _, pod := range softEvictPods {
410+
if pod == nil || pod.EvictPod.Pod == nil {
411+
continue
412+
}
413+
414+
// TODO
415+
// if _, ok := pod.EvictPod.Pod.Annotations[constapi.PodAnnotationSoftEvictNotificationKey]; !ok {
416+
if _, ok := pod.EvictPod.Pod.Annotations["katalyst.kubewharf.io/soft_evict_notify"]; !ok {
417+
continue
418+
}
419+
420+
err := m.podNotifier.NotifyPod(pod)
421+
if err != nil {
422+
errList = append(errList, err)
423+
}
424+
}
425+
426+
return errors.NewAggregate(errList)
427+
}
428+
380429
func (m *EvictionManger) doEvict(softEvictPods, forceEvictPods map[string]*rule.RuledEvictPod) error {
381430
softEvictPods = filterOutCandidatePodsWithForcePods(softEvictPods, forceEvictPods)
382431
bestSuitedCandidate := m.getEvictPodFromCandidates(softEvictPods)

pkg/agent/evictionmanager/plugin/resource/reclaimed_resources.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ func NewReclaimedResourcesEvictionPlugin(_ *client.GenericClientSet, _ events.Ev
6262
}
6363
}
6464

65+
reclaimedSoftThresholdGetter := func(resourceName v1.ResourceName) *float64 {
66+
return nil
67+
}
68+
6569
deletionGracePeriodGetter := func() int64 {
6670
return conf.GetDynamicConfiguration().ReclaimedResourcesEvictionConfiguration.DeletionGracePeriod
6771
}
@@ -75,6 +79,8 @@ func NewReclaimedResourcesEvictionPlugin(_ *client.GenericClientSet, _ events.Ev
7579
emitter,
7680
reclaimedResourcesGetter,
7781
reclaimedThresholdGetter,
82+
reclaimedSoftThresholdGetter,
83+
nil,
7884
deletionGracePeriodGetter,
7985
thresholdMetToleranceDurationGetter,
8086
conf.SkipZeroQuantityResourceNames,

0 commit comments

Comments
 (0)