Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion cmd/katalyst-controller/app/controller/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
katalyst "github.com/kubewharf/katalyst-core/cmd/base"
"github.com/kubewharf/katalyst-core/pkg/config"
"github.com/kubewharf/katalyst-core/pkg/controller/overcommit/node"
"github.com/kubewharf/katalyst-core/pkg/controller/overcommit/prediction"
)

const (
Expand All @@ -47,7 +48,20 @@ func StartOvercommitController(
klog.Errorf("failed to new nodeOvercommit controller")
return false, err
}

go noc.Run()

if conf.Prediction.EnablePredict {
pc, err := prediction.NewPredictionController(
ctx,
controlCtx,
conf.ControllersConfiguration.OvercommitConfig,
)
if err != nil {
klog.Errorf("failed to new overcommit prediction controller")
return false, err
}

go pc.Run()
}
return true, nil
}
113 changes: 112 additions & 1 deletion cmd/katalyst-controller/app/options/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
cliflag "k8s.io/component-base/cli/flag"

"github.com/kubewharf/katalyst-core/pkg/config/controller"
"github.com/kubewharf/katalyst-core/pkg/util/datasource/prometheus"
)

const (
Expand All @@ -32,6 +33,35 @@ const (
// OvercommitOptions holds the configurations for overcommit.
type OvercommitOptions struct {
NodeOvercommitOptions
PredictionOptions
}

type PredictionOptions struct {
EnablePredict bool
Predictor string
PredictPeriod time.Duration
ReconcilePeriod time.Duration

MaxTimeSeriesDuration time.Duration
MinTimeSeriesDuration time.Duration

TargetReferenceNameKey string
TargetReferenceTypeKey string
CPUScaleFactor float64
MemoryScaleFactor float64

NodeCPUTargetLoad float64
NodeMemoryTargetLoad float64
PodEstimatedCPULoad float64
PodEstimatedMemoryLoad float64

prometheus.PromConfig
NSigmaOptions
}

type NSigmaOptions struct {
Factor int
Buckets int
}

// NodeOvercommitOptions holds the configurations for nodeOvercommitConfig controller.
Expand All @@ -45,7 +75,27 @@ type NodeOvercommitOptions struct {

// NewOvercommitOptions creates a new Options with a default config.
func NewOvercommitOptions() *OvercommitOptions {
return &OvercommitOptions{}
return &OvercommitOptions{
PredictionOptions: PredictionOptions{
EnablePredict: false,
Predictor: "",
PredictPeriod: 24 * time.Hour,
ReconcilePeriod: 1 * time.Hour,
MaxTimeSeriesDuration: 7 * 24 * time.Hour,
MinTimeSeriesDuration: 24 * time.Hour,
CPUScaleFactor: 1,
MemoryScaleFactor: 1,
NodeCPUTargetLoad: 0.6,
NodeMemoryTargetLoad: 0.8,
PodEstimatedCPULoad: 0.5,
PodEstimatedMemoryLoad: 0.8,
NSigmaOptions: NSigmaOptions{
Factor: 3,
Buckets: 24,
},
PromConfig: prometheus.PromConfig{},
},
}
}

// AddFlags adds flags to the specified FlagSet
Expand All @@ -54,11 +104,72 @@ func (o *OvercommitOptions) AddFlags(fss *cliflag.NamedFlagSets) {

fs.IntVar(&o.SyncWorkers, "nodeovercommit-sync-workers", defaultNodeOvercommitSyncWorkers, "num of goroutines to sync nodeovercommitconfig")
fs.DurationVar(&o.ConfigReconcilePeriod, "nodeovercommit-reconcile-period", defaultNodeOvercommitReconcilePeriod, "Period for nodeovercommit controller to sync configs")

fs.BoolVar(&o.EnablePredict, "nodeovercommit-enable-predict", o.EnablePredict, "enable node overcommit prediction")
fs.StringVar(&o.Predictor, "nodeovercommit-predictor", o.Predictor, "workload usage predictor in node overcommit controller")
fs.DurationVar(&o.PredictPeriod, "nodeovercommit-workload-predict-period", o.PredictPeriod, "reconcile period of workload usage predictor in overcommit controller")
fs.DurationVar(&o.ReconcilePeriod, "nodeovercommit-node-predict-period", o.ReconcilePeriod, "reconcile period of node overcommitmentRatio prediction in overcommit controller")
fs.DurationVar(&o.MaxTimeSeriesDuration, "nodeovercommit-max-timeseries-duration", o.MaxTimeSeriesDuration,
"max time duration of time series for workload usage prediction, default 7 days")
fs.DurationVar(&o.MinTimeSeriesDuration, "nodeovercommit-min-timeseries-duration", o.MinTimeSeriesDuration,
"min time duration of time series for workload usage prediction, default 24 hours")
fs.IntVar(&o.Factor, "nodeovercommit-nsigma-factor", o.Factor, "stddev factor of n-sigma predictor, default 3")
fs.IntVar(&o.Buckets, "nodeovercommit-nsigma-buckets", o.Buckets,
"bucket of n-sigma predictor result, 24 means predictor result will be divide into 24 buckets according to hours")
fs.StringVar(&o.TargetReferenceNameKey, "nodeovercommit-target-reference-name-key", o.TargetReferenceNameKey,
"overcommit controller get pod owner reference workload name from pod label by nodeovercommit-target-reference-name-key")
fs.StringVar(&o.TargetReferenceTypeKey, "nodeovercommit-target-reference-type-key", o.TargetReferenceTypeKey,
"overcommit controller get pod owner reference workload type from pod label by nodeovercommit-target-reference-type-key")
fs.Float64Var(&o.CPUScaleFactor, "nodeovercommit-cpu-scaleFactor", o.CPUScaleFactor,
"podUsage = podRequest * scaleFactor when pod resource portrait is missed")
fs.Float64Var(&o.MemoryScaleFactor, "nodeovercommit-memory-scaleFactor", o.MemoryScaleFactor,
"podUsage = podRequest * scaleFactor when pod resource portrait is missed")

fs.Float64Var(&o.NodeCPUTargetLoad, "nodeovercommit-cpu-targetload", o.NodeCPUTargetLoad,
"max node CPU load when calculate node CPU overcommitment ratio, should be greater than 0 and less than 1")
fs.Float64Var(&o.NodeMemoryTargetLoad, "nodeovercommit-memory-targetload", o.NodeMemoryTargetLoad,
"max node memory load when calculate node CPU overcommitment ratio, should be greater than 0 and less than 1")
fs.Float64Var(&o.PodEstimatedCPULoad, "nodeovercommit-cpu-estimatedload", o.PodEstimatedCPULoad,
"estimated avg pod CPU load in the cluster, should be greater than 0 and less than 1")
fs.Float64Var(&o.PodEstimatedMemoryLoad, "nodeovercommit-memory-estimatedload", o.PodEstimatedMemoryLoad,
"estimated avg pod memory load in the cluster, should be greater than 0 and less than 1")

fs.StringVar(&o.Address, "nodeovercommit-prometheus-address", "", "prometheus address")
fs.StringVar(&o.Auth.Type, "nodeovercommit-prometheus-auth-type", "", "prometheus auth type")
fs.StringVar(&o.Auth.Username, "nodeovercommit-prometheus-auth-username", "", "prometheus auth username")
fs.StringVar(&o.Auth.Password, "nodeovercommit-prometheus-auth-password", "", "prometheus auth password")
fs.StringVar(&o.Auth.BearerToken, "nodeovercommit-prometheus-auth-bearertoken", "", "prometheus auth bearertoken")
fs.DurationVar(&o.KeepAlive, "nodeovercommit-prometheus-keepalive", 60*time.Second, "prometheus keep alive")
fs.DurationVar(&o.Timeout, "nodeovercommit-prometheus-timeout", 3*time.Minute, "prometheus timeout")
fs.BoolVar(&o.BRateLimit, "nodeovercommit-prometheus-bratelimit", false, "prometheus bratelimit")
fs.IntVar(&o.MaxPointsLimitPerTimeSeries, "nodeovercommit-prometheus-maxpoints", 11000, "prometheus max points limit per time series")
fs.StringVar(&o.BaseFilter, "nodeovercommit-prometheus-promql-base-filter", "", ""+
"Get basic filters in promql for historical usage data. This filter is added to all promql statements. "+
"Supports filters format of promql, e.g: group=\\\"Katalyst\\\",cluster=\\\"cfeaf782fasdfe\\\"")
fs.BoolVar(&o.InsecureSkipVerify, "nodeovercommit-prometheus-insecureSkipVerify", true, "prometheus insecure skip verify")
fs.DurationVar(&o.TLSHandshakeTimeoutInSecond, "nodeovercommit-prometheus-TLSHandshakeTimeoutInSecond", 10*time.Second, "prometheus TLSHandshake timeout")
}

func (o *OvercommitOptions) ApplyTo(c *controller.OvercommitConfig) error {
c.Node.SyncWorkers = o.SyncWorkers
c.Node.ConfigReconcilePeriod = o.ConfigReconcilePeriod
c.Prediction.EnablePredict = o.EnablePredict
c.Prediction.Predictor = o.Predictor
c.Prediction.PredictPeriod = o.PredictPeriod
c.Prediction.ReconcilePeriod = o.ReconcilePeriod
c.Prediction.MaxTimeSeriesDuration = o.MaxTimeSeriesDuration
c.Prediction.MinTimeSeriesDuration = o.MinTimeSeriesDuration
c.Prediction.Buckets = o.Buckets
c.Prediction.Factor = o.Factor
c.Prediction.TargetReferenceNameKey = o.TargetReferenceNameKey
c.Prediction.TargetReferenceTypeKey = o.TargetReferenceTypeKey
c.Prediction.CPUScaleFactor = o.CPUScaleFactor
c.Prediction.MemoryScaleFactor = o.MemoryScaleFactor
c.Prediction.NodeCPUTargetLoad = o.NodeCPUTargetLoad
c.Prediction.NodeMemoryTargetLoad = o.NodeMemoryTargetLoad
c.Prediction.PodEstimatedCPULoad = o.PodEstimatedCPULoad
c.Prediction.PodEstimatedMemoryLoad = o.PodEstimatedMemoryLoad
c.Prediction.PromConfig = &o.PromConfig
return nil
}

Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ require (
)

replace (
github.com/kubewharf/katalyst-api => github.com/WangZzzhe/katalyst-api v0.0.0-20240708111725-a9666d703f9f
k8s.io/api => k8s.io/api v0.24.6
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6
k8s.io/apimachinery => k8s.io/apimachinery v0.24.6
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWX
github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
github.com/StackExchange/wmi v0.0.0-20180116203802-5d049714c4a6/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
github.com/WangZzzhe/katalyst-api v0.0.0-20240708111725-a9666d703f9f h1:xNRsHeQYoquhyG43l6PMyaHOksTHxzy1667T2q2WnxY=
github.com/WangZzzhe/katalyst-api v0.0.0-20240708111725-a9666d703f9f/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c=
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
Expand Down Expand Up @@ -558,8 +560,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubewharf/katalyst-api v0.5.1-0.20240702044746-be552fd7ea7d h1:6CuK3axf2B63zIkEu5XyxbaC+JArE/3Jo3QHvb+Hn0M=
github.com/kubewharf/katalyst-api v0.5.1-0.20240702044746-be552fd7ea7d/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k=
github.com/kubewharf/kubelet v1.24.6-kubewharf.8 h1:2e89T/nZTgzaVhyRsZuwEdRk8V8kJXs4PRkgfeG4Ai4=
github.com/kubewharf/kubelet v1.24.6-kubewharf.8/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c=
github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8=
Expand Down
41 changes: 39 additions & 2 deletions pkg/config/controller/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,20 @@ limitations under the License.

package controller

import "time"
import (
"time"

"github.com/kubewharf/katalyst-core/pkg/util/datasource/prometheus"
)

type OvercommitConfig struct {
Node NodeOvercommitConfig

Prediction PredictionConfig
}

type NodeOvercommitConfig struct {
// numer of workers to sync overcommit config
// number of workers to sync overcommit config
SyncWorkers int

// time interval of reconcile overcommit config
Expand All @@ -33,5 +39,36 @@ type NodeOvercommitConfig struct {
func NewOvercommitConfig() *OvercommitConfig {
return &OvercommitConfig{
Node: NodeOvercommitConfig{},
Prediction: PredictionConfig{
PromConfig: &prometheus.PromConfig{},
},
}
}

type PredictionConfig struct {
EnablePredict bool
Predictor string
PredictPeriod time.Duration
ReconcilePeriod time.Duration

MaxTimeSeriesDuration time.Duration
MinTimeSeriesDuration time.Duration

TargetReferenceNameKey string
TargetReferenceTypeKey string
CPUScaleFactor float64
MemoryScaleFactor float64

NodeCPUTargetLoad float64
NodeMemoryTargetLoad float64
PodEstimatedCPULoad float64
PodEstimatedMemoryLoad float64

*prometheus.PromConfig
NSigmaPredictorConfig
}

type NSigmaPredictorConfig struct {
Factor int
Buckets int
}
25 changes: 19 additions & 6 deletions pkg/controller/overcommit/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,8 @@ func (nc *NodeOvercommitController) setNodeOvercommitAnnotations(nodeName string

nc.nodeRealtimeOvercommitRatio(nodeAnnotations, node)

cpuAllocatable, cpuCapacity := nc.nodeOvercommitResource(node, validCPUOvercommitRatio(nodeAnnotations), corev1.ResourceCPU, consts.NodeAnnotationOriginalAllocatableCPUKey, consts.NodeAnnotationOriginalCapacityCPUKey)
enableDynamicOvercommit := nc.nodeEnableDynamicOvercommit(node.Name)
cpuAllocatable, cpuCapacity := nc.nodeOvercommitResource(node, validCPUOvercommitRatio(nodeAnnotations, enableDynamicOvercommit), corev1.ResourceCPU, consts.NodeAnnotationOriginalAllocatableCPUKey, consts.NodeAnnotationOriginalCapacityCPUKey)
klog.V(5).Infof("node %s CPU allocatable: %v, CPU capacity: %v with bindcpu", node.Name, cpuAllocatable, cpuCapacity)
if cpuAllocatable == "" {
delete(nodeAnnotations, consts.NodeAnnotationOvercommitAllocatableCPUKey)
Expand All @@ -477,7 +478,7 @@ func (nc *NodeOvercommitController) setNodeOvercommitAnnotations(nodeName string
nodeAnnotations[consts.NodeAnnotationOvercommitCapacityCPUKey] = cpuCapacity
}

memAllocatable, memCapacity := nc.nodeOvercommitResource(node, validMemoryOvercommitRatio(nodeAnnotations), corev1.ResourceMemory, consts.NodeAnnotationOriginalAllocatableMemoryKey, consts.NodeAnnotationOriginalCapacityMemoryKey)
memAllocatable, memCapacity := nc.nodeOvercommitResource(node, validMemoryOvercommitRatio(nodeAnnotations, enableDynamicOvercommit), corev1.ResourceMemory, consts.NodeAnnotationOriginalAllocatableMemoryKey, consts.NodeAnnotationOriginalCapacityMemoryKey)
klog.V(5).Infof("node %s memory allocatable: %v, memory capacity: %v", node.Name, memAllocatable, memCapacity)
if memAllocatable == "" {
delete(nodeAnnotations, consts.NodeAnnotationOvercommitAllocatableMemoryKey)
Expand All @@ -494,6 +495,18 @@ func (nc *NodeOvercommitController) setNodeOvercommitAnnotations(nodeName string
return nil
}

func (nc *NodeOvercommitController) nodeEnableDynamicOvercommit(nodeName string) bool {
// get node matched config

noc := nc.matcher.GetConfig(nodeName)
if noc == nil {
// if node is not matched with any noc, overcommit is not allowed
return false
}

return noc.Spec.EnableDynamicOvercommit
}

func emptyOvercommitConfig() *configv1alpha1.NodeOvercommitConfig {
return &configv1alpha1.NodeOvercommitConfig{
Spec: configv1alpha1.NodeOvercommitConfigSpec{
Expand Down Expand Up @@ -623,16 +636,16 @@ func (nc *NodeOvercommitController) getGuaranteedCPU(nodeName string) (int, erro
return guaranteedCPUs, nil
}

func validCPUOvercommitRatio(annotation map[string]string) float64 {
res, err := overcommitutil.OvercommitRatioValidate(annotation, consts.NodeAnnotationCPUOvercommitRatioKey, consts.NodeAnnotationRealtimeCPUOvercommitRatioKey)
func validCPUOvercommitRatio(annotation map[string]string, enableDynamicOvercommit bool) float64 {
res, err := overcommitutil.OvercommitRatioValidate(annotation, consts.NodeAnnotationCPUOvercommitRatioKey, consts.NodeAnnotationPredictCPUOvercommitRatioKey, consts.NodeAnnotationRealtimeCPUOvercommitRatioKey, enableDynamicOvercommit)
if err != nil {
klog.Error(err)
}
return res
}

func validMemoryOvercommitRatio(annotation map[string]string) float64 {
res, err := overcommitutil.OvercommitRatioValidate(annotation, consts.NodeAnnotationMemoryOvercommitRatioKey, consts.NodeAnnotationRealtimeMemoryOvercommitRatioKey)
func validMemoryOvercommitRatio(annotation map[string]string, enableDynamicOvercommit bool) float64 {
res, err := overcommitutil.OvercommitRatioValidate(annotation, consts.NodeAnnotationMemoryOvercommitRatioKey, consts.NodeAnnotationPredictMemoryOvercommitRatioKey, consts.NodeAnnotationRealtimeMemoryOvercommitRatioKey, enableDynamicOvercommit)
if err != nil {
klog.Error(err)
}
Expand Down
Loading