feat: Introduce object limits

mrueg · mrueg · commit 915ef4e10ed1 · 2025-03-10T23:38:37.000+01:00
This change allows user-controlled limits on how many objects KSM will
list from the API. This is helpful to prevent resource exhaustion on
KSM, in case the API creates too many resources.

The object limit it set globally and applied per resource watched.
diff --git a/docs/developer/cli-arguments.md b/docs/developer/cli-arguments.md
@@ -64,6 +64,7 @@ Flags:
       --namespaces string                          Comma-separated list of namespaces to be enabled. Defaults to ""
       --namespaces-denylist string                 Comma-separated list of namespaces not to be enabled. If namespaces and namespaces-denylist are both set, only namespaces that are excluded in namespaces-denylist will be used.
       --node string                                Name of the node that contains the kube-state-metrics pod. Most likely it should be passed via the downward API. This is used for daemonset sharding. Only available for resources (pod metrics) that support spec.nodeName fieldSelector. This is experimental.
+      --object-limit int                           The total number of objects to list from the API Server.
       --one_output                                 If true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true)
       --pod string                                 Name of the pod that contains the kube-state-metrics container. When set, it is expected that --pod and --pod-namespace are both set. Most likely this should be passed via the downward API. This is used for auto-detecting sharding. If set, this has preference over statically configured sharding. This is experimental, it may be removed without notice.
       --pod-namespace string                       Name of the namespace of the pod specified by --pod. When set, it is expected that --pod and --pod-namespace are both set. Most likely this should be passed via the downward API. This is used for auto-detecting sharding. If set, this has preference over statically configured sharding. This is experimental, it may be removed without notice.
diff --git a/internal/store/builder.go b/internal/store/builder.go
diff --git a/pkg/app/server.go b/pkg/app/server.go
@@ -247,6 +247,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
 	))
 
 	storeBuilder.WithUsingAPIServerCache(opts.UseAPIServerCache)
+	storeBuilder.WithObjectLimit(opts.ObjectLimit)
 	storeBuilder.WithGenerateStoresFunc(storeBuilder.DefaultGenerateStoresFunc())
 	proc.StartReaper()
 
diff --git a/pkg/builder/builder_test.go b/pkg/builder/builder_test.go
@@ -67,6 +67,7 @@ func customStore(_ []generator.FamilyGenerator,
 	_ interface{},
 	_ func(kubeClient clientset.Interface, ns string, fieldSelector string) cache.ListerWatcher,
 	_ bool,
+	_ int64,
 ) []cache.Store {
 	stores := make([]cache.Store, 0, 2)
 	stores = append(stores, newFakeStore(fakeMetricLists[0]))
diff --git a/pkg/builder/types/interfaces.go b/pkg/builder/types/interfaces.go
@@ -57,15 +57,15 @@ type BuilderInterface interface {
 type BuildStoresFunc func(metricFamilies []generator.FamilyGenerator,
 	expectedType interface{},
 	listWatchFunc func(kubeClient clientset.Interface, ns string, fieldSelector string) cache.ListerWatcher,
-	useAPIServerCache bool,
+	useAPIServerCache bool, limit int64,
 ) []cache.Store
 
 // BuildCustomResourceStoresFunc function signature that is used to return a list of custom resource cache.Store
 type BuildCustomResourceStoresFunc func(resourceName string,
 	metricFamilies []generator.FamilyGenerator,
 	expectedType interface{},
 	listWatchFunc func(customResourceClient interface{}, ns string, fieldSelector string) cache.ListerWatcher,
-	useAPIServerCache bool,
+	useAPIServerCache bool, limit int64,
 ) []cache.Store
 
 // AllowDenyLister interface for AllowDeny lister that can allow or exclude metrics by there names
diff --git a/pkg/options/options.go b/pkg/options/options.go
@@ -79,6 +79,7 @@ type Options struct {
 	Help                 bool  `yaml:"help"`
 	TrackUnscheduledPods bool  `yaml:"track_unscheduled_pods"`
 	UseAPIServerCache    bool  `yaml:"use_api_server_cache"`
+	ObjectLimit          int64 `yaml:"object_limit"`
 }
 
 // GetConfigFile is the getter for --config value.
@@ -143,6 +144,7 @@ func (o *Options) AddFlags(cmd *cobra.Command) {
 	o.cmd.Flags().BoolVar(&o.TrackUnscheduledPods, "track-unscheduled-pods", false, "This configuration is used in conjunction with node configuration. When this configuration is true, node configuration is empty and the metric of unscheduled pods is fetched from the Kubernetes API Server. This is experimental.")
 	o.cmd.Flags().BoolVarP(&o.Help, "help", "h", false, "Print Help text")
 	o.cmd.Flags().BoolVarP(&o.UseAPIServerCache, "use-apiserver-cache", "", false, "Sets resourceVersion=0 for ListWatch requests, using cached resources from the apiserver instead of an etcd quorum read.")
+	o.cmd.Flags().Int64Var(&o.ObjectLimit, "object-limit", 0, "The total number of objects to list from the API Server.")
 	o.cmd.Flags().Int32Var(&o.Shard, "shard", int32(0), "The instances shard nominal (zero indexed) within the total number of shards. (default 0)")
 	o.cmd.Flags().IntVar(&o.Port, "port", 8080, `Port to expose metrics on.`)
 	o.cmd.Flags().IntVar(&o.TelemetryPort, "telemetry-port", 8081, `Port to expose kube-state-metrics self metrics on.`)
diff --git a/pkg/watch/watch.go b/pkg/watch/watch.go
@@ -27,8 +27,9 @@ import (
 
 // ListWatchMetrics stores the pointers of kube_state_metrics_[list|watch]_total metrics.
 type ListWatchMetrics struct {
-	WatchTotal *prometheus.CounterVec
-	ListTotal  *prometheus.CounterVec
+	WatchTotal     *prometheus.CounterVec
+	ListTotal      *prometheus.CounterVec
+	ListLimitTotal *prometheus.GaugeVec
 }
 
 // NewListWatchMetrics takes in a prometheus registry and initializes
@@ -50,6 +51,13 @@ func NewListWatchMetrics(r prometheus.Registerer) *ListWatchMetrics {
 			},
 			[]string{"result", "resource"},
 		),
+		ListLimitTotal: promauto.With(r).NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "kube_state_metrics_list_limit",
+				Help: "Number of resource list limit in kube-state-metrics",
+			},
+			[]string{"resource"},
+		),
 	}
 }
 
@@ -60,15 +68,17 @@ type InstrumentedListerWatcher struct {
 	metrics           *ListWatchMetrics
 	resource          string
 	useAPIServerCache bool
+	limit             int64
 }
 
 // NewInstrumentedListerWatcher returns a new InstrumentedListerWatcher.
-func NewInstrumentedListerWatcher(lw cache.ListerWatcher, metrics *ListWatchMetrics, resource string, useAPIServerCache bool) cache.ListerWatcher {
+func NewInstrumentedListerWatcher(lw cache.ListerWatcher, metrics *ListWatchMetrics, resource string, useAPIServerCache bool, limit int64) cache.ListerWatcher {
 	return &InstrumentedListerWatcher{
 		lw:                lw,
 		metrics:           metrics,
 		resource:          resource,
 		useAPIServerCache: useAPIServerCache,
+		limit:             limit,
 	}
 }
 
@@ -80,6 +90,11 @@ func (i *InstrumentedListerWatcher) List(options metav1.ListOptions) (runtime.Ob
 		options.ResourceVersion = "0"
 	}
 
+	if i.limit != 0 {
+		options.Limit = i.limit
+		i.metrics.ListLimitTotal.WithLabelValues(i.resource).Set(float64(i.limit))
+	}
+
 	res, err := i.lw.List(options)
 	if err != nil {
 		i.metrics.ListTotal.WithLabelValues("error", i.resource).Inc()