diff --git a/apis/config/v1beta1/configuration_types.go b/apis/config/v1beta1/configuration_types.go index e0e01baf068..fbdbc20d162 100644 --- a/apis/config/v1beta1/configuration_types.go +++ b/apis/config/v1beta1/configuration_types.go @@ -177,7 +177,18 @@ type ControllerMetrics struct { // EnableClusterQueueResources, if true the cluster queue resource usage and quotas // metrics will be reported. // +optional - EnableClusterQueueResources bool `json:"enableClusterQueueResources,omitempty"` + EnableClusterQueueResources bool `json:"enableClusterQueueResources,omitempty"` + CustomMetricTags CustomMetricTags `json:"customMetricTags,omitempty"` +} + +type CustomMetricTags struct { + ClusterQueue []CustomMetricTag `json:"clusterQueue,omitempty"` + LocalQueue []CustomMetricTag `json:"localQueue,omitempty"` +} + +type CustomMetricTag struct { + ResourceTag string `json:"resourceTag,omitempty"` + OverrideMetricTag *string `json:"overrideMetricTag,omitempty"` } // ControllerHealth defines the health configs. diff --git a/cmd/kueue/main.go b/cmd/kueue/main.go index a65811688fa..029c13cd8c9 100644 --- a/cmd/kueue/main.go +++ b/cmd/kueue/main.go @@ -194,7 +194,9 @@ func main() { } options.Metrics = metricsServerOptions - metrics.Register() + metrics.Register(metrics.Configuration{ + CustomTags: cfg.Metrics.CustomMetricTags, + }) kubeConfig := ctrl.GetConfigOrDie() if kubeConfig.UserAgent == "" { diff --git a/pkg/controller/core/clusterqueue_controller.go b/pkg/controller/core/clusterqueue_controller.go index 7170e0c93ea..045dd783580 100644 --- a/pkg/controller/core/clusterqueue_controller.go +++ b/pkg/controller/core/clusterqueue_controller.go @@ -357,7 +357,7 @@ func recordResourceMetrics(cq *kueue.ClusterQueue) { nominal := resource.QuantityToFloat(&r.NominalQuota) borrow := resource.QuantityToFloat(r.BorrowingLimit) lend := resource.QuantityToFloat(r.LendingLimit) - metrics.ReportClusterQueueQuotas(cq.Spec.CohortName, cq.Name, string(fq.Name), string(r.Name), nominal, borrow, lend) + metrics.ReportClusterQueueQuotas(cq.Spec.CohortName, *cq, string(fq.Name), string(r.Name), nominal, borrow, lend) } } } @@ -366,7 +366,8 @@ func recordResourceMetrics(cq *kueue.ClusterQueue) { fr := &cq.Status.FlavorsReservation[fri] for ri := range fr.Resources { r := &fr.Resources[ri] - metrics.ReportClusterQueueResourceReservations(cq.Spec.CohortName, cq.Name, string(fr.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) + + metrics.ReportClusterQueueResourceReservations(cq.Spec.CohortName, *cq, string(fr.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) } } @@ -374,7 +375,7 @@ func recordResourceMetrics(cq *kueue.ClusterQueue) { fu := &cq.Status.FlavorsUsage[fui] for ri := range fu.Resources { r := &fu.Resources[ri] - metrics.ReportClusterQueueResourceUsage(cq.Spec.CohortName, cq.Name, string(fu.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) + metrics.ReportClusterQueueResourceUsage(cq.Spec.CohortName, *cq, string(fu.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) } } } @@ -567,7 +568,7 @@ func (r *ClusterQueueReconciler) updateCqStatusIfChanged( if weightedShare == math.Inf(1) { weightedShare = math.NaN() } - metrics.ReportClusterQueueWeightedShare(cq.Name, string(cq.Spec.CohortName), weightedShare) + metrics.ReportClusterQueueWeightedShare(*cq, string(cq.Spec.CohortName), weightedShare) } if cq.Status.FairSharing == nil { cq.Status.FairSharing = &kueue.FairSharingStatus{} diff --git a/pkg/metrics/custom_tag.go b/pkg/metrics/custom_tag.go new file mode 100644 index 00000000000..cb97d1c87e4 --- /dev/null +++ b/pkg/metrics/custom_tag.go @@ -0,0 +1,110 @@ +package metrics + +import ( + "sync" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/kueue/apis/config/v1beta1" +) + +var customTagsConf = CustomTagsConfiguration{} + +type CustomTagsConfiguration struct { + ClusterQueue CustomTagsObjectConfiguration + LocalQueue CustomTagsObjectConfiguration +} + +type CustomTagsObjectConfiguration struct { + ResourceTags []string + MetricTags []string +} + +func getPrometheusTag(tag v1beta1.CustomMetricTag) string { + if tag.OverrideMetricTag != nil { + return *tag.OverrideMetricTag + } + return tag.ResourceTag +} + +type Metric[T any] struct { + Name string + Help string + StandardLabels []string + Buckets []float64 + globalVariable **T + clusterQueueCustomLabels bool + localQueueCustomLabels bool +} + +type MetricsGroup[T any] struct { + metrics []Metric[T] + initFunc func(Metric[T], []string) *T + once sync.Once +} + +func (m *MetricsGroup[T]) Metrics() []Metric[T] { + return m.metrics +} + +func (m *MetricsGroup[T]) InitFunc() func(Metric[T], []string) *T { + return m.initFunc +} + +func (m *MetricsGroup[T]) init() { + for i, metric := range m.metrics { + labels := metric.StandardLabels + if metric.clusterQueueCustomLabels { + labels = append(metric.StandardLabels, customTagsConf.ClusterQueue.MetricTags...) + } + if metric.localQueueCustomLabels { + labels = append(metric.StandardLabels, customTagsConf.LocalQueue.MetricTags...) + } + *m.metrics[i].globalVariable = m.initFunc(metric, labels) + } +} + +func getResourceTagValues(cq metav1.Object, customTags CustomTagsObjectConfiguration) []string { + tags := []string{} + for _, tag := range customTags.ResourceTags { + t, ok := cq.GetLabels()[tag] + if !ok { + t = cq.GetAnnotations()[tag] + } + tags = append(tags, t) + } + return tags +} + +func getCustomTagsObjectConfiguration(customMetricTags []v1beta1.CustomMetricTag) CustomTagsObjectConfiguration { + customTagsObjectConf := CustomTagsObjectConfiguration{ + ResourceTags: make([]string, 0, len(customMetricTags)), + MetricTags: make([]string, 0, len(customMetricTags)), + } + for _, t := range customMetricTags { + customTagsObjectConf.ResourceTags = append(customTagsObjectConf.ResourceTags, t.ResourceTag) + customTagsObjectConf.MetricTags = append(customTagsObjectConf.MetricTags, getPrometheusTag(t)) + } + return customTagsObjectConf +} + +func getConfiguration(customMetricTags *v1beta1.CustomMetricTags) CustomTagsConfiguration { + if customMetricTags == nil { + return CustomTagsConfiguration{} + } + return CustomTagsConfiguration{ + ClusterQueue: getCustomTagsObjectConfiguration(customMetricTags.ClusterQueue), + LocalQueue: getCustomTagsObjectConfiguration(customMetricTags.LocalQueue), + } +} + +func (m *MetricsGroup[T]) Init() { + m.once.Do(m.init) +} + +func initCustomTagsMetric(customMetricsTagsConfiguration *v1beta1.CustomMetricTags) { + customTagsConf = getConfiguration(customMetricsTagsConfiguration) + + customTagsCounterMetrics.Init() + customTagsGaugeMetrics.Init() + customTagsHistogramMetrics.Init() +} diff --git a/pkg/metrics/custom_tag_test.go b/pkg/metrics/custom_tag_test.go new file mode 100644 index 00000000000..f913c136b41 --- /dev/null +++ b/pkg/metrics/custom_tag_test.go @@ -0,0 +1,213 @@ +package metrics + +import ( + "github.com/google/go-cmp/cmp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + configv1beta1 "sigs.k8s.io/kueue/apis/config/v1beta1" + "testing" +) + +func TestGetPrometheusTag(t *testing.T) { + overrideTag := "custom_tag" + tests := []struct { + name string + tag configv1beta1.CustomMetricTag + want string + }{ + { + name: "no override, use resource tag", + tag: configv1beta1.CustomMetricTag{ + ResourceTag: "resource_label", + }, + want: "resource_label", + }, + { + name: "with override, use override tag", + tag: configv1beta1.CustomMetricTag{ + ResourceTag: "resource_label", + OverrideMetricTag: &overrideTag, + }, + want: "custom_tag", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := getPrometheusTag(tt.tag) + if got != tt.want { + t.Errorf("getPrometheusTag() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestGetConfiguration(t *testing.T) { + overrideTag1 := "custom_cq_tag" + overrideTag2 := "custom_lq_tag" + + tests := []struct { + name string + customMetricTags configv1beta1.CustomMetricTags + want CustomTagsConfiguration + }{ + { + name: "cluster queue tags only", + customMetricTags: configv1beta1.CustomMetricTags{ + ClusterQueue: []configv1beta1.CustomMetricTag{ + {ResourceTag: "team"}, + {ResourceTag: "env"}, + }, + LocalQueue: []configv1beta1.CustomMetricTag{}, + }, + want: CustomTagsConfiguration{ + ClusterQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team", "env"}, + MetricTags: []string{"team", "env"}, + }, + LocalQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{}, + MetricTags: []string{}, + }, + }, + }, + { + name: "local queue tags only", + customMetricTags: configv1beta1.CustomMetricTags{ + ClusterQueue: []configv1beta1.CustomMetricTag{}, + LocalQueue: []configv1beta1.CustomMetricTag{ + {ResourceTag: "project"}, + }, + }, + want: CustomTagsConfiguration{ + ClusterQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{}, + MetricTags: []string{}, + }, + LocalQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{"project"}, + MetricTags: []string{"project"}, + }, + }, + }, + { + name: "both cluster and local queue tags with overrides", + customMetricTags: configv1beta1.CustomMetricTags{ + ClusterQueue: []configv1beta1.CustomMetricTag{ + {ResourceTag: "team", OverrideMetricTag: &overrideTag1}, + {ResourceTag: "env"}, + }, + LocalQueue: []configv1beta1.CustomMetricTag{ + {ResourceTag: "project", OverrideMetricTag: &overrideTag2}, + }, + }, + want: CustomTagsConfiguration{ + ClusterQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team", "env"}, + MetricTags: []string{"custom_cq_tag", "env"}, + }, + LocalQueue: CustomTagsObjectConfiguration{ + ResourceTags: []string{"project"}, + MetricTags: []string{"custom_lq_tag"}, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := getConfiguration(&tt.customMetricTags) + if diff := cmp.Diff(tt.want, got); diff != "" { + t.Errorf("getConfiguration() mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestGetResourceTagValues(t *testing.T) { + tests := []struct { + name string + obj metav1.Object + customTags CustomTagsObjectConfiguration + want []string + }{ + { + name: "empty tags configuration", + obj: &metav1.ObjectMeta{ + Name: "test-queue", + Labels: map[string]string{ + "team": "platform", + }, + }, + customTags: CustomTagsObjectConfiguration{ + ResourceTags: []string{}, + }, + want: []string{}, + }, + { + name: "tags from labels only", + obj: &metav1.ObjectMeta{ + Name: "test-queue", + Labels: map[string]string{ + "team": "platform", + "env": "production", + }, + }, + customTags: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team", "env"}, + }, + want: []string{"platform", "production"}, + }, + { + name: "tags from annotations only", + obj: &metav1.ObjectMeta{ + Name: "test-queue", + Annotations: map[string]string{ + "team": "data", + "cost": "high", + }, + }, + customTags: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team", "cost"}, + }, + want: []string{"data", "high"}, + }, + { + name: "labels take precedence over annotations", + obj: &metav1.ObjectMeta{ + Name: "test-queue", + Labels: map[string]string{ + "team": "platform", + }, + Annotations: map[string]string{ + "team": "data", + }, + }, + customTags: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team"}, + }, + want: []string{"platform"}, + }, + { + name: "missing tags return empty strings", + obj: &metav1.ObjectMeta{ + Name: "test-queue", + Labels: map[string]string{ + "team": "platform", + }, + }, + customTags: CustomTagsObjectConfiguration{ + ResourceTags: []string{"team", "missing_tag"}, + }, + want: []string{"platform", ""}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := getResourceTagValues(tt.obj, tt.customTags) + + if diff := cmp.Diff(tt.want, got); diff != "" { + t.Errorf("getResourceTagValues() mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c34bbac9d61..3358c50f35a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -22,6 +22,7 @@ import ( "github.com/prometheus/client_golang/prometheus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/kueue/apis/config/v1beta1" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta2" "sigs.k8s.io/kueue/pkg/constants" @@ -455,14 +456,7 @@ The label 'reason' can have the following values: // +metricsdoc:group=clusterqueue // +metricsdoc:labels=cluster_queue="the name of the ClusterQueue",status="one of `pending`, `active`, or `terminated`" - ClusterQueueByStatus = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_status", - Help: `Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated'). -For a ClusterQueue, the metric only reports a value of 1 for one of the statuses.`, - }, []string{"cluster_queue", "status"}, - ) + ClusterQueueByStatus *prometheus.GaugeVec // +metricsdoc:group=localqueue // +metricsdoc:labels=name="the name of the LocalQueue",namespace="the namespace of the LocalQueue",active="one of `True`, `False`, or `Unknown`" @@ -479,23 +473,11 @@ For a LocalQueue, the metric only reports a value of 1 for one of the statuses.` // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cohort="the name of the Cohort",cluster_queue="the name of the ClusterQueue",flavor="the resource flavor name",resource="the resource name" - ClusterQueueResourceReservations = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_resource_reservation", - Help: `Reports the cluster_queue's total resource reservation within all the flavors`, - }, []string{"cohort", "cluster_queue", "flavor", "resource"}, - ) + ClusterQueueResourceReservations *prometheus.GaugeVec // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cohort="the name of the Cohort",cluster_queue="the name of the ClusterQueue",flavor="the resource flavor name",resource="the resource name" - ClusterQueueResourceUsage = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_resource_usage", - Help: `Reports the cluster_queue's total resource usage within all the flavors`, - }, []string{"cohort", "cluster_queue", "flavor", "resource"}, - ) + ClusterQueueResourceUsage *prometheus.GaugeVec // +metricsdoc:group=localqueue // +metricsdoc:labels=name="the name of the LocalQueue",namespace="the namespace of the LocalQueue",flavor="the resource flavor name",resource="the resource name" @@ -519,47 +501,19 @@ For a LocalQueue, the metric only reports a value of 1 for one of the statuses.` // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cohort="the name of the Cohort",cluster_queue="the name of the ClusterQueue",flavor="the resource flavor name",resource="the resource name" - ClusterQueueResourceNominalQuota = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_nominal_quota", - Help: `Reports the cluster_queue's resource nominal quota within all the flavors`, - }, []string{"cohort", "cluster_queue", "flavor", "resource"}, - ) + ClusterQueueResourceNominalQuota *prometheus.GaugeVec // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cohort="the name of the Cohort",cluster_queue="the name of the ClusterQueue",flavor="the resource flavor name",resource="the resource name" - ClusterQueueResourceBorrowingLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_borrowing_limit", - Help: `Reports the cluster_queue's resource borrowing limit within all the flavors`, - }, []string{"cohort", "cluster_queue", "flavor", "resource"}, - ) + ClusterQueueResourceBorrowingLimit *prometheus.GaugeVec // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cohort="the name of the Cohort",cluster_queue="the name of the ClusterQueue",flavor="the resource flavor name",resource="the resource name" - ClusterQueueResourceLendingLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_lending_limit", - Help: `Reports the cluster_queue's resource lending limit within all the flavors`, - }, []string{"cohort", "cluster_queue", "flavor", "resource"}, - ) + ClusterQueueResourceLendingLimit *prometheus.GaugeVec // +metricsdoc:group=optional_clusterqueue_resources // +metricsdoc:labels=cluster_queue="the name of the ClusterQueue",cohort="the name of the Cohort" - ClusterQueueWeightedShare = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Subsystem: constants.KueueName, - Name: "cluster_queue_weighted_share", - Help: `Reports a value that representing the maximum of the ratios of usage above nominal -quota to the lendable resources in the cohort, among all the resources provided by -the ClusterQueue, and divided by the weight. -If zero, it means that the usage of the ClusterQueue is below the nominal quota. -If the ClusterQueue has a weight of zero and is borrowing, this will return NaN.`, - }, []string{"cluster_queue", "cohort"}, - ) + ClusterQueueWeightedShare *prometheus.GaugeVec // +metricsdoc:group=cohort // +metricsdoc:labels=cohort="the name of the Cohort" @@ -576,6 +530,103 @@ If the Cohort has a weight of zero and is borrowing, this will return NaN.`, ) ) +var standardClusterQueueLabels = []string{"cohort", "cluster_queue", "flavor", "resource"} + +var customTagsHistogramMetrics = MetricsGroup[prometheus.HistogramVec]{ + metrics: []Metric[prometheus.HistogramVec]{}, + initFunc: func(metric Metric[prometheus.HistogramVec], labels []string) *prometheus.HistogramVec { + return prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: constants.KueueName, + Name: metric.Name, + Help: metric.Help, + Buckets: metric.Buckets, + }, labels, + ) + }, +} + +var customTagsCounterMetrics = MetricsGroup[prometheus.CounterVec]{ + metrics: []Metric[prometheus.CounterVec]{}, + initFunc: func(metric Metric[prometheus.CounterVec], labels []string) *prometheus.CounterVec { + return prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: metric.Name, + Help: metric.Help, + }, labels, + ) + }, +} + +var customTagsGaugeMetrics = MetricsGroup[prometheus.GaugeVec]{ + metrics: []Metric[prometheus.GaugeVec]{ + { + Name: "cluster_queue_resource_usage", + Help: `Reports the cluster_queue's total resource usage within all the flavors`, + StandardLabels: standardClusterQueueLabels, + globalVariable: &ClusterQueueResourceUsage, + clusterQueueCustomLabels: true, + }, + { + Name: "cluster_queue_status", + Help: `Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated'). +For a ClusterQueue, the metric only reports a value of 1 for one of the statuses.`, + StandardLabels: []string{"cluster_queue", "status"}, + globalVariable: &ClusterQueueByStatus, + }, + { + Name: "cluster_queue_resource_reservation", + Help: `Reports the cluster_queue's total resource reservation within all the flavors`, + StandardLabels: standardClusterQueueLabels, + globalVariable: &ClusterQueueResourceReservations, + clusterQueueCustomLabels: true, + }, + { + Name: "cluster_queue_nominal_quota", + Help: `Reports the cluster_queue's resource nominal quota within all the flavors`, + StandardLabels: standardClusterQueueLabels, + globalVariable: &ClusterQueueResourceNominalQuota, + clusterQueueCustomLabels: true, + }, + { + Name: "cluster_queue_borrowing_limit", + Help: `Reports the cluster_queue's resource borrowing limit within all the flavors`, + StandardLabels: standardClusterQueueLabels, + globalVariable: &ClusterQueueResourceBorrowingLimit, + clusterQueueCustomLabels: true, + }, + { + Name: "cluster_queue_lending_limit", + Help: `Reports the cluster_queue's resource lending limit within all the flavors`, + StandardLabels: standardClusterQueueLabels, + globalVariable: &ClusterQueueResourceLendingLimit, + clusterQueueCustomLabels: true, + }, + { + Name: "cluster_queue_weighted_share", + Help: `Reports a value that representing the maximum of the ratios of usage above nominal + quota to the lendable resources in the cohort, among all the resources provided by + the ClusterQueue, and divided by the weight. + If zero, it means that the usage of the ClusterQueue is below the nominal quota. + If the ClusterQueue has a weight of zero and is borrowing, this will return 9223372036854775807, + the maximum possible share value.`, + StandardLabels: []string{"cluster_queue"}, + globalVariable: &ClusterQueueWeightedShare, + clusterQueueCustomLabels: true, + }, + }, + initFunc: func(metric Metric[prometheus.GaugeVec], labels []string) *prometheus.GaugeVec { + return prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: constants.KueueName, + Name: metric.Name, + Help: metric.Help, + }, labels, + ) + }, +} + func init() { versionInfo := version.Get() buildInfo.WithLabelValues(versionInfo.GitVersion, versionInfo.GitCommit, versionInfo.BuildDate, versionInfo.GoVersion, versionInfo.Compiler, versionInfo.Platform).Set(1) @@ -741,32 +792,40 @@ func ClearLocalQueueCacheMetrics(lq LocalQueueReference) { } } -func ReportClusterQueueQuotas(cohort kueue.CohortReference, queue, flavor, resource string, nominal, borrowing, lending float64) { - ClusterQueueResourceNominalQuota.WithLabelValues(string(cohort), queue, flavor, resource).Set(nominal) - ClusterQueueResourceBorrowingLimit.WithLabelValues(string(cohort), queue, flavor, resource).Set(borrowing) +func ReportClusterQueueQuotas(cohort kueue.CohortReference, queue kueue.ClusterQueue, flavor, resource string, nominal, borrowing, lending float64) { + labels := []string{string(cohort), queue.Name, flavor, resource} + labels = append(labels, getResourceTagValues(&queue, customTagsConf.ClusterQueue)...) + ClusterQueueResourceNominalQuota.WithLabelValues(labels...).Set(nominal) + ClusterQueueResourceBorrowingLimit.WithLabelValues(labels...).Set(borrowing) if features.Enabled(features.LendingLimit) { - ClusterQueueResourceLendingLimit.WithLabelValues(string(cohort), queue, flavor, resource).Set(lending) + ClusterQueueResourceLendingLimit.WithLabelValues(labels...).Set(lending) } } -func ReportClusterQueueResourceReservations(cohort kueue.CohortReference, queue, flavor, resource string, usage float64) { - ClusterQueueResourceReservations.WithLabelValues(string(cohort), queue, flavor, resource).Set(usage) +func ReportClusterQueueResourceReservations(cohort kueue.CohortReference, queue kueue.ClusterQueue, flavor, resource string, usage float64) { + labels := []string{string(cohort), queue.Name, flavor, resource} + labels = append(labels, getResourceTagValues(&queue, customTagsConf.ClusterQueue)...) + ClusterQueueResourceReservations.WithLabelValues(labels...).Set(usage) } func ReportLocalQueueResourceReservations(lq LocalQueueReference, flavor, resource string, usage float64) { LocalQueueResourceReservations.WithLabelValues(string(lq.Name), lq.Namespace, flavor, resource).Set(usage) } -func ReportClusterQueueResourceUsage(cohort kueue.CohortReference, queue, flavor, resource string, usage float64) { - ClusterQueueResourceUsage.WithLabelValues(string(cohort), queue, flavor, resource).Set(usage) +func ReportClusterQueueResourceUsage(cohort kueue.CohortReference, queue kueue.ClusterQueue, flavor, resource string, usage float64) { + labels := []string{string(cohort), queue.Name, flavor, resource} + labels = append(labels, getResourceTagValues(&queue, customTagsConf.ClusterQueue)...) + ClusterQueueResourceUsage.WithLabelValues(labels...).Set(usage) } func ReportLocalQueueResourceUsage(lq LocalQueueReference, flavor, resource string, usage float64) { LocalQueueResourceUsage.WithLabelValues(string(lq.Name), lq.Namespace, flavor, resource).Set(usage) } -func ReportClusterQueueWeightedShare(cq, cohort string, weightedShare float64) { - ClusterQueueWeightedShare.WithLabelValues(cq, cohort).Set(weightedShare) +func ReportClusterQueueWeightedShare(cq kueue.ClusterQueue, cohort string, weightedShare float64) { + labels := []string{cq.Name, cohort} + labels = append(labels, getResourceTagValues(&cq, customTagsConf.ClusterQueue)...) + ClusterQueueWeightedShare.WithLabelValues(labels...).Set(weightedShare) } func ReportCohortWeightedShare(cohort string, weightedShare float64) { @@ -838,7 +897,12 @@ func ClearClusterQueueResourceReservations(cqName, flavor, resource string) { ClusterQueueResourceReservations.DeletePartialMatch(lbls) } -func Register() { +type Configuration struct { + CustomTags v1beta1.CustomMetricTags +} + +func Register(configuration Configuration) { + initCustomTagsMetric(&configuration.CustomTags) metrics.Registry.MustRegister( buildInfo, AdmissionAttemptsTotal, diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index aa245cfb14e..453d66ff593 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -21,12 +21,17 @@ import ( "github.com/google/go-cmp/cmp" "github.com/prometheus/client_golang/prometheus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta2" "sigs.k8s.io/kueue/pkg/util/testing/metrics" "sigs.k8s.io/kueue/pkg/version" ) +func init() { + initCustomTagsMetric(nil) +} + func expectFilteredMetricsCount(t *testing.T, vec prometheus.Collector, count int, kvs ...string) { labels := prometheus.Labels{} for i := range len(kvs) / 2 { @@ -48,18 +53,24 @@ func TestGenerateExponentialBuckets(t *testing.T) { } func TestReportAndCleanupClusterQueueMetrics(t *testing.T) { - ReportClusterQueueQuotas("cohort", "queue", "flavor", "res", 5, 10, 3) - ReportClusterQueueQuotas("cohort", "queue", "flavor2", "res", 1, 2, 1) + clusterQueue := kueue.ClusterQueue{ + ObjectMeta: metav1.ObjectMeta{ + Name: "queue", + }, + } + + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor", "res", 5, 10, 3) + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor2", "res", 1, 2, 1) expectFilteredMetricsCount(t, ClusterQueueResourceNominalQuota, 2, "cluster_queue", "queue") expectFilteredMetricsCount(t, ClusterQueueResourceBorrowingLimit, 2, "cluster_queue", "queue") expectFilteredMetricsCount(t, ClusterQueueResourceLendingLimit, 2, "cluster_queue", "queue") - ReportClusterQueueResourceReservations("cohort", "queue", "flavor", "res", 7) - ReportClusterQueueResourceReservations("cohort", "queue", "flavor2", "res", 3) + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor", "res", 7) + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor2", "res", 3) - ReportClusterQueueResourceUsage("cohort", "queue", "flavor", "res", 7) - ReportClusterQueueResourceUsage("cohort", "queue", "flavor2", "res", 3) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor", "res", 7) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor2", "res", 3) expectFilteredMetricsCount(t, ClusterQueueResourceReservations, 2, "cluster_queue", "queue") expectFilteredMetricsCount(t, ClusterQueueResourceUsage, 2, "cluster_queue", "queue") @@ -74,10 +85,16 @@ func TestReportAndCleanupClusterQueueMetrics(t *testing.T) { } func TestReportAndCleanupClusterQueueQuotas(t *testing.T) { - ReportClusterQueueQuotas("cohort", "queue", "flavor", "res", 5, 10, 3) - ReportClusterQueueQuotas("cohort", "queue", "flavor", "res2", 5, 10, 3) - ReportClusterQueueQuotas("cohort", "queue", "flavor2", "res", 1, 2, 1) - ReportClusterQueueQuotas("cohort", "queue", "flavor2", "res2", 1, 2, 1) + clusterQueue := kueue.ClusterQueue{ + ObjectMeta: metav1.ObjectMeta{ + Name: "queue", + }, + } + + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor", "res", 5, 10, 3) + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor", "res2", 5, 10, 3) + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor2", "res", 1, 2, 1) + ReportClusterQueueQuotas("cohort", clusterQueue, "flavor2", "res2", 1, 2, 1) expectFilteredMetricsCount(t, ClusterQueueResourceNominalQuota, 4, "cluster_queue", "queue") expectFilteredMetricsCount(t, ClusterQueueResourceBorrowingLimit, 4, "cluster_queue", "queue") @@ -107,10 +124,16 @@ func TestReportAndCleanupClusterQueueQuotas(t *testing.T) { } func TestReportAndCleanupClusterQueueUsage(t *testing.T) { - ReportClusterQueueResourceReservations("cohort", "queue", "flavor", "res", 5) - ReportClusterQueueResourceReservations("cohort", "queue", "flavor", "res2", 5) - ReportClusterQueueResourceReservations("cohort", "queue", "flavor2", "res", 1) - ReportClusterQueueResourceReservations("cohort", "queue", "flavor2", "res2", 1) + clusterQueue := kueue.ClusterQueue{ + ObjectMeta: metav1.ObjectMeta{ + Name: "queue", + }, + } + + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor", "res", 5) + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor", "res2", 5) + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor2", "res", 1) + ReportClusterQueueResourceReservations("cohort", clusterQueue, "flavor2", "res2", 1) expectFilteredMetricsCount(t, ClusterQueueResourceReservations, 4, "cluster_queue", "queue") @@ -126,10 +149,10 @@ func TestReportAndCleanupClusterQueueUsage(t *testing.T) { expectFilteredMetricsCount(t, ClusterQueueResourceReservations, 1, "cluster_queue", "queue") expectFilteredMetricsCount(t, ClusterQueueResourceReservations, 0, "cluster_queue", "queue", "flavor", "flavor", "resource", "res2") - ReportClusterQueueResourceUsage("cohort", "queue", "flavor", "res", 5) - ReportClusterQueueResourceUsage("cohort", "queue", "flavor", "res2", 5) - ReportClusterQueueResourceUsage("cohort", "queue", "flavor2", "res", 1) - ReportClusterQueueResourceUsage("cohort", "queue", "flavor2", "res2", 1) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor", "res", 5) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor", "res2", 5) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor2", "res", 1) + ReportClusterQueueResourceUsage("cohort", clusterQueue, "flavor2", "res2", 1) expectFilteredMetricsCount(t, ClusterQueueResourceUsage, 4, "cluster_queue", "queue") diff --git a/test/performance/scheduler/minimalkueue/main.go b/test/performance/scheduler/minimalkueue/main.go index 919750a583e..0a6ffe269f8 100644 --- a/test/performance/scheduler/minimalkueue/main.go +++ b/test/performance/scheduler/minimalkueue/main.go @@ -128,7 +128,7 @@ func mainWithExitCode() int { if *metricsPort > 0 { options.Metrics.BindAddress = fmt.Sprintf(":%d", *metricsPort) - metrics.Register() + metrics.Register(metrics.Configuration{}) } mgr, err := ctrl.NewManager(kubeConfig, options)