From ff328e661b2a4b405070d1e90569276dca9d6bb1 Mon Sep 17 00:00:00 2001 From: Kevin Hannon Date: Fri, 21 Feb 2025 11:41:58 -0500 Subject: [PATCH 1/2] add metrics for workloads count --- cmd/kueue/main.go | 4 + pkg/metrics/metrics.go | 254 ++++++++++++++++++++++++++++++++++++ pkg/metrics/metrics_test.go | 14 ++ 3 files changed, 272 insertions(+) diff --git a/cmd/kueue/main.go b/cmd/kueue/main.go index afd6baf80f..1f3a7f9edf 100644 --- a/cmd/kueue/main.go +++ b/cmd/kueue/main.go @@ -349,6 +349,10 @@ func setupControllers(ctx context.Context, mgr ctrl.Manager, cCache *cache.Cache setupLog.Error(err, "Unable to create controller or webhook", "kubernetesVersion", serverVersionFetcher.GetServerVersion()) os.Exit(1) } + + // Set metrics for workloads enablement + metrics.ReportExternalFrameworksSupported(cfg.Integrations.ExternalFrameworks) + metrics.ReportIntegrationsFrameworksSupported(cfg.Integrations.Frameworks) } // setupProbeEndpoints registers the health endpoints diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 62f01cab44..285960502e 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -31,6 +31,7 @@ import ( type AdmissionResult string type ClusterQueueStatus string +type FrameworkBool string type LocalQueueReference struct { Name string @@ -57,6 +58,11 @@ const ( CQStatusActive ClusterQueueStatus = "active" // CQStatusTerminating means the clusterQueue is in pending deletion. CQStatusTerminating ClusterQueueStatus = "terminating" + + // Provide a label to state if the frameworks are + // enabled in the configuration + FrameworkEnabled FrameworkBool = "true" + FrameworkDisabled FrameworkBool = "false" ) var ( @@ -369,6 +375,147 @@ If the ClusterQueue has a weight of zero, this will return 9223372036854775807, the maximum possible share value.`, }, []string{"cluster_queue"}, ) + + // Metrics counting usage of frameworks + + // Counter of jobs that are managed by Kueue + JobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "job_workload_count", + Help: `The number of batch jobs that Kueue is/has managed. +Enabled will have a value on true or false +And the count will track the number of jobs that Kueue manages.`, + }, []string{"enabled"}, + ) + + JobSetWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "jobset_workload_count", + Help: `The number of jobsets that Kueue is/has managed +Enabled will have a value on true or false +And the count will track the number of jobsets that Kueue manages.`, + }, []string{"enabled"}, + ) + + AppWrapperWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "appwrapper_workload_count", + Help: `The number of appwrappers that Kueue is/has managed +Enabled will have a value on true or false +And the count will track the number of appwrappers that Kueue manages.`, + }, []string{"enabled"}, + ) + + PodWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "pod_workload_count", + Help: `The number of pod-based workloads that Kueue is/has managed. +Enabled will have a value on true or false +And the count will track the number of pod-based workloads that Kueue manages.`, + }, []string{"enabled"}, + ) + + StatefulsetWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "statefulset_workload_count", + Help: `The number of statefulsets that Kueue is/has managed. +Enabled will have a value on true or false +And the count will track the number of statefulsets that Kueue manages.`, + }, []string{"enabled"}, + ) + + DeploymentWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "deployment_workload_count", + Help: `The number of deployments that Kueue is/has managed. +Enabled will have a value on true or false +And the count will track the number of deployments that Kueue manages.`, + }, []string{"enabled"}, + ) + + LeaderWorkerSetWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "leaderworkerset_workload_count", + Help: `The number of leaderworkersets that Kueue is/has managed. +Enabled will have a value on true or false +And the count will track the number of leaderworkersets that Kueue manages.`, + }, []string{"enabled"}, + ) + + KubeflowMPIWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "kubeflow_mpi_workload_count", + Help: `The number of mpi jobs that Kueue is/has managed. +Enabled will have a value of true or false`, + }, []string{"enabled"}, + ) + + KubeflowPyTorchJobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "kubeflow_pytorch_workload_count", + Help: `The number of pytorch jobs that Kueue is/has managed. +Enabled will have a value of true or false`, + }, []string{"enabled"}, + ) + + KubeflowTensorFlowJobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "kubeflow_tensorflow_workload_count", + Help: `The number of tensorflow jobs that Kueue is/has managed. +Enabled will have a value of true or false.`, + }, []string{"enabled"}, + ) + + KubeflowPaddleJobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "kubeflow_paddle_workload_count", + Help: `The number of paddle jobs that Kueue is/has managed. +Enabled will have a value of true or false.`, + }, []string{"enabled"}, + ) + + KubeflowXGBoostJobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "kubeflow_xgboost_workload_count", + Help: "The number of XGBoost jobs that Kueue is/has managed", + }, []string{"enabled"}, + ) + + RayJobWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "rayjob_workload_count", + Help: "The number of ray job workloads that Kueue is/has managed", + }, []string{"enabled"}, + ) + + RayClusterWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "raycluster_workload_count", + Help: "The number of ray cluster workloads that Kueue is/has managed", + }, []string{"enabled"}, + ) + + ExternalFrameworksWorkloadCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "externalframeworks_workload_count", + Help: "The number of external frameworks that Kueue is/has managed", + }, []string{"enabled"}, + ) ) func generateExponentialBuckets(count int) []float64 { @@ -431,6 +578,113 @@ func ReportPreemption(preemptingCqName, preemptingReason, targetCqName string) { ReportEvictedWorkloads(targetCqName, kueue.WorkloadEvictedByPreemption) } +func ReportIntegrationsFrameworksSupported(frameworks []string) { + if len(frameworks) == 0 { + JobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + JobSetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + RayJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + RayClusterWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + PodWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + AppWrapperWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + KubeflowPaddleJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + KubeflowTensorFlowJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) + } + for _, val := range frameworks { + switch val { + case "batch/job": + JobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "jobset.x-k8s.io/jobset": + JobSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "ray.io/rayjob": + RayJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "kubeflow.org/mpijob": + KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "kubeflow.org/pytorchjob": + KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "kubeflow.org/tfjob": + KubeflowTensorFlowJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "kubeflow.org/xgboostjob": + KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "workload.codeflare.dev/appwrapper": + AppWrapperWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "pod": + PodWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "statefulset": + StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "deployment": + DeploymentWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + case "leaderworkerset.x-k8s.io/leaderworkerset": + LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) + } + + } +} + +func ReportExternalFrameworksSupported(externalFrameworks []string) { + var externalFramework FrameworkBool = FrameworkDisabled + if len(externalFrameworks) > 0 { + externalFramework = FrameworkEnabled + } + ExternalFrameworksWorkloadCounter.WithLabelValues(string(externalFramework)).Add(0) +} + +func CountExternalFramework() { + ExternalFrameworksWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountJobWorkloads() { + JobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountJobSetWorkloads() { + JobSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountRayJobWorkloads() { + RayJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountRayClusterWorkloads() { + RayClusterWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountMPIJobWorkloads() { + KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountPyTorchJobWorkloads() { + KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountPaddleJobWorkloads() { + KubeflowPaddleJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountXGBoostJobWorkloads() { + KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountPodWorkloads() { + PodWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountStatefulSetWorkloads() { + StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountDeploymentWorkloads() { + DeploymentWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + +func CountLeaderWorkerSetWorkloads() { + LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +} + func LQRefFromWorkload(wl *kueue.Workload) LocalQueueReference { return LocalQueueReference{ Name: wl.Spec.QueueName, diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 65b1547efe..0904731a5d 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -173,3 +173,17 @@ func TestReportAndCleanupClusterQueuePreemptedNumber(t *testing.T) { expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 0, "preempting_cluster_queue", "cluster_queue1") expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 0, "cluster_queue", "cluster_queue1") } + +func TestReportExternalFrameworkMetricsEnabled(t *testing.T) { + ReportExternalFrameworksSupported([]string{"test"}) + expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "enabled", string(FrameworkEnabled)) +} + +func TestReportExternalFrameworkMetricsDisabled(t *testing.T) { + ReportExternalFrameworksSupported([]string{}) + expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "enabled", string(FrameworkDisabled)) +} + +func TestReportIntegrationsFrameworksSupported(t *testing.T) { + ReportIntegrationsFrameworksSupported([]string{}) +} From 4639bf51342c6458a1353e9c1bb0de39fdf0dc46 Mon Sep 17 00:00:00 2001 From: Kevin Hannon Date: Fri, 21 Feb 2025 17:03:06 -0500 Subject: [PATCH 2/2] use a metric for internal frameworks and one for external frameworks --- pkg/metrics/metrics.go | 244 +++--------------------------------- pkg/metrics/metrics_test.go | 7 +- 2 files changed, 21 insertions(+), 230 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 285960502e..99fb3ff5d6 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -58,11 +58,6 @@ const ( CQStatusActive ClusterQueueStatus = "active" // CQStatusTerminating means the clusterQueue is in pending deletion. CQStatusTerminating ClusterQueueStatus = "terminating" - - // Provide a label to state if the frameworks are - // enabled in the configuration - FrameworkEnabled FrameworkBool = "true" - FrameworkDisabled FrameworkBool = "false" ) var ( @@ -379,142 +374,22 @@ the maximum possible share value.`, // Metrics counting usage of frameworks // Counter of jobs that are managed by Kueue - JobWorkloadCounter = prometheus.NewCounterVec( + WorkloadIntegrationsCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: constants.KueueName, - Name: "job_workload_count", + Name: "workload_integrations", Help: `The number of batch jobs that Kueue is/has managed. -Enabled will have a value on true or false -And the count will track the number of jobs that Kueue manages.`, - }, []string{"enabled"}, - ) - - JobSetWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "jobset_workload_count", - Help: `The number of jobsets that Kueue is/has managed -Enabled will have a value on true or false -And the count will track the number of jobsets that Kueue manages.`, - }, []string{"enabled"}, - ) - - AppWrapperWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "appwrapper_workload_count", - Help: `The number of appwrappers that Kueue is/has managed -Enabled will have a value on true or false -And the count will track the number of appwrappers that Kueue manages.`, - }, []string{"enabled"}, - ) - - PodWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "pod_workload_count", - Help: `The number of pod-based workloads that Kueue is/has managed. -Enabled will have a value on true or false -And the count will track the number of pod-based workloads that Kueue manages.`, - }, []string{"enabled"}, - ) - - StatefulsetWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "statefulset_workload_count", - Help: `The number of statefulsets that Kueue is/has managed. -Enabled will have a value on true or false -And the count will track the number of statefulsets that Kueue manages.`, - }, []string{"enabled"}, - ) - - DeploymentWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "deployment_workload_count", - Help: `The number of deployments that Kueue is/has managed. -Enabled will have a value on true or false -And the count will track the number of deployments that Kueue manages.`, - }, []string{"enabled"}, - ) - - LeaderWorkerSetWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "leaderworkerset_workload_count", - Help: `The number of leaderworkersets that Kueue is/has managed. -Enabled will have a value on true or false -And the count will track the number of leaderworkersets that Kueue manages.`, - }, []string{"enabled"}, - ) - - KubeflowMPIWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "kubeflow_mpi_workload_count", - Help: `The number of mpi jobs that Kueue is/has managed. -Enabled will have a value of true or false`, - }, []string{"enabled"}, - ) - - KubeflowPyTorchJobWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "kubeflow_pytorch_workload_count", - Help: `The number of pytorch jobs that Kueue is/has managed. -Enabled will have a value of true or false`, - }, []string{"enabled"}, - ) - - KubeflowTensorFlowJobWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "kubeflow_tensorflow_workload_count", - Help: `The number of tensorflow jobs that Kueue is/has managed. -Enabled will have a value of true or false.`, - }, []string{"enabled"}, - ) - - KubeflowPaddleJobWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "kubeflow_paddle_workload_count", - Help: `The number of paddle jobs that Kueue is/has managed. -Enabled will have a value of true or false.`, - }, []string{"enabled"}, - ) - - KubeflowXGBoostJobWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "kubeflow_xgboost_workload_count", - Help: "The number of XGBoost jobs that Kueue is/has managed", - }, []string{"enabled"}, - ) - - RayJobWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "rayjob_workload_count", - Help: "The number of ray job workloads that Kueue is/has managed", - }, []string{"enabled"}, - ) - - RayClusterWorkloadCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: constants.KueueName, - Name: "raycluster_workload_count", - Help: "The number of ray cluster workloads that Kueue is/has managed", - }, []string{"enabled"}, +"Workload will specify what kind of workload`, + }, []string{"workload"}, ) ExternalFrameworksWorkloadCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: constants.KueueName, - Name: "externalframeworks_workload_count", - Help: "The number of external frameworks that Kueue is/has managed", - }, []string{"enabled"}, + Name: "workload_external_integrations", + Help: `The number of external frameworks that Kueue is/has managed. +Workload will specify what kind of workload`, + }, []string{"workload"}, ) ) @@ -579,110 +454,23 @@ func ReportPreemption(preemptingCqName, preemptingReason, targetCqName string) { } func ReportIntegrationsFrameworksSupported(frameworks []string) { - if len(frameworks) == 0 { - JobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - JobSetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - RayJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - RayClusterWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - PodWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - AppWrapperWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - KubeflowPaddleJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - KubeflowTensorFlowJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkDisabled)).Add(0) - } for _, val := range frameworks { - switch val { - case "batch/job": - JobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "jobset.x-k8s.io/jobset": - JobSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "ray.io/rayjob": - RayJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "kubeflow.org/mpijob": - KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "kubeflow.org/pytorchjob": - KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "kubeflow.org/tfjob": - KubeflowTensorFlowJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "kubeflow.org/xgboostjob": - KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "workload.codeflare.dev/appwrapper": - AppWrapperWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "pod": - PodWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "statefulset": - StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "deployment": - DeploymentWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - case "leaderworkerset.x-k8s.io/leaderworkerset": - LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Add(0) - } - + WorkloadIntegrationsCounter.WithLabelValues(val).Add(0) } } func ReportExternalFrameworksSupported(externalFrameworks []string) { - var externalFramework FrameworkBool = FrameworkDisabled - if len(externalFrameworks) > 0 { - externalFramework = FrameworkEnabled + for _, val := range externalFrameworks { + ExternalFrameworksWorkloadCounter.WithLabelValues(val).Add(0) } - ExternalFrameworksWorkloadCounter.WithLabelValues(string(externalFramework)).Add(0) -} - -func CountExternalFramework() { - ExternalFrameworksWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountJobWorkloads() { - JobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountJobSetWorkloads() { - JobSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountRayJobWorkloads() { - RayJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountRayClusterWorkloads() { - RayClusterWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountMPIJobWorkloads() { - KubeflowMPIWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountPyTorchJobWorkloads() { - KubeflowPyTorchJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountPaddleJobWorkloads() { - KubeflowPaddleJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountXGBoostJobWorkloads() { - KubeflowXGBoostJobWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountPodWorkloads() { - PodWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() -} - -func CountStatefulSetWorkloads() { - StatefulsetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() } -func CountDeploymentWorkloads() { - DeploymentWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +func CountExternalFramework(externalGVK string) { + ExternalFrameworksWorkloadCounter.WithLabelValues(externalGVK).Inc() } -func CountLeaderWorkerSetWorkloads() { - LeaderWorkerSetWorkloadCounter.WithLabelValues(string(FrameworkEnabled)).Inc() +func CountFramework(framework string) { + WorkloadIntegrationsCounter.WithLabelValues(framework).Inc() } func LQRefFromWorkload(wl *kueue.Workload) LocalQueueReference { @@ -879,6 +667,8 @@ func Register() { ClusterQueueResourceBorrowingLimit, ClusterQueueResourceLendingLimit, ClusterQueueWeightedShare, + WorkloadIntegrationsCounter, + ExternalFrameworksWorkloadCounter, ) if features.Enabled(features.LocalQueueMetrics) { RegisterLQMetrics() diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 0904731a5d..f61118dee8 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -176,14 +176,15 @@ func TestReportAndCleanupClusterQueuePreemptedNumber(t *testing.T) { func TestReportExternalFrameworkMetricsEnabled(t *testing.T) { ReportExternalFrameworksSupported([]string{"test"}) - expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "enabled", string(FrameworkEnabled)) + expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "workload", "test") } func TestReportExternalFrameworkMetricsDisabled(t *testing.T) { ReportExternalFrameworksSupported([]string{}) - expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "enabled", string(FrameworkDisabled)) + expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 0) } func TestReportIntegrationsFrameworksSupported(t *testing.T) { - ReportIntegrationsFrameworksSupported([]string{}) + ReportIntegrationsFrameworksSupported([]string{"batch/job"}) + expectFilteredMetricsCount(t, WorkloadIntegrationsCounter, 1, "workload", "batch/job") }