Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add metrics for workloads count #4350

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/kueue/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,10 @@ func setupControllers(ctx context.Context, mgr ctrl.Manager, cCache *cache.Cache
setupLog.Error(err, "Unable to create controller or webhook", "kubernetesVersion", serverVersionFetcher.GetServerVersion())
os.Exit(1)
}

// Set metrics for workloads enablement
metrics.ReportExternalFrameworksSupported(cfg.Integrations.ExternalFrameworks)
metrics.ReportIntegrationsFrameworksSupported(cfg.Integrations.Frameworks)
}

// setupProbeEndpoints registers the health endpoints
Expand Down
44 changes: 44 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (

type AdmissionResult string
type ClusterQueueStatus string
type FrameworkBool string

type LocalQueueReference struct {
Name string
Expand Down Expand Up @@ -369,6 +370,27 @@ If the ClusterQueue has a weight of zero, this will return 9223372036854775807,
the maximum possible share value.`,
}, []string{"cluster_queue"},
)

// Metrics counting usage of frameworks

// Counter of jobs that are managed by Kueue
WorkloadIntegrationsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: constants.KueueName,
Name: "workload_integrations",
Help: `The number of batch jobs that Kueue is/has managed.
"Workload will specify what kind of workload`,
}, []string{"workload"},
)

ExternalFrameworksWorkloadCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: constants.KueueName,
Name: "workload_external_integrations",
Help: `The number of external frameworks that Kueue is/has managed.
Workload will specify what kind of workload`,
}, []string{"workload"},
)
)

func generateExponentialBuckets(count int) []float64 {
Expand Down Expand Up @@ -431,6 +453,26 @@ func ReportPreemption(preemptingCqName, preemptingReason, targetCqName string) {
ReportEvictedWorkloads(targetCqName, kueue.WorkloadEvictedByPreemption)
}

func ReportIntegrationsFrameworksSupported(frameworks []string) {
for _, val := range frameworks {
WorkloadIntegrationsCounter.WithLabelValues(val).Add(0)
}
}

func ReportExternalFrameworksSupported(externalFrameworks []string) {
for _, val := range externalFrameworks {
ExternalFrameworksWorkloadCounter.WithLabelValues(val).Add(0)
}
}

func CountExternalFramework(externalGVK string) {
ExternalFrameworksWorkloadCounter.WithLabelValues(externalGVK).Inc()
}

func CountFramework(framework string) {
WorkloadIntegrationsCounter.WithLabelValues(framework).Inc()
}

func LQRefFromWorkload(wl *kueue.Workload) LocalQueueReference {
return LocalQueueReference{
Name: wl.Spec.QueueName,
Expand Down Expand Up @@ -625,6 +667,8 @@ func Register() {
ClusterQueueResourceBorrowingLimit,
ClusterQueueResourceLendingLimit,
ClusterQueueWeightedShare,
WorkloadIntegrationsCounter,
ExternalFrameworksWorkloadCounter,
)
if features.Enabled(features.LocalQueueMetrics) {
RegisterLQMetrics()
Expand Down
15 changes: 15 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,18 @@ func TestReportAndCleanupClusterQueuePreemptedNumber(t *testing.T) {
expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 0, "preempting_cluster_queue", "cluster_queue1")
expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 0, "cluster_queue", "cluster_queue1")
}

func TestReportExternalFrameworkMetricsEnabled(t *testing.T) {
ReportExternalFrameworksSupported([]string{"test"})
expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 1, "workload", "test")
}

func TestReportExternalFrameworkMetricsDisabled(t *testing.T) {
ReportExternalFrameworksSupported([]string{})
expectFilteredMetricsCount(t, ExternalFrameworksWorkloadCounter, 0)
}

func TestReportIntegrationsFrameworksSupported(t *testing.T) {
ReportIntegrationsFrameworksSupported([]string{"batch/job"})
expectFilteredMetricsCount(t, WorkloadIntegrationsCounter, 1, "workload", "batch/job")
}