Skip to content

Commit 9e4ce84

Browse files
committed
Refactor rayCluster metrics
Signed-off-by: win5923 <ken89@kimo.com>
1 parent 3d383d6 commit 9e4ce84

File tree

8 files changed

+350
-135
lines changed

8 files changed

+350
-135
lines changed

ray-operator/controllers/ray/common/metrics_test.go

Lines changed: 0 additions & 34 deletions
This file was deleted.

ray-operator/controllers/ray/metrics/ray_cluster_metrics.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@ import (
44
"github.com/prometheus/client_golang/prometheus"
55
)
66

7-
type RayClusterMetricCollector struct {
7+
// RayClusterCollector implements the prometheus.Collector and ray.RayClusterMetricsCollector interface to collect ray cluster metrics.
8+
type RayClusterCollector struct {
89
// Metrics
910
rayClusterProvisionedDuration *prometheus.GaugeVec
1011
rayClusterHeadPodReadyDuration *prometheus.GaugeVec
1112
}
1213

13-
func NewRayClusterMetricCollector() *RayClusterMetricCollector {
14-
collector := &RayClusterMetricCollector{
14+
// NewRayClusterCollector creates a new RayClusterCollector instance.
15+
func NewRayClusterCollector() *RayClusterCollector {
16+
collector := &RayClusterCollector{
1517
rayClusterProvisionedDuration: prometheus.NewGaugeVec(
1618
prometheus.GaugeOpts{
1719
Name: "kuberay_cluster_provisioned_duration_seconds",
@@ -31,23 +33,35 @@ func NewRayClusterMetricCollector() *RayClusterMetricCollector {
3133
}
3234

3335
// Describe implements prometheus.Collector interface Describe method.
34-
func (c *RayClusterMetricCollector) Describe(ch chan<- *prometheus.Desc) {
36+
func (c *RayClusterCollector) Describe(ch chan<- *prometheus.Desc) {
3537
c.rayClusterProvisionedDuration.Describe(ch)
3638
c.rayClusterHeadPodReadyDuration.Describe(ch)
3739
}
3840

3941
// Collect implements prometheus.Collector interface Collect method.
40-
func (c *RayClusterMetricCollector) Collect(ch chan<- prometheus.Metric) {
42+
func (c *RayClusterCollector) Collect(ch chan<- prometheus.Metric) {
4143
c.rayClusterProvisionedDuration.Collect(ch)
4244
c.rayClusterHeadPodReadyDuration.Collect(ch)
4345
}
4446

45-
// ObserveRayClusterProvisionedDuration observes the duration of RayCluster from creation to provisioned
46-
func (c *RayClusterMetricCollector) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) {
47+
// ObserveRayClusterProvisionedDuration observes the duration of RayCluster's status transition from false (or unset) to true
48+
func (c *RayClusterCollector) ObserveRayClusterProvisionedDuration(name, namespace string, duration float64) {
4749
c.rayClusterProvisionedDuration.WithLabelValues(name, namespace).Set(duration)
4850
}
4951

5052
// ObserveRayClusterHeadPodReadyDuration observes the duration of RayCluster from creation to head pod ready
51-
func (c *RayClusterMetricCollector) ObserveRayClusterHeadPodReadyDuration(name, namespace string, duration float64) {
53+
func (c *RayClusterCollector) ObserveRayClusterHeadPodReadyDuration(name, namespace string, duration float64) {
5254
c.rayClusterHeadPodReadyDuration.WithLabelValues(name, namespace).Set(duration)
5355
}
56+
57+
type RayClusterNoopCollector struct{}
58+
59+
func NewRayClusterNoopCollector() *RayClusterNoopCollector {
60+
return &RayClusterNoopCollector{}
61+
}
62+
63+
func (c *RayClusterNoopCollector) ObserveRayClusterProvisionedDuration(_ string, _ string, _ float64) {
64+
}
65+
66+
func (c *RayClusterNoopCollector) ObserveRayClusterHeadPodReadyDuration(_ string, _ string, _ float64) {
67+
}

ray-operator/controllers/ray/mocks/raycluster_controller_mock.go

Lines changed: 58 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 24 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,6 @@ import (
1111
"strings"
1212
"time"
1313

14-
"k8s.io/apimachinery/pkg/api/meta"
15-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16-
"k8s.io/utils/ptr"
17-
18-
configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
19-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
20-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
21-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/expectations"
22-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics"
23-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
24-
"github.com/ray-project/kuberay/ray-operator/pkg/features"
25-
26-
batchv1 "k8s.io/api/batch/v1"
27-
rbacv1 "k8s.io/api/rbac/v1"
28-
29-
"k8s.io/client-go/tools/record"
30-
31-
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
32-
3314
"github.com/go-logr/logr"
3415
routev1 "github.com/openshift/api/route/v1"
3516
batchv1 "k8s.io/api/batch/v1"
@@ -57,7 +38,6 @@ import (
5738
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
5839
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
5940
"github.com/ray-project/kuberay/ray-operator/controllers/ray/expectations"
60-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics"
6141
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
6242
"github.com/ray-project/kuberay/ray-operator/pkg/features"
6343
)
@@ -110,8 +90,13 @@ type RayClusterReconciler struct {
11090
options RayClusterReconcilerOptions
11191
}
11292

93+
type RayClusterMetricsCollector interface {
94+
ObserveRayClusterProvisionedDuration(name, namespace string, duration float64)
95+
ObserveRayClusterHeadPodReadyDuration(name, namespace string, duration float64)
96+
}
97+
11398
type RayClusterReconcilerOptions struct {
114-
RayClusterMetricCollector *metrics.RayClusterMetricCollector
99+
RayClusterMetricCollector RayClusterMetricsCollector
115100
HeadSidecarContainers []corev1.Container
116101
WorkerSidecarContainers []corev1.Container
117102
IsOpenShift bool
@@ -348,6 +333,8 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance
348333
inconsistent, updateErr = r.updateRayClusterStatus(ctx, originalRayClusterInstance, newInstance)
349334
}
350335

336+
emitRayClusterMetrics(r.options.RayClusterMetricCollector, newInstance.Name, newInstance.Namespace, originalRayClusterInstance.Status, newInstance.Status, newInstance.CreationTimestamp.Time)
337+
351338
// Return error based on order.
352339
var err error
353340
if reconcileErr != nil {
@@ -1288,16 +1275,6 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
12881275
})
12891276
} else {
12901277
headPodReadyCondition := utils.FindHeadPodReadyCondition(headPod)
1291-
1292-
// Record ray_cluster_head_pod_ready_duration_seconds metric
1293-
// Calculate the time from RayClusters created to head pod ready
1294-
// if headPodReadyCondition.Status == metav1.ConditionTrue {
1295-
// if !meta.IsStatusConditionTrue(newInstance.Status.Conditions, string(rayv1.HeadPodReady)) {
1296-
// readyDuration := time.Since(instance.CreationTimestamp.Time)
1297-
// common.ObserveRayClusterHeadPodReadyDuration(instance.Namespace, readyDuration)
1298-
// }
1299-
// }
1300-
13011278
meta.SetStatusCondition(&newInstance.Status.Conditions, headPodReadyCondition)
13021279
}
13031280

@@ -1637,21 +1614,28 @@ func (r *RayClusterReconciler) updateRayClusterStatus(ctx context.Context, origi
16371614
if err != nil {
16381615
logger.Info("Error updating status", "name", originalRayClusterInstance.Name, "error", err, "RayCluster", newInstance)
16391616
}
1640-
collectRayClusterMetrics(r.options.RayClusterMetricCollector, newInstance.Name, newInstance.Namespace, newInstance.Status, originalRayClusterInstance.Status, originalRayClusterInstance.CreationTimestamp.Time)
16411617

16421618
return inconsistent, err
16431619
}
16441620

1645-
// collectRayClusterMetrics records metrics related to the RayCluster.
1646-
func collectRayClusterMetrics(collector *metrics.RayClusterMetricCollector, name, namespace string, newClusterStatus, oldClusterStatus rayv1.RayClusterStatus, creationTimestamp time.Time) {
1647-
if collector == nil {
1648-
return
1621+
func emitRayClusterMetrics(rayClusterMetricsCollector RayClusterMetricsCollector, rayClusterName, rayClusterNamespace string, originRayClusterStatus, rayClusterStatus rayv1.RayClusterStatus, creationTimestamp time.Time) {
1622+
emitRayClusterProvisionedDuration(rayClusterMetricsCollector, rayClusterName, rayClusterNamespace, originRayClusterStatus, rayClusterStatus, creationTimestamp)
1623+
emitRayClusterHeadPodReadyDuration(rayClusterMetricsCollector, rayClusterName, rayClusterNamespace, originRayClusterStatus, rayClusterStatus, creationTimestamp)
1624+
}
1625+
1626+
func emitRayClusterProvisionedDuration(collector RayClusterMetricsCollector, rayClusterName, rayClusterNamespace string, originRayClusterStatus, rayClusterStatus rayv1.RayClusterStatus, creationTimestamp time.Time) {
1627+
// Emit kuberay_cluster_provisioned_duration_seconds when a RayCluster's RayClusterProvisioned status transitions from false (or unset) to true
1628+
if !meta.IsStatusConditionTrue(originRayClusterStatus.Conditions, string(rayv1.RayClusterProvisioned)) &&
1629+
meta.IsStatusConditionTrue(rayClusterStatus.Conditions, string(rayv1.RayClusterProvisioned)) {
1630+
collector.ObserveRayClusterProvisionedDuration(rayClusterName, rayClusterNamespace, time.Since(creationTimestamp).Seconds())
16491631
}
1632+
}
16501633

1651-
// Record `kuberay_cluster_provisioned_duration_seconds` metric if just provisioned
1652-
if meta.IsStatusConditionTrue(newClusterStatus.Conditions, string(rayv1.RayClusterProvisioned)) &&
1653-
!meta.IsStatusConditionTrue(oldClusterStatus.Conditions, string(rayv1.RayClusterProvisioned)) {
1654-
collector.ObserveRayClusterProvisionedDuration(name, namespace, time.Since(creationTimestamp).Seconds())
1634+
func emitRayClusterHeadPodReadyDuration(collector RayClusterMetricsCollector, rayClusterName, rayClusterNamespace string, originRayClusterStatus, rayClusterStatus rayv1.RayClusterStatus, creationTimestamp time.Time) {
1635+
// Emit kuberay_cluster_head_pod_ready_duration_seconds when a RayCluster's HeadPodReady status transitions from false (or unset) to true
1636+
if !meta.IsStatusConditionTrue(originRayClusterStatus.Conditions, string(rayv1.HeadPodReady)) &&
1637+
meta.IsStatusConditionTrue(rayClusterStatus.Conditions, string(rayv1.HeadPodReady)) {
1638+
collector.ObserveRayClusterHeadPodReadyDuration(rayClusterName, rayClusterNamespace, time.Since(creationTimestamp).Seconds())
16551639
}
16561640
}
16571641

0 commit comments

Comments
 (0)