Skip to content

Commit 9236cad

Browse files
committed
feat: add action duration metric
1 parent deb8d51 commit 9236cad

File tree

3 files changed

+22
-5
lines changed

3 files changed

+22
-5
lines changed

internal/controller/controller.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,15 @@ func (s *Controller) handleActions(ctx context.Context, clusterActions []*castai
156156

157157
var err error
158158

159+
startTime := time.Now()
159160
handleErr := s.handleAction(ctx, action)
160161
if errors.Is(handleErr, context.Canceled) {
161162
// Action should be handled again on context canceled errors.
162163
return
163164
}
164-
ackErr := s.ackAction(ctx, action, handleErr)
165+
166+
handleDuration := time.Since(startTime)
167+
ackErr := s.ackAction(ctx, action, handleErr, handleDuration)
165168
if handleErr != nil {
166169
err = handleErr
167170
}
@@ -235,7 +238,7 @@ func (s *Controller) handleAction(ctx context.Context, action *castai.ClusterAct
235238
return nil
236239
}
237240

238-
func (s *Controller) ackAction(ctx context.Context, action *castai.ClusterAction, handleErr error) error {
241+
func (s *Controller) ackAction(ctx context.Context, action *castai.ClusterAction, handleErr error, handleDuration time.Duration) error {
239242
actionType := action.GetType()
240243
actionError := getHandlerError(handleErr)
241244
s.log.WithFields(logrus.Fields{
@@ -244,7 +247,7 @@ func (s *Controller) ackAction(ctx context.Context, action *castai.ClusterAction
244247
"successful": actionError == nil,
245248
}).Info("ack action")
246249

247-
metrics.ActionFinished(actionType, actionError == nil)
250+
metrics.ActionFinished(actionType, actionError == nil, handleDuration)
248251

249252
boff := waitext.NewConstantBackoff(s.cfg.AckRetryWait)
250253

internal/metrics/custom_metrics.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package metrics
22

33
import (
44
"strconv"
5+
"time"
56

67
"github.com/prometheus/client_golang/prometheus"
78
)
@@ -15,6 +16,17 @@ var actionCounter = prometheus.NewCounterVec(
1516
[]string{"success", "type"},
1617
)
1718

18-
func ActionFinished(actionType string, success bool) {
19-
actionCounter.With(prometheus.Labels{"success": strconv.FormatBool(success), "type": actionType}).Inc()
19+
// actionDuration tracks the duration of actions executed by the cluster controller.
20+
var actionExecutedDuration = prometheus.NewHistogramVec(
21+
prometheus.HistogramOpts{
22+
Name: "action_executed_duration_seconds",
23+
Help: "Duration of action handle execution in seconds.",
24+
},
25+
[]string{"success", "type"},
26+
)
27+
28+
func ActionFinished(actionType string, success bool, duration time.Duration) {
29+
labels := prometheus.Labels{"success": strconv.FormatBool(success), "type": actionType}
30+
actionCounter.With(labels).Inc()
31+
actionExecutedDuration.With(labels).Observe(duration.Seconds())
2032
}

internal/monitor/monitor.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ func (m *monitor) metadataUpdated(ctx context.Context, metadata Metadata) {
5959
func (m *monitor) reportPodDiagnostics(ctx context.Context, prevLastStart int64) {
6060
m.log.Errorf("unexpected controller restart detected, fetching k8s events for %s/%s", m.pod.Namespace, m.pod.Name)
6161

62+
// TODO: log pod restart reason from pod status
63+
6264
// log pod-related warnings
6365
m.logEvents(ctx, m.log.WithField("events_group", fmt.Sprintf("%s/%s", m.pod.Namespace, m.pod.Name)), m.pod.Namespace, &metav1.ListOptions{
6466
FieldSelector: "involvedObject.name=" + m.pod.Name,

0 commit comments

Comments
 (0)