Skip to content

Commit a349ee8

Browse files
authored
KUBE-417: Add metrics to CC. (#171)
* Add metrics server. * Extract metrics to package * Adjust metrics registration to avoid superflous http header error * Update promhttp * Add custom action counter. * Linter * Remove the enable/disable for metrics, no point in not doing it always and all castware does this pattern. * Add test and remove obsolete env var.
1 parent 3d18ce1 commit a349ee8

File tree

9 files changed

+109
-18
lines changed

9 files changed

+109
-18
lines changed

cmd/controller/run.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"github.com/castai/cluster-controller/internal/controller/logexporter"
3232
"github.com/castai/cluster-controller/internal/helm"
3333
"github.com/castai/cluster-controller/internal/k8sversion"
34+
"github.com/castai/cluster-controller/internal/metrics"
3435
"github.com/castai/cluster-controller/internal/monitor"
3536
"github.com/castai/cluster-controller/internal/waitext"
3637
)
@@ -172,13 +173,29 @@ func runController(
172173
addr := fmt.Sprintf(":%d", cfg.PprofPort)
173174
log.Infof("starting pprof server on %s", addr)
174175

175-
//TODO: remove nolint when we have a proper solution for this
176+
// https://deepsource.com/directory/go/issues/GO-S2114
177+
// => This is not a public API and runs in customer cluster; risk should be OK.
176178
//nolint:gosec
177179
if err := http.ListenAndServe(addr, httpMux); err != nil {
178180
log.Errorf("failed to start pprof http server: %v", err)
179181
}
180182
}()
181183

184+
// Start http server for metrics
185+
go func() {
186+
addr := fmt.Sprintf(":%d", cfg.Metrics.Port)
187+
log.Infof("starting metrics on %s", addr)
188+
189+
metrics.RegisterCustomMetrics()
190+
metricsMux := metrics.NewMetricsMux()
191+
// https://deepsource.com/directory/go/issues/GO-S2114
192+
// => This is not a public API and runs in customer cluster; risk should be OK.
193+
//nolint:gosec
194+
if err := http.ListenAndServe(addr, metricsMux); err != nil {
195+
log.Errorf("failed to start metrics http server: %v", err)
196+
}
197+
}()
198+
182199
if err := saveMetadata(cfg.ClusterID, cfg, log); err != nil {
183200
return err
184201
}

go.mod

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ require (
1212
github.com/golang/mock v1.6.0
1313
github.com/google/uuid v1.6.0
1414
github.com/kelseyhightower/envconfig v1.4.0
15+
github.com/prometheus/client_golang v1.21.1
1516
github.com/samber/lo v1.47.0
1617
github.com/sirupsen/logrus v1.9.3
1718
github.com/spf13/cobra v1.8.1
@@ -25,6 +26,7 @@ require (
2526
k8s.io/apiserver v0.32.1
2627
k8s.io/cli-runtime v0.32.1
2728
k8s.io/client-go v0.32.1
29+
k8s.io/component-base v0.32.1
2830
k8s.io/klog/v2 v2.130.1
2931
k8s.io/kubectl v0.32.1
3032
sigs.k8s.io/controller-runtime v0.19.0
@@ -92,7 +94,7 @@ require (
9294
github.com/jmoiron/sqlx v1.4.0 // indirect
9395
github.com/josharian/intern v1.0.0 // indirect
9496
github.com/json-iterator/go v1.1.12 // indirect
95-
github.com/klauspost/compress v1.17.2 // indirect
97+
github.com/klauspost/compress v1.17.11 // indirect
9698
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
9799
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
98100
github.com/lib/pq v1.10.9 // indirect
@@ -120,9 +122,8 @@ require (
120122
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
121123
github.com/pkg/errors v0.9.1 // indirect
122124
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
123-
github.com/prometheus/client_golang v1.19.1 // indirect
124125
github.com/prometheus/client_model v0.6.1 // indirect
125-
github.com/prometheus/common v0.55.0 // indirect
126+
github.com/prometheus/common v0.62.0 // indirect
126127
github.com/prometheus/procfs v0.15.1 // indirect
127128
github.com/rubenv/sql-migrate v1.7.1 // indirect
128129
github.com/russross/blackfriday/v2 v2.1.0 // indirect
@@ -146,21 +147,20 @@ require (
146147
go.uber.org/multierr v1.11.0 // indirect
147148
golang.org/x/crypto v0.32.0 // indirect
148149
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
149-
golang.org/x/oauth2 v0.23.0 // indirect
150+
golang.org/x/oauth2 v0.24.0 // indirect
150151
golang.org/x/sync v0.10.0 // indirect
151152
golang.org/x/sys v0.29.0 // indirect
152153
golang.org/x/term v0.28.0 // indirect
153154
golang.org/x/text v0.21.0 // indirect
154155
golang.org/x/time v0.7.0 // indirect
155156
google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect
156157
google.golang.org/grpc v1.65.0 // indirect
157-
google.golang.org/protobuf v1.35.1 // indirect
158+
google.golang.org/protobuf v1.36.1 // indirect
158159
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
159160
gopkg.in/inf.v0 v0.9.1 // indirect
160161
gopkg.in/ini.v1 v1.67.0 // indirect
161162
gopkg.in/yaml.v3 v3.0.1 // indirect
162163
k8s.io/apiextensions-apiserver v0.32.1 // indirect
163-
k8s.io/component-base v0.32.1 // indirect
164164
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect
165165
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
166166
oras.land/oras-go v1.2.5 // indirect

go.sum

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dv
244244
github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg=
245245
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
246246
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
247-
github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
248-
github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
247+
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
248+
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
249249
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
250250
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
251251
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
@@ -257,6 +257,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
257257
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
258258
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
259259
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
260+
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
261+
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
260262
github.com/labstack/echo/v4 v4.7.2/go.mod h1:xkCDAdFCIf8jsFQ5NnbK7oqaF/yU1A1X20Ltm0OvSks=
261263
github.com/labstack/gommon v0.3.1/go.mod h1:uW6kP17uPlLJsD3ijUYn3/M5bAxtlZhMI6m3MFxTMTM=
262264
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw=
@@ -360,16 +362,16 @@ github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjz
360362
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
361363
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
362364
github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g=
363-
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
364-
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
365+
github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk=
366+
github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg=
365367
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
366368
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
367369
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
368370
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
369371
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
370372
github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc=
371-
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
372-
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
373+
github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
374+
github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
373375
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
374376
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
375377
github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ=
@@ -504,8 +506,8 @@ golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qx
504506
golang.org/x/net v0.0.0-20220513224357-95641704303c/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
505507
golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0=
506508
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
507-
golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
508-
golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
509+
golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
510+
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
509511
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
510512
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
511513
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -573,8 +575,8 @@ google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjr
573575
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
574576
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
575577
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
576-
google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA=
577-
google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
578+
google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk=
579+
google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
578580
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
579581
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
580582
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

internal/config/config.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type Config struct {
2525
KubeClient KubeClient
2626
ClusterID string
2727
PprofPort int
28+
Metrics Metrics
2829
LeaderElection LeaderElection
2930
// MaxActionsInProgress serves as a safeguard to limit the number of Goroutines in progress.
3031
MaxActionsInProgress int
@@ -52,6 +53,10 @@ type TLS struct {
5253
CACert string
5354
}
5455

56+
type Metrics struct {
57+
Port int
58+
}
59+
5560
type LeaderElection struct {
5661
Enabled bool
5762
LockName string
@@ -94,6 +99,7 @@ func Get() Config {
9499
_ = viper.BindEnv("self_pod.name", "KUBERNETES_POD")
95100
_ = viper.BindEnv("self_pod.namespace", "LEADER_ELECTION_NAMESPACE")
96101
_ = viper.BindEnv("max_action_in_progress", "MAX_ACTIONS_IN_PROGRESS")
102+
_ = viper.BindEnv("metrics.port", "METRICS_PORT")
97103

98104
cfg = &Config{}
99105
if err := viper.Unmarshal(&cfg); err != nil {
@@ -146,6 +152,10 @@ func Get() Config {
146152
cfg.MaxActionsInProgress = 1000
147153
}
148154

155+
if cfg.Metrics.Port == 0 {
156+
cfg.Metrics.Port = 9090
157+
}
158+
149159
return *cfg
150160
}
151161

internal/config/config_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ func TestConfig(t *testing.T) {
2121
require.NoError(t, os.Setenv("LEADER_ELECTION_LOCK_NAME", "castai-cluster-controller"))
2222
require.NoError(t, os.Setenv("LEADER_ELECTION_LEASE_DURATION", "25s"))
2323
require.NoError(t, os.Setenv("LEADER_ELECTION_LEASE_RENEW_DEADLINE", "20s"))
24+
require.NoError(t, os.Setenv("METRICS_PORT", "16000"))
2425

2526
cfg := Get()
2627

@@ -49,6 +50,7 @@ func TestConfig(t *testing.T) {
4950
Burst: 150,
5051
},
5152
MaxActionsInProgress: 1000,
53+
Metrics: Metrics{Port: 16000},
5254
}
5355

5456
require.Equal(t, expected, cfg)

internal/controller/controller.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/castai/cluster-controller/internal/actions"
1919
"github.com/castai/cluster-controller/internal/castai"
2020
"github.com/castai/cluster-controller/internal/helm"
21+
"github.com/castai/cluster-controller/internal/metrics"
2122
"github.com/castai/cluster-controller/internal/waitext"
2223
)
2324

@@ -230,18 +231,21 @@ func (s *Controller) handleAction(ctx context.Context, action *castai.ClusterAct
230231

231232
func (s *Controller) ackAction(ctx context.Context, action *castai.ClusterAction, handleErr error) error {
232233
actionType := reflect.TypeOf(action.Data())
234+
actionError := getHandlerError(handleErr)
233235
s.log.WithFields(logrus.Fields{
234236
actions.ActionIDLogField: action.ID,
235237
"type": actionType.String(),
236238
}).Info("ack action")
237239

240+
metrics.ActionFinished(actionType.String(), actionError == nil)
241+
238242
boff := waitext.NewConstantBackoff(s.cfg.AckRetryWait)
239243

240244
return waitext.Retry(ctx, boff, s.cfg.AckRetriesCount, func(ctx context.Context) (bool, error) {
241245
ctx, cancel := context.WithTimeout(ctx, s.cfg.AckTimeout)
242246
defer cancel()
243247
return true, s.castAIClient.AckAction(ctx, action.ID, &castai.AckClusterActionRequest{
244-
Error: getHandlerError(handleErr),
248+
Error: actionError,
245249
})
246250
}, func(err error) {
247251
s.log.Debugf("ack failed, will retry: %v", err)

internal/metrics/custom_metrics.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package metrics
2+
3+
import (
4+
"strconv"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
)
8+
9+
// actionCounter tracks actions executed by the cluster controller.
10+
var actionCounter = prometheus.NewCounterVec(
11+
prometheus.CounterOpts{
12+
Name: "action_executed_total",
13+
Help: "Count of successful and unsuccessful actions executed by type.",
14+
},
15+
[]string{"success", "type"},
16+
)
17+
18+
func ActionFinished(actionType string, success bool) {
19+
actionCounter.With(prometheus.Labels{"success": strconv.FormatBool(success), "type": actionType}).Inc()
20+
}

internal/metrics/metrics.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package metrics
2+
3+
import (
4+
"net/http"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
"github.com/prometheus/client_golang/prometheus/promhttp"
8+
"k8s.io/component-base/metrics/legacyregistry"
9+
)
10+
11+
// registry = metrics.NewKubeRegistry()
12+
var registry = prometheus.NewRegistry()
13+
14+
func NewMetricsMux() *http.ServeMux {
15+
// Implementation inspired from https://github.com/kubernetes/kubernetes/pull/118081 and metrics-server.
16+
// Client-go doesn't really have good docs on exporting metrics...
17+
metricsMux := http.NewServeMux()
18+
19+
metricsMux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
20+
// Handles clientgo and other metrics
21+
legacyregistry.Handler().ServeHTTP(w, r)
22+
// Handles other metrics like go runtime, our custom metrics, etc.
23+
promhttp.HandlerFor(registry, promhttp.HandlerOpts{}).ServeHTTP(w, r)
24+
})
25+
26+
return metricsMux
27+
}

internal/metrics/register.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package metrics
2+
3+
import (
4+
_ "k8s.io/component-base/metrics/prometheus/clientgo" // client-go metrics registration
5+
)
6+
7+
func RegisterCustomMetrics() {
8+
registry.MustRegister(actionCounter)
9+
}

0 commit comments

Comments
 (0)