diff --git a/cmd/aws-k8s-agent/main.go b/cmd/aws-k8s-agent/main.go index 469dc6e632..4f13a81873 100644 --- a/cmd/aws-k8s-agent/main.go +++ b/cmd/aws-k8s-agent/main.go @@ -17,6 +17,7 @@ package main import ( "context" "os" + "strconv" "time" "github.com/aws/amazon-vpc-cni-k8s/pkg/ipamd" @@ -90,6 +91,9 @@ func startBackgroundAPIServerCheck(ipamContext *ipamd.IPAMContext) { } func _main() int { + // Start measuring full startup duration + startupStartTime := time.Now() + // Do not add anything before initializing logger log := logger.Get() @@ -105,6 +109,8 @@ func _main() int { log.Info("SGP, custom networking or pod annotation feature is in use, waiting for API server connectivity to start IPAMD") if err := k8sapi.CheckAPIServerConnectivity(); err != nil { log.Errorf("Failed to check API server connectivity: %s", err) + // Record failed startup + metrics.IpamdStartupDuration.WithLabelValues("false", strconv.FormatBool(withApiServer), "api_server_connectivity").Observe(time.Since(startupStartTime).Seconds()) return 1 } else { log.Info("API server connectivity established.") @@ -131,12 +137,22 @@ func _main() int { log.Errorf("Failed to create event recorder: %s", err) log.Warn("Skipping event recorder initialization") } + // Measure node initialization duration + IPAMDNodeInitStartTime := time.Now() ipamContext, err := ipamd.New(k8sClient, withApiServer) + IPAMDNodeInitDuration := time.Since(IPAMDNodeInitStartTime).Seconds() + if err != nil { log.Errorf("Initialization failure: %v", err) + // Record failed IPAMD initialization and failed startup + metrics.IpamdNodeInitDuration.WithLabelValues("false").Observe(IPAMDNodeInitDuration) + metrics.IpamdStartupDuration.WithLabelValues("false", strconv.FormatBool(withApiServer), "node_initialization").Observe(time.Since(startupStartTime).Seconds()) return 1 } + // Record successful AWS initialization + metrics.IpamdNodeInitDuration.WithLabelValues("true").Observe(IPAMDNodeInitDuration) + // If not connected to API server yet, start background checks if !withApiServer { startBackgroundAPIServerCheck(ipamContext) @@ -155,7 +171,10 @@ func _main() int { go ipamContext.ServeIntrospection() } - // Start the RPC listener + // Record successful startup duration before the blocking RPC handler call + metrics.IpamdStartupDuration.WithLabelValues("true", strconv.FormatBool(withApiServer), "").Observe(time.Since(startupStartTime).Seconds()) + + // Start the RPC listener (this is a blocking call) err = ipamContext.RunRPCHandler(version.Version) if err != nil { log.Errorf("Failed to set up gRPC handler: %v", err) diff --git a/utils/prometheusmetrics/prometheusmetrics.go b/utils/prometheusmetrics/prometheusmetrics.go index fc5adc0463..cefefc9577 100644 --- a/utils/prometheusmetrics/prometheusmetrics.go +++ b/utils/prometheusmetrics/prometheusmetrics.go @@ -172,6 +172,20 @@ var ( }, []string{"eni"}, ) + IpamdStartupDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "awscni_ipamd_startup_duration_seconds", + Help: "The duration of IPAMD startup from process start to ready to serve CNI requests", + }, + []string{"success", "with_api_server", "failure_reason"}, + ) + IpamdNodeInitDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "awscni_ipamd_node_initialization_duration_seconds", + Help: "The duration of node initialization during IPAMD startup", + }, + []string{"success"}, + ) ) // ServeMetrics sets up ipamd metrics and introspection endpoints @@ -225,6 +239,8 @@ func PrometheusRegister() { prometheus.MustRegister(IpsPerCidr) prometheus.MustRegister(NoAvailableIPAddrs) prometheus.MustRegister(EniIPsInUse) + prometheus.MustRegister(IpamdStartupDuration) + prometheus.MustRegister(IpamdNodeInitDuration) } // This can be enhanced to get it programatically.