Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion cmd/aws-k8s-agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package main
import (
"context"
"os"
"strconv"
"time"

"github.com/aws/amazon-vpc-cni-k8s/pkg/ipamd"
Expand Down Expand Up @@ -90,6 +91,9 @@ func startBackgroundAPIServerCheck(ipamContext *ipamd.IPAMContext) {
}

func _main() int {
// Start measuring full startup duration
startupStartTime := time.Now()

// Do not add anything before initializing logger
log := logger.Get()

Expand All @@ -105,6 +109,8 @@ func _main() int {
log.Info("SGP, custom networking or pod annotation feature is in use, waiting for API server connectivity to start IPAMD")
if err := k8sapi.CheckAPIServerConnectivity(); err != nil {
log.Errorf("Failed to check API server connectivity: %s", err)
// Record failed startup
metrics.IpamdStartupDuration.WithLabelValues("false", strconv.FormatBool(withApiServer), "api_server_connectivity").Observe(time.Since(startupStartTime).Seconds())
return 1
} else {
log.Info("API server connectivity established.")
Expand All @@ -131,12 +137,22 @@ func _main() int {
log.Errorf("Failed to create event recorder: %s", err)
log.Warn("Skipping event recorder initialization")
}
// Measure node initialization duration
IPAMDNodeInitStartTime := time.Now()
ipamContext, err := ipamd.New(k8sClient, withApiServer)
IPAMDNodeInitDuration := time.Since(IPAMDNodeInitStartTime).Seconds()

if err != nil {
log.Errorf("Initialization failure: %v", err)
// Record failed IPAMD initialization and failed startup
metrics.IpamdNodeInitDuration.WithLabelValues("false").Observe(IPAMDNodeInitDuration)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric might not be that useful for cases where success: false as when ipamd initialization fails, it will be in a crashloop continuously.

metrics.IpamdStartupDuration.WithLabelValues("false", strconv.FormatBool(withApiServer), "node_initialization").Observe(time.Since(startupStartTime).Seconds())
return 1
}

// Record successful AWS initialization
metrics.IpamdNodeInitDuration.WithLabelValues("true").Observe(IPAMDNodeInitDuration)

// If not connected to API server yet, start background checks
if !withApiServer {
startBackgroundAPIServerCheck(ipamContext)
Expand All @@ -155,7 +171,10 @@ func _main() int {
go ipamContext.ServeIntrospection()
}

// Start the RPC listener
// Record successful startup duration before the blocking RPC handler call
metrics.IpamdStartupDuration.WithLabelValues("true", strconv.FormatBool(withApiServer), "").Observe(time.Since(startupStartTime).Seconds())

// Start the RPC listener (this is a blocking call)
err = ipamContext.RunRPCHandler(version.Version)
if err != nil {
log.Errorf("Failed to set up gRPC handler: %v", err)
Expand Down
16 changes: 16 additions & 0 deletions utils/prometheusmetrics/prometheusmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,20 @@ var (
},
[]string{"eni"},
)
IpamdStartupDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "awscni_ipamd_startup_duration_seconds",
Help: "The duration of IPAMD startup from process start to ready to serve CNI requests",
},
[]string{"success", "with_api_server", "failure_reason"},
)
IpamdNodeInitDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "awscni_ipamd_node_initialization_duration_seconds",
Help: "The duration of node initialization during IPAMD startup",
},
[]string{"success"},
)
)

// ServeMetrics sets up ipamd metrics and introspection endpoints
Expand Down Expand Up @@ -225,6 +239,8 @@ func PrometheusRegister() {
prometheus.MustRegister(IpsPerCidr)
prometheus.MustRegister(NoAvailableIPAddrs)
prometheus.MustRegister(EniIPsInUse)
prometheus.MustRegister(IpamdStartupDuration)
prometheus.MustRegister(IpamdNodeInitDuration)
}

// This can be enhanced to get it programatically.
Expand Down
Loading