diff --git a/helm/oci-native-ingress-controller/templates/deployment.yaml b/helm/oci-native-ingress-controller/templates/deployment.yaml index 52dbd4b5..f54a82fe 100644 --- a/helm/oci-native-ingress-controller/templates/deployment.yaml +++ b/helm/oci-native-ingress-controller/templates/deployment.yaml @@ -86,6 +86,24 @@ spec: protocol: TCP - name: metrics-server containerPort: 2223 + readinessProbe: + httpGet: + path: /healthz/ready + port: metrics-server + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz/live + port: metrics-server + scheme: HTTP + initialDelaySeconds: 60 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 3 resources: {{- toYaml .Values.resources | nindent 12 }} volumeMounts: diff --git a/helm/oci-native-ingress-controller/values.yaml b/helm/oci-native-ingress-controller/values.yaml index e8abac48..daa66585 100644 --- a/helm/oci-native-ingress-controller/values.yaml +++ b/helm/oci-native-ingress-controller/values.yaml @@ -120,6 +120,10 @@ objectSelector: matchLabels: # key: value +# Metrics server configuration +# Health probes for operational reliability and Cloud Guard compliance +# Readiness probe: HTTP GET /healthz/ready on metrics-server port (initialDelaySeconds: 30) +# Liveness probe: HTTP GET /healthz/live on metrics-server port (initialDelaySeconds: 60) metrics: backend: prometheus port: 2223 diff --git a/main.go b/main.go index 73e3a893..83aed622 100644 --- a/main.go +++ b/main.go @@ -279,5 +279,10 @@ func setupInformers(informerFactory informers.SharedInformerFactory, ctx context klog.Fatal("failed to sync informers") } + + // Mark caches as synced for health checks + healthChecker := server.GetHealthChecker() + healthChecker.SetCachesSynced(true) + return ingressClassInformer, ingressInformer, serviceInformer, secretInformer, endpointInformer, podInformer, nodeInformer, serviceAccountInformer } diff --git a/pkg/server/health.go b/pkg/server/health.go new file mode 100644 index 00000000..6e867b19 --- /dev/null +++ b/pkg/server/health.go @@ -0,0 +1,85 @@ +/* + * + * * OCI Native Ingress Controller + * * + * * Copyright (c) 2023 Oracle America, Inc. and its affiliates. + * * Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + * + */ + +package server + +import ( + "encoding/json" + "net/http" + "sync" + "sync/atomic" +) + +type HealthChecker struct { + mu sync.RWMutex + cachesSynced atomic.Bool + controllersReady atomic.Bool +} + +type HealthStatus struct { + Status string `json:"status"` + CachesSynced bool `json:"cachesSynced"` + ControllersReady bool `json:"controllersReady"` +} + +var globalHealthChecker *HealthChecker + +func NewHealthChecker() *HealthChecker { + return &HealthChecker{ + cachesSynced: atomic.Bool{}, + controllersReady: atomic.Bool{}, + } +} + +func GetHealthChecker() *HealthChecker { + if globalHealthChecker == nil { + globalHealthChecker = NewHealthChecker() + } + return globalHealthChecker +} + +func (hc *HealthChecker) SetCachesSynced(synced bool) { + hc.cachesSynced.Store(synced) +} + +func (hc *HealthChecker) SetControllersReady(ready bool) { + hc.controllersReady.Store(ready) +} + +// Readiness check - pod should receive traffic +func (hc *HealthChecker) HandleReadiness(w http.ResponseWriter, r *http.Request) { + status := HealthStatus{ + Status: "unhealthy", + CachesSynced: hc.cachesSynced.Load(), + ControllersReady: hc.controllersReady.Load(), + } + + if status.CachesSynced && status.ControllersReady { + status.Status = "healthy" + w.WriteHeader(http.StatusOK) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(status) +} + +// Liveness check - pod should be restarted if unhealthy +func (hc *HealthChecker) HandleLiveness(w http.ResponseWriter, r *http.Request) { + status := HealthStatus{ + Status: "alive", + CachesSynced: hc.cachesSynced.Load(), + ControllersReady: hc.controllersReady.Load(), + } + + w.WriteHeader(http.StatusOK) + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(status) +} diff --git a/pkg/server/server.go b/pkg/server/server.go index a4d01a17..a1a6f01d 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -12,11 +12,12 @@ package server import ( "context" "crypto/tls" - "github.com/oracle/oci-native-ingress-controller/pkg/task/certificatecleanup" "net/http" "os" "time" + "github.com/oracle/oci-native-ingress-controller/pkg/task/certificatecleanup" + ctrcache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -163,6 +164,13 @@ func SetUpControllers(opts types.IngressOpts, ingressClassInformer networkinginf ) go certificateCleanUpTask.Run(ctx.Done()) } + + // Mark controllers as ready for health checks + go func() { + time.Sleep(2 * time.Second) // Give controllers a moment to start + GetHealthChecker().SetControllersReady(true) + klog.Info("Controllers marked as ready for health checks") + }() } } @@ -234,5 +242,10 @@ func SetupMetricsServer(metricsBackend string, metricsPort int, mux *http.ServeM } metric.RegisterMetrics(reg, mux) + // Register health check endpoints + hc := GetHealthChecker() + mux.HandleFunc("/healthz/ready", hc.HandleReadiness) + mux.HandleFunc("/healthz/live", hc.HandleLiveness) + return reg, nil }