Skip to content
Merged
18 changes: 18 additions & 0 deletions helm/oci-native-ingress-controller/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,24 @@ spec:
protocol: TCP
- name: metrics-server
containerPort: 2223
readinessProbe:
httpGet:
path: /healthz/ready
port: metrics-server
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /healthz/live
port: metrics-server
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
Expand Down
4 changes: 4 additions & 0 deletions helm/oci-native-ingress-controller/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ podDisruptionBudget: {}
# maxUnavailable: 1

# The TCP port the Webhook server binds to. (default 9443)
# Health probes for operational reliability and Cloud Guard compliance
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is misplaced on webhook Port.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have moved it to metrics server @nirpai

requesting approval on PR @nirpai @AkarshES

# Readiness probe: HTTP GET /healthz/ready on metrics-server port 2223 (initialDelaySeconds: 30)
# Liveness probe: HTTP GET /healthz/live on metrics-server port 2223 (initialDelaySeconds: 60)
# Probes use the metrics server and do not target the webhook port
webhookBindPort: 9443

# Supported auths - instance(default), user
Expand Down
5 changes: 5 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,5 +279,10 @@ func setupInformers(informerFactory informers.SharedInformerFactory, ctx context

klog.Fatal("failed to sync informers")
}

// Mark caches as synced for health checks
healthChecker := server.GetHealthChecker()
healthChecker.SetCachesSynced(true)

return ingressClassInformer, ingressInformer, serviceInformer, secretInformer, endpointInformer, podInformer, nodeInformer, serviceAccountInformer
}
85 changes: 85 additions & 0 deletions pkg/server/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
*
* * OCI Native Ingress Controller
* *
* * Copyright (c) 2023 Oracle America, Inc. and its affiliates.
* * Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
*
*/

package server

import (
"encoding/json"
"net/http"
"sync"
"sync/atomic"
)

type HealthChecker struct {
mu sync.RWMutex
cachesSynced atomic.Bool
controllersReady atomic.Bool
}

type HealthStatus struct {
Status string `json:"status"`
CachesSynced bool `json:"cachesSynced"`
ControllersReady bool `json:"controllersReady"`
}

var globalHealthChecker *HealthChecker

func NewHealthChecker() *HealthChecker {
return &HealthChecker{
cachesSynced: atomic.Bool{},
controllersReady: atomic.Bool{},
}
}

func GetHealthChecker() *HealthChecker {
if globalHealthChecker == nil {
globalHealthChecker = NewHealthChecker()
}
return globalHealthChecker
}

func (hc *HealthChecker) SetCachesSynced(synced bool) {
hc.cachesSynced.Store(synced)
}

func (hc *HealthChecker) SetControllersReady(ready bool) {
hc.controllersReady.Store(ready)
}

// Readiness check - pod should receive traffic
func (hc *HealthChecker) HandleReadiness(w http.ResponseWriter, r *http.Request) {
status := HealthStatus{
Status: "unhealthy",
CachesSynced: hc.cachesSynced.Load(),
ControllersReady: hc.controllersReady.Load(),
}

if status.CachesSynced && status.ControllersReady {
status.Status = "healthy"
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}

w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(status)
}

// Liveness check - pod should be restarted if unhealthy
func (hc *HealthChecker) HandleLiveness(w http.ResponseWriter, r *http.Request) {
status := HealthStatus{
Status: "alive",
CachesSynced: hc.cachesSynced.Load(),
ControllersReady: hc.controllersReady.Load(),
}

w.WriteHeader(http.StatusOK)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(status)
}
15 changes: 14 additions & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ package server
import (
"context"
"crypto/tls"
"github.com/oracle/oci-native-ingress-controller/pkg/task/certificatecleanup"
"net/http"
"os"
"time"

"github.com/oracle/oci-native-ingress-controller/pkg/task/certificatecleanup"

ctrcache "sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/webhook"

Expand Down Expand Up @@ -163,6 +164,13 @@ func SetUpControllers(opts types.IngressOpts, ingressClassInformer networkinginf
)
go certificateCleanUpTask.Run(ctx.Done())
}

// Mark controllers as ready for health checks
go func() {
time.Sleep(2 * time.Second) // Give controllers a moment to start
GetHealthChecker().SetControllersReady(true)
klog.Info("Controllers marked as ready for health checks")
}()
}
}

Expand Down Expand Up @@ -234,5 +242,10 @@ func SetupMetricsServer(metricsBackend string, metricsPort int, mux *http.ServeM
}
metric.RegisterMetrics(reg, mux)

// Register health check endpoints
hc := GetHealthChecker()
mux.HandleFunc("/healthz/ready", hc.HandleReadiness)
mux.HandleFunc("/healthz/live", hc.HandleLiveness)

return reg, nil
}