diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml
index 41c6e4ac7..77d4b9695 100644
--- a/.github/workflows/e2e-test.yaml
+++ b/.github/workflows/e2e-test.yaml
@@ -47,6 +47,8 @@ jobs:
test_pattern: "^Test_SO"
- test_name: Topology_Aware_Scheduling
test_pattern: "^Test_TAS"
+ - test_name: cert_management
+ test_pattern: "^Test_CM"
name: E2E - ${{ matrix.test_name }}
steps:
# print runner specs so we have a record incase of failures
diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md
index 1c9f6bfa8..f8b6a9c2e 100644
--- a/docs/api-reference/operator-api.md
+++ b/docs/api-reference/operator-api.md
@@ -671,6 +671,24 @@ _Appears in:_
| `exemptServiceAccountUserNames` _string array_ | ExemptServiceAccountUserNames is a list of service account usernames that are exempt from authorizer checks.
Each service account username name in ExemptServiceAccountUserNames should be of the following format:
system:serviceaccount::. ServiceAccounts are represented in this
format when checking the username in authenticationv1.UserInfo.Name. | | |
+#### CertProvisionMode
+
+_Underlying type:_ _string_
+
+CertProvisionMode defines how webhook certificates are provisioned.
+
+_Validation:_
+- Enum: [auto manual]
+
+_Appears in:_
+- [WebhookServer](#webhookserver)
+
+| Field | Description |
+| --- | --- |
+| `auto` | CertProvisionModeAuto enables automatic certificate generation and management via cert-controller.
cert-controller automatically generates self-signed certificates and stores them in the Secret.
|
+| `manual` | CertProvisionModeManual expects certificates to be provided externally (e.g., by cert-manager, cluster admin).
|
+
+
#### ClientConnectionConfiguration
@@ -916,5 +934,7 @@ _Appears in:_
| `bindAddress` _string_ | BindAddress is the IP address on which to listen for the specified port. | | |
| `port` _integer_ | Port is the port on which to serve requests. | | |
| `serverCertDir` _string_ | ServerCertDir is the directory containing the server certificate and key. | | |
+| `secretName` _string_ | SecretName is the name of the Kubernetes Secret containing webhook certificates.
The Secret must contain tls.crt, tls.key, and ca.crt. | grove-webhook-server-cert | |
+| `certProvisionMode` _[CertProvisionMode](#certprovisionmode)_ | CertProvisionMode controls how webhook certificates are provisioned. | auto | Enum: [auto manual]
|
diff --git a/docs/installation.md b/docs/installation.md
index 901deff90..3d935c7aa 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -102,6 +102,12 @@ This make target leverages Grove [Helm](https://helm.sh/) charts and [Skaffold](
- Grove Scheduler CRDs - `podgangs.scheduler.grove.io`.
- All Grove operator resources defined as a part of [Grove Helm chart templates](../operator/charts/templates).
+## Certificate Management
+
+By default, Grove automatically generates and manages TLS certificates for its webhook server. For production environments, you may want to use certificates from your organization's PKI or a certificate manager like cert-manager.
+
+See the [Certificate Management Guide](user-guide/certificate-management.md) for detailed configuration options.
+
## Verify Installation
Follow the instructions in the [quickstart guide](quickstart.md) to deploy a PodCliqueSet and validate your installation.
diff --git a/docs/user-guide/certificate-management.md b/docs/user-guide/certificate-management.md
new file mode 100644
index 000000000..2c8326bd7
--- /dev/null
+++ b/docs/user-guide/certificate-management.md
@@ -0,0 +1,298 @@
+# Certificate Management
+
+Grove's webhook server requires TLS certificates to secure communication with the Kubernetes API server. This guide explains how to configure certificate management for your Grove deployment.
+
+## Overview
+
+Grove supports two certificate management modes:
+
+| Mode | Description | Best For |
+|------|-------------|----------|
+| **Auto-provisioned** (default) | Grove automatically generates and manages self-signed certificates | Development, testing, quick setup |
+| **External certificates** | You provide certificates from an external source (e.g., cert-manager, manual) | Production, enterprise PKI integration |
+
+## Auto-Provisioned Certificates (Default)
+
+By default, Grove's built-in cert-controller automatically:
+- Generates self-signed CA and server certificates
+- Stores them in a Kubernetes Secret
+- Injects the CA bundle into webhook configurations
+- Rotates certificates before expiry
+
+This mode requires no additional configuration. Simply deploy Grove:
+
+```bash
+helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts:
+```
+
+### How It Works
+
+1. On startup, the cert-controller checks if the webhook certificate Secret exists
+2. If missing or expired, it generates new certificates
+3. The CA bundle is automatically injected into `ValidatingWebhookConfiguration` and `MutatingWebhookConfiguration` resources
+4. Certificates are stored in the Secret specified by `config.server.webhooks.secretName` (default: `grove-webhook-server-cert`)
+
+## External Certificate Management
+
+For production environments, you may want to use certificates from your organization's PKI or a certificate manager like [cert-manager](https://cert-manager.io/).
+
+### Prerequisites
+
+Before deploying Grove with external certificates, ensure:
+1. Your certificate Secret exists in the target namespace
+2. The Secret contains the required keys: `tls.crt`, `tls.key`, and `ca.crt`
+3. The certificate's Subject Alternative Names (SANs) include the webhook service DNS name
+
+### Required Certificate SANs
+
+The webhook server certificate must include these SANs:
+```
+grove-operator..svc
+grove-operator..svc.cluster.local
+```
+
+Replace `` with your deployment namespace (default: `default`).
+
+### Configuration
+
+To use external certificates, configure the following Helm values:
+
+```yaml
+config:
+ server:
+ webhooks:
+ # Use manual certificate provisioning (external)
+ certProvisionMode: manual
+ # Name of the Secret containing your certificates
+ secretName: "grove-webhook-server-cert"
+
+webhooks:
+ # Base64-encoded CA certificate that signed the webhook server certificate
+ caBundle: ""
+```
+
+### Using cert-manager
+
+[cert-manager](https://cert-manager.io/) is a popular choice for managing certificates in Kubernetes. Here's how to configure Grove with cert-manager.
+
+#### Step 1: Install cert-manager
+
+If not already installed:
+
+```bash
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
+```
+
+Wait for cert-manager to be ready:
+
+```bash
+kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
+kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
+```
+
+#### Step 2: Create a Certificate Issuer
+
+Create a self-signed issuer (or use your organization's issuer):
+
+```yaml
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+ name: grove-selfsigned-issuer
+ namespace: # Your Grove deployment namespace
+spec:
+ selfSigned: {}
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+ name: grove-ca
+ namespace:
+spec:
+ isCA: true
+ commonName: grove-ca
+ secretName: grove-ca-secret
+ privateKey:
+ algorithm: ECDSA
+ size: 256
+ issuerRef:
+ name: grove-selfsigned-issuer
+ kind: Issuer
+---
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+ name: grove-ca-issuer
+ namespace:
+spec:
+ ca:
+ secretName: grove-ca-secret
+```
+
+#### Step 3: Create a Certificate for the Webhook
+
+```yaml
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+ name: grove-webhook-cert
+ namespace:
+spec:
+ secretName: grove-webhook-server-cert
+ duration: 8760h # 1 year
+ renewBefore: 720h # 30 days
+ subject:
+ organizations:
+ - grove
+ isCA: false
+ privateKey:
+ algorithm: RSA
+ encoding: PKCS1
+ size: 2048
+ usages:
+ - server auth
+ - digital signature
+ - key encipherment
+ dnsNames:
+ - grove-operator
+ - grove-operator.
+ - grove-operator..svc
+ - grove-operator..svc.cluster.local
+ issuerRef:
+ name: grove-ca-issuer
+ kind: Issuer
+```
+
+#### Step 4: Configure Webhook Annotations
+
+cert-manager can automatically inject the CA bundle into webhook configurations. Add these annotations to your Helm values:
+
+```yaml
+webhooks:
+ podCliqueSetValidationWebhook:
+ annotations:
+ cert-manager.io/inject-ca-from: /grove-webhook-cert
+ podCliqueSetDefaultingWebhook:
+ annotations:
+ cert-manager.io/inject-ca-from: /grove-webhook-cert
+ authorizerWebhook:
+ annotations:
+ cert-manager.io/inject-ca-from: /grove-webhook-cert
+```
+
+#### Step 5: Deploy Grove
+
+```bash
+helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \
+ --set config.server.webhooks.certProvisionMode=manual \
+ --set webhooks.podCliqueSetValidationWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert" \
+ --set webhooks.podCliqueSetDefaultingWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert" \
+ --set webhooks.authorizerWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert"
+```
+
+Or create a `values.yaml` file with the configuration shown above and deploy:
+
+```bash
+helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: -f values.yaml
+```
+
+### Manual Certificate Management
+
+If you prefer to manage certificates manually:
+
+#### Step 1: Generate Certificates
+
+```bash
+# Create a CA
+openssl genrsa -out ca.key 2048
+openssl req -x509 -new -nodes -key ca.key -sha256 -days 365 \
+ -out ca.crt -subj "/CN=grove-ca"
+
+# Create server certificate
+openssl genrsa -out tls.key 2048
+
+# Create CSR config
+cat > csr.conf <
+DNS.3 = grove-operator..svc
+DNS.4 = grove-operator..svc.cluster.local
+EOF
+
+# Generate CSR and certificate
+openssl req -new -key tls.key -out tls.csr -config csr.conf
+openssl x509 -req -in tls.csr -CA ca.crt -CAkey ca.key -CAcreateserial \
+ -out tls.crt -days 365 -extensions req_ext -extfile csr.conf
+```
+
+#### Step 2: Create the Secret
+
+```bash
+kubectl create secret generic grove-webhook-server-cert \
+ --from-file=tls.crt=tls.crt \
+ --from-file=tls.key=tls.key \
+ --from-file=ca.crt=ca.crt \
+ -n
+```
+
+#### Step 3: Deploy Grove with the CA Bundle
+
+```bash
+CA_BUNDLE=$(cat ca.crt | base64 | tr -d '\n')
+
+helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \
+ --set config.server.webhooks.certProvisionMode=manual \
+ --set webhooks.caBundle="${CA_BUNDLE}"
+```
+
+## Switching Between Modes
+
+### From Auto-Provision to External
+
+1. Set up your external certificate infrastructure (cert-manager or manual)
+2. Update your Helm values to disable auto-provisioning
+3. Upgrade the Grove deployment
+
+```bash
+helm upgrade grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \
+ --set config.server.webhooks.certProvisionMode=manual \
+ --set webhooks.caBundle="${CA_BUNDLE}"
+```
+
+### From External to Auto-Provision
+
+1. Update your Helm values to enable auto-provisioning
+2. Remove the `webhooks.caBundle` value (or set it to empty)
+3. Upgrade the Grove deployment
+
+```bash
+helm upgrade grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \
+ --set config.server.webhooks.certProvisionMode=auto \
+ --set webhooks.caBundle=""
+```
+
+The cert-controller will generate new certificates on the next operator restart.
+
+## Configuration Reference
+
+| Helm Value | Type | Default | Description |
+|------------|------|---------|-------------|
+| `config.server.webhooks.certProvisionMode` | string | `auto` | Certificate provisioning mode: `auto` (cert-controller generates certs) or `manual` (external certs) |
+| `config.server.webhooks.secretName` | string | `grove-webhook-server-cert` | Name of the Secret containing certificates |
+| `webhooks.caBundle` | string | `""` | Base64-encoded CA certificate for webhook configurations |
+| `webhooks.podCliqueSetValidationWebhook.annotations` | map | `{}` | Annotations for the validation webhook (e.g., cert-manager injection) |
+| `webhooks.podCliqueSetDefaultingWebhook.annotations` | map | `{}` | Annotations for the defaulting webhook |
+| `webhooks.authorizerWebhook.annotations` | map | `{}` | Annotations for the authorizer webhook |
diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go
index e2752790d..c1d97a428 100644
--- a/operator/api/config/v1alpha1/defaults.go
+++ b/operator/api/config/v1alpha1/defaults.go
@@ -79,6 +79,14 @@ func SetDefaults_ServerConfiguration(serverConfig *ServerConfiguration) {
serverConfig.Webhooks.ServerCertDir = defaultWebhookServerTLSServerCertDir
}
+ if serverConfig.Webhooks.SecretName == "" {
+ serverConfig.Webhooks.SecretName = "grove-webhook-server-cert"
+ }
+
+ if serverConfig.Webhooks.CertProvisionMode == "" {
+ serverConfig.Webhooks.CertProvisionMode = CertProvisionModeAuto
+ }
+
if serverConfig.HealthProbes == nil {
serverConfig.HealthProbes = &Server{}
}
diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go
index 82b8e8365..f932a7bed 100644
--- a/operator/api/config/v1alpha1/types.go
+++ b/operator/api/config/v1alpha1/types.go
@@ -140,8 +140,29 @@ type WebhookServer struct {
Server `json:",inline"`
// ServerCertDir is the directory containing the server certificate and key.
ServerCertDir string `json:"serverCertDir"`
+ // SecretName is the name of the Kubernetes Secret containing webhook certificates.
+ // The Secret must contain tls.crt, tls.key, and ca.crt.
+ // +optional
+ // +kubebuilder:default="grove-webhook-server-cert"
+ SecretName string `json:"secretName,omitempty"`
+ // CertProvisionMode controls how webhook certificates are provisioned.
+ // +optional
+ // +kubebuilder:default="auto"
+ CertProvisionMode CertProvisionMode `json:"certProvisionMode,omitempty"`
}
+// CertProvisionMode defines how webhook certificates are provisioned.
+// +kubebuilder:validation:Enum=auto;manual
+type CertProvisionMode string
+
+const (
+ // CertProvisionModeAuto enables automatic certificate generation and management via cert-controller.
+ // cert-controller automatically generates self-signed certificates and stores them in the Secret.
+ CertProvisionModeAuto CertProvisionMode = "auto"
+ // CertProvisionModeManual expects certificates to be provided externally (e.g., by cert-manager, cluster admin).
+ CertProvisionModeManual CertProvisionMode = "manual"
+)
+
// Server contains information for HTTP(S) server configuration.
type Server struct {
// BindAddress is the IP address on which to listen for the specified port.
diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl
index 5c42f26d3..d76d4954f 100644
--- a/operator/charts/templates/_helpers.tpl
+++ b/operator/charts/templates/_helpers.tpl
@@ -16,6 +16,9 @@ config.yaml: |
server:
webhooks:
port: {{ .Values.config.server.webhooks.port }}
+ serverCertDir: {{ .Values.config.server.webhooks.serverCertDir }}
+ secretName: {{ .Values.config.server.webhooks.secretName | default "grove-webhook-server-cert" }}
+ certProvisionMode: {{ .Values.config.server.webhooks.certProvisionMode | default "auto" }}
healthProbes:
port: {{ .Values.config.server.healthProbes.port }}
metrics:
diff --git a/operator/charts/templates/authorizer-webhook-config.yaml b/operator/charts/templates/authorizer-webhook-config.yaml
index e54f2cbdd..cc05e35c2 100644
--- a/operator/charts/templates/authorizer-webhook-config.yaml
+++ b/operator/charts/templates/authorizer-webhook-config.yaml
@@ -4,12 +4,20 @@ apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: authorizer-webhook
+ {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}}
+ {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.authorizerWebhook.annotations }}
+ annotations:
+ {{- toYaml .Values.webhooks.authorizerWebhook.annotations | nindent 4 }}
+ {{- end }}
labels:
{{- include "operator.authorizer.webhook.labels" . | nindent 4 }}
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
+ {{- if .Values.webhooks.caBundle }}
+ caBundle: {{ .Values.webhooks.caBundle }}
+ {{- end }}
service:
name: {{ required ".Values.service.name is required" .Values.service.name }}
namespace: {{ .Release.Namespace }}
diff --git a/operator/charts/templates/deployment.yaml b/operator/charts/templates/deployment.yaml
index ed9cdd59b..51d4ca7e5 100644
--- a/operator/charts/templates/deployment.yaml
+++ b/operator/charts/templates/deployment.yaml
@@ -61,7 +61,7 @@ spec:
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
readOnly: true
- name: grove-webhook-server-cert
- mountPath: /etc/grove-operator/webhook-certs
+ mountPath: {{ .Values.config.server.webhooks.serverCertDir | default "/etc/grove-operator/webhook-certs" }}
readOnly: true
env:
- name: GROVE_OPERATOR_SERVICE_ACCOUNT_NAME
@@ -92,6 +92,7 @@ spec:
configMap:
name: {{ include "operator.config.name" . }}
- name: grove-webhook-server-cert
+ # Mount the Secret volume containing webhook certificates
secret:
- secretName: grove-webhook-server-cert
+ secretName: {{ .Values.config.server.webhooks.secretName | default "grove-webhook-server-cert" }}
defaultMode: 420
diff --git a/operator/charts/templates/pcs-defaulting-webhook-config.yaml b/operator/charts/templates/pcs-defaulting-webhook-config.yaml
index 98bb34571..634e219d7 100644
--- a/operator/charts/templates/pcs-defaulting-webhook-config.yaml
+++ b/operator/charts/templates/pcs-defaulting-webhook-config.yaml
@@ -3,12 +3,20 @@ apiVersion: admissionregistration.k8s.io/v1
kind: MutatingWebhookConfiguration
metadata:
name: podcliqueset-defaulting-webhook
+ {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}}
+ {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.podCliqueSetDefaultingWebhook.annotations }}
+ annotations:
+ {{- toYaml .Values.webhooks.podCliqueSetDefaultingWebhook.annotations | nindent 4 }}
+ {{- end }}
labels:
{{- include "operator.pcs.defaulting.webhook.labels" . | nindent 4 }}
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
+ {{- if .Values.webhooks.caBundle }}
+ caBundle: {{ .Values.webhooks.caBundle }}
+ {{- end }}
service:
name: {{ required ".Values.service.name is required" .Values.service.name }}
namespace: {{ .Release.Namespace }}
diff --git a/operator/charts/templates/pcs-validating-webhook-config.yaml b/operator/charts/templates/pcs-validating-webhook-config.yaml
index 92c170935..3a629edb4 100644
--- a/operator/charts/templates/pcs-validating-webhook-config.yaml
+++ b/operator/charts/templates/pcs-validating-webhook-config.yaml
@@ -3,12 +3,20 @@ apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: podcliqueset-validating-webhook
+ {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}}
+ {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.podCliqueSetValidationWebhook.annotations }}
+ annotations:
+ {{- toYaml .Values.webhooks.podCliqueSetValidationWebhook.annotations | nindent 4 }}
+ {{- end }}
labels:
{{- include "operator.pcs.validating.webhook.labels" . | nindent 4 }}
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
+ {{- if .Values.webhooks.caBundle }}
+ caBundle: {{ .Values.webhooks.caBundle }}
+ {{- end }}
service:
name: {{ required ".Values.service.name is required" .Values.service.name }}
namespace: {{ .Release.Namespace }}
diff --git a/operator/charts/templates/webhook-server-cert-secret.yaml b/operator/charts/templates/webhook-server-cert-secret.yaml
index 7c9064718..b220ea7df 100644
--- a/operator/charts/templates/webhook-server-cert-secret.yaml
+++ b/operator/charts/templates/webhook-server-cert-secret.yaml
@@ -1,3 +1,9 @@
+{{- /*
+ Create this Secret if certProvisionMode is "auto" (default).
+ This allows the operator to auto-generate webhook TLS certs at startup.
+*/ -}}
+{{- if ne (.Values.config.server.webhooks.certProvisionMode | default "auto") "manual" }}
+---
apiVersion: v1
kind: Secret
metadata:
@@ -9,3 +15,4 @@ type: kubernetes.io/tls
data:
tls.crt: ""
tls.key: ""
+{{- end }}
diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml
index be1ab4425..42003b7d0 100644
--- a/operator/charts/values.yaml
+++ b/operator/charts/values.yaml
@@ -46,6 +46,25 @@ config:
server:
webhooks:
port: 9443
+ # serverCertDir is the directory path where certificate files are located.
+ # The operator will look for tls.crt and tls.key in this directory.
+ # Default: /etc/grove-operator/webhook-certs
+ serverCertDir: /etc/grove-operator/webhook-certs
+ # secretName is the name of the Kubernetes Secret containing webhook certificates.
+ # The Secret must contain tls.crt, tls.key, and ca.crt.
+ # Default: "grove-webhook-server-cert"
+ secretName: "grove-webhook-server-cert"
+ # certProvisionMode controls how webhook certificates are provisioned.
+ # Values:
+ # "auto" (default): cert-controller automatically generates self-signed certificates.
+ # - Best for development/testing
+ # - No manual certificate management needed
+ # - Certificates auto-rotate before expiry
+ # "manual": certificates must be provided externally (e.g., cert-manager, cluster admin).
+ # - Best for production
+ # - Use your own CA or Let's Encrypt
+ # - Requires Secret to exist before operator starts
+ certProvisionMode: auto
healthProbes:
enable: false
port: 9444
@@ -145,21 +164,27 @@ leaseRoleBinding:
app.kubernetes.io/part-of: grove
webhooks:
+ # Shared CA bundle for all webhooks. This is the base64-encoded CA certificate
+ # that signed the webhook server's TLS certificate.
+ caBundle: ""
podCliqueSetValidationWebhook:
labels:
app.kubernetes.io/component: operator-pcs-validating-webhook
app.kubernetes.io/name: pcs-validating-webhook
app.kubernetes.io/part-of: grove
+ annotations: {}
podCliqueSetDefaultingWebhook:
labels:
app.kubernetes.io/component: operator-pcs-defaulting-webhook
app.kubernetes.io/name: pcs-defaulting-webhook
app.kubernetes.io/part-of: grove
+ annotations: {}
authorizerWebhook:
labels:
app.kubernetes.io/component: operator-authorizer-webhook
app.kubernetes.io/name: authorizer-webhook
app.kubernetes.io/part-of: grove
+ annotations: {}
webhookServerSecret:
labels:
diff --git a/operator/cmd/main.go b/operator/cmd/main.go
index d79007dfb..b6fd8dbde 100644
--- a/operator/cmd/main.go
+++ b/operator/cmd/main.go
@@ -94,7 +94,14 @@ func main() {
}
webhookCertsReadyCh := make(chan struct{})
- if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil {
+ if err = cert.ManageWebhookCerts(
+ mgr,
+ operatorConfig.Server.Webhooks.ServerCertDir,
+ operatorConfig.Server.Webhooks.SecretName,
+ operatorConfig.Authorizer.Enabled,
+ operatorConfig.Server.Webhooks.CertProvisionMode,
+ webhookCertsReadyCh,
+ ); err != nil {
logger.Error(err, "failed to setup cert rotation")
handleErrorAndExit(err, cli.ExitErrInitializeManager)
}
diff --git a/operator/e2e/dependencies.go b/operator/e2e/dependencies.go
index 687d7588c..1704efbd9 100644
--- a/operator/e2e/dependencies.go
+++ b/operator/e2e/dependencies.go
@@ -51,6 +51,7 @@ type HelmChartDependency struct {
type HelmCharts struct {
KaiScheduler HelmChartDependency `yaml:"kaiScheduler"`
GPUOperator HelmChartDependency `yaml:"gpuOperator"`
+ CertManager HelmChartDependency `yaml:"certManager"`
}
// Dependencies represents all E2E test dependencies
diff --git a/operator/e2e/dependencies.yaml b/operator/e2e/dependencies.yaml
index dd15f8ee3..1bd01a55a 100644
--- a/operator/e2e/dependencies.yaml
+++ b/operator/e2e/dependencies.yaml
@@ -32,6 +32,16 @@ images:
- name: ghcr.io/nvidia/kai-scheduler/scheduler
version: v0.13.0-rc1
+ # Cert-manager
+ - name: quay.io/jetstack/cert-manager-controller
+ version: v1.14.4
+ - name: quay.io/jetstack/cert-manager-webhook
+ version: v1.14.4
+ - name: quay.io/jetstack/cert-manager-cainjector
+ version: v1.14.4
+ - name: quay.io/jetstack/cert-manager-ctl
+ version: v1.14.4
+
# Helm charts used in E2E tests
helmCharts:
# Kai Scheduler - gang scheduling for Kubernetes
@@ -40,3 +50,11 @@ helmCharts:
chartRef: oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler
version: v0.13.0-rc1
namespace: kai-scheduler
+
+ # cert-manager - for certificate management
+ certManager:
+ releaseName: cert-manager
+ chartRef: cert-manager
+ version: v1.14.4
+ namespace: cert-manager
+ repoURL: https://charts.jetstack.io
diff --git a/operator/e2e/setup/grove.go b/operator/e2e/setup/grove.go
new file mode 100644
index 000000000..74b5b1a6e
--- /dev/null
+++ b/operator/e2e/setup/grove.go
@@ -0,0 +1,178 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+// Package setup provides internal testing utilities for configuring and managing
+// Grove operator installations during e2e tests. This package is not intended for
+// production use and its API may change without notice.
+//
+// Functions are exported to allow access from e2e test packages (e.g., operator/e2e/tests)
+// which need to modify Grove configuration during test scenarios.
+package setup
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "path/filepath"
+ "runtime"
+
+ configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+ "github.com/ai-dynamo/grove/operator/e2e/utils"
+ "gopkg.in/yaml.v3"
+ "k8s.io/client-go/rest"
+)
+
+// GroveConfig holds typed configuration options for updating the Grove operator.
+// This struct provides a user-friendly interface that gets translated to Helm values internally.
+type GroveConfig struct {
+ // InstallCRDs controls whether CRDs should be installed/updated.
+ InstallCRDs bool
+ // Webhooks contains webhook-specific configuration.
+ Webhooks WebhooksConfig
+}
+
+// WebhooksConfig holds configuration for Grove's webhook server.
+type WebhooksConfig struct {
+ // CertProvisionMode controls how webhook certificates are provisioned.
+ // Use configv1alpha1.CertProvisionModeAuto for automatic provisioning,
+ // configv1alpha1.CertProvisionModeManual for external cert management.
+ CertProvisionMode configv1alpha1.CertProvisionMode
+ // SecretName is the name of the Kubernetes secret containing TLS certificates.
+ SecretName string
+ // Annotations to apply to webhook configurations (e.g., for cert-manager CA injection).
+ // These annotations are applied to all webhook configurations.
+ Annotations map[string]string
+}
+
+// toHelmValues converts the typed GroveConfig to a Helm values map.
+func (c *GroveConfig) toHelmValues() map[string]interface{} {
+ // Convert annotations to interface{} map for Helm
+ annotations := make(map[string]interface{})
+ for k, v := range c.Webhooks.Annotations {
+ annotations[k] = v
+ }
+
+ webhookAnnotations := map[string]interface{}{"annotations": annotations}
+
+ return map[string]interface{}{
+ "installCRDs": c.InstallCRDs,
+ "config": map[string]interface{}{
+ "server": map[string]interface{}{
+ "webhooks": map[string]interface{}{
+ "certProvisionMode": string(c.Webhooks.CertProvisionMode),
+ "secretName": c.Webhooks.SecretName,
+ },
+ },
+ },
+ "webhooks": map[string]interface{}{
+ "podCliqueSetValidationWebhook": webhookAnnotations,
+ "podCliqueSetDefaultingWebhook": webhookAnnotations,
+ "authorizerWebhook": webhookAnnotations,
+ },
+ }
+}
+
+// UpdateGroveConfiguration updates the Grove operator configuration.
+//
+// This uses Helm upgrade (rather than Skaffold) because:
+// 1. Grove is initially installed via Skaffold, which uses Helm under the hood
+// 2. For config-only changes (like switching cert modes), rebuilding images is unnecessary
+// 3. Helm upgrade with ReuseValues preserves the image configuration that Skaffold set
+//
+// This approach avoids wasteful rebuilds while staying compatible with the Skaffold installation.
+func UpdateGroveConfiguration(ctx context.Context, restConfig *rest.Config, config *GroveConfig, logger *utils.Logger) error {
+ chartDir, err := getGroveChartDir()
+ if err != nil {
+ return fmt.Errorf("failed to get Grove chart directory: %w", err)
+ }
+
+ chartVersion, err := getChartVersion(chartDir)
+ if err != nil {
+ return fmt.Errorf("failed to get chart version: %w", err)
+ }
+
+ // Configure Helm upgrade using shared constants to stay in sync with Skaffold installation.
+ // - ReleaseName: Uses OperatorDeploymentName from constants.go (matches skaffold.yaml's deploy.helm.releases[0].name)
+ // - ChartVersion: Read from Chart.yaml to avoid version string duplication
+ // - ReuseValues: Preserves image configuration that Skaffold set during initial install
+ helmConfig := &HelmInstallConfig{
+ RestConfig: restConfig,
+ ReleaseName: OperatorDeploymentName,
+ ChartRef: chartDir,
+ ChartVersion: chartVersion,
+ Namespace: OperatorNamespace,
+ ReuseValues: true,
+ Values: config.toHelmValues(),
+ HelmLoggerFunc: logger.Debugf,
+ Logger: logger,
+ }
+
+ logger.Debug("Updating Grove operator configuration")
+
+ if _, err := UpgradeHelmChart(helmConfig); err != nil {
+ return fmt.Errorf("helm upgrade failed: %w", err)
+ }
+
+ // Wait for Grove operator pod to be ready after upgrade
+ if err := utils.WaitForPodsInNamespace(ctx, OperatorNamespace, restConfig, 1, defaultPollTimeout, defaultPollInterval, logger); err != nil {
+ return fmt.Errorf("grove operator pod not ready after upgrade: %w", err)
+ }
+
+ logger.Debug("Grove configuration update completed successfully")
+ return nil
+}
+
+// chartYAML represents the structure of Chart.yaml for version extraction
+type chartYAML struct {
+ Version string `yaml:"version"`
+}
+
+// getChartVersion reads the version from Chart.yaml in the given chart directory.
+// The chartDir parameter should be the path to a Helm chart directory. Chart.yaml is
+// a required file per the Helm chart specification and will always exist for valid charts.
+// We read from Chart.yaml rather than hardcoding the version to maintain a single source
+// of truth, avoiding configuration drift between the chart definition and the e2e test code.
+func getChartVersion(chartDir string) (string, error) {
+ chartFile := filepath.Join(chartDir, "Chart.yaml")
+ data, err := os.ReadFile(chartFile)
+ if err != nil {
+ return "", fmt.Errorf("failed to read %s: %w", chartFile, err)
+ }
+
+ var chart chartYAML
+ if err := yaml.Unmarshal(data, &chart); err != nil {
+ return "", fmt.Errorf("failed to parse %s: %w", chartFile, err)
+ }
+
+ if chart.Version == "" {
+ return "", fmt.Errorf("version not found in %s", chartFile)
+ }
+
+ return chart.Version, nil
+}
+
+// getGroveChartDir returns the absolute path to the Grove Helm chart directory.
+// It uses runtime.Caller to find the path relative to this source file.
+func getGroveChartDir() (string, error) {
+ _, currentFile, _, ok := runtime.Caller(0)
+ if !ok {
+ return "", fmt.Errorf("failed to get current file path")
+ }
+ // This file is at operator/e2e/setup/grove.go
+ // Chart directory is at operator/charts
+ chartDir := filepath.Join(filepath.Dir(currentFile), "../../charts")
+ return filepath.Abs(chartDir)
+}
diff --git a/operator/e2e/setup/helm.go b/operator/e2e/setup/helm.go
index e899a634c..32657223a 100644
--- a/operator/e2e/setup/helm.go
+++ b/operator/e2e/setup/helm.go
@@ -23,6 +23,7 @@ import (
"os"
"path/filepath"
"strings"
+ "time"
"github.com/ai-dynamo/grove/operator/e2e/utils"
"helm.sh/helm/v3/pkg/action"
@@ -60,6 +61,10 @@ type HelmInstallConfig struct {
Logger *utils.Logger
// RepoURL is the base URL of the Helm repository (optional, for direct chart downloads).
RepoURL string
+ // ReuseValues reuses the last release's values and merges in the new values.
+ ReuseValues bool
+ // Timeout is the time to wait for Kubernetes operations (default: 5 minutes).
+ Timeout time.Duration
}
// Validate validates and sets defaults for the configuration.
@@ -133,6 +138,81 @@ func InstallHelmChart(config *HelmInstallConfig) (*release.Release, error) {
return rel, nil
}
+// UpgradeHelmChart upgrades a Helm chart with the given configuration.
+func UpgradeHelmChart(config *HelmInstallConfig) (*release.Release, error) {
+ if err := config.Validate(); err != nil {
+ return nil, err
+ }
+
+ // Initialize Helm action configuration
+ config.HelmLoggerFunc("Setting up Helm configuration for upgrade of %s...", config.ReleaseName)
+ actionConfig, err := setupHelmAction(config)
+ if err != nil {
+ return nil, err
+ }
+
+ // Resolve chart location (download from HTTP or locate via Helm)
+ chartPath, err := resolveChart(actionConfig, config)
+ if err != nil {
+ return nil, err
+ }
+
+ // Load and validate the chart
+ config.HelmLoggerFunc("Loading chart from %s...", chartPath)
+ chart, err := loader.Load(chartPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to load chart: %w", err)
+ }
+
+ // Upgrade the chart
+ config.HelmLoggerFunc("Upgrading release %s in namespace %s...", config.ReleaseName, config.Namespace)
+ upgradeClient := newUpgradeClient(actionConfig, config)
+ rel, err := upgradeClient.Run(config.ReleaseName, chart, config.Values)
+ if err != nil {
+ return nil, fmt.Errorf("helm upgrade failed: %w", err)
+ }
+
+ config.HelmLoggerFunc("✅ Release '%s' upgraded successfully. Status: %s", rel.Name, rel.Info.Status)
+ return rel, nil
+}
+
+// UninstallHelmChart uninstalls a Helm release.
+func UninstallHelmChart(config *HelmInstallConfig) error {
+ if config.ReleaseName == "" {
+ return fmt.Errorf("release name is required for uninstall")
+ }
+ if config.Namespace == "" {
+ return fmt.Errorf("namespace is required for uninstall")
+ }
+
+ // Initialize Helm action configuration
+ if config.HelmLoggerFunc == nil {
+ config.HelmLoggerFunc = func(_ string, _ ...interface{}) {}
+ }
+ config.HelmLoggerFunc("Setting up Helm configuration for uninstall of %s...", config.ReleaseName)
+ actionConfig, err := setupHelmAction(config)
+ if err != nil {
+ return err
+ }
+
+ // Uninstall the release
+ config.HelmLoggerFunc("Uninstalling release %s from namespace %s...", config.ReleaseName, config.Namespace)
+ uninstallClient := action.NewUninstall(actionConfig)
+
+ // Set timeout if provided
+ if config.Timeout > 0 {
+ uninstallClient.Timeout = config.Timeout
+ }
+
+ _, err = uninstallClient.Run(config.ReleaseName)
+ if err != nil {
+ return fmt.Errorf("helm uninstall failed: %w", err)
+ }
+
+ config.HelmLoggerFunc("✅ Release '%s' uninstalled successfully", config.ReleaseName)
+ return nil
+}
+
// setupHelmAction sets up Helm action configuration.
func setupHelmAction(config *HelmInstallConfig) (*action.Configuration, error) {
actionConfig := new(action.Configuration)
@@ -226,6 +306,27 @@ func newInstallClient(actionConfig *action.Configuration, config *HelmInstallCon
client.Version = config.ChartVersion
client.Replace = true // Allow replacing failed releases on retry
+ // Set timeout
+ if config.Timeout > 0 {
+ client.Timeout = config.Timeout
+ }
+
+ return client
+}
+
+// newUpgradeClient creates and configures a Helm upgrade action client from the provided configuration.
+func newUpgradeClient(actionConfig *action.Configuration, config *HelmInstallConfig) *action.Upgrade {
+ client := action.NewUpgrade(actionConfig)
+ client.Namespace = config.Namespace
+ client.Wait = config.Wait
+ client.Version = config.ChartVersion
+ client.ReuseValues = config.ReuseValues
+
+ // Set timeout
+ if config.Timeout > 0 {
+ client.Timeout = config.Timeout
+ }
+
return client
}
diff --git a/operator/e2e/tests/cert_management_test.go b/operator/e2e/tests/cert_management_test.go
new file mode 100644
index 000000000..b1d346a9a
--- /dev/null
+++ b/operator/e2e/tests/cert_management_test.go
@@ -0,0 +1,744 @@
+//go:build e2e
+
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package tests
+
+import (
+ "bytes"
+ "context"
+ "crypto/tls"
+ "crypto/x509"
+ "encoding/pem"
+ "fmt"
+ "net"
+ "net/http"
+ "net/url"
+ "path/filepath"
+ "runtime"
+ "testing"
+ "time"
+
+ configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+ "github.com/ai-dynamo/grove/operator/e2e"
+ "github.com/ai-dynamo/grove/operator/e2e/setup"
+ "github.com/ai-dynamo/grove/operator/e2e/utils"
+ "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+ "k8s.io/apimachinery/pkg/runtime/schema"
+ "k8s.io/client-go/dynamic"
+ "k8s.io/client-go/kubernetes"
+ "k8s.io/client-go/rest"
+ "k8s.io/client-go/tools/portforward"
+ "k8s.io/client-go/transport/spdy"
+)
+
+// certManagerGroveConfig returns Grove configuration for cert-manager mode.
+// This uses manual cert provisioning and configures webhook annotations for cert-manager CA injection.
+func certManagerGroveConfig() *setup.GroveConfig {
+ return &setup.GroveConfig{
+ InstallCRDs: true,
+ Webhooks: setup.WebhooksConfig{
+ CertProvisionMode: configv1alpha1.CertProvisionModeManual,
+ SecretName: "grove-webhook-server-cert",
+ Annotations: map[string]string{
+ "cert-manager.io/inject-ca-from": "grove-system/grove-webhook-server-cert",
+ },
+ },
+ }
+}
+
+// autoProvisionGroveConfig returns Grove configuration for auto-provision mode (default).
+// Note: Annotations are intentionally omitted - the Helm templates only include annotations
+// when certProvisionMode is "manual", so any previous cert-manager annotations are automatically
+// cleared when switching back to auto mode.
+func autoProvisionGroveConfig() *setup.GroveConfig {
+ return &setup.GroveConfig{
+ InstallCRDs: true,
+ Webhooks: setup.WebhooksConfig{
+ CertProvisionMode: configv1alpha1.CertProvisionModeAuto,
+ SecretName: "grove-webhook-server-cert",
+ },
+ }
+}
+
+// updateGroveToCertManager updates Grove to use cert-manager for certificate management.
+func updateGroveToCertManager(t *testing.T, ctx context.Context, restConfig *rest.Config) {
+ t.Helper()
+ if err := setup.UpdateGroveConfiguration(ctx, restConfig, certManagerGroveConfig(), logger); err != nil {
+ t.Fatalf("Failed to update Grove to cert-manager mode: %v", err)
+ }
+}
+
+// updateGroveToAutoProvision updates Grove to use auto-provisioned certificates.
+func updateGroveToAutoProvision(t *testing.T, ctx context.Context, restConfig *rest.Config) {
+ t.Helper()
+ if err := setup.UpdateGroveConfiguration(ctx, restConfig, autoProvisionGroveConfig(), logger); err != nil {
+ t.Fatalf("Failed to update Grove to auto-provision mode: %v", err)
+ }
+}
+
+const (
+ certManagerIssuerYAML = `
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+ name: selfsigned-issuer
+spec:
+ selfSigned: {}
+`
+ certManagerCertificateYAML = `
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+ name: grove-webhook-server-cert
+ namespace: grove-system
+spec:
+ secretName: grove-webhook-server-cert
+ duration: 2160h # 90d
+ renewBefore: 360h # 15d
+ issuerRef:
+ name: selfsigned-issuer
+ kind: ClusterIssuer
+ dnsNames:
+ - grove-operator
+ - grove-operator.grove-system
+ - grove-operator.grove-system.svc
+ - grove-operator.grove-system.svc.cluster.local
+`
+)
+
+// Test_CM1_CertManagementRoundTrip tests the full certificate management round-trip:
+// auto-provision -> cert-manager -> auto-provision
+// Scenario CM-1:
+// 1. Initialize Grove with auto-provision mode
+// 2. Install cert-manager and create Certificate
+// 3. Upgrade Grove to use cert-manager (certProvisionMode=manual)
+// 4. Verify cert-manager mode is active
+// 5. Deploy and verify workload with cert-manager certs
+// 6. Remove cert-manager resources
+// 7. Upgrade Grove back to auto-provision mode
+// 8. Verify auto-provision mode is active
+// 9. Delete and redeploy workload to test webhooks with auto-provisioned certs
+func Test_CM1_CertManagementRoundTrip(t *testing.T) {
+ ctx := context.Background()
+
+ logger.Info("1. Initialize Grove with auto-provision mode (10 nodes for workload)")
+ clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 10)
+
+ logger.Info("2. Install cert-manager and create Certificate")
+ deps, _ := e2e.GetDependencies()
+ installCertManager(t, ctx, restConfig, deps)
+ defer uninstallCertManager(t, restConfig, deps)
+ defer cleanup()
+
+ // Create Issuer and Certificate
+ if _, err := utils.ApplyYAMLData(ctx, []byte(certManagerIssuerYAML), "", restConfig, logger); err != nil {
+ t.Fatalf("Failed to apply ClusterIssuer: %v", err)
+ }
+ waitForClusterIssuer(t, ctx, dynamicClient, "selfsigned-issuer")
+
+ if _, err := utils.ApplyYAMLData(ctx, []byte(certManagerCertificateYAML), "", restConfig, logger); err != nil {
+ t.Fatalf("Failed to apply Certificate: %v", err)
+ }
+
+ // Wait for cert-manager to actually take over the secret.
+ // This is critical because the secret may already exist from auto-provision mode,
+ // and we need to wait for cert-manager to update it (not just check existence).
+ waitForSecretManagedByCertManager(t, ctx, clientset, "grove-webhook-server-cert")
+
+ logger.Info("3. Upgrade Grove to use cert-manager (certProvisionMode=manual)")
+ updateGroveToCertManager(t, ctx, restConfig)
+
+ logger.Info("4. Verify cert-manager mode is active")
+ verifyCertManagerMode(t, ctx, clientset)
+ verifyWebhookServingCertificate(t, ctx, clientset, restConfig)
+
+ logger.Info("5. Deploy and verify workload with cert-manager certs")
+ tc := createTestContext(t, ctx, clientset, dynamicClient, restConfig)
+ if _, err := deployAndVerifyWorkload(tc); err != nil {
+ t.Fatalf("Failed to deploy workload in Cert-Manager mode: %v", err)
+ }
+
+ // Wait for all pods to become ready to verify the webhook is working end-to-end
+ if err := waitForPods(tc, tc.Workload.ExpectedPods); err != nil {
+ t.Fatalf("Failed to wait for workload pods to be ready: %v", err)
+ }
+
+ logger.Info("6. Remove cert-manager resources")
+ deleteCertManagerResources(ctx, clientset, dynamicClient)
+ waitForSecret(t, ctx, clientset, "grove-webhook-server-cert", false)
+
+ logger.Info("7. Upgrade Grove back to auto-provision mode")
+ updateGroveToAutoProvision(t, ctx, restConfig)
+ waitForSecret(t, ctx, clientset, "grove-webhook-server-cert", true)
+
+ logger.Info("8. Verify auto-provision mode is active")
+ verifyAutoProvisionMode(t, ctx, clientset)
+ verifyWebhookServingCertificate(t, ctx, clientset, restConfig)
+
+ logger.Info("9. Delete and redeploy workload to test webhooks with auto-provisioned certs")
+ // Delete the existing workload to test that webhooks work with new certs
+ deleteWorkload(t, ctx, dynamicClient, tc.Workload.Name, tc.Namespace)
+
+ // Redeploy workload - this will exercise the validating and mutating webhooks
+ if _, err := deployAndVerifyWorkload(tc); err != nil {
+ t.Fatalf("Failed to deploy workload with auto-provisioned certs: %v", err)
+ }
+
+ // Wait for all pods to become ready
+ if err := waitForPods(tc, tc.Workload.ExpectedPods); err != nil {
+ t.Fatalf("Workload pods not ready after redeploying with auto-provisioned certs: %v", err)
+ }
+
+ logger.Info("🎉 Certificate management round-trip test completed successfully")
+}
+
+func deleteWorkload(t *testing.T, ctx context.Context, dynamicClient dynamic.Interface, name, namespace string) {
+ t.Helper()
+
+ pcsGVR := schema.GroupVersionResource{
+ Group: "grove.io",
+ Version: "v1alpha1",
+ Resource: "podcliquesets",
+ }
+
+ // Delete the PodCliqueSet
+ logger.Debugf("Deleting PodCliqueSet %s/%s", namespace, name)
+ err := dynamicClient.Resource(pcsGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{})
+ if err != nil && !errors.IsNotFound(err) {
+ t.Fatalf("Failed to delete PodCliqueSet %s: %v", name, err)
+ }
+
+ // Wait for deletion to complete
+ err = utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) {
+ _, err := dynamicClient.Resource(pcsGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{})
+ return errors.IsNotFound(err), nil
+ })
+ if err != nil {
+ t.Fatalf("Timeout waiting for PodCliqueSet %s to be deleted: %v", name, err)
+ }
+ logger.Debugf("PodCliqueSet %s/%s deleted", namespace, name)
+}
+
+func waitForSecret(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, name string, shouldExist bool) {
+ t.Helper()
+
+ err := utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) {
+ _, err := clientset.CoreV1().Secrets("grove-system").Get(ctx, name, metav1.GetOptions{})
+ if shouldExist {
+ return err == nil, nil
+ }
+ return errors.IsNotFound(err), nil
+ })
+ if err != nil {
+ t.Fatalf("Timeout waiting for secret %s (shouldExist=%v): %v", name, shouldExist, err)
+ }
+}
+
+// waitForSecretManagedByCertManager waits for a secret to exist AND be managed by cert-manager.
+// This is important because when transitioning from auto-provision to cert-manager mode,
+// the secret may already exist from auto-provision, and we need to wait for cert-manager
+// to actually update it (which is indicated by the cert-manager.io/certificate-name annotation).
+func waitForSecretManagedByCertManager(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, name string) {
+ t.Helper()
+
+ logger.Debugf("Waiting for secret %s to be managed by cert-manager...", name)
+ err := utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) {
+ secret, err := clientset.CoreV1().Secrets("grove-system").Get(ctx, name, metav1.GetOptions{})
+ if err != nil {
+ return false, nil // Secret doesn't exist yet, keep waiting
+ }
+ // Check if cert-manager has taken ownership of this secret
+ certName := secret.Annotations[certManagerCertNameAnnotation]
+ if certName != "" {
+ logger.Debugf("Secret %s is now managed by cert-manager (certificate: %s)", name, certName)
+ return true, nil
+ }
+ return false, nil
+ })
+ if err != nil {
+ t.Fatalf("Timeout waiting for secret %s to be managed by cert-manager: %v", name, err)
+ }
+}
+
+func deleteCertManagerResources(ctx context.Context, clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) {
+ certGVR := schema.GroupVersionResource{Group: "cert-manager.io", Version: "v1", Resource: "certificates"}
+ issuerGVR := schema.GroupVersionResource{Group: "cert-manager.io", Version: "v1", Resource: "clusterissuers"}
+
+ // Delete Certificate first (cert-manager resource)
+ if err := dynamicClient.Resource(certGVR).Namespace("grove-system").Delete(ctx, "grove-webhook-server-cert", metav1.DeleteOptions{}); err != nil {
+ logger.Warnf("Failed to delete Certificate (may not exist): %v", err)
+ }
+
+ // Delete the Secret managed by cert-manager
+ if err := clientset.CoreV1().Secrets("grove-system").Delete(ctx, "grove-webhook-server-cert", metav1.DeleteOptions{}); err != nil {
+ logger.Warnf("Failed to delete Secret (may not exist): %v", err)
+ }
+
+ // Delete ClusterIssuer last
+ if err := dynamicClient.Resource(issuerGVR).Delete(ctx, "selfsigned-issuer", metav1.DeleteOptions{}); err != nil {
+ logger.Warnf("Failed to delete ClusterIssuer (may not exist): %v", err)
+ }
+}
+
+func installCertManager(t *testing.T, ctx context.Context, restConfig *rest.Config, deps *e2e.Dependencies) {
+ t.Helper()
+
+ cmConfig := &setup.HelmInstallConfig{
+ RestConfig: restConfig,
+ ReleaseName: deps.HelmCharts.CertManager.ReleaseName,
+ ChartRef: deps.HelmCharts.CertManager.ChartRef,
+ ChartVersion: deps.HelmCharts.CertManager.Version,
+ Namespace: deps.HelmCharts.CertManager.Namespace,
+ CreateNamespace: true,
+ RepoURL: deps.HelmCharts.CertManager.RepoURL,
+ Values: map[string]interface{}{
+ "installCRDs": true,
+ },
+ HelmLoggerFunc: logger.Debugf,
+ Logger: logger,
+ }
+
+ if _, err := setup.InstallHelmChart(cmConfig); err != nil {
+ t.Fatalf("Failed to install cert-manager: %v", err)
+ }
+
+ // Ensure cert-manager is actually up and running before returning
+ if err := utils.WaitForPodsInNamespace(ctx, cmConfig.Namespace, restConfig, 3, defaultPollTimeout, defaultPollInterval, logger); err != nil {
+ t.Fatalf("cert-manager pods failed to become ready: %v", err)
+ }
+}
+
+func createTestContext(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, dynamicClient dynamic.Interface, restConfig *rest.Config) TestContext {
+ t.Helper()
+
+ _, currentFile, _, _ := runtime.Caller(0)
+ workloadPath := filepath.Join(filepath.Dir(currentFile), "../yaml/workload1.yaml")
+
+ return TestContext{
+ T: t,
+ Ctx: ctx,
+ Clientset: clientset,
+ DynamicClient: dynamicClient,
+ RestConfig: restConfig,
+ Namespace: "default",
+ Timeout: defaultPollTimeout,
+ Interval: defaultPollInterval,
+ Workload: &WorkloadConfig{
+ Name: "workload1",
+ YAMLPath: workloadPath,
+ Namespace: "default",
+ ExpectedPods: 10,
+ },
+ }
+}
+
+func waitForClusterIssuer(t *testing.T, ctx context.Context, dynamicClient dynamic.Interface, name string) {
+ t.Helper()
+
+ issuerGVR := schema.GroupVersionResource{
+ Group: "cert-manager.io",
+ Version: "v1",
+ Resource: "clusterissuers",
+ }
+
+ err := utils.PollForCondition(ctx, 30*time.Second, 1*time.Second, func() (bool, error) {
+ issuer, err := dynamicClient.Resource(issuerGVR).Get(ctx, name, metav1.GetOptions{})
+ if err != nil {
+ return false, nil
+ }
+
+ return checkReadyStatus(issuer), nil
+ })
+
+ if err != nil {
+ t.Fatalf("ClusterIssuer %s failed to become Ready: %v", name, err)
+ }
+}
+
+func checkReadyStatus(obj *unstructured.Unstructured) bool {
+ conditions, found, err := unstructured.NestedSlice(obj.Object, "status", "conditions")
+ if err != nil || !found {
+ return false
+ }
+
+ for _, c := range conditions {
+ condition, ok := c.(map[string]interface{})
+ if !ok {
+ continue
+ }
+
+ if condition["type"] == "Ready" && condition["status"] == string(metav1.ConditionTrue) {
+ return true
+ }
+ }
+ return false
+}
+
+func uninstallCertManager(t *testing.T, restConfig *rest.Config, deps *e2e.Dependencies) {
+ t.Helper()
+
+ cmConfig := &setup.HelmInstallConfig{
+ RestConfig: restConfig,
+ ReleaseName: deps.HelmCharts.CertManager.ReleaseName,
+ Namespace: deps.HelmCharts.CertManager.Namespace,
+ HelmLoggerFunc: logger.Debugf,
+ Logger: logger,
+ }
+
+ if err := setup.UninstallHelmChart(cmConfig); err != nil {
+ logger.Warnf("Failed to uninstall cert-manager (may not exist): %v", err)
+ }
+}
+
+const (
+ // certManagerInjectAnnotation is the annotation cert-manager uses to inject CA bundles
+ certManagerInjectAnnotation = "cert-manager.io/inject-ca-from"
+ // certManagerCertNameAnnotation is the annotation cert-manager adds to secrets it manages
+ certManagerCertNameAnnotation = "cert-manager.io/certificate-name"
+ // groveWebhookSecretName is the name of the Grove webhook certificate secret
+ groveWebhookSecretName = "grove-webhook-server-cert"
+ // groveNamespace is the namespace where Grove is installed
+ groveNamespace = "grove-system"
+)
+
+// webhookNames lists the ValidatingWebhookConfiguration and MutatingWebhookConfiguration names to check
+var webhookConfigNames = []string{
+ "podcliqueset-validating-webhook",
+ "podcliqueset-defaulting-webhook",
+}
+
+// verifyCertManagerMode verifies that Grove is using cert-manager for certificate management.
+// It checks:
+// 1. Webhook configurations have the cert-manager.io/inject-ca-from annotation
+// 2. The certificate secret has cert-manager annotations
+func verifyCertManagerMode(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset) {
+ t.Helper()
+
+ logger.Debug("Verifying cert-manager mode is active...")
+
+ // Check webhook configurations have cert-manager annotation
+ for _, webhookName := range webhookConfigNames {
+ if err := verifyWebhookHasCertManagerAnnotation(ctx, clientset, webhookName, true); err != nil {
+ t.Fatalf("Webhook %s does not have cert-manager annotation: %v", webhookName, err)
+ }
+ }
+
+ // Check secret has cert-manager annotations
+ if err := verifyWebhookSecretCertManagerStatus(ctx, clientset, true); err != nil {
+ t.Fatalf("Secret verification for cert-manager mode failed: %v", err)
+ }
+
+ logger.Debug("Verified: Grove is using cert-manager certificates")
+}
+
+// verifyAutoProvisionMode verifies that Grove is using auto-provisioned certificates.
+func verifyAutoProvisionMode(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset) {
+ t.Helper()
+
+ logger.Debug("Verifying auto-provision mode is active...")
+
+ // Check secret does NOT have cert-manager annotations (proving it's managed by Grove's cert-controller)
+ if err := verifyWebhookSecretCertManagerStatus(ctx, clientset, false); err != nil {
+ t.Fatalf("Secret verification for auto-provision mode failed: %v", err)
+ }
+
+ logger.Debug("Verified: Grove is using auto-provisioned certificates (secret not managed by cert-manager)")
+}
+
+// verifyWebhookHasCertManagerAnnotation checks if a webhook configuration has the cert-manager annotation.
+// If shouldHave is true, it verifies the annotation exists; if false, it verifies the annotation is absent or empty.
+func verifyWebhookHasCertManagerAnnotation(ctx context.Context, clientset *kubernetes.Clientset, webhookName string, shouldHave bool) error {
+ // Try ValidatingWebhookConfiguration first
+ vwc, err := clientset.AdmissionregistrationV1().ValidatingWebhookConfigurations().Get(ctx, webhookName, metav1.GetOptions{})
+ if err == nil {
+ annotationValue := vwc.Annotations[certManagerInjectAnnotation]
+ hasAnnotation := annotationValue != ""
+
+ if shouldHave && !hasAnnotation {
+ return fmt.Errorf("ValidatingWebhookConfiguration %s missing cert-manager annotation", webhookName)
+ }
+ if !shouldHave && hasAnnotation {
+ return fmt.Errorf("ValidatingWebhookConfiguration %s has unexpected cert-manager annotation: %s", webhookName, annotationValue)
+ }
+ logger.Debugf("ValidatingWebhookConfiguration %s: cert-manager annotation present=%v (expected=%v)", webhookName, hasAnnotation, shouldHave)
+ return nil
+ }
+
+ // Try MutatingWebhookConfiguration
+ mwc, err := clientset.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(ctx, webhookName, metav1.GetOptions{})
+ if err == nil {
+ annotationValue := mwc.Annotations[certManagerInjectAnnotation]
+ hasAnnotation := annotationValue != ""
+
+ if shouldHave && !hasAnnotation {
+ return fmt.Errorf("MutatingWebhookConfiguration %s missing cert-manager annotation", webhookName)
+ }
+ if !shouldHave && hasAnnotation {
+ return fmt.Errorf("MutatingWebhookConfiguration %s has unexpected cert-manager annotation: %s", webhookName, annotationValue)
+ }
+ logger.Debugf("MutatingWebhookConfiguration %s: cert-manager annotation present=%v (expected=%v)", webhookName, hasAnnotation, shouldHave)
+ return nil
+ }
+
+ return fmt.Errorf("webhook configuration %s not found as ValidatingWebhookConfiguration or MutatingWebhookConfiguration", webhookName)
+}
+
+// verifyWebhookSecretCertManagerStatus checks if the webhook certificate secret's cert-manager management status matches expectations.
+// If expectManaged is true, it verifies cert-manager annotations exist; if false, it verifies they are absent.
+func verifyWebhookSecretCertManagerStatus(ctx context.Context, clientset *kubernetes.Clientset, expectManaged bool) error {
+ secret, err := clientset.CoreV1().Secrets(groveNamespace).Get(ctx, groveWebhookSecretName, metav1.GetOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to get secret %s/%s: %w", groveNamespace, groveWebhookSecretName, err)
+ }
+
+ // Check for cert-manager certificate name annotation
+ certName := secret.Annotations[certManagerCertNameAnnotation]
+ isManagedByCertManager := certName != ""
+
+ if expectManaged && !isManagedByCertManager {
+ return fmt.Errorf("secret %s is not managed by cert-manager (missing %s annotation)", groveWebhookSecretName, certManagerCertNameAnnotation)
+ }
+ if !expectManaged && isManagedByCertManager {
+ return fmt.Errorf("secret %s is unexpectedly managed by cert-manager (has %s=%s)", groveWebhookSecretName, certManagerCertNameAnnotation, certName)
+ }
+
+ logger.Debugf("Secret %s: managed by cert-manager=%v (expected=%v)", groveWebhookSecretName, isManagedByCertManager, expectManaged)
+ return nil
+}
+
+// verifyWebhookServingCertificate verifies that the webhook is actually serving the certificate from the Secret.
+// This connects to the webhook endpoint via TLS and compares the served certificate with the one in the Secret.
+// It includes retry logic to handle timing issues with:
+// - Kubernetes secret volume propagation delays
+// - The certwatcher's 10-second polling interval for detecting certificate changes
+func verifyWebhookServingCertificate(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, restConfig *rest.Config) {
+ t.Helper()
+
+ logger.Debug("Verifying webhook is serving the correct certificate (with retries for cert reload timing)...")
+
+ // Retry for up to 30 seconds to account for:
+ // - Kubernetes secret volume update propagation (can take up to the kubelet sync period)
+ // - certwatcher 10-second polling interval
+ var lastExpectedSerial, lastServedSerial string
+ err := utils.PollForCondition(ctx, 30*time.Second, 2*time.Second, func() (bool, error) {
+ // Get the certificate from the Secret
+ expectedCert, err := getCertificateFromSecret(ctx, clientset)
+ if err != nil {
+ logger.Debugf("Failed to get certificate from secret: %v", err)
+ return false, nil
+ }
+ lastExpectedSerial = expectedCert.SerialNumber.String()
+
+ // Get the certificate the webhook is actually serving
+ servedCert, err := getServedCertificate(ctx, clientset, restConfig)
+ if err != nil {
+ logger.Debugf("Failed to get served certificate from webhook: %v", err)
+ return false, nil
+ }
+ lastServedSerial = servedCert.SerialNumber.String()
+
+ // Compare the certificates
+ if certificatesMatch(expectedCert, servedCert) {
+ logger.Debugf("Certificate match! Serial: %s", lastExpectedSerial)
+ return true, nil
+ }
+
+ logger.Debugf("Certificate mismatch (will retry): expected serial=%s, served serial=%s",
+ lastExpectedSerial, lastServedSerial)
+ return false, nil
+ })
+
+ if err != nil {
+ t.Fatalf("Certificate mismatch: webhook is not serving the expected certificate after retries.\n"+
+ "Expected serial: %s\n"+
+ "Served serial: %s\n"+
+ "This may indicate the operator has not reloaded the certificate from the secret.",
+ lastExpectedSerial, lastServedSerial)
+ }
+
+ logger.Debug("Verified: webhook is serving the correct certificate from the Secret")
+}
+
+// getCertificateFromSecret retrieves and parses the TLS certificate from the webhook Secret.
+func getCertificateFromSecret(ctx context.Context, clientset *kubernetes.Clientset) (*x509.Certificate, error) {
+ secret, err := clientset.CoreV1().Secrets(groveNamespace).Get(ctx, groveWebhookSecretName, metav1.GetOptions{})
+ if err != nil {
+ return nil, fmt.Errorf("failed to get secret: %w", err)
+ }
+
+ certPEM, ok := secret.Data["tls.crt"]
+ if !ok {
+ return nil, fmt.Errorf("secret does not contain tls.crt")
+ }
+
+ block, _ := pem.Decode(certPEM)
+ if block == nil {
+ return nil, fmt.Errorf("failed to decode PEM block from tls.crt")
+ }
+
+ cert, err := x509.ParseCertificate(block.Bytes)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse certificate: %w", err)
+ }
+
+ return cert, nil
+}
+
+// getServedCertificate connects to the webhook service and retrieves the certificate it's serving.
+// It uses port-forwarding to connect to the operator pod.
+func getServedCertificate(ctx context.Context, clientset *kubernetes.Clientset, restConfig *rest.Config) (*x509.Certificate, error) {
+ // Find the grove-operator pod
+ pods, err := clientset.CoreV1().Pods(groveNamespace).List(ctx, metav1.ListOptions{
+ LabelSelector: "app.kubernetes.io/name=grove-operator",
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to list operator pods: %w", err)
+ }
+ if len(pods.Items) == 0 {
+ return nil, fmt.Errorf("no grove-operator pods found")
+ }
+
+ podName := pods.Items[0].Name
+ logger.Debugf("Using operator pod: %s", podName)
+
+ // Set up port forwarding
+ localPort, stopChan, err := setupPortForward(ctx, restConfig, groveNamespace, podName, 9443)
+ if err != nil {
+ return nil, fmt.Errorf("failed to set up port forwarding: %w", err)
+ }
+ defer close(stopChan)
+
+ // Give port-forward a moment to establish
+ time.Sleep(500 * time.Millisecond)
+
+ // Connect via TLS and get the certificate
+ cert, err := getTLSCertificate(fmt.Sprintf("localhost:%d", localPort))
+ if err != nil {
+ return nil, fmt.Errorf("failed to get TLS certificate: %w", err)
+ }
+
+ return cert, nil
+}
+
+// setupPortForward creates a port-forward to the specified pod and returns the local port.
+func setupPortForward(ctx context.Context, restConfig *rest.Config, namespace, podName string, remotePort int) (int, chan struct{}, error) {
+ // Find a free local port
+ listener, err := net.Listen("tcp", "localhost:0")
+ if err != nil {
+ return 0, nil, fmt.Errorf("failed to find free port: %w", err)
+ }
+ localPort := listener.Addr().(*net.TCPAddr).Port
+ listener.Close()
+
+ // Create the port-forward URL
+ path := fmt.Sprintf("/api/v1/namespaces/%s/pods/%s/portforward", namespace, podName)
+
+ transport, upgrader, err := spdy.RoundTripperFor(restConfig)
+ if err != nil {
+ return 0, nil, fmt.Errorf("failed to create round tripper: %w", err)
+ }
+
+ // Parse the URL properly
+ parsedURL, err := parsePortForwardURL(restConfig.Host, path)
+ if err != nil {
+ return 0, nil, fmt.Errorf("failed to parse URL: %w", err)
+ }
+ dialer := spdy.NewDialer(upgrader, &http.Client{Transport: transport}, http.MethodPost, parsedURL)
+
+ stopChan := make(chan struct{}, 1)
+ readyChan := make(chan struct{})
+
+ ports := []string{fmt.Sprintf("%d:%d", localPort, remotePort)}
+
+ // Create a buffer to capture output (we don't need to display it)
+ out := &bytes.Buffer{}
+ errOut := &bytes.Buffer{}
+
+ pf, err := portforward.New(dialer, ports, stopChan, readyChan, out, errOut)
+ if err != nil {
+ return 0, nil, fmt.Errorf("failed to create port forwarder: %w", err)
+ }
+
+ // Start port forwarding in a goroutine
+ errChan := make(chan error, 1)
+ go func() {
+ if err := pf.ForwardPorts(); err != nil {
+ errChan <- err
+ }
+ }()
+
+ // Wait for port-forward to be ready or error
+ select {
+ case <-readyChan:
+ logger.Debugf("Port forward established: localhost:%d -> %s:%d", localPort, podName, remotePort)
+ case err := <-errChan:
+ return 0, nil, fmt.Errorf("port forward failed: %w", err)
+ case <-time.After(10 * time.Second):
+ close(stopChan)
+ return 0, nil, fmt.Errorf("timeout waiting for port forward to be ready")
+ case <-ctx.Done():
+ close(stopChan)
+ return 0, nil, ctx.Err()
+ }
+
+ // Check for any errors in errOut
+ if errOut.Len() > 0 {
+ logger.Warnf("Port forward stderr: %s", errOut.String())
+ }
+
+ return localPort, stopChan, nil
+}
+
+// parsePortForwardURL parses the host and path into a proper URL for port forwarding.
+func parsePortForwardURL(host, path string) (*url.URL, error) {
+ u, err := url.Parse(host)
+ if err != nil {
+ return nil, err
+ }
+ u.Path = path
+ return u, nil
+}
+
+// getTLSCertificate connects to the specified address via TLS and returns the server's certificate.
+func getTLSCertificate(address string) (*x509.Certificate, error) {
+ // Connect with InsecureSkipVerify since we're just extracting the certificate
+ conn, err := tls.Dial("tcp", address, &tls.Config{
+ InsecureSkipVerify: true, //nolint:gosec // We need to connect without verification to extract the cert
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to connect: %w", err)
+ }
+ defer conn.Close()
+
+ // Get the peer certificates
+ certs := conn.ConnectionState().PeerCertificates
+ if len(certs) == 0 {
+ return nil, fmt.Errorf("no certificates received from server")
+ }
+
+ // Return the leaf certificate (first one)
+ return certs[0], nil
+}
+
+// certificatesMatch compares two certificates to determine if they are the same.
+// We compare the raw DER bytes for an exact match.
+func certificatesMatch(cert1, cert2 *x509.Certificate) bool {
+ return bytes.Equal(cert1.Raw, cert2.Raw)
+}
diff --git a/operator/e2e/utils/k8s_client.go b/operator/e2e/utils/k8s_client.go
index e39d217cf..4bec4278b 100644
--- a/operator/e2e/utils/k8s_client.go
+++ b/operator/e2e/utils/k8s_client.go
@@ -64,7 +64,7 @@ func ApplyYAMLFile(ctx context.Context, yamlFilePath string, namespace string, r
return nil, fmt.Errorf("failed to read YAML file %s: %w", yamlFilePath, err)
}
- return applyYAMLData(ctx, yamlData, namespace, restConfig, logger)
+ return ApplyYAMLData(ctx, yamlData, namespace, restConfig, logger)
}
// WaitForPods waits for pods to be ready in the specified namespaces
@@ -129,8 +129,8 @@ func WaitForPods(ctx context.Context, restConfig *rest.Config, namespaces []stri
})
}
-// applyYAMLData is the common function that applies YAML data to Kubernetes
-func applyYAMLData(ctx context.Context, yamlData []byte, namespace string, restConfig *rest.Config, logger *Logger) ([]AppliedResource, error) {
+// ApplyYAMLData is the common function that applies YAML data to Kubernetes
+func ApplyYAMLData(ctx context.Context, yamlData []byte, namespace string, restConfig *rest.Config, logger *Logger) ([]AppliedResource, error) {
dynamicClient, restMapper, err := createKubernetesClients(restConfig)
if err != nil {
return nil, err
diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go
index 2cd60b4d2..753ea616f 100644
--- a/operator/internal/controller/cert/cert.go
+++ b/operator/internal/controller/cert/cert.go
@@ -21,6 +21,7 @@ import (
"os"
"strings"
+ configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
"github.com/ai-dynamo/grove/operator/internal/constants"
authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization"
defaultingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting"
@@ -38,17 +39,31 @@ const (
certificateAuthorityOrganization = "Grove"
)
-// ManageWebhookCerts registers the cert-controller with the manager which will be used to manage
-// webhook certificates.
-func ManageWebhookCerts(mgr ctrl.Manager, certDir string, authorizerEnabled bool, certsReadyCh chan struct{}) error {
+// ManageWebhookCerts manages webhook certificates based on the CertProvisionMode configuration.
+// When mode=auto: uses cert-controller for automatic certificate generation and management.
+// When mode=manual: waits for externally provided certificates (e.g., from cert-manager, cluster admin).
+func ManageWebhookCerts(mgr ctrl.Manager, certDir string, secretName string, authorizerEnabled bool, certProvisionMode configv1alpha1.CertProvisionMode, certsReadyCh chan struct{}) error {
namespace, err := getOperatorNamespace()
if err != nil {
return err
}
+
+ logger := ctrl.Log.WithName("cert-management")
+
+ if certProvisionMode == configv1alpha1.CertProvisionModeManual {
+ logger.Info("Using externally provided certificates (manual mode)",
+ "certDir", certDir, "secretName", secretName)
+ // Certificates are managed externally, signal ready immediately
+ close(certsReadyCh)
+ return nil
+ }
+
+ logger.Info("Auto-provisioning certificates using cert-controller",
+ "secretName", secretName, "certDir", certDir, "mode", certProvisionMode)
rotator := &cert.CertRotator{
SecretKey: types.NamespacedName{
Namespace: namespace,
- Name: "grove-webhook-server-cert",
+ Name: secretName,
},
CertDir: certDir,
CAName: certificateAuthorityName,