diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 41c6e4ac7..77d4b9695 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -47,6 +47,8 @@ jobs: test_pattern: "^Test_SO" - test_name: Topology_Aware_Scheduling test_pattern: "^Test_TAS" + - test_name: cert_management + test_pattern: "^Test_CM" name: E2E - ${{ matrix.test_name }} steps: # print runner specs so we have a record incase of failures diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index 1c9f6bfa8..f8b6a9c2e 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -671,6 +671,24 @@ _Appears in:_ | `exemptServiceAccountUserNames` _string array_ | ExemptServiceAccountUserNames is a list of service account usernames that are exempt from authorizer checks.
Each service account username name in ExemptServiceAccountUserNames should be of the following format:
system:serviceaccount::. ServiceAccounts are represented in this
format when checking the username in authenticationv1.UserInfo.Name. | | | +#### CertProvisionMode + +_Underlying type:_ _string_ + +CertProvisionMode defines how webhook certificates are provisioned. + +_Validation:_ +- Enum: [auto manual] + +_Appears in:_ +- [WebhookServer](#webhookserver) + +| Field | Description | +| --- | --- | +| `auto` | CertProvisionModeAuto enables automatic certificate generation and management via cert-controller.
cert-controller automatically generates self-signed certificates and stores them in the Secret.
| +| `manual` | CertProvisionModeManual expects certificates to be provided externally (e.g., by cert-manager, cluster admin).
| + + #### ClientConnectionConfiguration @@ -916,5 +934,7 @@ _Appears in:_ | `bindAddress` _string_ | BindAddress is the IP address on which to listen for the specified port. | | | | `port` _integer_ | Port is the port on which to serve requests. | | | | `serverCertDir` _string_ | ServerCertDir is the directory containing the server certificate and key. | | | +| `secretName` _string_ | SecretName is the name of the Kubernetes Secret containing webhook certificates.
The Secret must contain tls.crt, tls.key, and ca.crt. | grove-webhook-server-cert | | +| `certProvisionMode` _[CertProvisionMode](#certprovisionmode)_ | CertProvisionMode controls how webhook certificates are provisioned. | auto | Enum: [auto manual]
| diff --git a/docs/installation.md b/docs/installation.md index 901deff90..3d935c7aa 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -102,6 +102,12 @@ This make target leverages Grove [Helm](https://helm.sh/) charts and [Skaffold]( - Grove Scheduler CRDs - `podgangs.scheduler.grove.io`. - All Grove operator resources defined as a part of [Grove Helm chart templates](../operator/charts/templates). +## Certificate Management + +By default, Grove automatically generates and manages TLS certificates for its webhook server. For production environments, you may want to use certificates from your organization's PKI or a certificate manager like cert-manager. + +See the [Certificate Management Guide](user-guide/certificate-management.md) for detailed configuration options. + ## Verify Installation Follow the instructions in the [quickstart guide](quickstart.md) to deploy a PodCliqueSet and validate your installation. diff --git a/docs/user-guide/certificate-management.md b/docs/user-guide/certificate-management.md new file mode 100644 index 000000000..2c8326bd7 --- /dev/null +++ b/docs/user-guide/certificate-management.md @@ -0,0 +1,298 @@ +# Certificate Management + +Grove's webhook server requires TLS certificates to secure communication with the Kubernetes API server. This guide explains how to configure certificate management for your Grove deployment. + +## Overview + +Grove supports two certificate management modes: + +| Mode | Description | Best For | +|------|-------------|----------| +| **Auto-provisioned** (default) | Grove automatically generates and manages self-signed certificates | Development, testing, quick setup | +| **External certificates** | You provide certificates from an external source (e.g., cert-manager, manual) | Production, enterprise PKI integration | + +## Auto-Provisioned Certificates (Default) + +By default, Grove's built-in cert-controller automatically: +- Generates self-signed CA and server certificates +- Stores them in a Kubernetes Secret +- Injects the CA bundle into webhook configurations +- Rotates certificates before expiry + +This mode requires no additional configuration. Simply deploy Grove: + +```bash +helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: +``` + +### How It Works + +1. On startup, the cert-controller checks if the webhook certificate Secret exists +2. If missing or expired, it generates new certificates +3. The CA bundle is automatically injected into `ValidatingWebhookConfiguration` and `MutatingWebhookConfiguration` resources +4. Certificates are stored in the Secret specified by `config.server.webhooks.secretName` (default: `grove-webhook-server-cert`) + +## External Certificate Management + +For production environments, you may want to use certificates from your organization's PKI or a certificate manager like [cert-manager](https://cert-manager.io/). + +### Prerequisites + +Before deploying Grove with external certificates, ensure: +1. Your certificate Secret exists in the target namespace +2. The Secret contains the required keys: `tls.crt`, `tls.key`, and `ca.crt` +3. The certificate's Subject Alternative Names (SANs) include the webhook service DNS name + +### Required Certificate SANs + +The webhook server certificate must include these SANs: +``` +grove-operator..svc +grove-operator..svc.cluster.local +``` + +Replace `` with your deployment namespace (default: `default`). + +### Configuration + +To use external certificates, configure the following Helm values: + +```yaml +config: + server: + webhooks: + # Use manual certificate provisioning (external) + certProvisionMode: manual + # Name of the Secret containing your certificates + secretName: "grove-webhook-server-cert" + +webhooks: + # Base64-encoded CA certificate that signed the webhook server certificate + caBundle: "" +``` + +### Using cert-manager + +[cert-manager](https://cert-manager.io/) is a popular choice for managing certificates in Kubernetes. Here's how to configure Grove with cert-manager. + +#### Step 1: Install cert-manager + +If not already installed: + +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml +``` + +Wait for cert-manager to be ready: + +```bash +kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s +kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s +``` + +#### Step 2: Create a Certificate Issuer + +Create a self-signed issuer (or use your organization's issuer): + +```yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: grove-selfsigned-issuer + namespace: # Your Grove deployment namespace +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: grove-ca + namespace: +spec: + isCA: true + commonName: grove-ca + secretName: grove-ca-secret + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: grove-selfsigned-issuer + kind: Issuer +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: grove-ca-issuer + namespace: +spec: + ca: + secretName: grove-ca-secret +``` + +#### Step 3: Create a Certificate for the Webhook + +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: grove-webhook-cert + namespace: +spec: + secretName: grove-webhook-server-cert + duration: 8760h # 1 year + renewBefore: 720h # 30 days + subject: + organizations: + - grove + isCA: false + privateKey: + algorithm: RSA + encoding: PKCS1 + size: 2048 + usages: + - server auth + - digital signature + - key encipherment + dnsNames: + - grove-operator + - grove-operator. + - grove-operator..svc + - grove-operator..svc.cluster.local + issuerRef: + name: grove-ca-issuer + kind: Issuer +``` + +#### Step 4: Configure Webhook Annotations + +cert-manager can automatically inject the CA bundle into webhook configurations. Add these annotations to your Helm values: + +```yaml +webhooks: + podCliqueSetValidationWebhook: + annotations: + cert-manager.io/inject-ca-from: /grove-webhook-cert + podCliqueSetDefaultingWebhook: + annotations: + cert-manager.io/inject-ca-from: /grove-webhook-cert + authorizerWebhook: + annotations: + cert-manager.io/inject-ca-from: /grove-webhook-cert +``` + +#### Step 5: Deploy Grove + +```bash +helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \ + --set config.server.webhooks.certProvisionMode=manual \ + --set webhooks.podCliqueSetValidationWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert" \ + --set webhooks.podCliqueSetDefaultingWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert" \ + --set webhooks.authorizerWebhook.annotations."cert-manager\.io/inject-ca-from"="/grove-webhook-cert" +``` + +Or create a `values.yaml` file with the configuration shown above and deploy: + +```bash +helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: -f values.yaml +``` + +### Manual Certificate Management + +If you prefer to manage certificates manually: + +#### Step 1: Generate Certificates + +```bash +# Create a CA +openssl genrsa -out ca.key 2048 +openssl req -x509 -new -nodes -key ca.key -sha256 -days 365 \ + -out ca.crt -subj "/CN=grove-ca" + +# Create server certificate +openssl genrsa -out tls.key 2048 + +# Create CSR config +cat > csr.conf < +DNS.3 = grove-operator..svc +DNS.4 = grove-operator..svc.cluster.local +EOF + +# Generate CSR and certificate +openssl req -new -key tls.key -out tls.csr -config csr.conf +openssl x509 -req -in tls.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out tls.crt -days 365 -extensions req_ext -extfile csr.conf +``` + +#### Step 2: Create the Secret + +```bash +kubectl create secret generic grove-webhook-server-cert \ + --from-file=tls.crt=tls.crt \ + --from-file=tls.key=tls.key \ + --from-file=ca.crt=ca.crt \ + -n +``` + +#### Step 3: Deploy Grove with the CA Bundle + +```bash +CA_BUNDLE=$(cat ca.crt | base64 | tr -d '\n') + +helm upgrade -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \ + --set config.server.webhooks.certProvisionMode=manual \ + --set webhooks.caBundle="${CA_BUNDLE}" +``` + +## Switching Between Modes + +### From Auto-Provision to External + +1. Set up your external certificate infrastructure (cert-manager or manual) +2. Update your Helm values to disable auto-provisioning +3. Upgrade the Grove deployment + +```bash +helm upgrade grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \ + --set config.server.webhooks.certProvisionMode=manual \ + --set webhooks.caBundle="${CA_BUNDLE}" +``` + +### From External to Auto-Provision + +1. Update your Helm values to enable auto-provisioning +2. Remove the `webhooks.caBundle` value (or set it to empty) +3. Upgrade the Grove deployment + +```bash +helm upgrade grove oci://ghcr.io/ai-dynamo/grove/grove-charts: \ + --set config.server.webhooks.certProvisionMode=auto \ + --set webhooks.caBundle="" +``` + +The cert-controller will generate new certificates on the next operator restart. + +## Configuration Reference + +| Helm Value | Type | Default | Description | +|------------|------|---------|-------------| +| `config.server.webhooks.certProvisionMode` | string | `auto` | Certificate provisioning mode: `auto` (cert-controller generates certs) or `manual` (external certs) | +| `config.server.webhooks.secretName` | string | `grove-webhook-server-cert` | Name of the Secret containing certificates | +| `webhooks.caBundle` | string | `""` | Base64-encoded CA certificate for webhook configurations | +| `webhooks.podCliqueSetValidationWebhook.annotations` | map | `{}` | Annotations for the validation webhook (e.g., cert-manager injection) | +| `webhooks.podCliqueSetDefaultingWebhook.annotations` | map | `{}` | Annotations for the defaulting webhook | +| `webhooks.authorizerWebhook.annotations` | map | `{}` | Annotations for the authorizer webhook | diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index e2752790d..c1d97a428 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -79,6 +79,14 @@ func SetDefaults_ServerConfiguration(serverConfig *ServerConfiguration) { serverConfig.Webhooks.ServerCertDir = defaultWebhookServerTLSServerCertDir } + if serverConfig.Webhooks.SecretName == "" { + serverConfig.Webhooks.SecretName = "grove-webhook-server-cert" + } + + if serverConfig.Webhooks.CertProvisionMode == "" { + serverConfig.Webhooks.CertProvisionMode = CertProvisionModeAuto + } + if serverConfig.HealthProbes == nil { serverConfig.HealthProbes = &Server{} } diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 82b8e8365..f932a7bed 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -140,8 +140,29 @@ type WebhookServer struct { Server `json:",inline"` // ServerCertDir is the directory containing the server certificate and key. ServerCertDir string `json:"serverCertDir"` + // SecretName is the name of the Kubernetes Secret containing webhook certificates. + // The Secret must contain tls.crt, tls.key, and ca.crt. + // +optional + // +kubebuilder:default="grove-webhook-server-cert" + SecretName string `json:"secretName,omitempty"` + // CertProvisionMode controls how webhook certificates are provisioned. + // +optional + // +kubebuilder:default="auto" + CertProvisionMode CertProvisionMode `json:"certProvisionMode,omitempty"` } +// CertProvisionMode defines how webhook certificates are provisioned. +// +kubebuilder:validation:Enum=auto;manual +type CertProvisionMode string + +const ( + // CertProvisionModeAuto enables automatic certificate generation and management via cert-controller. + // cert-controller automatically generates self-signed certificates and stores them in the Secret. + CertProvisionModeAuto CertProvisionMode = "auto" + // CertProvisionModeManual expects certificates to be provided externally (e.g., by cert-manager, cluster admin). + CertProvisionModeManual CertProvisionMode = "manual" +) + // Server contains information for HTTP(S) server configuration. type Server struct { // BindAddress is the IP address on which to listen for the specified port. diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 5c42f26d3..d76d4954f 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -16,6 +16,9 @@ config.yaml: | server: webhooks: port: {{ .Values.config.server.webhooks.port }} + serverCertDir: {{ .Values.config.server.webhooks.serverCertDir }} + secretName: {{ .Values.config.server.webhooks.secretName | default "grove-webhook-server-cert" }} + certProvisionMode: {{ .Values.config.server.webhooks.certProvisionMode | default "auto" }} healthProbes: port: {{ .Values.config.server.healthProbes.port }} metrics: diff --git a/operator/charts/templates/authorizer-webhook-config.yaml b/operator/charts/templates/authorizer-webhook-config.yaml index e54f2cbdd..cc05e35c2 100644 --- a/operator/charts/templates/authorizer-webhook-config.yaml +++ b/operator/charts/templates/authorizer-webhook-config.yaml @@ -4,12 +4,20 @@ apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: name: authorizer-webhook + {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}} + {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.authorizerWebhook.annotations }} + annotations: + {{- toYaml .Values.webhooks.authorizerWebhook.annotations | nindent 4 }} + {{- end }} labels: {{- include "operator.authorizer.webhook.labels" . | nindent 4 }} webhooks: - admissionReviewVersions: - v1 clientConfig: + {{- if .Values.webhooks.caBundle }} + caBundle: {{ .Values.webhooks.caBundle }} + {{- end }} service: name: {{ required ".Values.service.name is required" .Values.service.name }} namespace: {{ .Release.Namespace }} diff --git a/operator/charts/templates/deployment.yaml b/operator/charts/templates/deployment.yaml index ed9cdd59b..51d4ca7e5 100644 --- a/operator/charts/templates/deployment.yaml +++ b/operator/charts/templates/deployment.yaml @@ -61,7 +61,7 @@ spec: mountPath: /var/run/secrets/kubernetes.io/serviceaccount readOnly: true - name: grove-webhook-server-cert - mountPath: /etc/grove-operator/webhook-certs + mountPath: {{ .Values.config.server.webhooks.serverCertDir | default "/etc/grove-operator/webhook-certs" }} readOnly: true env: - name: GROVE_OPERATOR_SERVICE_ACCOUNT_NAME @@ -92,6 +92,7 @@ spec: configMap: name: {{ include "operator.config.name" . }} - name: grove-webhook-server-cert + # Mount the Secret volume containing webhook certificates secret: - secretName: grove-webhook-server-cert + secretName: {{ .Values.config.server.webhooks.secretName | default "grove-webhook-server-cert" }} defaultMode: 420 diff --git a/operator/charts/templates/pcs-defaulting-webhook-config.yaml b/operator/charts/templates/pcs-defaulting-webhook-config.yaml index 98bb34571..634e219d7 100644 --- a/operator/charts/templates/pcs-defaulting-webhook-config.yaml +++ b/operator/charts/templates/pcs-defaulting-webhook-config.yaml @@ -3,12 +3,20 @@ apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: name: podcliqueset-defaulting-webhook + {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}} + {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.podCliqueSetDefaultingWebhook.annotations }} + annotations: + {{- toYaml .Values.webhooks.podCliqueSetDefaultingWebhook.annotations | nindent 4 }} + {{- end }} labels: {{- include "operator.pcs.defaulting.webhook.labels" . | nindent 4 }} webhooks: - admissionReviewVersions: - v1 clientConfig: + {{- if .Values.webhooks.caBundle }} + caBundle: {{ .Values.webhooks.caBundle }} + {{- end }} service: name: {{ required ".Values.service.name is required" .Values.service.name }} namespace: {{ .Release.Namespace }} diff --git a/operator/charts/templates/pcs-validating-webhook-config.yaml b/operator/charts/templates/pcs-validating-webhook-config.yaml index 92c170935..3a629edb4 100644 --- a/operator/charts/templates/pcs-validating-webhook-config.yaml +++ b/operator/charts/templates/pcs-validating-webhook-config.yaml @@ -3,12 +3,20 @@ apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: name: podcliqueset-validating-webhook + {{- /* Only include annotations in manual cert mode (certProvisionMode=manual) */ -}} + {{- if and (eq .Values.config.server.webhooks.certProvisionMode "manual") .Values.webhooks.podCliqueSetValidationWebhook.annotations }} + annotations: + {{- toYaml .Values.webhooks.podCliqueSetValidationWebhook.annotations | nindent 4 }} + {{- end }} labels: {{- include "operator.pcs.validating.webhook.labels" . | nindent 4 }} webhooks: - admissionReviewVersions: - v1 clientConfig: + {{- if .Values.webhooks.caBundle }} + caBundle: {{ .Values.webhooks.caBundle }} + {{- end }} service: name: {{ required ".Values.service.name is required" .Values.service.name }} namespace: {{ .Release.Namespace }} diff --git a/operator/charts/templates/webhook-server-cert-secret.yaml b/operator/charts/templates/webhook-server-cert-secret.yaml index 7c9064718..b220ea7df 100644 --- a/operator/charts/templates/webhook-server-cert-secret.yaml +++ b/operator/charts/templates/webhook-server-cert-secret.yaml @@ -1,3 +1,9 @@ +{{- /* + Create this Secret if certProvisionMode is "auto" (default). + This allows the operator to auto-generate webhook TLS certs at startup. +*/ -}} +{{- if ne (.Values.config.server.webhooks.certProvisionMode | default "auto") "manual" }} +--- apiVersion: v1 kind: Secret metadata: @@ -9,3 +15,4 @@ type: kubernetes.io/tls data: tls.crt: "" tls.key: "" +{{- end }} diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index be1ab4425..42003b7d0 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -46,6 +46,25 @@ config: server: webhooks: port: 9443 + # serverCertDir is the directory path where certificate files are located. + # The operator will look for tls.crt and tls.key in this directory. + # Default: /etc/grove-operator/webhook-certs + serverCertDir: /etc/grove-operator/webhook-certs + # secretName is the name of the Kubernetes Secret containing webhook certificates. + # The Secret must contain tls.crt, tls.key, and ca.crt. + # Default: "grove-webhook-server-cert" + secretName: "grove-webhook-server-cert" + # certProvisionMode controls how webhook certificates are provisioned. + # Values: + # "auto" (default): cert-controller automatically generates self-signed certificates. + # - Best for development/testing + # - No manual certificate management needed + # - Certificates auto-rotate before expiry + # "manual": certificates must be provided externally (e.g., cert-manager, cluster admin). + # - Best for production + # - Use your own CA or Let's Encrypt + # - Requires Secret to exist before operator starts + certProvisionMode: auto healthProbes: enable: false port: 9444 @@ -145,21 +164,27 @@ leaseRoleBinding: app.kubernetes.io/part-of: grove webhooks: + # Shared CA bundle for all webhooks. This is the base64-encoded CA certificate + # that signed the webhook server's TLS certificate. + caBundle: "" podCliqueSetValidationWebhook: labels: app.kubernetes.io/component: operator-pcs-validating-webhook app.kubernetes.io/name: pcs-validating-webhook app.kubernetes.io/part-of: grove + annotations: {} podCliqueSetDefaultingWebhook: labels: app.kubernetes.io/component: operator-pcs-defaulting-webhook app.kubernetes.io/name: pcs-defaulting-webhook app.kubernetes.io/part-of: grove + annotations: {} authorizerWebhook: labels: app.kubernetes.io/component: operator-authorizer-webhook app.kubernetes.io/name: authorizer-webhook app.kubernetes.io/part-of: grove + annotations: {} webhookServerSecret: labels: diff --git a/operator/cmd/main.go b/operator/cmd/main.go index d79007dfb..b6fd8dbde 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -94,7 +94,14 @@ func main() { } webhookCertsReadyCh := make(chan struct{}) - if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil { + if err = cert.ManageWebhookCerts( + mgr, + operatorConfig.Server.Webhooks.ServerCertDir, + operatorConfig.Server.Webhooks.SecretName, + operatorConfig.Authorizer.Enabled, + operatorConfig.Server.Webhooks.CertProvisionMode, + webhookCertsReadyCh, + ); err != nil { logger.Error(err, "failed to setup cert rotation") handleErrorAndExit(err, cli.ExitErrInitializeManager) } diff --git a/operator/e2e/dependencies.go b/operator/e2e/dependencies.go index 687d7588c..1704efbd9 100644 --- a/operator/e2e/dependencies.go +++ b/operator/e2e/dependencies.go @@ -51,6 +51,7 @@ type HelmChartDependency struct { type HelmCharts struct { KaiScheduler HelmChartDependency `yaml:"kaiScheduler"` GPUOperator HelmChartDependency `yaml:"gpuOperator"` + CertManager HelmChartDependency `yaml:"certManager"` } // Dependencies represents all E2E test dependencies diff --git a/operator/e2e/dependencies.yaml b/operator/e2e/dependencies.yaml index dd15f8ee3..1bd01a55a 100644 --- a/operator/e2e/dependencies.yaml +++ b/operator/e2e/dependencies.yaml @@ -32,6 +32,16 @@ images: - name: ghcr.io/nvidia/kai-scheduler/scheduler version: v0.13.0-rc1 + # Cert-manager + - name: quay.io/jetstack/cert-manager-controller + version: v1.14.4 + - name: quay.io/jetstack/cert-manager-webhook + version: v1.14.4 + - name: quay.io/jetstack/cert-manager-cainjector + version: v1.14.4 + - name: quay.io/jetstack/cert-manager-ctl + version: v1.14.4 + # Helm charts used in E2E tests helmCharts: # Kai Scheduler - gang scheduling for Kubernetes @@ -40,3 +50,11 @@ helmCharts: chartRef: oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler version: v0.13.0-rc1 namespace: kai-scheduler + + # cert-manager - for certificate management + certManager: + releaseName: cert-manager + chartRef: cert-manager + version: v1.14.4 + namespace: cert-manager + repoURL: https://charts.jetstack.io diff --git a/operator/e2e/setup/grove.go b/operator/e2e/setup/grove.go new file mode 100644 index 000000000..74b5b1a6e --- /dev/null +++ b/operator/e2e/setup/grove.go @@ -0,0 +1,178 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +// Package setup provides internal testing utilities for configuring and managing +// Grove operator installations during e2e tests. This package is not intended for +// production use and its API may change without notice. +// +// Functions are exported to allow access from e2e test packages (e.g., operator/e2e/tests) +// which need to modify Grove configuration during test scenarios. +package setup + +import ( + "context" + "fmt" + "os" + "path/filepath" + "runtime" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + "github.com/ai-dynamo/grove/operator/e2e/utils" + "gopkg.in/yaml.v3" + "k8s.io/client-go/rest" +) + +// GroveConfig holds typed configuration options for updating the Grove operator. +// This struct provides a user-friendly interface that gets translated to Helm values internally. +type GroveConfig struct { + // InstallCRDs controls whether CRDs should be installed/updated. + InstallCRDs bool + // Webhooks contains webhook-specific configuration. + Webhooks WebhooksConfig +} + +// WebhooksConfig holds configuration for Grove's webhook server. +type WebhooksConfig struct { + // CertProvisionMode controls how webhook certificates are provisioned. + // Use configv1alpha1.CertProvisionModeAuto for automatic provisioning, + // configv1alpha1.CertProvisionModeManual for external cert management. + CertProvisionMode configv1alpha1.CertProvisionMode + // SecretName is the name of the Kubernetes secret containing TLS certificates. + SecretName string + // Annotations to apply to webhook configurations (e.g., for cert-manager CA injection). + // These annotations are applied to all webhook configurations. + Annotations map[string]string +} + +// toHelmValues converts the typed GroveConfig to a Helm values map. +func (c *GroveConfig) toHelmValues() map[string]interface{} { + // Convert annotations to interface{} map for Helm + annotations := make(map[string]interface{}) + for k, v := range c.Webhooks.Annotations { + annotations[k] = v + } + + webhookAnnotations := map[string]interface{}{"annotations": annotations} + + return map[string]interface{}{ + "installCRDs": c.InstallCRDs, + "config": map[string]interface{}{ + "server": map[string]interface{}{ + "webhooks": map[string]interface{}{ + "certProvisionMode": string(c.Webhooks.CertProvisionMode), + "secretName": c.Webhooks.SecretName, + }, + }, + }, + "webhooks": map[string]interface{}{ + "podCliqueSetValidationWebhook": webhookAnnotations, + "podCliqueSetDefaultingWebhook": webhookAnnotations, + "authorizerWebhook": webhookAnnotations, + }, + } +} + +// UpdateGroveConfiguration updates the Grove operator configuration. +// +// This uses Helm upgrade (rather than Skaffold) because: +// 1. Grove is initially installed via Skaffold, which uses Helm under the hood +// 2. For config-only changes (like switching cert modes), rebuilding images is unnecessary +// 3. Helm upgrade with ReuseValues preserves the image configuration that Skaffold set +// +// This approach avoids wasteful rebuilds while staying compatible with the Skaffold installation. +func UpdateGroveConfiguration(ctx context.Context, restConfig *rest.Config, config *GroveConfig, logger *utils.Logger) error { + chartDir, err := getGroveChartDir() + if err != nil { + return fmt.Errorf("failed to get Grove chart directory: %w", err) + } + + chartVersion, err := getChartVersion(chartDir) + if err != nil { + return fmt.Errorf("failed to get chart version: %w", err) + } + + // Configure Helm upgrade using shared constants to stay in sync with Skaffold installation. + // - ReleaseName: Uses OperatorDeploymentName from constants.go (matches skaffold.yaml's deploy.helm.releases[0].name) + // - ChartVersion: Read from Chart.yaml to avoid version string duplication + // - ReuseValues: Preserves image configuration that Skaffold set during initial install + helmConfig := &HelmInstallConfig{ + RestConfig: restConfig, + ReleaseName: OperatorDeploymentName, + ChartRef: chartDir, + ChartVersion: chartVersion, + Namespace: OperatorNamespace, + ReuseValues: true, + Values: config.toHelmValues(), + HelmLoggerFunc: logger.Debugf, + Logger: logger, + } + + logger.Debug("Updating Grove operator configuration") + + if _, err := UpgradeHelmChart(helmConfig); err != nil { + return fmt.Errorf("helm upgrade failed: %w", err) + } + + // Wait for Grove operator pod to be ready after upgrade + if err := utils.WaitForPodsInNamespace(ctx, OperatorNamespace, restConfig, 1, defaultPollTimeout, defaultPollInterval, logger); err != nil { + return fmt.Errorf("grove operator pod not ready after upgrade: %w", err) + } + + logger.Debug("Grove configuration update completed successfully") + return nil +} + +// chartYAML represents the structure of Chart.yaml for version extraction +type chartYAML struct { + Version string `yaml:"version"` +} + +// getChartVersion reads the version from Chart.yaml in the given chart directory. +// The chartDir parameter should be the path to a Helm chart directory. Chart.yaml is +// a required file per the Helm chart specification and will always exist for valid charts. +// We read from Chart.yaml rather than hardcoding the version to maintain a single source +// of truth, avoiding configuration drift between the chart definition and the e2e test code. +func getChartVersion(chartDir string) (string, error) { + chartFile := filepath.Join(chartDir, "Chart.yaml") + data, err := os.ReadFile(chartFile) + if err != nil { + return "", fmt.Errorf("failed to read %s: %w", chartFile, err) + } + + var chart chartYAML + if err := yaml.Unmarshal(data, &chart); err != nil { + return "", fmt.Errorf("failed to parse %s: %w", chartFile, err) + } + + if chart.Version == "" { + return "", fmt.Errorf("version not found in %s", chartFile) + } + + return chart.Version, nil +} + +// getGroveChartDir returns the absolute path to the Grove Helm chart directory. +// It uses runtime.Caller to find the path relative to this source file. +func getGroveChartDir() (string, error) { + _, currentFile, _, ok := runtime.Caller(0) + if !ok { + return "", fmt.Errorf("failed to get current file path") + } + // This file is at operator/e2e/setup/grove.go + // Chart directory is at operator/charts + chartDir := filepath.Join(filepath.Dir(currentFile), "../../charts") + return filepath.Abs(chartDir) +} diff --git a/operator/e2e/setup/helm.go b/operator/e2e/setup/helm.go index e899a634c..32657223a 100644 --- a/operator/e2e/setup/helm.go +++ b/operator/e2e/setup/helm.go @@ -23,6 +23,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/ai-dynamo/grove/operator/e2e/utils" "helm.sh/helm/v3/pkg/action" @@ -60,6 +61,10 @@ type HelmInstallConfig struct { Logger *utils.Logger // RepoURL is the base URL of the Helm repository (optional, for direct chart downloads). RepoURL string + // ReuseValues reuses the last release's values and merges in the new values. + ReuseValues bool + // Timeout is the time to wait for Kubernetes operations (default: 5 minutes). + Timeout time.Duration } // Validate validates and sets defaults for the configuration. @@ -133,6 +138,81 @@ func InstallHelmChart(config *HelmInstallConfig) (*release.Release, error) { return rel, nil } +// UpgradeHelmChart upgrades a Helm chart with the given configuration. +func UpgradeHelmChart(config *HelmInstallConfig) (*release.Release, error) { + if err := config.Validate(); err != nil { + return nil, err + } + + // Initialize Helm action configuration + config.HelmLoggerFunc("Setting up Helm configuration for upgrade of %s...", config.ReleaseName) + actionConfig, err := setupHelmAction(config) + if err != nil { + return nil, err + } + + // Resolve chart location (download from HTTP or locate via Helm) + chartPath, err := resolveChart(actionConfig, config) + if err != nil { + return nil, err + } + + // Load and validate the chart + config.HelmLoggerFunc("Loading chart from %s...", chartPath) + chart, err := loader.Load(chartPath) + if err != nil { + return nil, fmt.Errorf("failed to load chart: %w", err) + } + + // Upgrade the chart + config.HelmLoggerFunc("Upgrading release %s in namespace %s...", config.ReleaseName, config.Namespace) + upgradeClient := newUpgradeClient(actionConfig, config) + rel, err := upgradeClient.Run(config.ReleaseName, chart, config.Values) + if err != nil { + return nil, fmt.Errorf("helm upgrade failed: %w", err) + } + + config.HelmLoggerFunc("✅ Release '%s' upgraded successfully. Status: %s", rel.Name, rel.Info.Status) + return rel, nil +} + +// UninstallHelmChart uninstalls a Helm release. +func UninstallHelmChart(config *HelmInstallConfig) error { + if config.ReleaseName == "" { + return fmt.Errorf("release name is required for uninstall") + } + if config.Namespace == "" { + return fmt.Errorf("namespace is required for uninstall") + } + + // Initialize Helm action configuration + if config.HelmLoggerFunc == nil { + config.HelmLoggerFunc = func(_ string, _ ...interface{}) {} + } + config.HelmLoggerFunc("Setting up Helm configuration for uninstall of %s...", config.ReleaseName) + actionConfig, err := setupHelmAction(config) + if err != nil { + return err + } + + // Uninstall the release + config.HelmLoggerFunc("Uninstalling release %s from namespace %s...", config.ReleaseName, config.Namespace) + uninstallClient := action.NewUninstall(actionConfig) + + // Set timeout if provided + if config.Timeout > 0 { + uninstallClient.Timeout = config.Timeout + } + + _, err = uninstallClient.Run(config.ReleaseName) + if err != nil { + return fmt.Errorf("helm uninstall failed: %w", err) + } + + config.HelmLoggerFunc("✅ Release '%s' uninstalled successfully", config.ReleaseName) + return nil +} + // setupHelmAction sets up Helm action configuration. func setupHelmAction(config *HelmInstallConfig) (*action.Configuration, error) { actionConfig := new(action.Configuration) @@ -226,6 +306,27 @@ func newInstallClient(actionConfig *action.Configuration, config *HelmInstallCon client.Version = config.ChartVersion client.Replace = true // Allow replacing failed releases on retry + // Set timeout + if config.Timeout > 0 { + client.Timeout = config.Timeout + } + + return client +} + +// newUpgradeClient creates and configures a Helm upgrade action client from the provided configuration. +func newUpgradeClient(actionConfig *action.Configuration, config *HelmInstallConfig) *action.Upgrade { + client := action.NewUpgrade(actionConfig) + client.Namespace = config.Namespace + client.Wait = config.Wait + client.Version = config.ChartVersion + client.ReuseValues = config.ReuseValues + + // Set timeout + if config.Timeout > 0 { + client.Timeout = config.Timeout + } + return client } diff --git a/operator/e2e/tests/cert_management_test.go b/operator/e2e/tests/cert_management_test.go new file mode 100644 index 000000000..b1d346a9a --- /dev/null +++ b/operator/e2e/tests/cert_management_test.go @@ -0,0 +1,744 @@ +//go:build e2e + +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "bytes" + "context" + "crypto/tls" + "crypto/x509" + "encoding/pem" + "fmt" + "net" + "net/http" + "net/url" + "path/filepath" + "runtime" + "testing" + "time" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + "github.com/ai-dynamo/grove/operator/e2e" + "github.com/ai-dynamo/grove/operator/e2e/setup" + "github.com/ai-dynamo/grove/operator/e2e/utils" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/portforward" + "k8s.io/client-go/transport/spdy" +) + +// certManagerGroveConfig returns Grove configuration for cert-manager mode. +// This uses manual cert provisioning and configures webhook annotations for cert-manager CA injection. +func certManagerGroveConfig() *setup.GroveConfig { + return &setup.GroveConfig{ + InstallCRDs: true, + Webhooks: setup.WebhooksConfig{ + CertProvisionMode: configv1alpha1.CertProvisionModeManual, + SecretName: "grove-webhook-server-cert", + Annotations: map[string]string{ + "cert-manager.io/inject-ca-from": "grove-system/grove-webhook-server-cert", + }, + }, + } +} + +// autoProvisionGroveConfig returns Grove configuration for auto-provision mode (default). +// Note: Annotations are intentionally omitted - the Helm templates only include annotations +// when certProvisionMode is "manual", so any previous cert-manager annotations are automatically +// cleared when switching back to auto mode. +func autoProvisionGroveConfig() *setup.GroveConfig { + return &setup.GroveConfig{ + InstallCRDs: true, + Webhooks: setup.WebhooksConfig{ + CertProvisionMode: configv1alpha1.CertProvisionModeAuto, + SecretName: "grove-webhook-server-cert", + }, + } +} + +// updateGroveToCertManager updates Grove to use cert-manager for certificate management. +func updateGroveToCertManager(t *testing.T, ctx context.Context, restConfig *rest.Config) { + t.Helper() + if err := setup.UpdateGroveConfiguration(ctx, restConfig, certManagerGroveConfig(), logger); err != nil { + t.Fatalf("Failed to update Grove to cert-manager mode: %v", err) + } +} + +// updateGroveToAutoProvision updates Grove to use auto-provisioned certificates. +func updateGroveToAutoProvision(t *testing.T, ctx context.Context, restConfig *rest.Config) { + t.Helper() + if err := setup.UpdateGroveConfiguration(ctx, restConfig, autoProvisionGroveConfig(), logger); err != nil { + t.Fatalf("Failed to update Grove to auto-provision mode: %v", err) + } +} + +const ( + certManagerIssuerYAML = ` +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} +` + certManagerCertificateYAML = ` +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: grove-webhook-server-cert + namespace: grove-system +spec: + secretName: grove-webhook-server-cert + duration: 2160h # 90d + renewBefore: 360h # 15d + issuerRef: + name: selfsigned-issuer + kind: ClusterIssuer + dnsNames: + - grove-operator + - grove-operator.grove-system + - grove-operator.grove-system.svc + - grove-operator.grove-system.svc.cluster.local +` +) + +// Test_CM1_CertManagementRoundTrip tests the full certificate management round-trip: +// auto-provision -> cert-manager -> auto-provision +// Scenario CM-1: +// 1. Initialize Grove with auto-provision mode +// 2. Install cert-manager and create Certificate +// 3. Upgrade Grove to use cert-manager (certProvisionMode=manual) +// 4. Verify cert-manager mode is active +// 5. Deploy and verify workload with cert-manager certs +// 6. Remove cert-manager resources +// 7. Upgrade Grove back to auto-provision mode +// 8. Verify auto-provision mode is active +// 9. Delete and redeploy workload to test webhooks with auto-provisioned certs +func Test_CM1_CertManagementRoundTrip(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize Grove with auto-provision mode (10 nodes for workload)") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 10) + + logger.Info("2. Install cert-manager and create Certificate") + deps, _ := e2e.GetDependencies() + installCertManager(t, ctx, restConfig, deps) + defer uninstallCertManager(t, restConfig, deps) + defer cleanup() + + // Create Issuer and Certificate + if _, err := utils.ApplyYAMLData(ctx, []byte(certManagerIssuerYAML), "", restConfig, logger); err != nil { + t.Fatalf("Failed to apply ClusterIssuer: %v", err) + } + waitForClusterIssuer(t, ctx, dynamicClient, "selfsigned-issuer") + + if _, err := utils.ApplyYAMLData(ctx, []byte(certManagerCertificateYAML), "", restConfig, logger); err != nil { + t.Fatalf("Failed to apply Certificate: %v", err) + } + + // Wait for cert-manager to actually take over the secret. + // This is critical because the secret may already exist from auto-provision mode, + // and we need to wait for cert-manager to update it (not just check existence). + waitForSecretManagedByCertManager(t, ctx, clientset, "grove-webhook-server-cert") + + logger.Info("3. Upgrade Grove to use cert-manager (certProvisionMode=manual)") + updateGroveToCertManager(t, ctx, restConfig) + + logger.Info("4. Verify cert-manager mode is active") + verifyCertManagerMode(t, ctx, clientset) + verifyWebhookServingCertificate(t, ctx, clientset, restConfig) + + logger.Info("5. Deploy and verify workload with cert-manager certs") + tc := createTestContext(t, ctx, clientset, dynamicClient, restConfig) + if _, err := deployAndVerifyWorkload(tc); err != nil { + t.Fatalf("Failed to deploy workload in Cert-Manager mode: %v", err) + } + + // Wait for all pods to become ready to verify the webhook is working end-to-end + if err := waitForPods(tc, tc.Workload.ExpectedPods); err != nil { + t.Fatalf("Failed to wait for workload pods to be ready: %v", err) + } + + logger.Info("6. Remove cert-manager resources") + deleteCertManagerResources(ctx, clientset, dynamicClient) + waitForSecret(t, ctx, clientset, "grove-webhook-server-cert", false) + + logger.Info("7. Upgrade Grove back to auto-provision mode") + updateGroveToAutoProvision(t, ctx, restConfig) + waitForSecret(t, ctx, clientset, "grove-webhook-server-cert", true) + + logger.Info("8. Verify auto-provision mode is active") + verifyAutoProvisionMode(t, ctx, clientset) + verifyWebhookServingCertificate(t, ctx, clientset, restConfig) + + logger.Info("9. Delete and redeploy workload to test webhooks with auto-provisioned certs") + // Delete the existing workload to test that webhooks work with new certs + deleteWorkload(t, ctx, dynamicClient, tc.Workload.Name, tc.Namespace) + + // Redeploy workload - this will exercise the validating and mutating webhooks + if _, err := deployAndVerifyWorkload(tc); err != nil { + t.Fatalf("Failed to deploy workload with auto-provisioned certs: %v", err) + } + + // Wait for all pods to become ready + if err := waitForPods(tc, tc.Workload.ExpectedPods); err != nil { + t.Fatalf("Workload pods not ready after redeploying with auto-provisioned certs: %v", err) + } + + logger.Info("🎉 Certificate management round-trip test completed successfully") +} + +func deleteWorkload(t *testing.T, ctx context.Context, dynamicClient dynamic.Interface, name, namespace string) { + t.Helper() + + pcsGVR := schema.GroupVersionResource{ + Group: "grove.io", + Version: "v1alpha1", + Resource: "podcliquesets", + } + + // Delete the PodCliqueSet + logger.Debugf("Deleting PodCliqueSet %s/%s", namespace, name) + err := dynamicClient.Resource(pcsGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + t.Fatalf("Failed to delete PodCliqueSet %s: %v", name, err) + } + + // Wait for deletion to complete + err = utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) { + _, err := dynamicClient.Resource(pcsGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + return errors.IsNotFound(err), nil + }) + if err != nil { + t.Fatalf("Timeout waiting for PodCliqueSet %s to be deleted: %v", name, err) + } + logger.Debugf("PodCliqueSet %s/%s deleted", namespace, name) +} + +func waitForSecret(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, name string, shouldExist bool) { + t.Helper() + + err := utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) { + _, err := clientset.CoreV1().Secrets("grove-system").Get(ctx, name, metav1.GetOptions{}) + if shouldExist { + return err == nil, nil + } + return errors.IsNotFound(err), nil + }) + if err != nil { + t.Fatalf("Timeout waiting for secret %s (shouldExist=%v): %v", name, shouldExist, err) + } +} + +// waitForSecretManagedByCertManager waits for a secret to exist AND be managed by cert-manager. +// This is important because when transitioning from auto-provision to cert-manager mode, +// the secret may already exist from auto-provision, and we need to wait for cert-manager +// to actually update it (which is indicated by the cert-manager.io/certificate-name annotation). +func waitForSecretManagedByCertManager(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, name string) { + t.Helper() + + logger.Debugf("Waiting for secret %s to be managed by cert-manager...", name) + err := utils.PollForCondition(ctx, defaultPollTimeout, defaultPollInterval, func() (bool, error) { + secret, err := clientset.CoreV1().Secrets("grove-system").Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return false, nil // Secret doesn't exist yet, keep waiting + } + // Check if cert-manager has taken ownership of this secret + certName := secret.Annotations[certManagerCertNameAnnotation] + if certName != "" { + logger.Debugf("Secret %s is now managed by cert-manager (certificate: %s)", name, certName) + return true, nil + } + return false, nil + }) + if err != nil { + t.Fatalf("Timeout waiting for secret %s to be managed by cert-manager: %v", name, err) + } +} + +func deleteCertManagerResources(ctx context.Context, clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) { + certGVR := schema.GroupVersionResource{Group: "cert-manager.io", Version: "v1", Resource: "certificates"} + issuerGVR := schema.GroupVersionResource{Group: "cert-manager.io", Version: "v1", Resource: "clusterissuers"} + + // Delete Certificate first (cert-manager resource) + if err := dynamicClient.Resource(certGVR).Namespace("grove-system").Delete(ctx, "grove-webhook-server-cert", metav1.DeleteOptions{}); err != nil { + logger.Warnf("Failed to delete Certificate (may not exist): %v", err) + } + + // Delete the Secret managed by cert-manager + if err := clientset.CoreV1().Secrets("grove-system").Delete(ctx, "grove-webhook-server-cert", metav1.DeleteOptions{}); err != nil { + logger.Warnf("Failed to delete Secret (may not exist): %v", err) + } + + // Delete ClusterIssuer last + if err := dynamicClient.Resource(issuerGVR).Delete(ctx, "selfsigned-issuer", metav1.DeleteOptions{}); err != nil { + logger.Warnf("Failed to delete ClusterIssuer (may not exist): %v", err) + } +} + +func installCertManager(t *testing.T, ctx context.Context, restConfig *rest.Config, deps *e2e.Dependencies) { + t.Helper() + + cmConfig := &setup.HelmInstallConfig{ + RestConfig: restConfig, + ReleaseName: deps.HelmCharts.CertManager.ReleaseName, + ChartRef: deps.HelmCharts.CertManager.ChartRef, + ChartVersion: deps.HelmCharts.CertManager.Version, + Namespace: deps.HelmCharts.CertManager.Namespace, + CreateNamespace: true, + RepoURL: deps.HelmCharts.CertManager.RepoURL, + Values: map[string]interface{}{ + "installCRDs": true, + }, + HelmLoggerFunc: logger.Debugf, + Logger: logger, + } + + if _, err := setup.InstallHelmChart(cmConfig); err != nil { + t.Fatalf("Failed to install cert-manager: %v", err) + } + + // Ensure cert-manager is actually up and running before returning + if err := utils.WaitForPodsInNamespace(ctx, cmConfig.Namespace, restConfig, 3, defaultPollTimeout, defaultPollInterval, logger); err != nil { + t.Fatalf("cert-manager pods failed to become ready: %v", err) + } +} + +func createTestContext(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, dynamicClient dynamic.Interface, restConfig *rest.Config) TestContext { + t.Helper() + + _, currentFile, _, _ := runtime.Caller(0) + workloadPath := filepath.Join(filepath.Dir(currentFile), "../yaml/workload1.yaml") + + return TestContext{ + T: t, + Ctx: ctx, + Clientset: clientset, + DynamicClient: dynamicClient, + RestConfig: restConfig, + Namespace: "default", + Timeout: defaultPollTimeout, + Interval: defaultPollInterval, + Workload: &WorkloadConfig{ + Name: "workload1", + YAMLPath: workloadPath, + Namespace: "default", + ExpectedPods: 10, + }, + } +} + +func waitForClusterIssuer(t *testing.T, ctx context.Context, dynamicClient dynamic.Interface, name string) { + t.Helper() + + issuerGVR := schema.GroupVersionResource{ + Group: "cert-manager.io", + Version: "v1", + Resource: "clusterissuers", + } + + err := utils.PollForCondition(ctx, 30*time.Second, 1*time.Second, func() (bool, error) { + issuer, err := dynamicClient.Resource(issuerGVR).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + + return checkReadyStatus(issuer), nil + }) + + if err != nil { + t.Fatalf("ClusterIssuer %s failed to become Ready: %v", name, err) + } +} + +func checkReadyStatus(obj *unstructured.Unstructured) bool { + conditions, found, err := unstructured.NestedSlice(obj.Object, "status", "conditions") + if err != nil || !found { + return false + } + + for _, c := range conditions { + condition, ok := c.(map[string]interface{}) + if !ok { + continue + } + + if condition["type"] == "Ready" && condition["status"] == string(metav1.ConditionTrue) { + return true + } + } + return false +} + +func uninstallCertManager(t *testing.T, restConfig *rest.Config, deps *e2e.Dependencies) { + t.Helper() + + cmConfig := &setup.HelmInstallConfig{ + RestConfig: restConfig, + ReleaseName: deps.HelmCharts.CertManager.ReleaseName, + Namespace: deps.HelmCharts.CertManager.Namespace, + HelmLoggerFunc: logger.Debugf, + Logger: logger, + } + + if err := setup.UninstallHelmChart(cmConfig); err != nil { + logger.Warnf("Failed to uninstall cert-manager (may not exist): %v", err) + } +} + +const ( + // certManagerInjectAnnotation is the annotation cert-manager uses to inject CA bundles + certManagerInjectAnnotation = "cert-manager.io/inject-ca-from" + // certManagerCertNameAnnotation is the annotation cert-manager adds to secrets it manages + certManagerCertNameAnnotation = "cert-manager.io/certificate-name" + // groveWebhookSecretName is the name of the Grove webhook certificate secret + groveWebhookSecretName = "grove-webhook-server-cert" + // groveNamespace is the namespace where Grove is installed + groveNamespace = "grove-system" +) + +// webhookNames lists the ValidatingWebhookConfiguration and MutatingWebhookConfiguration names to check +var webhookConfigNames = []string{ + "podcliqueset-validating-webhook", + "podcliqueset-defaulting-webhook", +} + +// verifyCertManagerMode verifies that Grove is using cert-manager for certificate management. +// It checks: +// 1. Webhook configurations have the cert-manager.io/inject-ca-from annotation +// 2. The certificate secret has cert-manager annotations +func verifyCertManagerMode(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset) { + t.Helper() + + logger.Debug("Verifying cert-manager mode is active...") + + // Check webhook configurations have cert-manager annotation + for _, webhookName := range webhookConfigNames { + if err := verifyWebhookHasCertManagerAnnotation(ctx, clientset, webhookName, true); err != nil { + t.Fatalf("Webhook %s does not have cert-manager annotation: %v", webhookName, err) + } + } + + // Check secret has cert-manager annotations + if err := verifyWebhookSecretCertManagerStatus(ctx, clientset, true); err != nil { + t.Fatalf("Secret verification for cert-manager mode failed: %v", err) + } + + logger.Debug("Verified: Grove is using cert-manager certificates") +} + +// verifyAutoProvisionMode verifies that Grove is using auto-provisioned certificates. +func verifyAutoProvisionMode(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset) { + t.Helper() + + logger.Debug("Verifying auto-provision mode is active...") + + // Check secret does NOT have cert-manager annotations (proving it's managed by Grove's cert-controller) + if err := verifyWebhookSecretCertManagerStatus(ctx, clientset, false); err != nil { + t.Fatalf("Secret verification for auto-provision mode failed: %v", err) + } + + logger.Debug("Verified: Grove is using auto-provisioned certificates (secret not managed by cert-manager)") +} + +// verifyWebhookHasCertManagerAnnotation checks if a webhook configuration has the cert-manager annotation. +// If shouldHave is true, it verifies the annotation exists; if false, it verifies the annotation is absent or empty. +func verifyWebhookHasCertManagerAnnotation(ctx context.Context, clientset *kubernetes.Clientset, webhookName string, shouldHave bool) error { + // Try ValidatingWebhookConfiguration first + vwc, err := clientset.AdmissionregistrationV1().ValidatingWebhookConfigurations().Get(ctx, webhookName, metav1.GetOptions{}) + if err == nil { + annotationValue := vwc.Annotations[certManagerInjectAnnotation] + hasAnnotation := annotationValue != "" + + if shouldHave && !hasAnnotation { + return fmt.Errorf("ValidatingWebhookConfiguration %s missing cert-manager annotation", webhookName) + } + if !shouldHave && hasAnnotation { + return fmt.Errorf("ValidatingWebhookConfiguration %s has unexpected cert-manager annotation: %s", webhookName, annotationValue) + } + logger.Debugf("ValidatingWebhookConfiguration %s: cert-manager annotation present=%v (expected=%v)", webhookName, hasAnnotation, shouldHave) + return nil + } + + // Try MutatingWebhookConfiguration + mwc, err := clientset.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(ctx, webhookName, metav1.GetOptions{}) + if err == nil { + annotationValue := mwc.Annotations[certManagerInjectAnnotation] + hasAnnotation := annotationValue != "" + + if shouldHave && !hasAnnotation { + return fmt.Errorf("MutatingWebhookConfiguration %s missing cert-manager annotation", webhookName) + } + if !shouldHave && hasAnnotation { + return fmt.Errorf("MutatingWebhookConfiguration %s has unexpected cert-manager annotation: %s", webhookName, annotationValue) + } + logger.Debugf("MutatingWebhookConfiguration %s: cert-manager annotation present=%v (expected=%v)", webhookName, hasAnnotation, shouldHave) + return nil + } + + return fmt.Errorf("webhook configuration %s not found as ValidatingWebhookConfiguration or MutatingWebhookConfiguration", webhookName) +} + +// verifyWebhookSecretCertManagerStatus checks if the webhook certificate secret's cert-manager management status matches expectations. +// If expectManaged is true, it verifies cert-manager annotations exist; if false, it verifies they are absent. +func verifyWebhookSecretCertManagerStatus(ctx context.Context, clientset *kubernetes.Clientset, expectManaged bool) error { + secret, err := clientset.CoreV1().Secrets(groveNamespace).Get(ctx, groveWebhookSecretName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get secret %s/%s: %w", groveNamespace, groveWebhookSecretName, err) + } + + // Check for cert-manager certificate name annotation + certName := secret.Annotations[certManagerCertNameAnnotation] + isManagedByCertManager := certName != "" + + if expectManaged && !isManagedByCertManager { + return fmt.Errorf("secret %s is not managed by cert-manager (missing %s annotation)", groveWebhookSecretName, certManagerCertNameAnnotation) + } + if !expectManaged && isManagedByCertManager { + return fmt.Errorf("secret %s is unexpectedly managed by cert-manager (has %s=%s)", groveWebhookSecretName, certManagerCertNameAnnotation, certName) + } + + logger.Debugf("Secret %s: managed by cert-manager=%v (expected=%v)", groveWebhookSecretName, isManagedByCertManager, expectManaged) + return nil +} + +// verifyWebhookServingCertificate verifies that the webhook is actually serving the certificate from the Secret. +// This connects to the webhook endpoint via TLS and compares the served certificate with the one in the Secret. +// It includes retry logic to handle timing issues with: +// - Kubernetes secret volume propagation delays +// - The certwatcher's 10-second polling interval for detecting certificate changes +func verifyWebhookServingCertificate(t *testing.T, ctx context.Context, clientset *kubernetes.Clientset, restConfig *rest.Config) { + t.Helper() + + logger.Debug("Verifying webhook is serving the correct certificate (with retries for cert reload timing)...") + + // Retry for up to 30 seconds to account for: + // - Kubernetes secret volume update propagation (can take up to the kubelet sync period) + // - certwatcher 10-second polling interval + var lastExpectedSerial, lastServedSerial string + err := utils.PollForCondition(ctx, 30*time.Second, 2*time.Second, func() (bool, error) { + // Get the certificate from the Secret + expectedCert, err := getCertificateFromSecret(ctx, clientset) + if err != nil { + logger.Debugf("Failed to get certificate from secret: %v", err) + return false, nil + } + lastExpectedSerial = expectedCert.SerialNumber.String() + + // Get the certificate the webhook is actually serving + servedCert, err := getServedCertificate(ctx, clientset, restConfig) + if err != nil { + logger.Debugf("Failed to get served certificate from webhook: %v", err) + return false, nil + } + lastServedSerial = servedCert.SerialNumber.String() + + // Compare the certificates + if certificatesMatch(expectedCert, servedCert) { + logger.Debugf("Certificate match! Serial: %s", lastExpectedSerial) + return true, nil + } + + logger.Debugf("Certificate mismatch (will retry): expected serial=%s, served serial=%s", + lastExpectedSerial, lastServedSerial) + return false, nil + }) + + if err != nil { + t.Fatalf("Certificate mismatch: webhook is not serving the expected certificate after retries.\n"+ + "Expected serial: %s\n"+ + "Served serial: %s\n"+ + "This may indicate the operator has not reloaded the certificate from the secret.", + lastExpectedSerial, lastServedSerial) + } + + logger.Debug("Verified: webhook is serving the correct certificate from the Secret") +} + +// getCertificateFromSecret retrieves and parses the TLS certificate from the webhook Secret. +func getCertificateFromSecret(ctx context.Context, clientset *kubernetes.Clientset) (*x509.Certificate, error) { + secret, err := clientset.CoreV1().Secrets(groveNamespace).Get(ctx, groveWebhookSecretName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get secret: %w", err) + } + + certPEM, ok := secret.Data["tls.crt"] + if !ok { + return nil, fmt.Errorf("secret does not contain tls.crt") + } + + block, _ := pem.Decode(certPEM) + if block == nil { + return nil, fmt.Errorf("failed to decode PEM block from tls.crt") + } + + cert, err := x509.ParseCertificate(block.Bytes) + if err != nil { + return nil, fmt.Errorf("failed to parse certificate: %w", err) + } + + return cert, nil +} + +// getServedCertificate connects to the webhook service and retrieves the certificate it's serving. +// It uses port-forwarding to connect to the operator pod. +func getServedCertificate(ctx context.Context, clientset *kubernetes.Clientset, restConfig *rest.Config) (*x509.Certificate, error) { + // Find the grove-operator pod + pods, err := clientset.CoreV1().Pods(groveNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=grove-operator", + }) + if err != nil { + return nil, fmt.Errorf("failed to list operator pods: %w", err) + } + if len(pods.Items) == 0 { + return nil, fmt.Errorf("no grove-operator pods found") + } + + podName := pods.Items[0].Name + logger.Debugf("Using operator pod: %s", podName) + + // Set up port forwarding + localPort, stopChan, err := setupPortForward(ctx, restConfig, groveNamespace, podName, 9443) + if err != nil { + return nil, fmt.Errorf("failed to set up port forwarding: %w", err) + } + defer close(stopChan) + + // Give port-forward a moment to establish + time.Sleep(500 * time.Millisecond) + + // Connect via TLS and get the certificate + cert, err := getTLSCertificate(fmt.Sprintf("localhost:%d", localPort)) + if err != nil { + return nil, fmt.Errorf("failed to get TLS certificate: %w", err) + } + + return cert, nil +} + +// setupPortForward creates a port-forward to the specified pod and returns the local port. +func setupPortForward(ctx context.Context, restConfig *rest.Config, namespace, podName string, remotePort int) (int, chan struct{}, error) { + // Find a free local port + listener, err := net.Listen("tcp", "localhost:0") + if err != nil { + return 0, nil, fmt.Errorf("failed to find free port: %w", err) + } + localPort := listener.Addr().(*net.TCPAddr).Port + listener.Close() + + // Create the port-forward URL + path := fmt.Sprintf("/api/v1/namespaces/%s/pods/%s/portforward", namespace, podName) + + transport, upgrader, err := spdy.RoundTripperFor(restConfig) + if err != nil { + return 0, nil, fmt.Errorf("failed to create round tripper: %w", err) + } + + // Parse the URL properly + parsedURL, err := parsePortForwardURL(restConfig.Host, path) + if err != nil { + return 0, nil, fmt.Errorf("failed to parse URL: %w", err) + } + dialer := spdy.NewDialer(upgrader, &http.Client{Transport: transport}, http.MethodPost, parsedURL) + + stopChan := make(chan struct{}, 1) + readyChan := make(chan struct{}) + + ports := []string{fmt.Sprintf("%d:%d", localPort, remotePort)} + + // Create a buffer to capture output (we don't need to display it) + out := &bytes.Buffer{} + errOut := &bytes.Buffer{} + + pf, err := portforward.New(dialer, ports, stopChan, readyChan, out, errOut) + if err != nil { + return 0, nil, fmt.Errorf("failed to create port forwarder: %w", err) + } + + // Start port forwarding in a goroutine + errChan := make(chan error, 1) + go func() { + if err := pf.ForwardPorts(); err != nil { + errChan <- err + } + }() + + // Wait for port-forward to be ready or error + select { + case <-readyChan: + logger.Debugf("Port forward established: localhost:%d -> %s:%d", localPort, podName, remotePort) + case err := <-errChan: + return 0, nil, fmt.Errorf("port forward failed: %w", err) + case <-time.After(10 * time.Second): + close(stopChan) + return 0, nil, fmt.Errorf("timeout waiting for port forward to be ready") + case <-ctx.Done(): + close(stopChan) + return 0, nil, ctx.Err() + } + + // Check for any errors in errOut + if errOut.Len() > 0 { + logger.Warnf("Port forward stderr: %s", errOut.String()) + } + + return localPort, stopChan, nil +} + +// parsePortForwardURL parses the host and path into a proper URL for port forwarding. +func parsePortForwardURL(host, path string) (*url.URL, error) { + u, err := url.Parse(host) + if err != nil { + return nil, err + } + u.Path = path + return u, nil +} + +// getTLSCertificate connects to the specified address via TLS and returns the server's certificate. +func getTLSCertificate(address string) (*x509.Certificate, error) { + // Connect with InsecureSkipVerify since we're just extracting the certificate + conn, err := tls.Dial("tcp", address, &tls.Config{ + InsecureSkipVerify: true, //nolint:gosec // We need to connect without verification to extract the cert + }) + if err != nil { + return nil, fmt.Errorf("failed to connect: %w", err) + } + defer conn.Close() + + // Get the peer certificates + certs := conn.ConnectionState().PeerCertificates + if len(certs) == 0 { + return nil, fmt.Errorf("no certificates received from server") + } + + // Return the leaf certificate (first one) + return certs[0], nil +} + +// certificatesMatch compares two certificates to determine if they are the same. +// We compare the raw DER bytes for an exact match. +func certificatesMatch(cert1, cert2 *x509.Certificate) bool { + return bytes.Equal(cert1.Raw, cert2.Raw) +} diff --git a/operator/e2e/utils/k8s_client.go b/operator/e2e/utils/k8s_client.go index e39d217cf..4bec4278b 100644 --- a/operator/e2e/utils/k8s_client.go +++ b/operator/e2e/utils/k8s_client.go @@ -64,7 +64,7 @@ func ApplyYAMLFile(ctx context.Context, yamlFilePath string, namespace string, r return nil, fmt.Errorf("failed to read YAML file %s: %w", yamlFilePath, err) } - return applyYAMLData(ctx, yamlData, namespace, restConfig, logger) + return ApplyYAMLData(ctx, yamlData, namespace, restConfig, logger) } // WaitForPods waits for pods to be ready in the specified namespaces @@ -129,8 +129,8 @@ func WaitForPods(ctx context.Context, restConfig *rest.Config, namespaces []stri }) } -// applyYAMLData is the common function that applies YAML data to Kubernetes -func applyYAMLData(ctx context.Context, yamlData []byte, namespace string, restConfig *rest.Config, logger *Logger) ([]AppliedResource, error) { +// ApplyYAMLData is the common function that applies YAML data to Kubernetes +func ApplyYAMLData(ctx context.Context, yamlData []byte, namespace string, restConfig *rest.Config, logger *Logger) ([]AppliedResource, error) { dynamicClient, restMapper, err := createKubernetesClients(restConfig) if err != nil { return nil, err diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go index 2cd60b4d2..753ea616f 100644 --- a/operator/internal/controller/cert/cert.go +++ b/operator/internal/controller/cert/cert.go @@ -21,6 +21,7 @@ import ( "os" "strings" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/constants" authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" defaultingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting" @@ -38,17 +39,31 @@ const ( certificateAuthorityOrganization = "Grove" ) -// ManageWebhookCerts registers the cert-controller with the manager which will be used to manage -// webhook certificates. -func ManageWebhookCerts(mgr ctrl.Manager, certDir string, authorizerEnabled bool, certsReadyCh chan struct{}) error { +// ManageWebhookCerts manages webhook certificates based on the CertProvisionMode configuration. +// When mode=auto: uses cert-controller for automatic certificate generation and management. +// When mode=manual: waits for externally provided certificates (e.g., from cert-manager, cluster admin). +func ManageWebhookCerts(mgr ctrl.Manager, certDir string, secretName string, authorizerEnabled bool, certProvisionMode configv1alpha1.CertProvisionMode, certsReadyCh chan struct{}) error { namespace, err := getOperatorNamespace() if err != nil { return err } + + logger := ctrl.Log.WithName("cert-management") + + if certProvisionMode == configv1alpha1.CertProvisionModeManual { + logger.Info("Using externally provided certificates (manual mode)", + "certDir", certDir, "secretName", secretName) + // Certificates are managed externally, signal ready immediately + close(certsReadyCh) + return nil + } + + logger.Info("Auto-provisioning certificates using cert-controller", + "secretName", secretName, "certDir", certDir, "mode", certProvisionMode) rotator := &cert.CertRotator{ SecretKey: types.NamespacedName{ Namespace: namespace, - Name: "grove-webhook-server-cert", + Name: secretName, }, CertDir: certDir, CAName: certificateAuthorityName,