Skip to content

Commit c2b8257

Browse files
committed
fix(lmeval): merge multiple CA sources into single bundle for HTTPS pods
The previous CA injection mounted a single key from odh-trusted-ca-bundle, which contains only public/system CAs. Cluster-internal services using OpenShift service-serving certificates (*.svc.cluster.local) are signed by a different CA in the openshift-service-ca.crt ConfigMap, so the pod still got SSLCertVerificationError. Additionally, REQUESTS_CA_BUNDLE replaces Python's default trust store rather than appending, so mounting only one CA source loses trust in all others. Fix: replace findCABundle with findAndMergeCABundle, which collects PEM data from both odh-trusted-ca-bundle and openshift-service-ca.crt (best-effort, each skipped if absent), concatenates them, and creates a per-job merged ConfigMap (<jobName>-ca-bundle) with an owner reference for automatic GC. The pod mounts this merged ConfigMap so REQUESTS_CA_BUNDLE contains all relevant CAs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED
1 parent be2e76c commit c2b8257

5 files changed

Lines changed: 346 additions & 85 deletions

File tree

config/components/lmes/rbac/manager-rbac.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ rules:
5858
- get
5959
- watch
6060
- list
61+
- create
62+
- update
6163
- apiGroups:
6264
- ""
6365
resources:

controllers/lmes/constants.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,20 @@ const (
4747
ServiceName = "LMES"
4848

4949
// DefaultCABundleConfigMapName is the standard RHOAI ConfigMap that holds the cluster CA bundle.
50-
// It is injected into managed namespaces by the RHOAI operator and trusted by cluster-internal
51-
// HTTPS services (e.g., KServe external routes using self-signed certs).
50+
// It is injected into managed namespaces by the RHOAI operator.
5251
DefaultCABundleConfigMapName = "odh-trusted-ca-bundle"
53-
// CABundleVolumeName is the volume name used when auto-mounting the cluster CA bundle.
52+
// ServiceCAConfigMapName is the well-known ConfigMap that contains the OpenShift
53+
// service-serving CA, used by cluster-internal HTTPS services (*.svc.cluster.local).
54+
ServiceCAConfigMapName = "openshift-service-ca.crt"
55+
// ServiceCAKey is the standard key within the service-serving CA ConfigMap.
56+
ServiceCAKey = "service-ca.crt"
57+
// MergedCABundleKey is the key used in the per-job merged CA ConfigMap.
58+
MergedCABundleKey = "merged-ca-bundle.crt"
59+
// MergedCAConfigMapSuffix is appended to the job name to form the merged CA ConfigMap name.
60+
MergedCAConfigMapSuffix = "-ca-bundle"
61+
// CABundleVolumeName is the volume name used when auto-mounting the merged CA bundle.
5462
CABundleVolumeName = "odh-ca-bundle"
55-
// CABundleMountPath is the file path at which the CA bundle is mounted inside the lm-eval pod.
63+
// CABundleMountPath is the file path at which the merged CA bundle is mounted inside the lm-eval pod.
5664
// REQUESTS_CA_BUNDLE is set to this path so Python's requests library picks it up automatically.
5765
CABundleMountPath = "/etc/ssl/certs/odh-ca-bundle.crt"
5866

controllers/lmes/lmevaljob_controller.go

Lines changed: 134 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package lmes
1919
import (
2020
"bytes"
2121
"context"
22+
"errors"
2223
"fmt"
2324
"maps"
2425
"slices"
@@ -31,6 +32,7 @@ import (
3132
"github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
3233

3334
corev1 "k8s.io/api/core/v1"
35+
apierrors "k8s.io/apimachinery/pkg/api/errors"
3436
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3537
"k8s.io/apimachinery/pkg/runtime"
3638
"k8s.io/apimachinery/pkg/types"
@@ -96,6 +98,8 @@ var (
9698
},
9799
},
98100
}
101+
102+
errNoCAData = errors.New("no CA bundle data found")
99103
)
100104

101105
// maintain a list of key-time pair data.
@@ -170,7 +174,7 @@ func (q *syncedMap4Reconciler) remove(key string) {
170174
// +kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=lmevaljobs/finalizers,verbs=update
171175
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete
172176
// +kubebuilder:rbac:groups="",resources=pods/exec,verbs=get;list;watch;create;delete
173-
// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;watch;list
177+
// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;watch;list;create;update
174178
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;watch;list
175179
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=list;get;watch
176180
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;get;watch;create;update;patch;delete
@@ -189,9 +193,9 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
189193
return r.handleDeletion(ctx, job, log)
190194
}
191195

192-
// Bug 3: When a completed job's spec is edited its metadata.Generation is
193-
// incremented by the API server. Detect that and reset the status so the job
194-
// re-runs with the updated configuration.
196+
// When a completed job's spec is edited, metadata.Generation is incremented
197+
// by the API server. Detect that and reset the status so the job re-runs
198+
// with the updated configuration.
195199
if job.Status.State == lmesv1alpha1.CompleteJobState {
196200
if lastGen := getLastScheduledGeneration(job); lastGen > 0 && job.Generation > lastGen {
197201
// Delete the completed pod first. The replacement pod reuses the same
@@ -563,38 +567,12 @@ func (r *LMEvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger,
563567
return ctrl.Result{}, err
564568
}
565569

566-
// Bug 1: Auto-inject the cluster CA bundle when the model endpoint uses HTTPS
567-
// and verify_certificate is not already set. This handles the common RHOAI case
568-
// where KServe external routes use self-signed certs trusted by the cluster but
569-
// not by Python's default trust store inside the pod.
570-
var caBundle *corev1.ConfigMap
571-
var caBundleKey string
572-
if hasHTTPSBaseURL(job) && !hasExplicitVerifyCertificate(job) {
573-
if cm, key, err := r.findCABundle(ctx, job.Namespace); err == nil {
574-
caBundle = cm
575-
caBundleKey = key
576-
log.Info("auto-injecting cluster CA bundle for HTTPS endpoint",
577-
"configmap", DefaultCABundleConfigMapName, "key", key)
578-
} else {
579-
log.Info("HTTPS base_url detected but cluster CA bundle not found, proceeding without auto-injection",
580-
"error", err.Error())
581-
}
582-
}
583-
584-
// Bug 3: Record the spec generation being scheduled. Reconcile reads this back
585-
// after completion to detect a spec change and reset the job for re-run.
586-
currentGenStr := strconv.FormatInt(job.Generation, 10)
587-
annotations := job.GetAnnotations()
588-
if annotations == nil {
589-
annotations = make(map[string]string)
570+
caBundle, caBundleKey, err := r.resolveCABundle(ctx, log, job)
571+
if err != nil {
572+
return ctrl.Result{}, err
590573
}
591-
if annotations[LastScheduledGenerationAnnotation] != currentGenStr {
592-
annotations[LastScheduledGenerationAnnotation] = currentGenStr
593-
job.SetAnnotations(annotations)
594-
if err := r.Update(ctx, job); err != nil {
595-
log.Error(err, "failed to update generation annotation")
596-
return ctrl.Result{}, err
597-
}
574+
if err := r.recordScheduledGeneration(ctx, job); err != nil {
575+
return ctrl.Result{}, err
598576
}
599577

600578
// construct a new pod and create a pod for the job
@@ -825,28 +803,12 @@ func (r *LMEvalJobReconciler) handleResume(ctx context.Context, log logr.Logger,
825803
return r.pullingJobs.addOrUpdate(string(job.GetUID()), Options.PodCheckingInterval), err
826804
}
827805

828-
// Bug 1: Apply the same CA bundle auto-injection on resume as on initial scheduling.
829-
var caBundle *corev1.ConfigMap
830-
var caBundleKey string
831-
if hasHTTPSBaseURL(job) && !hasExplicitVerifyCertificate(job) {
832-
if cm, key, err := r.findCABundle(ctx, job.Namespace); err == nil {
833-
caBundle = cm
834-
caBundleKey = key
835-
}
836-
}
837-
838-
currentGenStr := strconv.FormatInt(job.Generation, 10)
839-
annotations := job.GetAnnotations()
840-
if annotations == nil {
841-
annotations = make(map[string]string)
806+
caBundle, caBundleKey, err := r.resolveCABundle(ctx, log, job)
807+
if err != nil {
808+
return ctrl.Result{}, err
842809
}
843-
if annotations[LastScheduledGenerationAnnotation] != currentGenStr {
844-
annotations[LastScheduledGenerationAnnotation] = currentGenStr
845-
job.SetAnnotations(annotations)
846-
if err := r.Update(ctx, job); err != nil {
847-
log.Error(err, "failed to update generation annotation on resume")
848-
return ctrl.Result{}, err
849-
}
810+
if err := r.recordScheduledGeneration(ctx, job); err != nil {
811+
return ctrl.Result{}, err
850812
}
851813

852814
pod := CreatePod(Options, job, permConfig, caBundle, caBundleKey, log)
@@ -1349,9 +1311,8 @@ func CreatePod(svcOpts *serviceOptions, job *lmesv1alpha1.LMEvalJob, permConfig
13491311
volumes = append(volumes, job.Spec.Pod.GetVolumes()...)
13501312
volumeMounts = append(volumeMounts, job.Spec.Pod.GetContainer().GetVolumMounts()...)
13511313

1352-
// Bug 1: Mount the cluster CA bundle so Python's requests library can verify
1353-
// the self-signed certificate used by OpenShift external routes. REQUESTS_CA_BUNDLE
1354-
// is the standard env var that the requests library picks up automatically.
1314+
// Mount the merged CA bundle so REQUESTS_CA_BUNDLE lets Python's requests
1315+
// library verify certificates signed by cluster or service-serving CAs.
13551316
if caBundle != nil && caBundleKey != "" {
13561317
volumeMounts = append(volumeMounts, corev1.VolumeMount{
13571318
Name: CABundleVolumeName,
@@ -1792,6 +1753,43 @@ func removeProtectedEnvVars(envVars []corev1.EnvVar) []corev1.EnvVar {
17921753
return allowedEnvVars
17931754
}
17941755

1756+
// resolveCABundle looks up cluster CA sources and returns a merged ConfigMap
1757+
// when the job targets an HTTPS endpoint without an explicit verify_certificate.
1758+
func (r *LMEvalJobReconciler) resolveCABundle(ctx context.Context, log logr.Logger, job *lmesv1alpha1.LMEvalJob) (*corev1.ConfigMap, string, error) {
1759+
if !hasHTTPSBaseURL(job) || hasExplicitVerifyCertificate(job) {
1760+
return nil, "", nil
1761+
}
1762+
cm, key, err := r.findAndMergeCABundle(ctx, job)
1763+
if errors.Is(err, errNoCAData) {
1764+
log.Info("HTTPS base_url detected but no CA bundle data found, proceeding without auto-injection")
1765+
return nil, "", nil
1766+
}
1767+
if err != nil {
1768+
return nil, "", fmt.Errorf("failed to prepare CA bundle: %w", err)
1769+
}
1770+
log.Info("auto-injecting merged CA bundle for HTTPS endpoint",
1771+
"configmap", cm.Name, "key", key)
1772+
return cm, key, nil
1773+
}
1774+
1775+
// recordScheduledGeneration persists the current spec generation so Reconcile
1776+
// can detect a spec change on a completed job and reset it for re-run.
1777+
func (r *LMEvalJobReconciler) recordScheduledGeneration(ctx context.Context, job *lmesv1alpha1.LMEvalJob) error {
1778+
currentGenStr := strconv.FormatInt(job.Generation, 10)
1779+
annotations := job.GetAnnotations()
1780+
if annotations == nil {
1781+
annotations = make(map[string]string)
1782+
}
1783+
if annotations[LastScheduledGenerationAnnotation] != currentGenStr {
1784+
annotations[LastScheduledGenerationAnnotation] = currentGenStr
1785+
job.SetAnnotations(annotations)
1786+
if err := r.Update(ctx, job); err != nil {
1787+
return fmt.Errorf("failed to update generation annotation: %w", err)
1788+
}
1789+
}
1790+
return nil
1791+
}
1792+
17951793
// hasHTTPSBaseURL returns true when any modelArg named "base_url" uses HTTPS.
17961794
func hasHTTPSBaseURL(job *lmesv1alpha1.LMEvalJob) bool {
17971795
for _, arg := range job.Spec.ModelArgs {
@@ -1830,18 +1828,84 @@ func getLastScheduledGeneration(job *lmesv1alpha1.LMEvalJob) int64 {
18301828
return gen
18311829
}
18321830

1833-
// findCABundle looks for the standard RHOAI cluster CA bundle ConfigMap in the
1834-
// given namespace. It tries well-known key names and returns the ConfigMap and
1835-
// the matching key on success.
1836-
func (r *LMEvalJobReconciler) findCABundle(ctx context.Context, namespace string) (*corev1.ConfigMap, string, error) {
1837-
cm := &corev1.ConfigMap{}
1838-
if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: DefaultCABundleConfigMapName}, cm); err != nil {
1839-
return nil, "", err
1831+
// findAndMergeCABundle collects CA certificates from well-known cluster sources
1832+
// and creates a per-job ConfigMap containing the merged bundle. This ensures
1833+
// that REQUESTS_CA_BUNDLE (which replaces the default trust store) contains
1834+
// both public CAs and the OpenShift service-serving CA.
1835+
//
1836+
// Sources checked (best-effort, each is skipped if not found):
1837+
// - odh-trusted-ca-bundle: ca-bundle.crt, odh-ca-bundle.crt (public/system CAs)
1838+
// - openshift-service-ca.crt: service-ca.crt (service-serving CA)
1839+
func (r *LMEvalJobReconciler) findAndMergeCABundle(ctx context.Context, job *lmesv1alpha1.LMEvalJob) (*corev1.ConfigMap, string, error) {
1840+
log := log.FromContext(ctx)
1841+
var pemBlocks []string
1842+
1843+
// Source 1: odh-trusted-ca-bundle (public/system CAs)
1844+
odhCM := &corev1.ConfigMap{}
1845+
if err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: DefaultCABundleConfigMapName}, odhCM); err == nil {
1846+
for _, key := range []string{"ca-bundle.crt", "odh-ca-bundle.crt"} {
1847+
if data, ok := odhCM.Data[key]; ok && strings.TrimSpace(data) != "" {
1848+
pemBlocks = append(pemBlocks, data)
1849+
log.V(1).Info("collected CA data", "configmap", DefaultCABundleConfigMapName, "key", key)
1850+
}
1851+
}
1852+
} else if !apierrors.IsNotFound(err) {
1853+
log.Error(err, "error reading CA bundle ConfigMap", "name", DefaultCABundleConfigMapName)
18401854
}
1841-
for _, key := range []string{"ca-bundle.crt", "odh-ca-bundle.crt", "service-ca.crt"} {
1842-
if _, ok := cm.Data[key]; ok {
1843-
return cm, key, nil
1855+
1856+
// Source 2: openshift-service-ca.crt (service-serving CA)
1857+
svcCM := &corev1.ConfigMap{}
1858+
if err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: ServiceCAConfigMapName}, svcCM); err == nil {
1859+
if data, ok := svcCM.Data[ServiceCAKey]; ok && strings.TrimSpace(data) != "" {
1860+
pemBlocks = append(pemBlocks, data)
1861+
log.V(1).Info("collected CA data", "configmap", ServiceCAConfigMapName, "key", ServiceCAKey)
1862+
}
1863+
} else if !apierrors.IsNotFound(err) {
1864+
log.Error(err, "error reading service CA ConfigMap", "name", ServiceCAConfigMapName)
1865+
}
1866+
1867+
if len(pemBlocks) == 0 {
1868+
return nil, "", errNoCAData
1869+
}
1870+
1871+
merged := strings.Join(pemBlocks, "\n")
1872+
mergedCMName := job.Name + MergedCAConfigMapSuffix
1873+
1874+
mergedCM := &corev1.ConfigMap{}
1875+
err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: mergedCMName}, mergedCM)
1876+
if apierrors.IsNotFound(err) {
1877+
ownerRefController := true
1878+
mergedCM = &corev1.ConfigMap{
1879+
ObjectMeta: v1.ObjectMeta{
1880+
Name: mergedCMName,
1881+
Namespace: job.Namespace,
1882+
OwnerReferences: []v1.OwnerReference{
1883+
{
1884+
APIVersion: job.APIVersion,
1885+
Kind: job.Kind,
1886+
Name: job.Name,
1887+
Controller: &ownerRefController,
1888+
UID: job.UID,
1889+
},
1890+
},
1891+
},
1892+
Data: map[string]string{
1893+
MergedCABundleKey: merged,
1894+
},
1895+
}
1896+
if err := r.Create(ctx, mergedCM); err != nil {
1897+
return nil, "", fmt.Errorf("failed to create merged CA ConfigMap: %w", err)
1898+
}
1899+
} else if err != nil {
1900+
return nil, "", fmt.Errorf("failed to read merged CA ConfigMap: %w", err)
1901+
} else {
1902+
mergedCM.Data = map[string]string{
1903+
MergedCABundleKey: merged,
1904+
}
1905+
if err := r.Update(ctx, mergedCM); err != nil {
1906+
return nil, "", fmt.Errorf("failed to update merged CA ConfigMap: %w", err)
18441907
}
18451908
}
1846-
return nil, "", fmt.Errorf("ConfigMap %s has no recognised CA bundle key", DefaultCABundleConfigMapName)
1909+
1910+
return mergedCM, MergedCABundleKey, nil
18471911
}

0 commit comments

Comments
 (0)