Skip to content

Commit 554d56a

Browse files
committed
fix(lmeval): merge multiple CA sources into single bundle for HTTPS pods
The previous CA injection mounted a single key from odh-trusted-ca-bundle, which contains only public/system CAs. Cluster-internal services using OpenShift service-serving certificates (*.svc.cluster.local) are signed by a different CA in the openshift-service-ca.crt ConfigMap, so the pod still got SSLCertVerificationError. Additionally, REQUESTS_CA_BUNDLE replaces Python's default trust store rather than appending, so mounting only one CA source loses trust in all others. Fix: replace findCABundle with findAndMergeCABundle, which collects PEM data from both odh-trusted-ca-bundle and openshift-service-ca.crt (best-effort, each skipped if absent), concatenates them, and creates a per-job merged ConfigMap (<jobName>-ca-bundle) with an owner reference for automatic GC. The pod mounts this merged ConfigMap so REQUESTS_CA_BUNDLE contains all relevant CAs. Co-Authored-By: Claude Opus 4.6 <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED
1 parent be2e76c commit 554d56a

3 files changed

Lines changed: 164 additions & 31 deletions

File tree

controllers/lmes/constants.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,20 @@ const (
4747
ServiceName = "LMES"
4848

4949
// DefaultCABundleConfigMapName is the standard RHOAI ConfigMap that holds the cluster CA bundle.
50-
// It is injected into managed namespaces by the RHOAI operator and trusted by cluster-internal
51-
// HTTPS services (e.g., KServe external routes using self-signed certs).
50+
// It is injected into managed namespaces by the RHOAI operator.
5251
DefaultCABundleConfigMapName = "odh-trusted-ca-bundle"
53-
// CABundleVolumeName is the volume name used when auto-mounting the cluster CA bundle.
52+
// ServiceCAConfigMapName is the well-known ConfigMap that contains the OpenShift
53+
// service-serving CA, used by cluster-internal HTTPS services (*.svc.cluster.local).
54+
ServiceCAConfigMapName = "openshift-service-ca.crt"
55+
// ServiceCAKey is the standard key within the service-serving CA ConfigMap.
56+
ServiceCAKey = "service-ca.crt"
57+
// MergedCABundleKey is the key used in the per-job merged CA ConfigMap.
58+
MergedCABundleKey = "merged-ca-bundle.crt"
59+
// MergedCAConfigMapSuffix is appended to the job name to form the merged CA ConfigMap name.
60+
MergedCAConfigMapSuffix = "-ca-bundle"
61+
// CABundleVolumeName is the volume name used when auto-mounting the merged CA bundle.
5462
CABundleVolumeName = "odh-ca-bundle"
55-
// CABundleMountPath is the file path at which the CA bundle is mounted inside the lm-eval pod.
63+
// CABundleMountPath is the file path at which the merged CA bundle is mounted inside the lm-eval pod.
5664
// REQUESTS_CA_BUNDLE is set to this path so Python's requests library picks it up automatically.
5765
CABundleMountPath = "/etc/ssl/certs/odh-ca-bundle.crt"
5866

controllers/lmes/lmevaljob_controller.go

Lines changed: 83 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
3232

3333
corev1 "k8s.io/api/core/v1"
34+
apierrors "k8s.io/apimachinery/pkg/api/errors"
3435
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/apimachinery/pkg/runtime"
3637
"k8s.io/apimachinery/pkg/types"
@@ -563,20 +564,16 @@ func (r *LMEvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger,
563564
return ctrl.Result{}, err
564565
}
565566

566-
// Bug 1: Auto-inject the cluster CA bundle when the model endpoint uses HTTPS
567-
// and verify_certificate is not already set. This handles the common RHOAI case
568-
// where KServe external routes use self-signed certs trusted by the cluster but
569-
// not by Python's default trust store inside the pod.
570567
var caBundle *corev1.ConfigMap
571568
var caBundleKey string
572569
if hasHTTPSBaseURL(job) && !hasExplicitVerifyCertificate(job) {
573-
if cm, key, err := r.findCABundle(ctx, job.Namespace); err == nil {
570+
if cm, key, err := r.findAndMergeCABundle(ctx, job); err == nil {
574571
caBundle = cm
575572
caBundleKey = key
576-
log.Info("auto-injecting cluster CA bundle for HTTPS endpoint",
577-
"configmap", DefaultCABundleConfigMapName, "key", key)
573+
log.Info("auto-injecting merged CA bundle for HTTPS endpoint",
574+
"configmap", cm.Name, "key", key)
578575
} else {
579-
log.Info("HTTPS base_url detected but cluster CA bundle not found, proceeding without auto-injection",
576+
log.Info("HTTPS base_url detected but no CA bundle data found, proceeding without auto-injection",
580577
"error", err.Error())
581578
}
582579
}
@@ -825,11 +822,10 @@ func (r *LMEvalJobReconciler) handleResume(ctx context.Context, log logr.Logger,
825822
return r.pullingJobs.addOrUpdate(string(job.GetUID()), Options.PodCheckingInterval), err
826823
}
827824

828-
// Bug 1: Apply the same CA bundle auto-injection on resume as on initial scheduling.
829825
var caBundle *corev1.ConfigMap
830826
var caBundleKey string
831827
if hasHTTPSBaseURL(job) && !hasExplicitVerifyCertificate(job) {
832-
if cm, key, err := r.findCABundle(ctx, job.Namespace); err == nil {
828+
if cm, key, err := r.findAndMergeCABundle(ctx, job); err == nil {
833829
caBundle = cm
834830
caBundleKey = key
835831
}
@@ -1830,18 +1826,84 @@ func getLastScheduledGeneration(job *lmesv1alpha1.LMEvalJob) int64 {
18301826
return gen
18311827
}
18321828

1833-
// findCABundle looks for the standard RHOAI cluster CA bundle ConfigMap in the
1834-
// given namespace. It tries well-known key names and returns the ConfigMap and
1835-
// the matching key on success.
1836-
func (r *LMEvalJobReconciler) findCABundle(ctx context.Context, namespace string) (*corev1.ConfigMap, string, error) {
1837-
cm := &corev1.ConfigMap{}
1838-
if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: DefaultCABundleConfigMapName}, cm); err != nil {
1839-
return nil, "", err
1829+
// findAndMergeCABundle collects CA certificates from well-known cluster sources
1830+
// and creates a per-job ConfigMap containing the merged bundle. This ensures
1831+
// that REQUESTS_CA_BUNDLE (which replaces the default trust store) contains
1832+
// both public CAs and the OpenShift service-serving CA.
1833+
//
1834+
// Sources checked (best-effort, each is skipped if not found):
1835+
// - odh-trusted-ca-bundle: ca-bundle.crt, odh-ca-bundle.crt (public/system CAs)
1836+
// - openshift-service-ca.crt: service-ca.crt (service-serving CA)
1837+
func (r *LMEvalJobReconciler) findAndMergeCABundle(ctx context.Context, job *lmesv1alpha1.LMEvalJob) (*corev1.ConfigMap, string, error) {
1838+
log := log.FromContext(ctx)
1839+
var pemBlocks []string
1840+
1841+
// Source 1: odh-trusted-ca-bundle (public/system CAs)
1842+
odhCM := &corev1.ConfigMap{}
1843+
if err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: DefaultCABundleConfigMapName}, odhCM); err == nil {
1844+
for _, key := range []string{"ca-bundle.crt", "odh-ca-bundle.crt"} {
1845+
if data, ok := odhCM.Data[key]; ok && strings.TrimSpace(data) != "" {
1846+
pemBlocks = append(pemBlocks, data)
1847+
log.V(1).Info("collected CA data", "configmap", DefaultCABundleConfigMapName, "key", key)
1848+
}
1849+
}
1850+
} else if !apierrors.IsNotFound(err) {
1851+
log.Error(err, "error reading CA bundle ConfigMap", "name", DefaultCABundleConfigMapName)
1852+
}
1853+
1854+
// Source 2: openshift-service-ca.crt (service-serving CA)
1855+
svcCM := &corev1.ConfigMap{}
1856+
if err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: ServiceCAConfigMapName}, svcCM); err == nil {
1857+
if data, ok := svcCM.Data[ServiceCAKey]; ok && strings.TrimSpace(data) != "" {
1858+
pemBlocks = append(pemBlocks, data)
1859+
log.V(1).Info("collected CA data", "configmap", ServiceCAConfigMapName, "key", ServiceCAKey)
1860+
}
1861+
} else if !apierrors.IsNotFound(err) {
1862+
log.Error(err, "error reading service CA ConfigMap", "name", ServiceCAConfigMapName)
1863+
}
1864+
1865+
if len(pemBlocks) == 0 {
1866+
return nil, "", fmt.Errorf("no CA bundle data found in namespace %s", job.Namespace)
18401867
}
1841-
for _, key := range []string{"ca-bundle.crt", "odh-ca-bundle.crt", "service-ca.crt"} {
1842-
if _, ok := cm.Data[key]; ok {
1843-
return cm, key, nil
1868+
1869+
merged := strings.Join(pemBlocks, "\n")
1870+
mergedCMName := job.Name + MergedCAConfigMapSuffix
1871+
1872+
mergedCM := &corev1.ConfigMap{}
1873+
err := r.Get(ctx, types.NamespacedName{Namespace: job.Namespace, Name: mergedCMName}, mergedCM)
1874+
if apierrors.IsNotFound(err) {
1875+
ownerRefController := true
1876+
mergedCM = &corev1.ConfigMap{
1877+
ObjectMeta: v1.ObjectMeta{
1878+
Name: mergedCMName,
1879+
Namespace: job.Namespace,
1880+
OwnerReferences: []v1.OwnerReference{
1881+
{
1882+
APIVersion: job.APIVersion,
1883+
Kind: job.Kind,
1884+
Name: job.Name,
1885+
Controller: &ownerRefController,
1886+
UID: job.UID,
1887+
},
1888+
},
1889+
},
1890+
Data: map[string]string{
1891+
MergedCABundleKey: merged,
1892+
},
1893+
}
1894+
if err := r.Create(ctx, mergedCM); err != nil {
1895+
return nil, "", fmt.Errorf("failed to create merged CA ConfigMap: %w", err)
1896+
}
1897+
} else if err != nil {
1898+
return nil, "", fmt.Errorf("failed to read merged CA ConfigMap: %w", err)
1899+
} else {
1900+
mergedCM.Data = map[string]string{
1901+
MergedCABundleKey: merged,
1902+
}
1903+
if err := r.Update(ctx, mergedCM); err != nil {
1904+
return nil, "", fmt.Errorf("failed to update merged CA ConfigMap: %w", err)
18441905
}
18451906
}
1846-
return nil, "", fmt.Errorf("ConfigMap %s has no recognised CA bundle key", DefaultCABundleConfigMapName)
1907+
1908+
return mergedCM, MergedCABundleKey, nil
18471909
}

controllers/lmes/lmevaljob_controller_suite_test.go

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,14 +216,14 @@ var _ = Describe("LMEvalJob CA bundle injection", func() {
216216
ctx := context.Background()
217217
trueB := true
218218

219-
It("injects CA bundle volume, mount, and env var when base_url uses HTTPS", func() {
219+
It("injects merged CA bundle when base_url uses HTTPS", func() {
220220
caConfigMap := &corev1.ConfigMap{
221221
ObjectMeta: metav1.ObjectMeta{
222222
Name: lmes.DefaultCABundleConfigMapName,
223223
Namespace: testNamespace,
224224
},
225225
Data: map[string]string{
226-
"ca-bundle.crt": "-----BEGIN CERTIFICATE-----\ntest\n-----END CERTIFICATE-----",
226+
"ca-bundle.crt": "-----BEGIN CERTIFICATE-----\npublic-ca\n-----END CERTIFICATE-----",
227227
},
228228
}
229229
Expect(k8sClient.Create(ctx, caConfigMap)).Should(Succeed())
@@ -252,32 +252,44 @@ var _ = Describe("LMEvalJob CA bundle injection", func() {
252252
}
253253
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
254254

255+
mergedCMName := "test-ca-bundle" + lmes.MergedCAConfigMapSuffix
256+
257+
// Verify the merged ConfigMap was created
258+
mergedCM := &corev1.ConfigMap{}
259+
WaitFor(func() error {
260+
return k8sClient.Get(ctx, types.NamespacedName{
261+
Name: mergedCMName, Namespace: testNamespace,
262+
}, mergedCM)
263+
}, "merged CA ConfigMap was not created")
264+
Expect(mergedCM.Data).To(HaveKey(lmes.MergedCABundleKey))
265+
Expect(mergedCM.Data[lmes.MergedCABundleKey]).To(ContainSubstring("public-ca"))
266+
255267
pod := &corev1.Pod{}
256268
WaitFor(func() error {
257269
return k8sClient.Get(ctx, types.NamespacedName{
258270
Name: "test-ca-bundle", Namespace: testNamespace,
259271
}, pod)
260272
}, "pod was not created for HTTPS job")
261273

262-
// Verify CA bundle volume
274+
// Verify CA bundle volume points to merged ConfigMap
263275
foundVolume := false
264276
for _, v := range pod.Spec.Volumes {
265277
if v.Name == lmes.CABundleVolumeName {
266278
foundVolume = true
267279
Expect(v.VolumeSource.ConfigMap).NotTo(BeNil())
268-
Expect(v.VolumeSource.ConfigMap.Name).To(Equal(lmes.DefaultCABundleConfigMapName))
280+
Expect(v.VolumeSource.ConfigMap.Name).To(Equal(mergedCMName))
269281
}
270282
}
271283
Expect(foundVolume).To(BeTrue(), "CA bundle volume not found on pod")
272284

273-
// Verify CA bundle volume mount on main container
285+
// Verify volume mount uses merged key
274286
mainContainer := pod.Spec.Containers[0]
275287
foundMount := false
276288
for _, m := range mainContainer.VolumeMounts {
277289
if m.Name == lmes.CABundleVolumeName {
278290
foundMount = true
279291
Expect(m.MountPath).To(Equal(lmes.CABundleMountPath))
280-
Expect(m.SubPath).To(Equal("ca-bundle.crt"))
292+
Expect(m.SubPath).To(Equal(lmes.MergedCABundleKey))
281293
Expect(m.ReadOnly).To(BeTrue())
282294
}
283295
}
@@ -294,6 +306,57 @@ var _ = Describe("LMEvalJob CA bundle injection", func() {
294306
Expect(foundEnv).To(BeTrue(), "REQUESTS_CA_BUNDLE env var not found")
295307
})
296308

309+
It("merges both odh-trusted-ca-bundle and openshift-service-ca.crt when both exist", func() {
310+
svcCAConfigMap := &corev1.ConfigMap{
311+
ObjectMeta: metav1.ObjectMeta{
312+
Name: lmes.ServiceCAConfigMapName,
313+
Namespace: testNamespace,
314+
},
315+
Data: map[string]string{
316+
lmes.ServiceCAKey: "-----BEGIN CERTIFICATE-----\nservice-ca\n-----END CERTIFICATE-----",
317+
},
318+
}
319+
Expect(k8sClient.Create(ctx, svcCAConfigMap)).Should(Succeed())
320+
321+
job := &lmesv1alpha1.LMEvalJob{
322+
ObjectMeta: metav1.ObjectMeta{
323+
Name: "test-ca-merged",
324+
Namespace: testNamespace,
325+
},
326+
TypeMeta: metav1.TypeMeta{
327+
Kind: lmesv1alpha1.KindName,
328+
APIVersion: lmesv1alpha1.Version,
329+
},
330+
Spec: lmesv1alpha1.LMEvalJobSpec{
331+
AllowOnline: &trueB,
332+
AllowCodeExecution: &trueB,
333+
Model: "hf",
334+
ModelArgs: []lmesv1alpha1.Arg{
335+
{Name: "pretrained", Value: "google/flan-t5-base"},
336+
{Name: "base_url", Value: "https://model.svc.cluster.local:8443"},
337+
},
338+
TaskList: lmesv1alpha1.TaskList{
339+
TaskNames: []string{"task1"},
340+
},
341+
},
342+
}
343+
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
344+
345+
mergedCMName := "test-ca-merged" + lmes.MergedCAConfigMapSuffix
346+
347+
mergedCM := &corev1.ConfigMap{}
348+
WaitFor(func() error {
349+
return k8sClient.Get(ctx, types.NamespacedName{
350+
Name: mergedCMName, Namespace: testNamespace,
351+
}, mergedCM)
352+
}, "merged CA ConfigMap was not created")
353+
354+
Expect(mergedCM.Data[lmes.MergedCABundleKey]).To(ContainSubstring("public-ca"),
355+
"merged bundle should contain public CAs from odh-trusted-ca-bundle")
356+
Expect(mergedCM.Data[lmes.MergedCABundleKey]).To(ContainSubstring("service-ca"),
357+
"merged bundle should contain service-serving CA from openshift-service-ca.crt")
358+
})
359+
297360
It("does not inject CA bundle when base_url uses HTTP", func() {
298361
job := &lmesv1alpha1.LMEvalJob{
299362
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)