@@ -19,6 +19,7 @@ package lmes
1919import (
2020 "bytes"
2121 "context"
22+ "errors"
2223 "fmt"
2324 "maps"
2425 "slices"
@@ -31,6 +32,7 @@ import (
3132 "github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
3233
3334 corev1 "k8s.io/api/core/v1"
35+ apierrors "k8s.io/apimachinery/pkg/api/errors"
3436 v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3537 "k8s.io/apimachinery/pkg/runtime"
3638 "k8s.io/apimachinery/pkg/types"
9698 },
9799 },
98100 }
101+
102+ errNoCAData = errors .New ("no CA bundle data found" )
99103)
100104
101105// maintain a list of key-time pair data.
@@ -170,7 +174,7 @@ func (q *syncedMap4Reconciler) remove(key string) {
170174// +kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=lmevaljobs/finalizers,verbs=update
171175// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete
172176// +kubebuilder:rbac:groups="",resources=pods/exec,verbs=get;list;watch;create;delete
173- // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;watch;list
177+ // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;watch;list;create;update
174178// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;watch;list
175179// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=list;get;watch
176180// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;get;watch;create;update;patch;delete
@@ -189,9 +193,9 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
189193 return r .handleDeletion (ctx , job , log )
190194 }
191195
192- // Bug 3: When a completed job's spec is edited its metadata.Generation is
193- // incremented by the API server. Detect that and reset the status so the job
194- // re-runs with the updated configuration.
196+ // When a completed job's spec is edited, metadata.Generation is incremented
197+ // by the API server. Detect that and reset the status so the job re-runs
198+ // with the updated configuration.
195199 if job .Status .State == lmesv1alpha1 .CompleteJobState {
196200 if lastGen := getLastScheduledGeneration (job ); lastGen > 0 && job .Generation > lastGen {
197201 // Delete the completed pod first. The replacement pod reuses the same
@@ -563,38 +567,12 @@ func (r *LMEvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger,
563567 return ctrl.Result {}, err
564568 }
565569
566- // Bug 1: Auto-inject the cluster CA bundle when the model endpoint uses HTTPS
567- // and verify_certificate is not already set. This handles the common RHOAI case
568- // where KServe external routes use self-signed certs trusted by the cluster but
569- // not by Python's default trust store inside the pod.
570- var caBundle * corev1.ConfigMap
571- var caBundleKey string
572- if hasHTTPSBaseURL (job ) && ! hasExplicitVerifyCertificate (job ) {
573- if cm , key , err := r .findCABundle (ctx , job .Namespace ); err == nil {
574- caBundle = cm
575- caBundleKey = key
576- log .Info ("auto-injecting cluster CA bundle for HTTPS endpoint" ,
577- "configmap" , DefaultCABundleConfigMapName , "key" , key )
578- } else {
579- log .Info ("HTTPS base_url detected but cluster CA bundle not found, proceeding without auto-injection" ,
580- "error" , err .Error ())
581- }
582- }
583-
584- // Bug 3: Record the spec generation being scheduled. Reconcile reads this back
585- // after completion to detect a spec change and reset the job for re-run.
586- currentGenStr := strconv .FormatInt (job .Generation , 10 )
587- annotations := job .GetAnnotations ()
588- if annotations == nil {
589- annotations = make (map [string ]string )
570+ caBundle , caBundleKey , err := r .resolveCABundle (ctx , log , job )
571+ if err != nil {
572+ return ctrl.Result {}, err
590573 }
591- if annotations [LastScheduledGenerationAnnotation ] != currentGenStr {
592- annotations [LastScheduledGenerationAnnotation ] = currentGenStr
593- job .SetAnnotations (annotations )
594- if err := r .Update (ctx , job ); err != nil {
595- log .Error (err , "failed to update generation annotation" )
596- return ctrl.Result {}, err
597- }
574+ if err := r .recordScheduledGeneration (ctx , job ); err != nil {
575+ return ctrl.Result {}, err
598576 }
599577
600578 // construct a new pod and create a pod for the job
@@ -825,28 +803,12 @@ func (r *LMEvalJobReconciler) handleResume(ctx context.Context, log logr.Logger,
825803 return r .pullingJobs .addOrUpdate (string (job .GetUID ()), Options .PodCheckingInterval ), err
826804 }
827805
828- // Bug 1: Apply the same CA bundle auto-injection on resume as on initial scheduling.
829- var caBundle * corev1.ConfigMap
830- var caBundleKey string
831- if hasHTTPSBaseURL (job ) && ! hasExplicitVerifyCertificate (job ) {
832- if cm , key , err := r .findCABundle (ctx , job .Namespace ); err == nil {
833- caBundle = cm
834- caBundleKey = key
835- }
836- }
837-
838- currentGenStr := strconv .FormatInt (job .Generation , 10 )
839- annotations := job .GetAnnotations ()
840- if annotations == nil {
841- annotations = make (map [string ]string )
806+ caBundle , caBundleKey , err := r .resolveCABundle (ctx , log , job )
807+ if err != nil {
808+ return ctrl.Result {}, err
842809 }
843- if annotations [LastScheduledGenerationAnnotation ] != currentGenStr {
844- annotations [LastScheduledGenerationAnnotation ] = currentGenStr
845- job .SetAnnotations (annotations )
846- if err := r .Update (ctx , job ); err != nil {
847- log .Error (err , "failed to update generation annotation on resume" )
848- return ctrl.Result {}, err
849- }
810+ if err := r .recordScheduledGeneration (ctx , job ); err != nil {
811+ return ctrl.Result {}, err
850812 }
851813
852814 pod := CreatePod (Options , job , permConfig , caBundle , caBundleKey , log )
@@ -1349,9 +1311,8 @@ func CreatePod(svcOpts *serviceOptions, job *lmesv1alpha1.LMEvalJob, permConfig
13491311 volumes = append (volumes , job .Spec .Pod .GetVolumes ()... )
13501312 volumeMounts = append (volumeMounts , job .Spec .Pod .GetContainer ().GetVolumMounts ()... )
13511313
1352- // Bug 1: Mount the cluster CA bundle so Python's requests library can verify
1353- // the self-signed certificate used by OpenShift external routes. REQUESTS_CA_BUNDLE
1354- // is the standard env var that the requests library picks up automatically.
1314+ // Mount the merged CA bundle so REQUESTS_CA_BUNDLE lets Python's requests
1315+ // library verify certificates signed by cluster or service-serving CAs.
13551316 if caBundle != nil && caBundleKey != "" {
13561317 volumeMounts = append (volumeMounts , corev1.VolumeMount {
13571318 Name : CABundleVolumeName ,
@@ -1792,6 +1753,43 @@ func removeProtectedEnvVars(envVars []corev1.EnvVar) []corev1.EnvVar {
17921753 return allowedEnvVars
17931754}
17941755
1756+ // resolveCABundle looks up cluster CA sources and returns a merged ConfigMap
1757+ // when the job targets an HTTPS endpoint without an explicit verify_certificate.
1758+ func (r * LMEvalJobReconciler ) resolveCABundle (ctx context.Context , log logr.Logger , job * lmesv1alpha1.LMEvalJob ) (* corev1.ConfigMap , string , error ) {
1759+ if ! hasHTTPSBaseURL (job ) || hasExplicitVerifyCertificate (job ) {
1760+ return nil , "" , nil
1761+ }
1762+ cm , key , err := r .findAndMergeCABundle (ctx , job )
1763+ if errors .Is (err , errNoCAData ) {
1764+ log .Info ("HTTPS base_url detected but no CA bundle data found, proceeding without auto-injection" )
1765+ return nil , "" , nil
1766+ }
1767+ if err != nil {
1768+ return nil , "" , fmt .Errorf ("failed to prepare CA bundle: %w" , err )
1769+ }
1770+ log .Info ("auto-injecting merged CA bundle for HTTPS endpoint" ,
1771+ "configmap" , cm .Name , "key" , key )
1772+ return cm , key , nil
1773+ }
1774+
1775+ // recordScheduledGeneration persists the current spec generation so Reconcile
1776+ // can detect a spec change on a completed job and reset it for re-run.
1777+ func (r * LMEvalJobReconciler ) recordScheduledGeneration (ctx context.Context , job * lmesv1alpha1.LMEvalJob ) error {
1778+ currentGenStr := strconv .FormatInt (job .Generation , 10 )
1779+ annotations := job .GetAnnotations ()
1780+ if annotations == nil {
1781+ annotations = make (map [string ]string )
1782+ }
1783+ if annotations [LastScheduledGenerationAnnotation ] != currentGenStr {
1784+ annotations [LastScheduledGenerationAnnotation ] = currentGenStr
1785+ job .SetAnnotations (annotations )
1786+ if err := r .Update (ctx , job ); err != nil {
1787+ return fmt .Errorf ("failed to update generation annotation: %w" , err )
1788+ }
1789+ }
1790+ return nil
1791+ }
1792+
17951793// hasHTTPSBaseURL returns true when any modelArg named "base_url" uses HTTPS.
17961794func hasHTTPSBaseURL (job * lmesv1alpha1.LMEvalJob ) bool {
17971795 for _ , arg := range job .Spec .ModelArgs {
@@ -1830,18 +1828,84 @@ func getLastScheduledGeneration(job *lmesv1alpha1.LMEvalJob) int64 {
18301828 return gen
18311829}
18321830
1833- // findCABundle looks for the standard RHOAI cluster CA bundle ConfigMap in the
1834- // given namespace. It tries well-known key names and returns the ConfigMap and
1835- // the matching key on success.
1836- func (r * LMEvalJobReconciler ) findCABundle (ctx context.Context , namespace string ) (* corev1.ConfigMap , string , error ) {
1837- cm := & corev1.ConfigMap {}
1838- if err := r .Get (ctx , types.NamespacedName {Namespace : namespace , Name : DefaultCABundleConfigMapName }, cm ); err != nil {
1839- return nil , "" , err
1831+ // findAndMergeCABundle collects CA certificates from well-known cluster sources
1832+ // and creates a per-job ConfigMap containing the merged bundle. This ensures
1833+ // that REQUESTS_CA_BUNDLE (which replaces the default trust store) contains
1834+ // both public CAs and the OpenShift service-serving CA.
1835+ //
1836+ // Sources checked (best-effort, each is skipped if not found):
1837+ // - odh-trusted-ca-bundle: ca-bundle.crt, odh-ca-bundle.crt (public/system CAs)
1838+ // - openshift-service-ca.crt: service-ca.crt (service-serving CA)
1839+ func (r * LMEvalJobReconciler ) findAndMergeCABundle (ctx context.Context , job * lmesv1alpha1.LMEvalJob ) (* corev1.ConfigMap , string , error ) {
1840+ log := log .FromContext (ctx )
1841+ var pemBlocks []string
1842+
1843+ // Source 1: odh-trusted-ca-bundle (public/system CAs)
1844+ odhCM := & corev1.ConfigMap {}
1845+ if err := r .Get (ctx , types.NamespacedName {Namespace : job .Namespace , Name : DefaultCABundleConfigMapName }, odhCM ); err == nil {
1846+ for _ , key := range []string {"ca-bundle.crt" , "odh-ca-bundle.crt" } {
1847+ if data , ok := odhCM .Data [key ]; ok && strings .TrimSpace (data ) != "" {
1848+ pemBlocks = append (pemBlocks , data )
1849+ log .V (1 ).Info ("collected CA data" , "configmap" , DefaultCABundleConfigMapName , "key" , key )
1850+ }
1851+ }
1852+ } else if ! apierrors .IsNotFound (err ) {
1853+ log .Error (err , "error reading CA bundle ConfigMap" , "name" , DefaultCABundleConfigMapName )
18401854 }
1841- for _ , key := range []string {"ca-bundle.crt" , "odh-ca-bundle.crt" , "service-ca.crt" } {
1842- if _ , ok := cm .Data [key ]; ok {
1843- return cm , key , nil
1855+
1856+ // Source 2: openshift-service-ca.crt (service-serving CA)
1857+ svcCM := & corev1.ConfigMap {}
1858+ if err := r .Get (ctx , types.NamespacedName {Namespace : job .Namespace , Name : ServiceCAConfigMapName }, svcCM ); err == nil {
1859+ if data , ok := svcCM .Data [ServiceCAKey ]; ok && strings .TrimSpace (data ) != "" {
1860+ pemBlocks = append (pemBlocks , data )
1861+ log .V (1 ).Info ("collected CA data" , "configmap" , ServiceCAConfigMapName , "key" , ServiceCAKey )
1862+ }
1863+ } else if ! apierrors .IsNotFound (err ) {
1864+ log .Error (err , "error reading service CA ConfigMap" , "name" , ServiceCAConfigMapName )
1865+ }
1866+
1867+ if len (pemBlocks ) == 0 {
1868+ return nil , "" , errNoCAData
1869+ }
1870+
1871+ merged := strings .Join (pemBlocks , "\n " )
1872+ mergedCMName := job .Name + MergedCAConfigMapSuffix
1873+
1874+ mergedCM := & corev1.ConfigMap {}
1875+ err := r .Get (ctx , types.NamespacedName {Namespace : job .Namespace , Name : mergedCMName }, mergedCM )
1876+ if apierrors .IsNotFound (err ) {
1877+ ownerRefController := true
1878+ mergedCM = & corev1.ConfigMap {
1879+ ObjectMeta : v1.ObjectMeta {
1880+ Name : mergedCMName ,
1881+ Namespace : job .Namespace ,
1882+ OwnerReferences : []v1.OwnerReference {
1883+ {
1884+ APIVersion : job .APIVersion ,
1885+ Kind : job .Kind ,
1886+ Name : job .Name ,
1887+ Controller : & ownerRefController ,
1888+ UID : job .UID ,
1889+ },
1890+ },
1891+ },
1892+ Data : map [string ]string {
1893+ MergedCABundleKey : merged ,
1894+ },
1895+ }
1896+ if err := r .Create (ctx , mergedCM ); err != nil {
1897+ return nil , "" , fmt .Errorf ("failed to create merged CA ConfigMap: %w" , err )
1898+ }
1899+ } else if err != nil {
1900+ return nil , "" , fmt .Errorf ("failed to read merged CA ConfigMap: %w" , err )
1901+ } else {
1902+ mergedCM .Data = map [string ]string {
1903+ MergedCABundleKey : merged ,
1904+ }
1905+ if err := r .Update (ctx , mergedCM ); err != nil {
1906+ return nil , "" , fmt .Errorf ("failed to update merged CA ConfigMap: %w" , err )
18441907 }
18451908 }
1846- return nil , "" , fmt .Errorf ("ConfigMap %s has no recognised CA bundle key" , DefaultCABundleConfigMapName )
1909+
1910+ return mergedCM , MergedCABundleKey , nil
18471911}
0 commit comments