@@ -15,6 +15,7 @@ import (
1515 "time"
1616
1717 batchv1 "k8s.io/api/batch/v1"
18+ corev1 "k8s.io/api/core/v1"
1819 apierrors "k8s.io/apimachinery/pkg/api/errors"
1920 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021 "k8s.io/apimachinery/pkg/types"
@@ -36,6 +37,15 @@ const kueueWorkloadReasonInadmissible = "Inadmissible"
3637// failure on every reconcile.
3738const annotationKueueFailedWorkloadEventReported = "trustyai.opendatahub.io/evalhub-kueue-failed-workload-reported"
3839
40+ // messageCodeGPUUnavailable is sent to EvalHub when Kueue cannot admit a workload because GPU
41+ // resources required by the adapter are not available in the requested queue.
42+ const messageCodeGPUUnavailable = "gpu_unavailable"
43+
44+ // gpuResourceSuffixes are the trailing parts of Kubernetes extended resource names that identify
45+ // GPU accelerators (e.g. "nvidia.com/gpu", "amd.com/gpu"). We match by suffix to avoid hard-coding
46+ // vendor-specific resource names and to remain forward-compatible.
47+ var gpuResourceSuffixes = []string {"/gpu" , ".gpu" }
48+
3949// evalHubEvaluationFailedKueueWorkloadsControllerName matches ctrl.NewControllerManagedBy(mgr).Named(...).
4050const evalHubEvaluationFailedKueueWorkloadsControllerName = "evalhub-evaluation-failed-kueue-workloads"
4151
@@ -144,6 +154,71 @@ func jobOwnerFromWorkload(wl *kueue.Workload) (name string, uid types.UID, ok bo
144154 return "" , "" , false
145155}
146156
157+ // jobRequestsGPU returns true if any container in the Job's pod template requests GPU resources.
158+ func jobRequestsGPU (job * batchv1.Job ) bool {
159+ return podSpecRequestsGPU (& job .Spec .Template .Spec )
160+ }
161+
162+ func podSpecRequestsGPU (spec * corev1.PodSpec ) bool {
163+ for _ , c := range spec .InitContainers {
164+ if containerRequestsGPU (c ) {
165+ return true
166+ }
167+ }
168+ for _ , c := range spec .Containers {
169+ if containerRequestsGPU (c ) {
170+ return true
171+ }
172+ }
173+ return false
174+ }
175+
176+ func containerRequestsGPU (c corev1.Container ) bool {
177+ for name := range c .Resources .Requests {
178+ if isGPUResource (string (name )) {
179+ return true
180+ }
181+ }
182+ for name := range c .Resources .Limits {
183+ if isGPUResource (string (name )) {
184+ return true
185+ }
186+ }
187+ return false
188+ }
189+
190+ func isGPUResource (name string ) bool {
191+ for _ , suffix := range gpuResourceSuffixes {
192+ if strings .HasSuffix (name , suffix ) {
193+ return true
194+ }
195+ }
196+ return false
197+ }
198+
199+ // kueueConditionMentionsGPU returns true when the Kueue inadmissible condition message references
200+ // a GPU resource by name (e.g. "nvidia.com/gpu"). This lets us distinguish GPU-specific quota
201+ // failures from other admission failures without parsing structured data out of free-form text.
202+ func kueueConditionMentionsGPU (msg string ) bool {
203+ for _ , suffix := range gpuResourceSuffixes {
204+ if strings .Contains (msg , suffix ) {
205+ return true
206+ }
207+ }
208+ return false
209+ }
210+
211+ // classifyKueueAdmissionFailure analyses an inadmissible Kueue workload and the owning Job to
212+ // produce a user-facing failure message and EvalHub message code. It intentionally avoids exposing
213+ // internal cluster details (queue names, flavor names, raw quota numbers).
214+ func classifyKueueAdmissionFailure (job * batchv1.Job , cond * metav1.Condition ) (msg , messageCode string ) {
215+ if jobRequestsGPU (job ) && kueueConditionMentionsGPU (cond .Message ) {
216+ return "GPU resources required by this evaluation are not currently available in the requested queue. The job will run when GPU capacity becomes available." , messageCodeGPUUnavailable
217+ }
218+ // Non-GPU or unrecognised failure: surface a generic queue-error without internal detail.
219+ return "The evaluation job cannot be admitted to the requested queue. The job will run when sufficient resources become available." , messageCodeQueueError
220+ }
221+
147222func (r * EvalHubEvaluationFailedKueueWorkloadsReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
148223 log := log .FromContext (ctx )
149224 log .Info ("reconcile start" ,
@@ -238,12 +313,9 @@ func (r *EvalHubEvaluationFailedKueueWorkloadsReconciler) Reconcile(ctx context.
238313 return ctrl.Result {RequeueAfter : 30 * time .Second }, nil
239314 }
240315
241- msg := strings .TrimSpace (cond .Message )
242- if msg == "" {
243- msg = fmt .Sprintf ("Kueue workload (conditionType=%s, reason=%s)" , cond .Type , cond .Reason )
244- }
316+ failureMsg , failureCode := classifyKueueAdmissionFailure (& job , cond )
245317
246- if err := postEvalHubBenchmarkFailed (ctx , r .RESTConfig , baseURL , job .Namespace , jobID , providerID , benchmarkID , benchmarkIndex , msg , messageCodeQueueError ); err != nil {
318+ if err := postEvalHubBenchmarkFailed (ctx , r .RESTConfig , baseURL , job .Namespace , jobID , providerID , benchmarkID , benchmarkIndex , failureMsg , failureCode ); err != nil {
247319 log .Error (err , "failed to post EvalHub benchmark failure event for Kueue workload" ,
248320 append (evaluationFailedKueueWorkloadsLogFields (), "action" , "post_events_failed" ,
249321 "workload" , wl .Name , "workloadNamespace" , wl .Namespace , "queue" , wl .Spec .QueueName ,
@@ -263,7 +335,8 @@ func (r *EvalHubEvaluationFailedKueueWorkloadsReconciler) Reconcile(ctx context.
263335 "workload" , wl .Name , "workloadNamespace" , wl .Namespace , "queue" , wl .Spec .QueueName ,
264336 "job" , job .Name , "jobUid" , string (job .UID ),
265337 "evalJobID" , jobID , "providerID" , providerID , "benchmarkID" , benchmarkID ,
266- "conditionType" , cond .Type , "conditionReason" , cond .Reason )... )
338+ "conditionType" , cond .Type , "conditionReason" , cond .Reason ,
339+ "messageCode" , failureCode )... )
267340
268341 return ctrl.Result {}, nil
269342}
0 commit comments