Skip to content

Commit a10ac4a

Browse files
Ambient Code Botclaude
andcommitted
feat(evalhub): classify Kueue GPU admission failures with user-facing messages
When a Kueue workload is inadmissible (QuotaReserved=False/Inadmissible), the reconciler now distinguishes GPU quota exhaustion from generic queue errors by inspecting the Job's pod spec and the Kueue condition message. GPU jobs that can't be admitted get message_code=gpu_unavailable with a human-readable explanation; all other admission failures use queue_error. This avoids surfacing raw cluster internals through the eval-hub API. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d4d151a commit a10ac4a

2 files changed

Lines changed: 210 additions & 6 deletions

File tree

controllers/evalhub/evaluation_failed_kueue_workloads_reconciler.go

100644100755
Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"time"
1616

1717
batchv1 "k8s.io/api/batch/v1"
18+
corev1 "k8s.io/api/core/v1"
1819
apierrors "k8s.io/apimachinery/pkg/api/errors"
1920
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021
"k8s.io/apimachinery/pkg/types"
@@ -36,6 +37,15 @@ const kueueWorkloadReasonInadmissible = "Inadmissible"
3637
// failure on every reconcile.
3738
const annotationKueueFailedWorkloadEventReported = "trustyai.opendatahub.io/evalhub-kueue-failed-workload-reported"
3839

40+
// messageCodeGPUUnavailable is sent to EvalHub when Kueue cannot admit a workload because GPU
41+
// resources required by the adapter are not available in the requested queue.
42+
const messageCodeGPUUnavailable = "gpu_unavailable"
43+
44+
// gpuResourceSuffixes are the trailing parts of Kubernetes extended resource names that identify
45+
// GPU accelerators (e.g. "nvidia.com/gpu", "amd.com/gpu"). We match by suffix to avoid hard-coding
46+
// vendor-specific resource names and to remain forward-compatible.
47+
var gpuResourceSuffixes = []string{"/gpu", ".gpu"}
48+
3949
// evalHubEvaluationFailedKueueWorkloadsControllerName matches ctrl.NewControllerManagedBy(mgr).Named(...).
4050
const evalHubEvaluationFailedKueueWorkloadsControllerName = "evalhub-evaluation-failed-kueue-workloads"
4151

@@ -144,6 +154,71 @@ func jobOwnerFromWorkload(wl *kueue.Workload) (name string, uid types.UID, ok bo
144154
return "", "", false
145155
}
146156

157+
// jobRequestsGPU returns true if any container in the Job's pod template requests GPU resources.
158+
func jobRequestsGPU(job *batchv1.Job) bool {
159+
return podSpecRequestsGPU(&job.Spec.Template.Spec)
160+
}
161+
162+
func podSpecRequestsGPU(spec *corev1.PodSpec) bool {
163+
for _, c := range spec.InitContainers {
164+
if containerRequestsGPU(c) {
165+
return true
166+
}
167+
}
168+
for _, c := range spec.Containers {
169+
if containerRequestsGPU(c) {
170+
return true
171+
}
172+
}
173+
return false
174+
}
175+
176+
func containerRequestsGPU(c corev1.Container) bool {
177+
for name := range c.Resources.Requests {
178+
if isGPUResource(string(name)) {
179+
return true
180+
}
181+
}
182+
for name := range c.Resources.Limits {
183+
if isGPUResource(string(name)) {
184+
return true
185+
}
186+
}
187+
return false
188+
}
189+
190+
func isGPUResource(name string) bool {
191+
for _, suffix := range gpuResourceSuffixes {
192+
if strings.HasSuffix(name, suffix) {
193+
return true
194+
}
195+
}
196+
return false
197+
}
198+
199+
// kueueConditionMentionsGPU returns true when the Kueue inadmissible condition message references
200+
// a GPU resource by name (e.g. "nvidia.com/gpu"). This lets us distinguish GPU-specific quota
201+
// failures from other admission failures without parsing structured data out of free-form text.
202+
func kueueConditionMentionsGPU(msg string) bool {
203+
for _, suffix := range gpuResourceSuffixes {
204+
if strings.Contains(msg, suffix) {
205+
return true
206+
}
207+
}
208+
return false
209+
}
210+
211+
// classifyKueueAdmissionFailure analyses an inadmissible Kueue workload and the owning Job to
212+
// produce a user-facing failure message and EvalHub message code. It intentionally avoids exposing
213+
// internal cluster details (queue names, flavor names, raw quota numbers).
214+
func classifyKueueAdmissionFailure(job *batchv1.Job, cond *metav1.Condition) (msg, messageCode string) {
215+
if jobRequestsGPU(job) && kueueConditionMentionsGPU(cond.Message) {
216+
return "GPU resources required by this evaluation are not currently available in the requested queue. The job will run when GPU capacity becomes available.", messageCodeGPUUnavailable
217+
}
218+
// Non-GPU or unrecognised failure: surface a generic queue-error without internal detail.
219+
return "The evaluation job cannot be admitted to the requested queue. The job will run when sufficient resources become available.", messageCodeQueueError
220+
}
221+
147222
func (r *EvalHubEvaluationFailedKueueWorkloadsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
148223
log := log.FromContext(ctx)
149224
log.Info("reconcile start",
@@ -238,12 +313,9 @@ func (r *EvalHubEvaluationFailedKueueWorkloadsReconciler) Reconcile(ctx context.
238313
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
239314
}
240315

241-
msg := strings.TrimSpace(cond.Message)
242-
if msg == "" {
243-
msg = fmt.Sprintf("Kueue workload (conditionType=%s, reason=%s)", cond.Type, cond.Reason)
244-
}
316+
failureMsg, failureCode := classifyKueueAdmissionFailure(&job, cond)
245317

246-
if err := postEvalHubBenchmarkFailed(ctx, r.RESTConfig, baseURL, job.Namespace, jobID, providerID, benchmarkID, benchmarkIndex, msg, messageCodeQueueError); err != nil {
318+
if err := postEvalHubBenchmarkFailed(ctx, r.RESTConfig, baseURL, job.Namespace, jobID, providerID, benchmarkID, benchmarkIndex, failureMsg, failureCode); err != nil {
247319
log.Error(err, "failed to post EvalHub benchmark failure event for Kueue workload",
248320
append(evaluationFailedKueueWorkloadsLogFields(), "action", "post_events_failed",
249321
"workload", wl.Name, "workloadNamespace", wl.Namespace, "queue", wl.Spec.QueueName,
@@ -263,7 +335,8 @@ func (r *EvalHubEvaluationFailedKueueWorkloadsReconciler) Reconcile(ctx context.
263335
"workload", wl.Name, "workloadNamespace", wl.Namespace, "queue", wl.Spec.QueueName,
264336
"job", job.Name, "jobUid", string(job.UID),
265337
"evalJobID", jobID, "providerID", providerID, "benchmarkID", benchmarkID,
266-
"conditionType", cond.Type, "conditionReason", cond.Reason)...)
338+
"conditionType", cond.Type, "conditionReason", cond.Reason,
339+
"messageCode", failureCode)...)
267340

268341
return ctrl.Result{}, nil
269342
}

controllers/evalhub/evaluation_failed_kueue_workloads_reconciler_test.go

100644100755
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import (
1212
. "github.com/onsi/gomega"
1313

1414
batchv1 "k8s.io/api/batch/v1"
15+
corev1 "k8s.io/api/core/v1"
16+
"k8s.io/apimachinery/pkg/api/resource"
1517
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1618
"k8s.io/apimachinery/pkg/types"
1719
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
@@ -145,4 +147,133 @@ var _ = Describe("Kueue failed workload reconciler helpers", func() {
145147
Expect(isEvalHubEvaluationJob(j)).To(BeFalse())
146148
})
147149
})
150+
151+
Describe("jobRequestsGPU", func() {
152+
It("returns true when a container requests nvidia.com/gpu", func() {
153+
j := gpuJob("nvidia.com/gpu", "1")
154+
Expect(jobRequestsGPU(j)).To(BeTrue())
155+
})
156+
157+
It("returns true when a container requests amd.com/gpu", func() {
158+
j := gpuJob("amd.com/gpu", "1")
159+
Expect(jobRequestsGPU(j)).To(BeTrue())
160+
})
161+
162+
It("returns false when no GPU resources are requested", func() {
163+
j := &batchv1.Job{
164+
Spec: batchv1.JobSpec{
165+
Template: corev1.PodTemplateSpec{
166+
Spec: corev1.PodSpec{
167+
Containers: []corev1.Container{
168+
{
169+
Name: "adapter",
170+
Resources: corev1.ResourceRequirements{
171+
Requests: corev1.ResourceList{
172+
corev1.ResourceCPU: resource.MustParse("250m"),
173+
corev1.ResourceMemory: resource.MustParse("512Mi"),
174+
},
175+
},
176+
},
177+
},
178+
},
179+
},
180+
},
181+
}
182+
Expect(jobRequestsGPU(j)).To(BeFalse())
183+
})
184+
185+
It("returns false for an empty job", func() {
186+
Expect(jobRequestsGPU(&batchv1.Job{})).To(BeFalse())
187+
})
188+
})
189+
190+
Describe("kueueConditionMentionsGPU", func() {
191+
It("returns true when message contains /gpu suffix", func() {
192+
Expect(kueueConditionMentionsGPU("insufficient quota for nvidia.com/gpu in flavor default")).To(BeTrue())
193+
})
194+
195+
It("returns true when message contains amd.com/gpu", func() {
196+
Expect(kueueConditionMentionsGPU("insufficient quota for amd.com/gpu in cluster queue")).To(BeTrue())
197+
})
198+
199+
It("returns false when message has no GPU reference", func() {
200+
Expect(kueueConditionMentionsGPU("ClusterQueue foo is stopped")).To(BeFalse())
201+
})
202+
203+
It("returns false for empty message", func() {
204+
Expect(kueueConditionMentionsGPU("")).To(BeFalse())
205+
})
206+
})
207+
208+
Describe("classifyKueueAdmissionFailure", func() {
209+
It("returns gpu_unavailable code when job requests GPU and condition mentions GPU", func() {
210+
job := gpuJob("nvidia.com/gpu", "1")
211+
cond := &metav1.Condition{
212+
Type: "QuotaReserved",
213+
Status: metav1.ConditionFalse,
214+
Reason: "Inadmissible",
215+
Message: "insufficient quota for nvidia.com/gpu in flavor default, requested: 1, used: 0, borrowable: 0",
216+
}
217+
msg, code := classifyKueueAdmissionFailure(job, cond)
218+
Expect(code).To(Equal(messageCodeGPUUnavailable))
219+
Expect(msg).NotTo(BeEmpty())
220+
Expect(msg).NotTo(ContainSubstring("nvidia.com/gpu"))
221+
Expect(msg).NotTo(ContainSubstring("flavor"))
222+
})
223+
224+
It("returns queue_error code when job requests GPU but condition does not mention GPU", func() {
225+
job := gpuJob("nvidia.com/gpu", "1")
226+
cond := &metav1.Condition{
227+
Type: "QuotaReserved",
228+
Status: metav1.ConditionFalse,
229+
Reason: "Inadmissible",
230+
Message: "ClusterQueue foo is stopped",
231+
}
232+
_, code := classifyKueueAdmissionFailure(job, cond)
233+
Expect(code).To(Equal(messageCodeQueueError))
234+
})
235+
236+
It("returns queue_error code for a CPU-only job even if condition mentions GPU", func() {
237+
job := &batchv1.Job{}
238+
cond := &metav1.Condition{
239+
Message: "insufficient quota for nvidia.com/gpu",
240+
}
241+
_, code := classifyKueueAdmissionFailure(job, cond)
242+
Expect(code).To(Equal(messageCodeQueueError))
243+
})
244+
245+
It("returns queue_error code for CPU-only job with non-GPU failure", func() {
246+
job := &batchv1.Job{}
247+
cond := &metav1.Condition{
248+
Message: "insufficient quota for cpu",
249+
}
250+
_, code := classifyKueueAdmissionFailure(job, cond)
251+
Expect(code).To(Equal(messageCodeQueueError))
252+
})
253+
})
148254
})
255+
256+
// gpuJob builds a minimal batchv1.Job that requests the given GPU resource.
257+
func gpuJob(resourceName, quantity string) *batchv1.Job {
258+
return &batchv1.Job{
259+
Spec: batchv1.JobSpec{
260+
Template: corev1.PodTemplateSpec{
261+
Spec: corev1.PodSpec{
262+
Containers: []corev1.Container{
263+
{
264+
Name: "adapter",
265+
Resources: corev1.ResourceRequirements{
266+
Requests: corev1.ResourceList{
267+
corev1.ResourceName(resourceName): resource.MustParse(quantity),
268+
},
269+
Limits: corev1.ResourceList{
270+
corev1.ResourceName(resourceName): resource.MustParse(quantity),
271+
},
272+
},
273+
},
274+
},
275+
},
276+
},
277+
},
278+
}
279+
}

0 commit comments

Comments
 (0)