Skip to content

Commit 15828b2

Browse files
fix(providers): refresh status message when ModelDeployment reaches Running (#326)
Signed-off-by: robert-cronin <robert.owen.cronin@gmail.com>
1 parent 40593eb commit 15828b2

10 files changed

Lines changed: 193 additions & 6 deletions

File tree

providers/dynamo/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,10 @@ func (r *DynamoProviderReconciler) syncStatus(ctx context.Context, md *airunwayv
438438
md.Status.Phase = statusResult.Phase
439439
if statusResult.Message != "" {
440440
md.Status.Message = statusResult.Message
441+
} else if statusResult.Phase == airunwayv1alpha1.DeploymentPhaseRunning {
442+
// The translator reports no message for a healthy DynamoGraphDeployment;
443+
// replace the stale "waiting for pods" message so status reflects Running.
444+
md.Status.Message = "DynamoGraphDeployment created, pods are ready"
441445
}
442446
md.Status.Replicas = statusResult.Replicas
443447
md.Status.Endpoint = statusResult.Endpoint

providers/dynamo/controller_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package dynamo
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"testing"
78
"time"
89

@@ -691,6 +692,7 @@ func TestSyncStatusRunning(t *testing.T) {
691692
r := NewDynamoProviderReconciler(c, scheme, "")
692693

693694
md := &airunwayv1alpha1.ModelDeployment{}
695+
md.Status.Message = "DynamoGraphDeployment created, waiting for pods to be ready"
694696
desired := &unstructured.Unstructured{}
695697
setDGDGVK(desired)
696698
desired.SetName("test")
@@ -703,6 +705,13 @@ func TestSyncStatusRunning(t *testing.T) {
703705
if md.Status.Phase != airunwayv1alpha1.DeploymentPhaseRunning {
704706
t.Errorf("expected Running phase, got %s", md.Status.Phase)
705707
}
708+
// Issue #289: a Running deployment must not keep the "waiting for pods" message.
709+
if strings.Contains(md.Status.Message, "waiting for pods") {
710+
t.Errorf("status message still claims waiting for pods while Running: %q", md.Status.Message)
711+
}
712+
if md.Status.Message == "" {
713+
t.Errorf("expected a non-empty status message in Running phase")
714+
}
706715
}
707716

708717
func TestSyncStatusFailed(t *testing.T) {

providers/kaito/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,10 @@ func (r *KaitoProviderReconciler) syncStatus(ctx context.Context, md *airunwayv1
667667
md.Status.Phase = statusResult.Phase
668668
if statusResult.Message != "" {
669669
md.Status.Message = statusResult.Message
670+
} else if statusResult.Phase == airunwayv1alpha1.DeploymentPhaseRunning {
671+
// The translator reports no message for a healthy Workspace; replace the
672+
// stale "waiting for pods" message so status reflects the Running phase.
673+
md.Status.Message = "Workspace created, pods are ready"
670674
}
671675
md.Status.Replicas = statusResult.Replicas
672676
md.Status.Endpoint = statusResult.Endpoint

providers/kaito/controller_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,71 @@ func TestReconcileAlreadyRunning(t *testing.T) {
469469
}
470470
}
471471

472+
// TestReconcileRunningUpdatesMessage reproduces issue #289: once the Workspace
473+
// is ready the phase flips to Running, but the status message must no longer
474+
// claim it is "waiting for pods to be ready".
475+
func TestReconcileRunningUpdatesMessage(t *testing.T) {
476+
scheme := newScheme()
477+
md := newMDForController("test", "default")
478+
md.UID = "test-uid"
479+
controllerutil.AddFinalizer(md, FinalizerName)
480+
// Simulate a prior reconcile loop that left the deploying-phase message.
481+
md.Status.Phase = airunwayv1alpha1.DeploymentPhaseDeploying
482+
md.Status.Message = "Workspace created, waiting for pods to be ready"
483+
484+
ws := &unstructured.Unstructured{}
485+
setWorkspaceGVK(ws)
486+
ws.SetName("test")
487+
ws.SetNamespace("default")
488+
ws.SetOwnerReferences([]metav1.OwnerReference{
489+
{UID: "test-uid", APIVersion: "airunway.ai/v1alpha1", Kind: "ModelDeployment", Name: "test"},
490+
})
491+
ws.Object["resource"] = map[string]interface{}{
492+
"count": int64(1),
493+
"labelSelector": map[string]interface{}{
494+
"matchLabels": map[string]interface{}{
495+
"kubernetes.io/os": "linux",
496+
},
497+
},
498+
}
499+
ws.Object["inference"] = map[string]interface{}{
500+
"preset": map[string]interface{}{
501+
"name": "test-model",
502+
},
503+
}
504+
ws.Object["status"] = map[string]interface{}{
505+
"conditions": []interface{}{
506+
map[string]interface{}{
507+
"type": "WorkspaceSucceeded",
508+
"status": "True",
509+
},
510+
},
511+
}
512+
513+
deploy := newReadyKaitoDeployment()
514+
directC := probeClientBuilderWithWorkspace(t).WithObjects(deploy).Build()
515+
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(md, ws).WithStatusSubresource(md).Build()
516+
r := NewKaitoProviderReconciler(c, scheme, directC, record.NewFakeRecorder(10))
517+
518+
if _, err := r.Reconcile(context.Background(), ctrl.Request{
519+
NamespacedName: types.NamespacedName{Name: "test", Namespace: "default"},
520+
}); err != nil {
521+
t.Fatalf("unexpected error: %v", err)
522+
}
523+
524+
var updated airunwayv1alpha1.ModelDeployment
525+
_ = c.Get(context.Background(), types.NamespacedName{Name: "test", Namespace: "default"}, &updated)
526+
if updated.Status.Phase != airunwayv1alpha1.DeploymentPhaseRunning {
527+
t.Fatalf("expected Running phase, got %s", updated.Status.Phase)
528+
}
529+
if strings.Contains(updated.Status.Message, "waiting for pods") {
530+
t.Errorf("status message still claims waiting for pods while Running: %q", updated.Status.Message)
531+
}
532+
if updated.Status.Message == "" {
533+
t.Errorf("expected a non-empty status message in Running phase")
534+
}
535+
}
536+
472537
func TestReconcileHandleDeletion(t *testing.T) {
473538
scheme := newScheme()
474539
md := newMDForController("test", "default")

providers/kuberay/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,10 @@ func (r *KubeRayProviderReconciler) syncStatus(ctx context.Context, md *airunway
309309
md.Status.Phase = statusResult.Phase
310310
if statusResult.Message != "" {
311311
md.Status.Message = statusResult.Message
312+
} else if statusResult.Phase == airunwayv1alpha1.DeploymentPhaseRunning {
313+
// The translator reports no message for a healthy RayService; replace the
314+
// stale "waiting for pods" message so status reflects the Running phase.
315+
md.Status.Message = "RayService created, pods are ready"
312316
}
313317
md.Status.Replicas = statusResult.Replicas
314318
md.Status.Endpoint = statusResult.Endpoint

providers/kuberay/controller_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package kuberay
22

33
import (
44
"context"
5+
"strings"
56
"testing"
67
"time"
78

@@ -610,6 +611,7 @@ func TestSyncStatusRunning(t *testing.T) {
610611
r := NewKubeRayProviderReconciler(c, scheme)
611612

612613
md := &airunwayv1alpha1.ModelDeployment{}
614+
md.Status.Message = "RayService created, waiting for pods to be ready"
613615
desired := &unstructured.Unstructured{}
614616
setRayServiceGVK(desired)
615617
desired.SetName("test")
@@ -622,6 +624,13 @@ func TestSyncStatusRunning(t *testing.T) {
622624
if md.Status.Phase != airunwayv1alpha1.DeploymentPhaseRunning {
623625
t.Errorf("expected Running, got %s", md.Status.Phase)
624626
}
627+
// Issue #289: a Running deployment must not keep the "waiting for pods" message.
628+
if strings.Contains(md.Status.Message, "waiting for pods") {
629+
t.Errorf("status message still claims waiting for pods while Running: %q", md.Status.Message)
630+
}
631+
if md.Status.Message == "" {
632+
t.Errorf("expected a non-empty status message in Running phase")
633+
}
625634
}
626635

627636
func TestSyncStatusFailed(t *testing.T) {

providers/llmd/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ func (r *LLMDProviderReconciler) syncStatus(ctx context.Context, md *airunwayv1a
305305
md.Status.Phase = statusResult.Phase
306306
if statusResult.Message != "" {
307307
md.Status.Message = statusResult.Message
308+
} else if statusResult.Phase == airunwayv1alpha1.DeploymentPhaseRunning {
309+
// The translator reports no message for a healthy Deployment; replace the
310+
// stale "waiting for pods" message so status reflects the Running phase.
311+
md.Status.Message = "Deployments created, pods are ready"
308312
}
309313
md.Status.Replicas = statusResult.Replicas
310314
md.Status.Endpoint = statusResult.Endpoint

providers/llmd/controller_test.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package llmd
22

33
import (
44
"context"
5+
"strings"
56
"testing"
67

78
airunwayv1alpha1 "github.com/kaito-project/airunway/controller/api/v1alpha1"
89
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
911
"k8s.io/apimachinery/pkg/runtime"
1012
"k8s.io/apimachinery/pkg/types"
1113
ctrl "sigs.k8s.io/controller-runtime"
@@ -107,7 +109,7 @@ func TestValidateCompatibility(t *testing.T) {
107109
name: "disaggregated without prefill is incompatible",
108110
md: &airunwayv1alpha1.ModelDeployment{
109111
Spec: airunwayv1alpha1.ModelDeploymentSpec{
110-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
112+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
111113
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
112114
Scaling: &airunwayv1alpha1.ScalingSpec{
113115
Decode: &airunwayv1alpha1.ComponentScalingSpec{
@@ -123,7 +125,7 @@ func TestValidateCompatibility(t *testing.T) {
123125
name: "disaggregated without decode is incompatible",
124126
md: &airunwayv1alpha1.ModelDeployment{
125127
Spec: airunwayv1alpha1.ModelDeploymentSpec{
126-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
128+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
127129
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
128130
Scaling: &airunwayv1alpha1.ScalingSpec{
129131
Prefill: &airunwayv1alpha1.ComponentScalingSpec{
@@ -139,7 +141,7 @@ func TestValidateCompatibility(t *testing.T) {
139141
name: "disaggregated with both prefill and decode is compatible",
140142
md: &airunwayv1alpha1.ModelDeployment{
141143
Spec: airunwayv1alpha1.ModelDeploymentSpec{
142-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
144+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
143145
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
144146
Scaling: &airunwayv1alpha1.ScalingSpec{
145147
Prefill: &airunwayv1alpha1.ComponentScalingSpec{
@@ -159,7 +161,7 @@ func TestValidateCompatibility(t *testing.T) {
159161
name: "disaggregated without GPU on prefill is incompatible",
160162
md: &airunwayv1alpha1.ModelDeployment{
161163
Spec: airunwayv1alpha1.ModelDeploymentSpec{
162-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
164+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
163165
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
164166
Scaling: &airunwayv1alpha1.ScalingSpec{
165167
Prefill: &airunwayv1alpha1.ComponentScalingSpec{
@@ -178,7 +180,7 @@ func TestValidateCompatibility(t *testing.T) {
178180
name: "disaggregated without GPU on decode is incompatible",
179181
md: &airunwayv1alpha1.ModelDeployment{
180182
Spec: airunwayv1alpha1.ModelDeploymentSpec{
181-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
183+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
182184
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
183185
Scaling: &airunwayv1alpha1.ScalingSpec{
184186
Prefill: &airunwayv1alpha1.ComponentScalingSpec{
@@ -197,7 +199,7 @@ func TestValidateCompatibility(t *testing.T) {
197199
name: "disaggregated without top-level resources is compatible",
198200
md: &airunwayv1alpha1.ModelDeployment{
199201
Spec: airunwayv1alpha1.ModelDeploymentSpec{
200-
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
202+
Engine: airunwayv1alpha1.EngineSpec{Type: airunwayv1alpha1.EngineTypeVLLM},
201203
Serving: &airunwayv1alpha1.ServingSpec{Mode: airunwayv1alpha1.ServingModeDisaggregated},
202204
Scaling: &airunwayv1alpha1.ScalingSpec{
203205
Prefill: &airunwayv1alpha1.ComponentScalingSpec{
@@ -273,3 +275,46 @@ func TestReconcileIgnoresNoProvider(t *testing.T) {
273275
t.Error("expected no requeue when no provider assigned")
274276
}
275277
}
278+
279+
// TestSyncStatusRunningUpdatesMessage reproduces issue #289: once the upstream
280+
// Deployment is Available the phase flips to Running, but the status message
281+
// must no longer claim it is "waiting for pods to be ready".
282+
func TestSyncStatusRunningUpdatesMessage(t *testing.T) {
283+
scheme := newScheme()
284+
285+
deploy := &unstructured.Unstructured{}
286+
deploy.SetAPIVersion("apps/v1")
287+
deploy.SetKind("Deployment")
288+
deploy.SetName("test")
289+
deploy.SetNamespace("default")
290+
deploy.Object["status"] = map[string]interface{}{
291+
"conditions": []interface{}{
292+
map[string]interface{}{"type": "Available", "status": "True"},
293+
},
294+
}
295+
296+
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(deploy).Build()
297+
r := NewLLMDProviderReconciler(c, scheme)
298+
299+
md := &airunwayv1alpha1.ModelDeployment{}
300+
md.Status.Message = "Deployments created, waiting for pods to be ready"
301+
302+
desired := &unstructured.Unstructured{}
303+
desired.SetAPIVersion("apps/v1")
304+
desired.SetKind("Deployment")
305+
desired.SetName("test")
306+
desired.SetNamespace("default")
307+
308+
if err := r.syncStatus(context.Background(), md, desired); err != nil {
309+
t.Fatalf("unexpected error: %v", err)
310+
}
311+
if md.Status.Phase != airunwayv1alpha1.DeploymentPhaseRunning {
312+
t.Fatalf("expected Running phase, got %s", md.Status.Phase)
313+
}
314+
if strings.Contains(md.Status.Message, "waiting for pods") {
315+
t.Errorf("status message still claims waiting for pods while Running: %q", md.Status.Message)
316+
}
317+
if md.Status.Message == "" {
318+
t.Errorf("expected a non-empty status message in Running phase")
319+
}
320+
}

providers/vllm/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ func (r *VLLMProviderReconciler) syncStatus(ctx context.Context, md *airunwayv1a
347347
md.Status.Phase = statusResult.Phase
348348
if statusResult.Message != "" {
349349
md.Status.Message = statusResult.Message
350+
} else if statusResult.Phase == airunwayv1alpha1.DeploymentPhaseRunning {
351+
// The translator reports no message for a healthy Deployment; replace the
352+
// stale "waiting for pods" message so status reflects the Running phase.
353+
md.Status.Message = "Deployments created, pods are ready"
350354
}
351355
md.Status.Replicas = statusResult.Replicas
352356
md.Status.Endpoint = statusResult.Endpoint

providers/vllm/controller_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,3 +495,42 @@ func TestRemoteImageResolverRejectsEmptyAndInvalidRefs(t *testing.T) {
495495
t.Error("expected parse error for malformed image reference")
496496
}
497497
}
498+
499+
func TestSyncStatusRunningUpdatesMessage(t *testing.T) {
500+
scheme := newScheme()
501+
502+
deploy := &unstructured.Unstructured{}
503+
deploy.SetGroupVersionKind(deploymentGVK)
504+
deploy.SetName("test")
505+
deploy.SetNamespace("default")
506+
deploy.Object["status"] = map[string]interface{}{
507+
"conditions": []interface{}{
508+
map[string]interface{}{"type": "Available", "status": "True"},
509+
},
510+
}
511+
512+
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(deploy).Build()
513+
r := NewVLLMProviderReconciler(c, scheme)
514+
515+
md := &airunwayv1alpha1.ModelDeployment{}
516+
// Simulate a prior reconcile loop that left the deploying-phase message.
517+
md.Status.Message = "Deployments created, waiting for pods to be ready"
518+
519+
desired := &unstructured.Unstructured{}
520+
desired.SetGroupVersionKind(deploymentGVK)
521+
desired.SetName("test")
522+
desired.SetNamespace("default")
523+
524+
if err := r.syncStatus(context.Background(), md, desired); err != nil {
525+
t.Fatalf("unexpected error: %v", err)
526+
}
527+
if md.Status.Phase != airunwayv1alpha1.DeploymentPhaseRunning {
528+
t.Fatalf("expected Running phase, got %s", md.Status.Phase)
529+
}
530+
if strings.Contains(md.Status.Message, "waiting for pods") {
531+
t.Errorf("status message still claims waiting for pods while Running: %q", md.Status.Message)
532+
}
533+
if md.Status.Message == "" {
534+
t.Errorf("expected a non-empty status message in Running phase")
535+
}
536+
}

0 commit comments

Comments
 (0)