Skip to content

Commit 7350000

Browse files
authored
fix nimservice sync issues with deployment status (#715)
Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
1 parent 2d93d3e commit 7350000

File tree

2 files changed

+37
-28
lines changed

2 files changed

+37
-28
lines changed

internal/controller/platform/standalone/nimservice.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
545545
// Update NIMServiceStatus with resource claims.
546546
updateErr := r.updateResourceClaimStatus(ctx, nimService, namedDraResources)
547547
if updateErr != nil {
548-
logger.Info("WARN: Resource claim status update failed, will retry in 5 seconds", "error", updateErr.Error())
548+
logger.V(4).Info("WARN: Resource claim status update failed, will retry in 5 seconds", "error", updateErr.Error())
549549
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
550550
}
551551
}
@@ -557,12 +557,12 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
557557
// Update status as NotReady
558558
err = r.updater.SetConditionsNotReady(ctx, nimService, conditions.NotReady, msg)
559559
r.GetEventRecorder().Eventf(nimService, corev1.EventTypeNormal, conditions.NotReady,
560-
"NIMService %s not ready yet, msg: %s", nimService.Name, msg)
560+
"NIMService %s not ready, msg: %s", nimService.Name, msg)
561561
} else {
562562
// Update NIMServiceStatus with model config.
563563
updateErr := r.updateModelStatus(ctx, nimService)
564564
if updateErr != nil {
565-
logger.Info("WARN: Model status update failed, will retry in 5 seconds", "error", updateErr.Error())
565+
logger.V(4).Info("WARN: Model status update failed, will retry in 5 seconds", "error", updateErr.Error())
566566
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
567567
}
568568

@@ -913,16 +913,16 @@ func (r *NIMServiceReconciler) isDeploymentReady(ctx context.Context, namespaced
913913
return "", false, err
914914
}
915915

916+
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas == 0 {
917+
return fmt.Sprintf("deployment %q is scaled down", deployment.Name), false, nil
918+
}
916919
cond := getDeploymentCondition(deployment.Status, appsv1.DeploymentProgressing)
917920
if cond != nil && cond.Reason == "ProgressDeadlineExceeded" {
918921
return fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.Name), false, nil
919922
}
920-
if deployment.Spec.Replicas != nil && deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
923+
if deployment.Status.UpdatedReplicas < *deployment.Spec.Replicas {
921924
return fmt.Sprintf("Waiting for deployment %q rollout to finish: %d out of %d new replicas have been updated...\n", deployment.Name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas), false, nil
922925
}
923-
if deployment.Status.Replicas > deployment.Status.UpdatedReplicas {
924-
return fmt.Sprintf("Waiting for deployment %q rollout to finish: %d old replicas are pending termination...\n", deployment.Name, deployment.Status.Replicas-deployment.Status.UpdatedReplicas), false, nil
925-
}
926926
if deployment.Status.AvailableReplicas < deployment.Status.UpdatedReplicas {
927927
return fmt.Sprintf("Waiting for deployment %q rollout to finish: %d of %d updated replicas are available...\n", deployment.Name, deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas), false, nil
928928
}

internal/controller/platform/standalone/nimservice_test.go

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -976,70 +976,73 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
976976
}
977977
_ = client.Delete(context.TODO(), deployment)
978978
})
979-
It("Deployment exceeded in its progress", func() {
979+
980+
It("Deployment is scaled down", func() {
980981
nimServiceKey := types.NamespacedName{Name: nimService.Name, Namespace: nimService.Namespace}
981982
deployment := &appsv1.Deployment{
982983
ObjectMeta: metav1.ObjectMeta{
983984
Name: "test-nimservice",
984985
Namespace: "default",
985986
},
986-
Status: appsv1.DeploymentStatus{
987-
Conditions: []appsv1.DeploymentCondition{
988-
{
989-
Type: appsv1.DeploymentProgressing,
990-
Reason: "ProgressDeadlineExceeded",
991-
},
992-
},
987+
Spec: appsv1.DeploymentSpec{
988+
Replicas: &[]int32{0}[0],
993989
},
994990
}
995991
err := client.Create(context.TODO(), deployment)
996992
Expect(err).NotTo(HaveOccurred())
997993
msg, ready, err := reconciler.isDeploymentReady(context.TODO(), &nimServiceKey)
998994
Expect(err).ToNot(HaveOccurred())
999-
Expect(ready).To(Equal(false))
1000-
Expect(msg).To(Equal(fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.Name)))
995+
Expect(ready).To(BeFalse())
996+
Expect(msg).To(Equal(fmt.Sprintf("deployment %q is scaled down", deployment.Name)))
1001997
})
1002998

1003-
It("Waiting for deployment rollout to finish: new replicas are coming up", func() {
999+
It("Deployment exceeded in its progress", func() {
10041000
nimServiceKey := types.NamespacedName{Name: nimService.Name, Namespace: nimService.Namespace}
10051001
deployment := &appsv1.Deployment{
10061002
ObjectMeta: metav1.ObjectMeta{
10071003
Name: "test-nimservice",
10081004
Namespace: "default",
10091005
},
10101006
Spec: appsv1.DeploymentSpec{
1011-
Replicas: &[]int32{4}[0],
1007+
Replicas: &[]int32{1}[0],
10121008
},
10131009
Status: appsv1.DeploymentStatus{
1014-
UpdatedReplicas: 1,
1010+
Conditions: []appsv1.DeploymentCondition{
1011+
{
1012+
Type: appsv1.DeploymentProgressing,
1013+
Reason: "ProgressDeadlineExceeded",
1014+
},
1015+
},
10151016
},
10161017
}
10171018
err := client.Create(context.TODO(), deployment)
10181019
Expect(err).NotTo(HaveOccurred())
10191020
msg, ready, err := reconciler.isDeploymentReady(context.TODO(), &nimServiceKey)
10201021
Expect(err).ToNot(HaveOccurred())
1021-
Expect(ready).To(Equal(false))
1022-
Expect(msg).To(Equal(fmt.Sprintf("Waiting for deployment %q rollout to finish: %d out of %d new replicas have been updated...\n", deployment.Name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)))
1022+
Expect(ready).To(BeFalse())
1023+
Expect(msg).To(Equal(fmt.Sprintf("deployment %q exceeded its progress deadline", deployment.Name)))
10231024
})
10241025

1025-
It("Waiting for deployment rollout to finish: old replicas are pending termination", func() {
1026+
It("Waiting for deployment rollout to finish: new replicas are coming up", func() {
10261027
nimServiceKey := types.NamespacedName{Name: nimService.Name, Namespace: nimService.Namespace}
10271028
deployment := &appsv1.Deployment{
10281029
ObjectMeta: metav1.ObjectMeta{
10291030
Name: "test-nimservice",
10301031
Namespace: "default",
10311032
},
1033+
Spec: appsv1.DeploymentSpec{
1034+
Replicas: &[]int32{4}[0],
1035+
},
10321036
Status: appsv1.DeploymentStatus{
10331037
UpdatedReplicas: 1,
1034-
Replicas: 4,
10351038
},
10361039
}
10371040
err := client.Create(context.TODO(), deployment)
10381041
Expect(err).NotTo(HaveOccurred())
10391042
msg, ready, err := reconciler.isDeploymentReady(context.TODO(), &nimServiceKey)
10401043
Expect(err).ToNot(HaveOccurred())
1041-
Expect(ready).To(Equal(false))
1042-
Expect(msg).To(Equal(fmt.Sprintf("Waiting for deployment %q rollout to finish: %d old replicas are pending termination...\n", deployment.Name, deployment.Status.Replicas-deployment.Status.UpdatedReplicas)))
1044+
Expect(ready).To(BeFalse())
1045+
Expect(msg).To(Equal(fmt.Sprintf("Waiting for deployment %q rollout to finish: %d out of %d new replicas have been updated...\n", deployment.Name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)))
10431046
})
10441047

10451048
It("Waiting for deployment rollout to finish:", func() {
@@ -1049,6 +1052,9 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
10491052
Name: "test-nimservice",
10501053
Namespace: "default",
10511054
},
1055+
Spec: appsv1.DeploymentSpec{
1056+
Replicas: &[]int32{4}[0],
1057+
},
10521058
Status: appsv1.DeploymentStatus{
10531059
UpdatedReplicas: 4,
10541060
AvailableReplicas: 1,
@@ -1058,7 +1064,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
10581064
Expect(err).NotTo(HaveOccurred())
10591065
msg, ready, err := reconciler.isDeploymentReady(context.TODO(), &nimServiceKey)
10601066
Expect(err).ToNot(HaveOccurred())
1061-
Expect(ready).To(Equal(false))
1067+
Expect(ready).To(BeFalse())
10621068
Expect(msg).To(Equal(fmt.Sprintf("Waiting for deployment %q rollout to finish: %d of %d updated replicas are available...\n", deployment.Name, deployment.Status.AvailableReplicas, deployment.Status.UpdatedReplicas)))
10631069
})
10641070

@@ -1069,6 +1075,9 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
10691075
Name: "test-nimservice",
10701076
Namespace: "default",
10711077
},
1078+
Spec: appsv1.DeploymentSpec{
1079+
Replicas: &[]int32{4}[0],
1080+
},
10721081
Status: appsv1.DeploymentStatus{
10731082
UpdatedReplicas: 4,
10741083
AvailableReplicas: 4,
@@ -1078,7 +1087,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
10781087
Expect(err).NotTo(HaveOccurred())
10791088
msg, ready, err := reconciler.isDeploymentReady(context.TODO(), &nimServiceKey)
10801089
Expect(err).ToNot(HaveOccurred())
1081-
Expect(ready).To(Equal(true))
1090+
Expect(ready).To(BeTrue())
10821091
Expect(msg).To(Equal(fmt.Sprintf("deployment %q successfully rolled out\n", deployment.Name)))
10831092
})
10841093
})

0 commit comments

Comments
 (0)