Skip to content

Commit 6a5e399

Browse files
committed
nimservice should be notready while nimcache is populating
Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
1 parent b630bd8 commit 6a5e399

3 files changed

Lines changed: 61 additions & 2 deletions

File tree

internal/conditions/conditions.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ const (
6161
ReasonStatefulSetFailed = "StatefulsetFailed"
6262
// ReasonSecretFailed indicates that the creation of secret has failed.
6363
ReasonSecretFailed = "SecretFailed"
64+
// ReasonNIMCacheFailed indicates that the NIMCache is in failed state.
65+
ReasonNIMCacheFailed = "NIMCacheFailed"
6466
)
6567

6668
// Updater is the condition updater.

internal/controller/platform/standalone/nimservice.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
networkingv1 "k8s.io/api/networking/v1"
3131
rbacv1 "k8s.io/api/rbac/v1"
3232
"k8s.io/apimachinery/pkg/api/errors"
33+
"k8s.io/apimachinery/pkg/api/meta"
3334
apiResource "k8s.io/apimachinery/pkg/api/resource"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/apimachinery/pkg/runtime"
@@ -185,8 +186,35 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
185186

186187
deploymentParams.OrchestratorType = string(r.GetOrchestratorType())
187188

188-
// Select PVC for model store
189-
if nimService.GetNIMCacheName() != "" { // nolint:gocritic
189+
// Select PVC for model storea
190+
nimCacheName := nimService.GetNIMCacheName()
191+
if len(nimCacheName) > 0 {
192+
nimCache := appsv1alpha1.NIMCache{}
193+
if err := r.Get(ctx, types.NamespacedName{Name: nimCacheName, Namespace: nimService.GetNamespace()}, &nimCache); err != nil {
194+
statusError := r.updater.SetConditionsFailed(ctx, nimService, conditions.ReasonNIMCacheFailed, err.Error())
195+
r.GetEventRecorder().Eventf(nimService, corev1.EventTypeWarning, conditions.Failed, err.Error())
196+
if statusError != nil {
197+
logger.Error(statusError, "failed to update status", "nimservice", nimService.Name)
198+
}
199+
return ctrl.Result{}, err
200+
}
201+
switch nimCache.Status.State {
202+
case appsv1alpha1.NimCacheStatusReady:
203+
logger.V(4).Info("NIMCache is ready", "nimcache", nimCacheName)
204+
case appsv1alpha1.NimCacheStatusFailed:
205+
msg := r.getNIMCacheFailedMessage(&nimCache)
206+
err = r.updater.SetConditionsFailed(ctx, nimService, conditions.ReasonNIMCacheFailed, msg)
207+
r.GetEventRecorder().Eventf(nimService, corev1.EventTypeWarning, conditions.Failed, msg)
208+
default:
209+
msg := fmt.Sprintf("NIMCache %s not ready", nimCacheName)
210+
err = r.updater.SetConditionsNotReady(ctx, nimService, conditions.NotReady, msg)
211+
r.GetEventRecorder().Eventf(nimService, corev1.EventTypeNormal, conditions.NotReady,
212+
"NIMService %s not ready yet, msg: %s", nimService.Name, msg)
213+
if err != nil {
214+
logger.Error(err, "failed to ", "nimservice", nimService.Name)
215+
}
216+
return ctrl.Result{RequeueAfter: 5 * time.Second}, err
217+
}
190218
// Fetch PVC for the associated NIMCache instance and mount it
191219
nimCachePVC, err := r.getNIMCachePVC(ctx, nimService)
192220
if err != nil {
@@ -724,3 +752,11 @@ func (r *NIMServiceReconciler) assignGPUResources(ctx context.Context, nimServic
724752

725753
return nil
726754
}
755+
756+
func (r *NIMServiceReconciler) getNIMCacheFailedMessage(nimCache *appsv1alpha1.NIMCache) string {
757+
cond := meta.FindStatusCondition(nimCache.Status.Conditions, conditions.Failed)
758+
if cond != nil {
759+
return cond.Message
760+
}
761+
return ""
762+
}

internal/controller/platform/standalone/nimservice_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"path"
2828
"sort"
2929
"strings"
30+
"time"
3031

3132
"os"
3233

@@ -611,6 +612,26 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
611612

612613
})
613614

615+
It("should be NotReady when nimcache is not ready", func() {
616+
nimCache.Status = appsv1alpha1.NIMCacheStatus{
617+
State: appsv1alpha1.NimCacheStatusNotReady,
618+
}
619+
Expect(client.Status().Update(context.TODO(), nimCache)).To(Succeed())
620+
err := client.Create(context.TODO(), nimService)
621+
Expect(err).NotTo(HaveOccurred())
622+
623+
result, err := reconciler.reconcileNIMService(context.TODO(), nimService)
624+
Expect(err).NotTo(HaveOccurred())
625+
Expect(result).To(Equal(ctrl.Result{RequeueAfter: 5 * time.Second}))
626+
627+
// Check that the NIMService is not ready.
628+
namespacedName := types.NamespacedName{Name: nimService.Name, Namespace: nimService.Namespace}
629+
obj := &appsv1alpha1.NIMService{}
630+
err = client.Get(context.TODO(), namespacedName, obj)
631+
Expect(err).NotTo(HaveOccurred())
632+
Expect(obj.Status.State).To(Equal(appsv1alpha1.NIMServiceStatusNotReady))
633+
})
634+
614635
Describe("isDeploymentReady for setting status on NIMService", func() {
615636
AfterEach(func() {
616637
// Clean up the Deployment instance

0 commit comments

Comments
 (0)