From cec23baaa425ece94eb54bd2f34f4aabeb8c809c Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Fri, 21 Feb 2025 08:59:55 -0800 Subject: [PATCH 1/9] add model info to nimservice status Signed-off-by: Varun Ramachandra Sekar --- api/apps/v1alpha1/nimservice_types.go | 14 ++++++ api/apps/v1alpha1/zz_generated.deepcopy.go | 16 +++++++ .../apps.nvidia.com_nimservices.yaml | 11 +++++ .../bases/apps.nvidia.com_nimservices.yaml | 11 +++++ .../crds/apps.nvidia.com_nimservices.yaml | 11 +++++ .../platform/standalone/nimservice.go | 43 +++++++++++++++++++ .../platform/standalone/standalone.go | 2 +- 7 files changed, 107 insertions(+), 1 deletion(-) diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index 3395b8546..08b9031e1 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -87,11 +87,25 @@ type NIMCacheVolSpec struct { Profile string `json:"profile,omitempty"` } +type ModelEngine string + +const ( + ModelEngineNIM ModelEngine = "nim" +) + // NIMServiceStatus defines the observed state of NIMService type NIMServiceStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` AvailableReplicas int32 `json:"availableReplicas,omitempty"` State string `json:"state,omitempty"` + Model ModelStatus `json:"model,omitempty"` +} + +// ModelStatus defines the configuration of the NIMService model. +type ModelStatus struct { + Name string `json:"name,omitempty"` + EndpointAddress string `json:"endpointAddress,omitempty"` + Engine ModelEngine `json:"engine,omitempty"` } // +genclient diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index db35cd7c2..0b123e728 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -406,6 +406,21 @@ func (in *ModelSpec) DeepCopy() *ModelSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelStatus) DeepCopyInto(out *ModelStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelStatus. +func (in *ModelStatus) DeepCopy() *ModelStatus { + if in == nil { + return nil + } + out := new(ModelStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NGCSource) DeepCopyInto(out *NGCSource) { *out = *in @@ -915,6 +930,7 @@ func (in *NIMServiceStatus) DeepCopyInto(out *NIMServiceStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.Model = in.Model } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMServiceStatus. diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml index ab3be5c15..1cccfe73b 100644 --- a/bundle/manifests/apps.nvidia.com_nimservices.yaml +++ b/bundle/manifests/apps.nvidia.com_nimservices.yaml @@ -2284,6 +2284,17 @@ spec: - type type: object type: array + model: + description: ModelStatus defines the configuration of the NIMService + model. + properties: + endpointAddress: + type: string + engine: + type: string + name: + type: string + type: object state: type: string type: object diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml index ab3be5c15..1cccfe73b 100644 --- a/config/crd/bases/apps.nvidia.com_nimservices.yaml +++ b/config/crd/bases/apps.nvidia.com_nimservices.yaml @@ -2284,6 +2284,17 @@ spec: - type type: object type: array + model: + description: ModelStatus defines the configuration of the NIMService + model. + properties: + endpointAddress: + type: string + engine: + type: string + name: + type: string + type: object state: type: string type: object diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml index ab3be5c15..1cccfe73b 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml @@ -2284,6 +2284,17 @@ spec: - type type: object type: array + model: + description: ModelStatus defines the configuration of the NIMService + model. + properties: + endpointAddress: + type: string + engine: + type: string + name: + type: string + type: object state: type: string type: object diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 9510b9dee..7f36eda30 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -20,6 +20,7 @@ import ( "context" "fmt" + "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" @@ -261,6 +262,12 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi r.GetEventRecorder().Eventf(nimService, corev1.EventTypeNormal, conditions.NotReady, "NIMService %s not ready yet, msg: %s", nimService.Name, msg) } else { + // Update NIMServiceStatus with model config. + err = r.updateModelStatus(ctx, nimService) + if err != nil { + return ctrl.Result{}, err + } + // Update status as ready err = r.updater.SetConditionsReady(ctx, nimService, conditions.Ready, msg) r.GetEventRecorder().Eventf(nimService, corev1.EventTypeNormal, conditions.Ready, @@ -275,6 +282,42 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi return ctrl.Result{}, nil } +func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService *v1alpha1.NIMService) error { + modelName, err := r.getNIMModelName(ctx, nimService) + if err != nil { + return err + } + nimService.Status.Model = v1alpha1.ModelStatus{ + Name: modelName, + EndpointAddress: fmt.Sprintf("%s.%s.svc.cluster.local:%d", nimService.GetName(), nimService.GetNamespace(), nimService.GetServicePort()), + Engine: v1alpha1.ModelEngineNIM, + } + + return nil +} + +func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, error) { + logger := log.FromContext(ctx) + + if nimService.GetNIMCacheName() == "" { + // NIM cache is not used + return "", nil + } + + // Lookup NIMCache instance in the same namespace as the NIMService instance + nimCache := &appsv1alpha1.NIMCache{} + if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetNIMCacheName(), Namespace: nimService.Namespace}, nimCache); err != nil { + logger.Error(err, "unable to fetch nimcache", "nimcache", nimService.GetNIMCacheName(), "nimservice", nimService.Name) + return "", err + } + + if nimCache.Spec.Source.DataStore != nil && nimCache.Spec.Source.DataStore.ModelName != nil { + return *nimCache.Spec.Source.DataStore.ModelName, nil + } + + return "", nil +} + func (r *NIMServiceReconciler) renderAndSyncResource(ctx context.Context, nimService *appsv1alpha1.NIMService, renderer *render.Renderer, obj client.Object, renderFunc func() (client.Object, error), conditionType string, reason string) error { logger := log.FromContext(ctx) diff --git a/internal/controller/platform/standalone/standalone.go b/internal/controller/platform/standalone/standalone.go index b71483865..c709ed053 100644 --- a/internal/controller/platform/standalone/standalone.go +++ b/internal/controller/platform/standalone/standalone.go @@ -111,7 +111,7 @@ func (s *Standalone) Sync(ctx context.Context, r shared.Reconciler, resource cli if nimService, ok := resource.(*appsv1alpha1.NIMService); ok { reconciler := NewNIMServiceReconciler(r) reconciler.renderer = render.NewRenderer(ManifestsDir) - logger.Info("Reconciling NIMService instance") + logger.Info("Reconciling NIMService instance", "nimservice", nimService.GetName()) result, err := reconciler.reconcileNIMService(ctx, nimService) if err != nil { r.GetEventRecorder().Eventf(nimService, corev1.EventTypeWarning, "ReconcileFailed", From db7b2bd57addd7a34a5396fdcb9b8377b065aa3e Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Mon, 24 Feb 2025 21:52:51 -0800 Subject: [PATCH 2/9] use clusterip for model endpoint Signed-off-by: Varun Ramachandra Sekar --- .../platform/standalone/nimservice.go | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 7f36eda30..17c63b5ea 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -287,9 +287,13 @@ func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService if err != nil { return err } + endpoint, err := r.getNIMModelEndpoint(ctx, nimService) + if err != nil { + return err + } nimService.Status.Model = v1alpha1.ModelStatus{ Name: modelName, - EndpointAddress: fmt.Sprintf("%s.%s.svc.cluster.local:%d", nimService.GetName(), nimService.GetNamespace(), nimService.GetServicePort()), + EndpointAddress: endpoint, Engine: v1alpha1.ModelEngineNIM, } @@ -306,8 +310,8 @@ func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimService * // Lookup NIMCache instance in the same namespace as the NIMService instance nimCache := &appsv1alpha1.NIMCache{} - if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetNIMCacheName(), Namespace: nimService.Namespace}, nimCache); err != nil { - logger.Error(err, "unable to fetch nimcache", "nimcache", nimService.GetNIMCacheName(), "nimservice", nimService.Name) + if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetNIMCacheName(), Namespace: nimService.GetNamespace()}, nimCache); err != nil { + logger.Error(err, "unable to fetch nimcache", "nimcache", nimService.GetNIMCacheName(), "nimservice", nimService.GetName()) return "", err } @@ -318,6 +322,19 @@ func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimService * return "", nil } +func (r *NIMServiceReconciler) getNIMModelEndpoint(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, error) { + logger := log.FromContext(ctx) + + // Lookup NIMCache instance in the same namespace as the NIMService instance + svc := &corev1.Service{} + if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetName(), Namespace: nimService.GetNamespace()}, svc); err != nil { + logger.Error(err, "unable to fetch k8s service", "nimservice", nimService.GetName()) + return "", err + } + + return fmt.Sprintf("%s:%v", svc.Spec.ClusterIP, nimService.GetServicePort()), nil +} + func (r *NIMServiceReconciler) renderAndSyncResource(ctx context.Context, nimService *appsv1alpha1.NIMService, renderer *render.Renderer, obj client.Object, renderFunc func() (client.Object, error), conditionType string, reason string) error { logger := log.FromContext(ctx) From 45cc4826f53c9871657ff9cad3b45a262501011d Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Mon, 24 Feb 2025 22:23:55 -0800 Subject: [PATCH 3/9] fetch model info from nimsrevice Signed-off-by: Varun Ramachandra Sekar --- .../platform/standalone/nimservice.go | 52 ++++++++++++++----- internal/nimmodels/models.go | 43 +++++++++++++++ 2 files changed, 82 insertions(+), 13 deletions(-) create mode 100644 internal/nimmodels/models.go diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 17c63b5ea..f3f695e90 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -18,12 +18,17 @@ package standalone import ( "context" + "encoding/json" "fmt" + "io" + "net/http" + "time" "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" + "github.com/NVIDIA/k8s-nim-operator/internal/nimmodels" "github.com/NVIDIA/k8s-nim-operator/internal/render" rendertypes "github.com/NVIDIA/k8s-nim-operator/internal/render/types" "github.com/NVIDIA/k8s-nim-operator/internal/shared" @@ -283,11 +288,11 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi } func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService *v1alpha1.NIMService) error { - modelName, err := r.getNIMModelName(ctx, nimService) + endpoint, err := r.getNIMModelEndpoint(ctx, nimService) if err != nil { return err } - endpoint, err := r.getNIMModelEndpoint(ctx, nimService) + modelName, err := r.getNIMModelName(ctx, endpoint) if err != nil { return err } @@ -300,26 +305,47 @@ func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService return nil } -func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, error) { +func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimServiceEndpoint string) (string, error) { logger := log.FromContext(ctx) - if nimService.GetNIMCacheName() == "" { - // NIM cache is not used - return "", nil + httpClient := http.Client{ + Timeout: 30 * time.Second, } + modelsURL := nimmodels.GetV1ModelsURL(nimServiceEndpoint) + modelsReq, err := http.NewRequest(http.MethodGet, modelsURL, nil) + if err != nil { + logger.Error(err, "failed to prepare request for models endpoint", "endpoint", nimServiceEndpoint) + return "", err + } + modelsReq.Header.Set("Content-Type", "application/json") - // Lookup NIMCache instance in the same namespace as the NIMService instance - nimCache := &appsv1alpha1.NIMCache{} - if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetNIMCacheName(), Namespace: nimService.GetNamespace()}, nimCache); err != nil { - logger.Error(err, "unable to fetch nimcache", "nimcache", nimService.GetNIMCacheName(), "nimservice", nimService.GetName()) + modelsResp, err := httpClient.Do(modelsReq) + if err != nil { + logger.Error(err, "failed to make request for models endpoint", "url", modelsURL) return "", err } + modelsData, err := io.ReadAll(modelsResp.Body) + logger.Info("got models response", "data", string(modelsData)) - if nimCache.Spec.Source.DataStore != nil && nimCache.Spec.Source.DataStore.ModelName != nil { - return *nimCache.Spec.Source.DataStore.ModelName, nil + var modelsList nimmodels.ModelsV1List + err = json.Unmarshal(modelsData, &modelsList) + if err != nil { + logger.Error(err, "failed to unmarshal models response", "url", modelsURL) + return "", err + } + if modelsList.Object == nimmodels.ObjectTypeList { + for _, model := range modelsList.Data { + if model.Object != nimmodels.ObjectTypeModel { + continue + } + if model.Parent == nil { + continue + } + return model.Id, nil + } } - return "", nil + return "", fmt.Errorf("failed to detect model name from nimservice endpoint '%s'", nimServiceEndpoint) } func (r *NIMServiceReconciler) getNIMModelEndpoint(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, error) { diff --git a/internal/nimmodels/models.go b/internal/nimmodels/models.go new file mode 100644 index 000000000..112fc5fb4 --- /dev/null +++ b/internal/nimmodels/models.go @@ -0,0 +1,43 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nimmodels + +type ObjectType string + +const ( + ObjectTypeList ObjectType = "list" + ObjectTypeModel ObjectType = "model" +) + +const ( + modelsV1URI = "/v1/models" +) + +type ModelsV1Info struct { + Id string `json:"id"` + Object ObjectType `json:"object"` + Parent *string `json:"parent,omitempty"` + Root *string `json:"root,omitempty"` +} +type ModelsV1List struct { + Object ObjectType `json:"object"` + Data []ModelsV1Info `json:"data,omitempty"` +} + +func GetV1ModelsURL(nimServiceEndpoint string) string { + return nimServiceEndpoint + modelsV1URI +} From 8249b9c6f6d08fd4343325830e14c81c27d6d578 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 25 Feb 2025 00:49:25 -0800 Subject: [PATCH 4/9] move models requests to separate package Signed-off-by: Varun Ramachandra Sekar --- api/apps/v1alpha1/nimservice_types.go | 2 +- api/apps/v1alpha1/zz_generated.deepcopy.go | 6 ++- .../platform/standalone/nimservice.go | 33 +------------ internal/nimmodels/models.go | 47 ++++++++++++++++++- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index 08b9031e1..82a2a250c 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -98,7 +98,7 @@ type NIMServiceStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` AvailableReplicas int32 `json:"availableReplicas,omitempty"` State string `json:"state,omitempty"` - Model ModelStatus `json:"model,omitempty"` + Model *ModelStatus `json:"model,omitempty"` } // ModelStatus defines the configuration of the NIMService model. diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index 0b123e728..8c8d63895 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -930,7 +930,11 @@ func (in *NIMServiceStatus) DeepCopyInto(out *NIMServiceStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - out.Model = in.Model + if in.Model != nil { + in, out := &in.Model, &out.Model + *out = new(ModelStatus) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMServiceStatus. diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index f3f695e90..9232d1740 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -18,11 +18,7 @@ package standalone import ( "context" - "encoding/json" "fmt" - "io" - "net/http" - "time" "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" @@ -296,7 +292,7 @@ func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService if err != nil { return err } - nimService.Status.Model = v1alpha1.ModelStatus{ + nimService.Status.Model = &v1alpha1.ModelStatus{ Name: modelName, EndpointAddress: endpoint, Engine: v1alpha1.ModelEngineNIM, @@ -306,41 +302,16 @@ func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService } func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimServiceEndpoint string) (string, error) { - logger := log.FromContext(ctx) - - httpClient := http.Client{ - Timeout: 30 * time.Second, - } - modelsURL := nimmodels.GetV1ModelsURL(nimServiceEndpoint) - modelsReq, err := http.NewRequest(http.MethodGet, modelsURL, nil) - if err != nil { - logger.Error(err, "failed to prepare request for models endpoint", "endpoint", nimServiceEndpoint) - return "", err - } - modelsReq.Header.Set("Content-Type", "application/json") - - modelsResp, err := httpClient.Do(modelsReq) + modelsList, err := nimmodels.ListModelsV1(ctx, nimServiceEndpoint) if err != nil { - logger.Error(err, "failed to make request for models endpoint", "url", modelsURL) return "", err } - modelsData, err := io.ReadAll(modelsResp.Body) - logger.Info("got models response", "data", string(modelsData)) - var modelsList nimmodels.ModelsV1List - err = json.Unmarshal(modelsData, &modelsList) - if err != nil { - logger.Error(err, "failed to unmarshal models response", "url", modelsURL) - return "", err - } if modelsList.Object == nimmodels.ObjectTypeList { for _, model := range modelsList.Data { if model.Object != nimmodels.ObjectTypeModel { continue } - if model.Parent == nil { - continue - } return model.Id, nil } } diff --git a/internal/nimmodels/models.go b/internal/nimmodels/models.go index 112fc5fb4..5fd75b617 100644 --- a/internal/nimmodels/models.go +++ b/internal/nimmodels/models.go @@ -16,6 +16,17 @@ limitations under the License. package nimmodels +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "sigs.k8s.io/controller-runtime/pkg/log" +) + type ObjectType string const ( @@ -38,6 +49,38 @@ type ModelsV1List struct { Data []ModelsV1Info `json:"data,omitempty"` } -func GetV1ModelsURL(nimServiceEndpoint string) string { - return nimServiceEndpoint + modelsV1URI +func getModelsV1URL(nimServiceEndpoint string) string { + return fmt.Sprintf("http://%s%s", nimServiceEndpoint, modelsV1URI) +} + +func ListModelsV1(ctx context.Context, nimServiceEndpoint string) (*ModelsV1List, error) { + logger := log.FromContext(ctx) + + httpClient := http.Client{ + Timeout: 30 * time.Second, + } + modelsURL := getModelsV1URL(nimServiceEndpoint) + modelsReq, err := http.NewRequest(http.MethodGet, modelsURL, nil) + if err != nil { + logger.Error(err, "failed to prepare request for models endpoint", "url", modelsURL) + return nil, err + } + + modelsResp, err := httpClient.Do(modelsReq) + if err != nil { + logger.Error(err, "failed to make request for models endpoint", "url", modelsURL) + return nil, err + } + defer modelsResp.Body.Close() + + modelsData, err := io.ReadAll(modelsResp.Body) + + var modelsList ModelsV1List + err = json.Unmarshal(modelsData, &modelsList) + if err != nil { + logger.Error(err, "failed to unmarshal models response", "url", modelsURL) + return nil, err + } + + return &modelsList, nil } From dfa4a2be729b03c618c4629c45476213695b9b37 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 25 Feb 2025 00:49:33 -0800 Subject: [PATCH 5/9] unit tests for nimservice status api Signed-off-by: Varun Ramachandra Sekar --- .../platform/standalone/nimservice_test.go | 181 +++++++++++++++++- 1 file changed, 174 insertions(+), 7 deletions(-) diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 6fba8a02b..1cc42f240 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -21,6 +21,9 @@ import ( "context" "fmt" "log" + "net/http" + "net/http/httptest" + "net/url" "path" "sort" "strings" @@ -71,15 +74,46 @@ func sortVolumes(volumes []corev1.Volume) { }) } +// Custom transport that redirects requests to a specific host +type mockTransport struct { + targetHost string + testServer *httptest.Server + originalTransport http.RoundTripper +} + +func (m *mockTransport) RoundTrip(req *http.Request) (*http.Response, error) { + // Check if this request is going to our target IP + if req.URL.Host == m.targetHost { + // Create a new URL pointing to our test server + testURL, _ := url.Parse(m.testServer.URL) + testURL.Path = req.URL.Path + testURL.RawQuery = req.URL.RawQuery + + // Create a new request to our test server + newReq := req.Clone(req.Context()) + newReq.URL = testURL + newReq.Host = m.targetHost // Preserve the original Host header + + // Send the request to our test server + return http.DefaultClient.Do(newReq) + } + + // For all other requests, use the original transport + return m.originalTransport.RoundTrip(req) +} + var _ = Describe("NIMServiceReconciler for a standalone platform", func() { var ( - client client.Client - reconciler *NIMServiceReconciler - scheme *runtime.Scheme - nimService *appsv1alpha1.NIMService - nimCache *appsv1alpha1.NIMCache - volumeMounts []corev1.VolumeMount - volumes []corev1.Volume + client client.Client + reconciler *NIMServiceReconciler + scheme *runtime.Scheme + nimService *appsv1alpha1.NIMService + nimCache *appsv1alpha1.NIMCache + volumeMounts []corev1.VolumeMount + volumes []corev1.Volume + testServerHandler http.HandlerFunc + testServer *httptest.Server + originalTransport = http.DefaultTransport ) BeforeEach(func() { scheme = runtime.NewScheme() @@ -352,9 +386,26 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { log.SetOutput(os.Stderr) }() + // Start mock test server to serve nimservice endpoint. + testServerHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + }) + testServer = httptest.NewServer(testServerHandler) + /*testServer = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + }))*/ + http.DefaultTransport = &mockTransport{ + targetHost: ":8123", + testServer: testServer, + originalTransport: originalTransport, + } }) AfterEach(func() { + defer func() { http.DefaultTransport = originalTransport }() + defer testServer.Close() // Clean up the NIMService instance nimService := &appsv1alpha1.NIMService{ ObjectMeta: metav1.ObjectMeta{ @@ -658,6 +709,122 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { }) }) + Describe("update model status on NIMService", func() { + AfterEach(func() { + // Clean up the Service instance + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + } + _ = client.Delete(context.TODO(), svc) + }) + + It("should fail when NIMService is unreachable", func() { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + ClusterIP: "bad.host", // not intercepted by testServer. + Ports: []corev1.ServicePort{ + { + Port: 8123, + Name: "service-port", + }, + }, + }, + } + _ = client.Create(context.TODO(), svc) + err := reconciler.updateModelStatus(context.Background(), nimService) + Expect(err).To(HaveOccurred()) + Expect(nimService.Status.Model).To(BeNil()) + }) + + It("should fail when models response is unmarshallable", func() { + testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"value": "invalid response"}`)) + }) + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Ports: []corev1.ServicePort{ + { + Port: 8123, + Name: "service-port", + }, + }, + }, + } + _ = client.Create(context.TODO(), svc) + err := reconciler.updateModelStatus(context.Background(), nimService) + Expect(err).To(HaveOccurred()) + Expect(nimService.Status.Model).To(BeNil()) + }) + + It("should fail when model name cannot be inferred", func() { + testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + // Set dummy object type for model. + w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "dummy", "root": "dummy-model", "parent": null}]}`)) + }) + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Ports: []corev1.ServicePort{ + { + Port: 8123, + Name: "service-port", + }, + }, + }, + } + _ = client.Create(context.TODO(), svc) + err := reconciler.updateModelStatus(context.Background(), nimService) + Expect(err).To(HaveOccurred()) + Expect(err).To(MatchError("failed to detect model name from nimservice endpoint ':8123'")) + Expect(nimService.Status.Model).To(BeNil()) + }) + + It("should set model status on NIMService", func() { + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Ports: []corev1.ServicePort{ + { + Port: 8123, + Name: "service-port", + }, + }, + }, + } + _ = client.Create(context.TODO(), svc) + err := reconciler.updateModelStatus(context.Background(), nimService) + Expect(err).ToNot(HaveOccurred()) + modelStatus := nimService.Status.Model + Expect(modelStatus).ToNot(BeNil()) + Expect(modelStatus.EndpointAddress).To(Equal(":8123")) + Expect(modelStatus.Name).To(Equal("dummy-model")) + Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) + }) + }) + Describe("getNIMCacheProfile", func() { It("should return nil when NIMCache is not used", func() { nimService.Spec.Storage.NIMCache.Name = "" From 4957a7d6b4d56f12363ba67f1411367c235b4aa9 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 25 Feb 2025 14:23:06 -0800 Subject: [PATCH 6/9] reflect both cluster + external endpoint for nimservice Signed-off-by: Varun Ramachandra Sekar --- api/apps/v1alpha1/nimservice_types.go | 7 +++--- .../apps.nvidia.com_nimservices.yaml | 9 ++++++- .../bases/apps.nvidia.com_nimservices.yaml | 9 ++++++- .../crds/apps.nvidia.com_nimservices.yaml | 9 ++++++- .../platform/standalone/nimservice.go | 22 ++++++++++------ .../platform/standalone/nimservice_test.go | 25 +++++++++---------- internal/utils/utils.go | 7 ++++++ 7 files changed, 61 insertions(+), 27 deletions(-) diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index 82a2a250c..23cc47114 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -103,9 +103,10 @@ type NIMServiceStatus struct { // ModelStatus defines the configuration of the NIMService model. type ModelStatus struct { - Name string `json:"name,omitempty"` - EndpointAddress string `json:"endpointAddress,omitempty"` - Engine ModelEngine `json:"engine,omitempty"` + Name string `json:"name"` + ClusterEndpoint string `json:"clusterEndpoint"` + ExternalEndpoint string `json:"externalEndpoint"` + Engine ModelEngine `json:"engine"` } // +genclient diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml index 1cccfe73b..4642eb772 100644 --- a/bundle/manifests/apps.nvidia.com_nimservices.yaml +++ b/bundle/manifests/apps.nvidia.com_nimservices.yaml @@ -2288,12 +2288,19 @@ spec: description: ModelStatus defines the configuration of the NIMService model. properties: - endpointAddress: + clusterEndpoint: type: string engine: type: string + externalEndpoint: + type: string name: type: string + required: + - clusterEndpoint + - engine + - externalEndpoint + - name type: object state: type: string diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml index 1cccfe73b..4642eb772 100644 --- a/config/crd/bases/apps.nvidia.com_nimservices.yaml +++ b/config/crd/bases/apps.nvidia.com_nimservices.yaml @@ -2288,12 +2288,19 @@ spec: description: ModelStatus defines the configuration of the NIMService model. properties: - endpointAddress: + clusterEndpoint: type: string engine: type: string + externalEndpoint: + type: string name: type: string + required: + - clusterEndpoint + - engine + - externalEndpoint + - name type: object state: type: string diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml index 1cccfe73b..4642eb772 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml @@ -2288,12 +2288,19 @@ spec: description: ModelStatus defines the configuration of the NIMService model. properties: - endpointAddress: + clusterEndpoint: type: string engine: type: string + externalEndpoint: + type: string name: type: string + required: + - clusterEndpoint + - engine + - externalEndpoint + - name type: object state: type: string diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 9232d1740..f1be2ebae 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -284,18 +284,19 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi } func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService *v1alpha1.NIMService) error { - endpoint, err := r.getNIMModelEndpoint(ctx, nimService) + clusterEndpoint, externalEndpoint, err := r.getNIMModelEndpoints(ctx, nimService) if err != nil { return err } - modelName, err := r.getNIMModelName(ctx, endpoint) + modelName, err := r.getNIMModelName(ctx, clusterEndpoint) if err != nil { return err } nimService.Status.Model = &v1alpha1.ModelStatus{ - Name: modelName, - EndpointAddress: endpoint, - Engine: v1alpha1.ModelEngineNIM, + Name: modelName, + ClusterEndpoint: clusterEndpoint, + ExternalEndpoint: externalEndpoint, + Engine: v1alpha1.ModelEngineNIM, } return nil @@ -319,17 +320,22 @@ func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimServiceEn return "", fmt.Errorf("failed to detect model name from nimservice endpoint '%s'", nimServiceEndpoint) } -func (r *NIMServiceReconciler) getNIMModelEndpoint(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, error) { +func (r *NIMServiceReconciler) getNIMModelEndpoints(ctx context.Context, nimService *appsv1alpha1.NIMService) (string, string, error) { logger := log.FromContext(ctx) // Lookup NIMCache instance in the same namespace as the NIMService instance svc := &corev1.Service{} if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetName(), Namespace: nimService.GetNamespace()}, svc); err != nil { logger.Error(err, "unable to fetch k8s service", "nimservice", nimService.GetName()) - return "", err + return "", "", err } - return fmt.Sprintf("%s:%v", svc.Spec.ClusterIP, nimService.GetServicePort()), nil + var externalIP string + if svc.Spec.Type == corev1.ServiceTypeLoadBalancer { + externalIP = svc.Spec.LoadBalancerIP + } + port := nimService.GetServicePort() + return utils.FormatEndpoint(svc.Spec.ClusterIP, port), utils.FormatEndpoint(externalIP, port), nil } func (r *NIMServiceReconciler) renderAndSyncResource(ctx context.Context, nimService *appsv1alpha1.NIMService, renderer *render.Renderer, obj client.Object, renderFunc func() (client.Object, error), conditionType string, reason string) error { diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 1cc42f240..5430e6a70 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -83,7 +83,8 @@ type mockTransport struct { func (m *mockTransport) RoundTrip(req *http.Request) (*http.Response, error) { // Check if this request is going to our target IP - if req.URL.Host == m.targetHost { + hostname := strings.Split(req.URL.Host, ":")[0] + if hostname == "" || req.URL.Host == m.targetHost { // Create a new URL pointing to our test server testURL, _ := url.Parse(m.testServer.URL) testURL.Path = req.URL.Path @@ -92,7 +93,7 @@ func (m *mockTransport) RoundTrip(req *http.Request) (*http.Response, error) { // Create a new request to our test server newReq := req.Clone(req.Context()) newReq.URL = testURL - newReq.Host = m.targetHost // Preserve the original Host header + newReq.Host = req.URL.Host // Preserve the original Host header // Send the request to our test server return http.DefaultClient.Do(newReq) @@ -392,12 +393,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) }) testServer = httptest.NewServer(testServerHandler) - /*testServer = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(http.StatusOK) - w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) - }))*/ http.DefaultTransport = &mockTransport{ - targetHost: ":8123", + targetHost: "127.0.0.1:8123", testServer: testServer, originalTransport: originalTransport, } @@ -591,7 +588,6 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { }) Describe("isDeploymentReady for setting status on NIMService", func() { - AfterEach(func() { // Clean up the Deployment instance deployment := &appsv1.Deployment{ @@ -755,7 +751,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Namespace: "default", }, Spec: corev1.ServiceSpec{ - Type: corev1.ServiceTypeClusterIP, + Type: corev1.ServiceTypeClusterIP, + ClusterIP: "127.0.0.1", Ports: []corev1.ServicePort{ { Port: 8123, @@ -782,7 +779,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Namespace: "default", }, Spec: corev1.ServiceSpec{ - Type: corev1.ServiceTypeClusterIP, + Type: corev1.ServiceTypeClusterIP, + ClusterIP: "127.0.0.1", Ports: []corev1.ServicePort{ { Port: 8123, @@ -794,7 +792,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { _ = client.Create(context.TODO(), svc) err := reconciler.updateModelStatus(context.Background(), nimService) Expect(err).To(HaveOccurred()) - Expect(err).To(MatchError("failed to detect model name from nimservice endpoint ':8123'")) + Expect(err).To(MatchError("failed to detect model name from nimservice endpoint '127.0.0.1:8123'")) Expect(nimService.Status.Model).To(BeNil()) }) @@ -805,7 +803,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Namespace: "default", }, Spec: corev1.ServiceSpec{ - Type: corev1.ServiceTypeClusterIP, + Type: corev1.ServiceTypeClusterIP, + ClusterIP: "127.0.0.1", Ports: []corev1.ServicePort{ { Port: 8123, @@ -819,7 +818,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Expect(err).ToNot(HaveOccurred()) modelStatus := nimService.Status.Model Expect(modelStatus).ToNot(BeNil()) - Expect(modelStatus.EndpointAddress).To(Equal(":8123")) + Expect(modelStatus.ClusterEndpoint).To(Equal("127.0.0.1:8123")) Expect(modelStatus.Name).To(Equal("dummy-model")) Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) }) diff --git a/internal/utils/utils.go b/internal/utils/utils.go index 6ff30569b..95f42eb70 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -286,3 +286,10 @@ func CalculateSHA256(data string) string { hash := sha256.Sum256([]byte(data)) return hex.EncodeToString(hash[:]) } + +func FormatEndpoint(ip string, port int32) string { + if ip == "" { + return "" + } + return fmt.Sprintf("%s:%d", ip, port) +} From c94c587df128640494902a6ddc039631eb4e1272 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Tue, 25 Feb 2025 15:26:33 -0800 Subject: [PATCH 7/9] identify model name when lora adapters are present Signed-off-by: Varun Ramachandra Sekar --- .../platform/standalone/nimservice.go | 3 ++ .../platform/standalone/nimservice_test.go | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index f1be2ebae..7b54b2bed 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -313,6 +313,9 @@ func (r *NIMServiceReconciler) getNIMModelName(ctx context.Context, nimServiceEn if model.Object != nimmodels.ObjectTypeModel { continue } + if model.Root == nil || *model.Root != model.Id { + continue + } return model.Id, nil } } diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 5430e6a70..32516f3c7 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -822,6 +822,39 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Expect(modelStatus.Name).To(Equal("dummy-model")) Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) }) + + It("should succeed when nimservice has lora adapter models attached", func() { + testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + // Set dummy object type for model. + w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model-adapter1", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model-adapter2", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + }) + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + ClusterIP: "127.0.0.1", + Ports: []corev1.ServicePort{ + { + Port: 8123, + Name: "service-port", + }, + }, + }, + } + _ = client.Create(context.TODO(), svc) + err := reconciler.updateModelStatus(context.Background(), nimService) + Expect(err).ToNot(HaveOccurred()) + modelStatus := nimService.Status.Model + Expect(modelStatus).ToNot(BeNil()) + Expect(modelStatus.ClusterEndpoint).To(Equal("127.0.0.1:8123")) + Expect(modelStatus.Name).To(Equal("dummy-model")) + Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) + }) + }) Describe("getNIMCacheProfile", func() { From 7c7639e66eccce8a170d6bf2a770497fe713cc80 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Wed, 26 Feb 2025 10:46:57 -0800 Subject: [PATCH 8/9] fix linter errors Signed-off-by: Varun Ramachandra Sekar --- .../platform/standalone/nimservice_test.go | 12 ++++++++---- internal/nimmodels/models.go | 10 +++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 32516f3c7..3c3d37320 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -390,7 +390,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { // Start mock test server to serve nimservice endpoint. testServerHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) - w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + _, err := w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + Expect(err).ToNot(HaveOccurred()) }) testServer = httptest.NewServer(testServerHandler) http.DefaultTransport = &mockTransport{ @@ -743,7 +744,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { It("should fail when models response is unmarshallable", func() { testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) - w.Write([]byte(`{"value": "invalid response"}`)) + _, err := w.Write([]byte(`{"value": "invalid response"}`)) + Expect(err).ToNot(HaveOccurred()) }) svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ @@ -771,7 +773,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) // Set dummy object type for model. - w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "dummy", "root": "dummy-model", "parent": null}]}`)) + _, err := w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model", "object": "dummy", "root": "dummy-model", "parent": null}]}`)) + Expect(err).ToNot(HaveOccurred()) }) svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ @@ -827,7 +830,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { testServer.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) // Set dummy object type for model. - w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model-adapter1", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model-adapter2", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + _, err := w.Write([]byte(`{"object": "list", "data":[{"id": "dummy-model-adapter1", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model-adapter2", "object": "model", "root": "dummy-model", "parent": null}, {"id": "dummy-model", "object": "model", "root": "dummy-model", "parent": null}]}`)) + Expect(err).ToNot(HaveOccurred()) }) svc := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ diff --git a/internal/nimmodels/models.go b/internal/nimmodels/models.go index 5fd75b617..795863dd3 100644 --- a/internal/nimmodels/models.go +++ b/internal/nimmodels/models.go @@ -62,23 +62,27 @@ func ListModelsV1(ctx context.Context, nimServiceEndpoint string) (*ModelsV1List modelsURL := getModelsV1URL(nimServiceEndpoint) modelsReq, err := http.NewRequest(http.MethodGet, modelsURL, nil) if err != nil { - logger.Error(err, "failed to prepare request for models endpoint", "url", modelsURL) + logger.Error(err, "Failed to prepare request for models endpoint", "url", modelsURL) return nil, err } modelsResp, err := httpClient.Do(modelsReq) if err != nil { - logger.Error(err, "failed to make request for models endpoint", "url", modelsURL) + logger.Error(err, "Failed to make request for models endpoint", "url", modelsURL) return nil, err } defer modelsResp.Body.Close() modelsData, err := io.ReadAll(modelsResp.Body) + if err != nil { + logger.Error(err, "Failed to read models api response", "url", modelsURL) + return nil, err + } var modelsList ModelsV1List err = json.Unmarshal(modelsData, &modelsList) if err != nil { - logger.Error(err, "failed to unmarshal models response", "url", modelsURL) + logger.Error(err, "Failed to unmarshal models response", "url", modelsURL) return nil, err } From f5d618702f66d24c0b98724b1b831ded30b0ab67 Mon Sep 17 00:00:00 2001 From: Varun Ramachandra Sekar Date: Wed, 26 Feb 2025 14:27:32 -0800 Subject: [PATCH 9/9] set ingress endpoints in status when available Signed-off-by: Varun Ramachandra Sekar --- api/apps/v1alpha1/nimservice_types.go | 13 ++----- .../apps.nvidia.com_nimservices.yaml | 3 -- .../bases/apps.nvidia.com_nimservices.yaml | 3 -- .../crds/apps.nvidia.com_nimservices.yaml | 3 -- .../platform/standalone/nimservice.go | 35 +++++++++++++++---- .../platform/standalone/nimservice_test.go | 14 ++++++-- 6 files changed, 44 insertions(+), 27 deletions(-) diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index 23cc47114..d49525aa3 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -87,12 +87,6 @@ type NIMCacheVolSpec struct { Profile string `json:"profile,omitempty"` } -type ModelEngine string - -const ( - ModelEngineNIM ModelEngine = "nim" -) - // NIMServiceStatus defines the observed state of NIMService type NIMServiceStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` @@ -103,10 +97,9 @@ type NIMServiceStatus struct { // ModelStatus defines the configuration of the NIMService model. type ModelStatus struct { - Name string `json:"name"` - ClusterEndpoint string `json:"clusterEndpoint"` - ExternalEndpoint string `json:"externalEndpoint"` - Engine ModelEngine `json:"engine"` + Name string `json:"name"` + ClusterEndpoint string `json:"clusterEndpoint"` + ExternalEndpoint string `json:"externalEndpoint"` } // +genclient diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml index 4642eb772..207309600 100644 --- a/bundle/manifests/apps.nvidia.com_nimservices.yaml +++ b/bundle/manifests/apps.nvidia.com_nimservices.yaml @@ -2290,15 +2290,12 @@ spec: properties: clusterEndpoint: type: string - engine: - type: string externalEndpoint: type: string name: type: string required: - clusterEndpoint - - engine - externalEndpoint - name type: object diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml index 4642eb772..207309600 100644 --- a/config/crd/bases/apps.nvidia.com_nimservices.yaml +++ b/config/crd/bases/apps.nvidia.com_nimservices.yaml @@ -2290,15 +2290,12 @@ spec: properties: clusterEndpoint: type: string - engine: - type: string externalEndpoint: type: string name: type: string required: - clusterEndpoint - - engine - externalEndpoint - name type: object diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml index 4642eb772..207309600 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml @@ -2290,15 +2290,12 @@ spec: properties: clusterEndpoint: type: string - engine: - type: string externalEndpoint: type: string name: type: string required: - clusterEndpoint - - engine - externalEndpoint - name type: object diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index 7b54b2bed..ccddc8a2d 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -296,7 +296,6 @@ func (r *NIMServiceReconciler) updateModelStatus(ctx context.Context, nimService Name: modelName, ClusterEndpoint: clusterEndpoint, ExternalEndpoint: externalEndpoint, - Engine: v1alpha1.ModelEngineNIM, } return nil @@ -333,12 +332,36 @@ func (r *NIMServiceReconciler) getNIMModelEndpoints(ctx context.Context, nimServ return "", "", err } - var externalIP string - if svc.Spec.Type == corev1.ServiceTypeLoadBalancer { - externalIP = svc.Spec.LoadBalancerIP - } + var externalEndpoint string port := nimService.GetServicePort() - return utils.FormatEndpoint(svc.Spec.ClusterIP, port), utils.FormatEndpoint(externalIP, port), nil + if nimService.IsIngressEnabled() { + ingress := &networkingv1.Ingress{} + if err := r.Get(ctx, types.NamespacedName{Name: nimService.GetName(), Namespace: nimService.GetNamespace()}, ingress); err != nil { + logger.Error(err, "unable to fetch ingress", "nimservice", nimService.GetName()) + return "", "", err + } + + var found bool + for _, rule := range ingress.Spec.Rules { + if rule.HTTP == nil { + continue + } + for _, path := range rule.HTTP.Paths { + if path.Backend.Service != nil && path.Backend.Service.Name == nimService.GetName() { + externalEndpoint = rule.Host + found = true + break + } + } + if found { + break + } + } + } else if svc.Spec.Type == corev1.ServiceTypeLoadBalancer { + externalEndpoint = utils.FormatEndpoint(svc.Spec.LoadBalancerIP, port) + } + + return utils.FormatEndpoint(svc.Spec.ClusterIP, port), externalEndpoint, nil } func (r *NIMServiceReconciler) renderAndSyncResource(ctx context.Context, nimService *appsv1alpha1.NIMService, renderer *render.Renderer, obj client.Object, renderFunc func() (client.Object, error), conditionType string, reason string) error { diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 3c3d37320..82eee0c50 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -707,6 +707,16 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { }) Describe("update model status on NIMService", func() { + BeforeEach(func() { + ingress := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimservice", + Namespace: "default", + }, + Spec: nimService.GetIngressSpec(), + } + _ = client.Create(context.TODO(), ingress) + }) AfterEach(func() { // Clean up the Service instance svc := &corev1.Service{ @@ -822,8 +832,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { modelStatus := nimService.Status.Model Expect(modelStatus).ToNot(BeNil()) Expect(modelStatus.ClusterEndpoint).To(Equal("127.0.0.1:8123")) + Expect(modelStatus.ExternalEndpoint).To(Equal("test-nimservice.default.example.com")) Expect(modelStatus.Name).To(Equal("dummy-model")) - Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) }) It("should succeed when nimservice has lora adapter models attached", func() { @@ -855,8 +865,8 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { modelStatus := nimService.Status.Model Expect(modelStatus).ToNot(BeNil()) Expect(modelStatus.ClusterEndpoint).To(Equal("127.0.0.1:8123")) + Expect(modelStatus.ExternalEndpoint).To(Equal("test-nimservice.default.example.com")) Expect(modelStatus.Name).To(Equal("dummy-model")) - Expect(modelStatus.Engine).To(Equal(appsv1alpha1.ModelEngineNIM)) }) })