From 5d58673ef85669686256c3d3a9292aac1375058d Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Mon, 9 Jun 2025 06:56:22 +0000 Subject: [PATCH 01/16] Adding support for building engine for buildable profiles Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 114 +++ api/apps/v1alpha1/zz_generated.deepcopy.go | 130 ++++ .../typed/apps/v1alpha1/apps_client.go | 5 + .../apps/v1alpha1/fake/fake_apps_client.go | 4 + .../typed/apps/v1alpha1/fake/fake_nimbuild.go | 146 ++++ .../apps/v1alpha1/generated_expansion.go | 2 + api/versioned/typed/apps/v1alpha1/nimbuild.go | 68 ++ .../manifests/apps.nvidia.com_nimbuilds.yaml | 155 +++++ cmd/main.go | 10 + .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 155 +++++ config/rbac/role.yaml | 3 + .../crds/apps.nvidia.com_nimbuilds.yaml | 155 +++++ .../templates/manager-rbac.yaml | 26 + internal/controller/nimbuild_controller.go | 657 ++++++++++++++++++ .../platform/standalone/nimservice.go | 20 +- .../platform/standalone/nimservice_test.go | 6 +- internal/utils/utils.go | 18 + 17 files changed, 1653 insertions(+), 21 deletions(-) create mode 100644 api/apps/v1alpha1/nimbuild_types.go create mode 100644 api/versioned/typed/apps/v1alpha1/fake/fake_nimbuild.go create mode 100644 api/versioned/typed/apps/v1alpha1/nimbuild.go create mode 100644 bundle/manifests/apps.nvidia.com_nimbuilds.yaml create mode 100644 config/crd/bases/apps.nvidia.com_nimbuilds.yaml create mode 100644 deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml create mode 100644 internal/controller/nimbuild_controller.go diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go new file mode 100644 index 000000000..3aa38be1d --- /dev/null +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -0,0 +1,114 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! +// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. + +// NIMBuildSpec to build optimized trtllm engines with given model config and weights. +type NIMBuildSpec struct { + // Required: Reference to the model weights from NIMCache + NIMCacheRef string `json:"nimCacheRef"` + // Required: Profile name for this engine build + ProfileName string `json:"profileName,omitempty"` + // Required: Number of GPUs to build the engine with + Parallelism int `json:"parallelism,omitempty"` + // Optional: Any user-defined tags for tracking builds + Tags map[string]string `json:"tags,omitempty"` + // Optional: Additional build params + BuildParams []string `json:"additionalBuildParams,omitempty"` +} + +// Parallelism defines the compute parallelism strategy for building the engine. +type Parallelism struct { + // Number of tensor parallelism shards (splits model across GPUs at tensor level) + TP int `json:"tp"` + // Optional: Number of pipeline parallelism stages (model split across layers) + PP int `json:"pp,omitempty"` +} + +// NIMBuildStatus defines the observed state of NIMBuild. +type NIMBuildStatus struct { + State string `json:"state,omitempty"` + Profiles []NIMProfile `json:"profiles,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` +} + +// +genclient +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.state`,priority=0 +// +kubebuilder:printcolumn:name="Age",type="date",format="date-time",JSONPath=".metadata.creationTimestamp",priority=0 + +// NIMBuild is the Schema for the nimcaches API. +type NIMBuild struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec NIMBuildSpec `json:"spec,omitempty"` + Status NIMBuildStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true +// NIMBuildList contains a list of NIMBuild. +type NIMBuildList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []NIMBuild `json:"items"` +} + +const ( + // NimBuildConditionWaitForNimCache indicates NIMBuild progress is blocked until that the caching is complete. + NimBuildConditionWaitForNimCache = "NIM_BUILD_WAIT_FOR_NIM_CACHE_READY" + // NimBuildConditionReconcileFailed indicated that error occurred while reconciling NIMBuild object. + NimBuildConditionReconcileFailed = "NIM_BUILD_RECONCILE_FAILED" + // NimBuildConditionMultipleBuildableProfilesFound indicates that multiple buildable profiles are found for the NIMCache object. + NimBuildConditionMultipleBuildableProfilesFound = "NIM_BUILD_MULTIPLE_BUILDABLE_PROFILES_FOUND" + // NimBuildConditionSingleBuildableProfilesFound indicates that only one buildable profile is found for the NIMCache object. + NimBuildConditionSingleBuildableProfilesFound = "NIM_BUILD_SINGLE_BUILDABLE_PROFILE_FOUND" + // NimBuildConditionNoBuildableProfilesFound indicates that no buildable profiles are found for the NIMCache object. + NimBuildConditionNoBuildableProfilesFound = "NIM_BUILD_NO_BUILDABLE_PROFILE_FOUND" + + // NimBuildConditionEngineBuildPodCreated indicates that the engine build pod is created. + NimBuildConditionEngineBuildPodCreated = "NIM_BUILD_ENGINE_BUILD_POD_CREATED" + // NimBuildConditionEngineBuildJobCompleted indicates that the engine build pod is completed. + NimBuildConditionEngineBuildPodCompleted = "NIM_BUILD_ENGINE_BUILD_POD_COMPLETED" + // NimBuildConditionEngineBuildPodPending indicates that the engine build pod is in pending state. + NimBuildConditionEngineBuildPodPending = "NIM_BUILD_ENGINE_BUILD_POD_PENDING" + + // NimBuildStatusNotReady indicates that build is not ready. + NimBuildStatusNotReady = "NotReady" + + // NimBuildStatusStarted indicates that caching process is started. + NimBuildStatusStarted = "Started" + // NimBuildStatusReady indicates that cache is ready. + NimBuildStatusReady = "Ready" + // NimBuildStatusInProgress indicates that caching is in progress. + NimBuildStatusInProgress = "InProgress" + // NimBuildStatusPending indicates that building is not yet started. + NimBuildStatusPending = "Pending" + // NimBuildStatusFailed indicates that caching is failed. + NimBuildStatusFailed = "Failed" +) + +func init() { + SchemeBuilder.Register(&NIMBuild{}, &NIMBuildList{}) +} diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index 4403479ae..eb1ef23fe 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -692,6 +692,121 @@ func (in *NGCSource) DeepCopy() *NGCSource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NIMBuild) DeepCopyInto(out *NIMBuild) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuild. +func (in *NIMBuild) DeepCopy() *NIMBuild { + if in == nil { + return nil + } + out := new(NIMBuild) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NIMBuild) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NIMBuildList) DeepCopyInto(out *NIMBuildList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]NIMBuild, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildList. +func (in *NIMBuildList) DeepCopy() *NIMBuildList { + if in == nil { + return nil + } + out := new(NIMBuildList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *NIMBuildList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { + *out = *in + if in.Tags != nil { + in, out := &in.Tags, &out.Tags + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.BuildParams != nil { + in, out := &in.BuildParams, &out.BuildParams + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildSpec. +func (in *NIMBuildSpec) DeepCopy() *NIMBuildSpec { + if in == nil { + return nil + } + out := new(NIMBuildSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NIMBuildStatus) DeepCopyInto(out *NIMBuildStatus) { + *out = *in + if in.Profiles != nil { + in, out := &in.Profiles, &out.Profiles + *out = make([]NIMProfile, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildStatus. +func (in *NIMBuildStatus) DeepCopy() *NIMBuildStatus { + if in == nil { + return nil + } + out := new(NIMBuildStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMCache) DeepCopyInto(out *NIMCache) { *out = *in @@ -2288,6 +2403,21 @@ func (in *ObjectStoreCredentials) DeepCopy() *ObjectStoreCredentials { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Parallelism) DeepCopyInto(out *Parallelism) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Parallelism. +func (in *Parallelism) DeepCopy() *Parallelism { + if in == nil { + return nil + } + out := new(Parallelism) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PersistentVolumeClaim) DeepCopyInto(out *PersistentVolumeClaim) { *out = *in diff --git a/api/versioned/typed/apps/v1alpha1/apps_client.go b/api/versioned/typed/apps/v1alpha1/apps_client.go index 10db1e04b..2e25a7225 100644 --- a/api/versioned/typed/apps/v1alpha1/apps_client.go +++ b/api/versioned/typed/apps/v1alpha1/apps_client.go @@ -27,6 +27,7 @@ import ( type AppsV1alpha1Interface interface { RESTClient() rest.Interface + NIMBuildsGetter NIMCachesGetter NIMPipelinesGetter NIMServicesGetter @@ -42,6 +43,10 @@ type AppsV1alpha1Client struct { restClient rest.Interface } +func (c *AppsV1alpha1Client) NIMBuilds(namespace string) NIMBuildInterface { + return newNIMBuilds(c, namespace) +} + func (c *AppsV1alpha1Client) NIMCaches(namespace string) NIMCacheInterface { return newNIMCaches(c, namespace) } diff --git a/api/versioned/typed/apps/v1alpha1/fake/fake_apps_client.go b/api/versioned/typed/apps/v1alpha1/fake/fake_apps_client.go index 5246cc313..04e393ff2 100644 --- a/api/versioned/typed/apps/v1alpha1/fake/fake_apps_client.go +++ b/api/versioned/typed/apps/v1alpha1/fake/fake_apps_client.go @@ -27,6 +27,10 @@ type FakeAppsV1alpha1 struct { *testing.Fake } +func (c *FakeAppsV1alpha1) NIMBuilds(namespace string) v1alpha1.NIMBuildInterface { + return &FakeNIMBuilds{c, namespace} +} + func (c *FakeAppsV1alpha1) NIMCaches(namespace string) v1alpha1.NIMCacheInterface { return &FakeNIMCaches{c, namespace} } diff --git a/api/versioned/typed/apps/v1alpha1/fake/fake_nimbuild.go b/api/versioned/typed/apps/v1alpha1/fake/fake_nimbuild.go new file mode 100644 index 000000000..b82ff9723 --- /dev/null +++ b/api/versioned/typed/apps/v1alpha1/fake/fake_nimbuild.go @@ -0,0 +1,146 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + "context" + + v1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeNIMBuilds implements NIMBuildInterface +type FakeNIMBuilds struct { + Fake *FakeAppsV1alpha1 + ns string +} + +var nimbuildsResource = v1alpha1.SchemeGroupVersion.WithResource("nimbuilds") + +var nimbuildsKind = v1alpha1.SchemeGroupVersion.WithKind("NIMBuild") + +// Get takes name of the nIMBuild, and returns the corresponding nIMBuild object, and an error if there is any. +func (c *FakeNIMBuilds) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1alpha1.NIMBuild, err error) { + emptyResult := &v1alpha1.NIMBuild{} + obj, err := c.Fake. + Invokes(testing.NewGetActionWithOptions(nimbuildsResource, c.ns, name, options), emptyResult) + + if obj == nil { + return emptyResult, err + } + return obj.(*v1alpha1.NIMBuild), err +} + +// List takes label and field selectors, and returns the list of NIMBuilds that match those selectors. +func (c *FakeNIMBuilds) List(ctx context.Context, opts v1.ListOptions) (result *v1alpha1.NIMBuildList, err error) { + emptyResult := &v1alpha1.NIMBuildList{} + obj, err := c.Fake. + Invokes(testing.NewListActionWithOptions(nimbuildsResource, nimbuildsKind, c.ns, opts), emptyResult) + + if obj == nil { + return emptyResult, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &v1alpha1.NIMBuildList{ListMeta: obj.(*v1alpha1.NIMBuildList).ListMeta} + for _, item := range obj.(*v1alpha1.NIMBuildList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested nIMBuilds. +func (c *FakeNIMBuilds) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchActionWithOptions(nimbuildsResource, c.ns, opts)) + +} + +// Create takes the representation of a nIMBuild and creates it. Returns the server's representation of the nIMBuild, and an error, if there is any. +func (c *FakeNIMBuilds) Create(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.CreateOptions) (result *v1alpha1.NIMBuild, err error) { + emptyResult := &v1alpha1.NIMBuild{} + obj, err := c.Fake. + Invokes(testing.NewCreateActionWithOptions(nimbuildsResource, c.ns, nIMBuild, opts), emptyResult) + + if obj == nil { + return emptyResult, err + } + return obj.(*v1alpha1.NIMBuild), err +} + +// Update takes the representation of a nIMBuild and updates it. Returns the server's representation of the nIMBuild, and an error, if there is any. +func (c *FakeNIMBuilds) Update(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.UpdateOptions) (result *v1alpha1.NIMBuild, err error) { + emptyResult := &v1alpha1.NIMBuild{} + obj, err := c.Fake. + Invokes(testing.NewUpdateActionWithOptions(nimbuildsResource, c.ns, nIMBuild, opts), emptyResult) + + if obj == nil { + return emptyResult, err + } + return obj.(*v1alpha1.NIMBuild), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakeNIMBuilds) UpdateStatus(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.UpdateOptions) (result *v1alpha1.NIMBuild, err error) { + emptyResult := &v1alpha1.NIMBuild{} + obj, err := c.Fake. + Invokes(testing.NewUpdateSubresourceActionWithOptions(nimbuildsResource, "status", c.ns, nIMBuild, opts), emptyResult) + + if obj == nil { + return emptyResult, err + } + return obj.(*v1alpha1.NIMBuild), err +} + +// Delete takes name of the nIMBuild and deletes it. Returns an error if one occurs. +func (c *FakeNIMBuilds) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteActionWithOptions(nimbuildsResource, c.ns, name, opts), &v1alpha1.NIMBuild{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeNIMBuilds) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { + action := testing.NewDeleteCollectionActionWithOptions(nimbuildsResource, c.ns, opts, listOpts) + + _, err := c.Fake.Invokes(action, &v1alpha1.NIMBuildList{}) + return err +} + +// Patch applies the patch and returns the patched nIMBuild. +func (c *FakeNIMBuilds) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.NIMBuild, err error) { + emptyResult := &v1alpha1.NIMBuild{} + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceActionWithOptions(nimbuildsResource, c.ns, name, pt, data, opts, subresources...), emptyResult) + + if obj == nil { + return emptyResult, err + } + return obj.(*v1alpha1.NIMBuild), err +} diff --git a/api/versioned/typed/apps/v1alpha1/generated_expansion.go b/api/versioned/typed/apps/v1alpha1/generated_expansion.go index 848a9efb4..9a96fc487 100644 --- a/api/versioned/typed/apps/v1alpha1/generated_expansion.go +++ b/api/versioned/typed/apps/v1alpha1/generated_expansion.go @@ -17,6 +17,8 @@ limitations under the License. package v1alpha1 +type NIMBuildExpansion interface{} + type NIMCacheExpansion interface{} type NIMPipelineExpansion interface{} diff --git a/api/versioned/typed/apps/v1alpha1/nimbuild.go b/api/versioned/typed/apps/v1alpha1/nimbuild.go new file mode 100644 index 000000000..1f55455cd --- /dev/null +++ b/api/versioned/typed/apps/v1alpha1/nimbuild.go @@ -0,0 +1,68 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "context" + + v1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" + scheme "github.com/NVIDIA/k8s-nim-operator/api/versioned/scheme" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + gentype "k8s.io/client-go/gentype" +) + +// NIMBuildsGetter has a method to return a NIMBuildInterface. +// A group's client should implement this interface. +type NIMBuildsGetter interface { + NIMBuilds(namespace string) NIMBuildInterface +} + +// NIMBuildInterface has methods to work with NIMBuild resources. +type NIMBuildInterface interface { + Create(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.CreateOptions) (*v1alpha1.NIMBuild, error) + Update(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.UpdateOptions) (*v1alpha1.NIMBuild, error) + // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + UpdateStatus(ctx context.Context, nIMBuild *v1alpha1.NIMBuild, opts v1.UpdateOptions) (*v1alpha1.NIMBuild, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*v1alpha1.NIMBuild, error) + List(ctx context.Context, opts v1.ListOptions) (*v1alpha1.NIMBuildList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1alpha1.NIMBuild, err error) + NIMBuildExpansion +} + +// nIMBuilds implements NIMBuildInterface +type nIMBuilds struct { + *gentype.ClientWithList[*v1alpha1.NIMBuild, *v1alpha1.NIMBuildList] +} + +// newNIMBuilds returns a NIMBuilds +func newNIMBuilds(c *AppsV1alpha1Client, namespace string) *nIMBuilds { + return &nIMBuilds{ + gentype.NewClientWithList[*v1alpha1.NIMBuild, *v1alpha1.NIMBuildList]( + "nimbuilds", + c.RESTClient(), + scheme.ParameterCodec, + namespace, + func() *v1alpha1.NIMBuild { return &v1alpha1.NIMBuild{} }, + func() *v1alpha1.NIMBuildList { return &v1alpha1.NIMBuildList{} }), + } +} diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml new file mode 100644 index 000000000..a16b8c4ae --- /dev/null +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -0,0 +1,155 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.2 + name: nimbuilds.apps.nvidia.com +spec: + group: apps.nvidia.com + names: + kind: NIMBuild + listKind: NIMBuildList + plural: nimbuilds + singular: nimbuild + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - format: date-time + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: NIMBuild is the Schema for the nimcaches API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NIMBuildSpec to build optimized trtllm engines with given + model config and weights. + properties: + additionalBuildParams: + description: 'Optional: Additional build params' + items: + type: string + type: array + nimCacheRef: + description: 'Required: Reference to the model weights from NIMCache' + type: string + parallelism: + description: 'Required: Number of GPUs to build the engine with' + type: integer + profileName: + description: 'Required: Profile name for this engine build' + type: string + tags: + additionalProperties: + type: string + description: 'Optional: Any user-defined tags for tracking builds' + type: object + required: + - nimCacheRef + type: object + status: + description: NIMBuildStatus defines the observed state of NIMBuild. + properties: + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + profiles: + items: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + type: array + state: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/cmd/main.go b/cmd/main.go index 205a87d30..dba354e50 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -177,6 +177,16 @@ func main() { os.Exit(1) } + if err = controller.NewNIMBuildReconciler( + mgr.GetClient(), + mgr.GetScheme(), + ctrl.Log.WithName("controllers").WithName("NIMBuild"), + platformImpl, + ).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "NIMBuild") + os.Exit(1) + } + if err = controller.NewNemoGuardrailReconciler( mgr.GetClient(), mgr.GetScheme(), diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml new file mode 100644 index 000000000..a16b8c4ae --- /dev/null +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -0,0 +1,155 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.2 + name: nimbuilds.apps.nvidia.com +spec: + group: apps.nvidia.com + names: + kind: NIMBuild + listKind: NIMBuildList + plural: nimbuilds + singular: nimbuild + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - format: date-time + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: NIMBuild is the Schema for the nimcaches API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NIMBuildSpec to build optimized trtllm engines with given + model config and weights. + properties: + additionalBuildParams: + description: 'Optional: Additional build params' + items: + type: string + type: array + nimCacheRef: + description: 'Required: Reference to the model weights from NIMCache' + type: string + parallelism: + description: 'Required: Number of GPUs to build the engine with' + type: integer + profileName: + description: 'Required: Profile name for this engine build' + type: string + tags: + additionalProperties: + type: string + description: 'Optional: Any user-defined tags for tracking builds' + type: object + required: + - nimCacheRef + type: object + status: + description: NIMBuildStatus defines the observed state of NIMBuild. + properties: + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + profiles: + items: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + type: array + state: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 138147dda..d94050c91 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -67,6 +67,7 @@ rules: - nemoentitystores - nemoevaluators - nemoguardrails + - nimbuilds - nimcaches - nimpipelines - nimservices @@ -86,6 +87,7 @@ rules: - nemoentitystores/finalizers - nemoevaluators/finalizers - nemoguardrails/finalizers + - nimbuilds/finalizers - nimcaches/finalizers - nimpipelines/finalizers - nimservices/finalizers @@ -99,6 +101,7 @@ rules: - nemoentitystores/status - nemoevaluators/status - nemoguardrails/status + - nimbuilds/status - nimcaches/status - nimpipelines/status - nimservices/status diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml new file mode 100644 index 000000000..a16b8c4ae --- /dev/null +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -0,0 +1,155 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.2 + name: nimbuilds.apps.nvidia.com +spec: + group: apps.nvidia.com + names: + kind: NIMBuild + listKind: NIMBuildList + plural: nimbuilds + singular: nimbuild + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - format: date-time + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: NIMBuild is the Schema for the nimcaches API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NIMBuildSpec to build optimized trtllm engines with given + model config and weights. + properties: + additionalBuildParams: + description: 'Optional: Additional build params' + items: + type: string + type: array + nimCacheRef: + description: 'Required: Reference to the model weights from NIMCache' + type: string + parallelism: + description: 'Required: Number of GPUs to build the engine with' + type: integer + profileName: + description: 'Required: Profile name for this engine build' + type: string + tags: + additionalProperties: + type: string + description: 'Optional: Any user-defined tags for tracking builds' + type: object + required: + - nimCacheRef + type: object + status: + description: NIMBuildStatus defines the observed state of NIMBuild. + properties: + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + profiles: + items: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + type: array + state: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml index e38016263..8335de9a0 100644 --- a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml +++ b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml @@ -232,6 +232,32 @@ rules: - get - patch - update +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds/finalizers + verbs: + - update +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds/status + verbs: + - get + - patch + - update - apiGroups: - apps.nvidia.com resources: diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go new file mode 100644 index 000000000..0c87bf955 --- /dev/null +++ b/internal/controller/nimbuild_controller.go @@ -0,0 +1,657 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "reflect" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + apiResource "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" + "github.com/NVIDIA/k8s-nim-operator/internal/conditions" + "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform" + "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" + "github.com/NVIDIA/k8s-nim-operator/internal/render" + "github.com/NVIDIA/k8s-nim-operator/internal/shared" + "github.com/NVIDIA/k8s-nim-operator/internal/utils" +) + +const ( + // NIMBuildFinalizer is the finalizer annotation. + NIMBuildFinalizer = "finalizer.nimbuild.apps.nvidia.com" + + // NIMBuildContainerName returns the name of the container used for NIM Build operations. + NIMBuildContainerName = "nim-build-ctr" +) + +// NIMBuildReconciler reconciles a NIMBuild object. +type NIMBuildReconciler struct { + client.Client + scheme *runtime.Scheme + log logr.Logger + Platform platform.Platform + orchestratorType k8sutil.OrchestratorType + updater conditions.Updater + recorder record.EventRecorder +} + +// Ensure NIMBuildReconciler implements the Reconciler interface. +var _ shared.Reconciler = &NIMBuildReconciler{} + +// NewNIMBuildReconciler creates a new reconciler for NIMBuild with the given platform. +func NewNIMBuildReconciler(client client.Client, scheme *runtime.Scheme, log logr.Logger, platform platform.Platform) *NIMBuildReconciler { + return &NIMBuildReconciler{ + Client: client, + scheme: scheme, + log: log, + Platform: platform, + } +} + +// +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimbuilds,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimbuilds/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimbuilds/finalizers,verbs=update +// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=nonroot +// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=pods/log,verbs=get + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile +func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + var err error + var result reconcile.Result + + // Fetch the NIMBuild instance + nimBuild := &appsv1alpha1.NIMBuild{} + if err = r.Get(ctx, req.NamespacedName, nimBuild); err != nil { + if client.IgnoreNotFound(err) != nil { + logger.Error(err, "unable to fetch NIMBuild", "name", req.NamespacedName) + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + logger.Info("Reconciling", "NIMBuild", nimBuild.Name) + previousStatusState := nimBuild.Status.State + + defer func() { + if err != nil { + r.GetEventRecorder().Eventf(nimBuild, corev1.EventTypeWarning, "ReconcileFailed", + "NIMBuild %s reconcile failed, msg: %s", nimBuild.Name, err.Error()) + } else if previousStatusState != nimBuild.Status.State { + r.GetEventRecorder().Eventf(nimBuild, corev1.EventTypeNormal, nimBuild.Status.State, + "NIMBuild %s reconcile success, new state: %s", nimBuild.Name, nimBuild.Status.State) + } + }() + // Check if the instance is marked for deletion + if nimBuild.DeletionTimestamp.IsZero() { + // Add finalizer if not present + if !controllerutil.ContainsFinalizer(nimBuild, NIMBuildFinalizer) { + controllerutil.AddFinalizer(nimBuild, NIMBuildFinalizer) + if err = r.Update(ctx, nimBuild); err != nil { + return ctrl.Result{}, err + } + } + } else { + // The instance is being deleted + if controllerutil.ContainsFinalizer(nimBuild, NIMBuildFinalizer) { + // Perform cleanup of resources + if err = r.cleanupNIMBuild(ctx, nimBuild); err != nil { + return ctrl.Result{}, err + } + + // Remove finalizer to allow for deletion + controllerutil.RemoveFinalizer(nimBuild, NIMBuildFinalizer) + if err := r.Update(ctx, nimBuild); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + } + + // Fetch container orchestrator type + _, err = r.GetOrchestratorType(ctx) + if err != nil { + return ctrl.Result{}, fmt.Errorf("unable to get container orchestrator type, %v", err) + } + + // Handle nim-cache reconciliation + result, err = r.reconcileNIMBuild(ctx, nimBuild) + if err != nil { + logger.Error(err, "error reconciling NIMBuild", "name", nimBuild.Name) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionReconcileFailed, metav1.ConditionTrue, "ReconcileFailed", err.Error()) + nimBuild.Status.State = appsv1alpha1.NimBuildStatusNotReady + + err := r.updateNIMBuildStatus(ctx, nimBuild) + if err != nil { + logger.Error(err, "Failed to update NIMBuild status", "NIMBuild", nimBuild.Name) + return ctrl.Result{}, err + } + return result, err + } + return result, nil +} + +// GetScheme returns the scheme of the reconciler. +func (r *NIMBuildReconciler) GetScheme() *runtime.Scheme { + return r.scheme +} + +func (r *NIMBuildReconciler) GetLogger() logr.Logger { + return r.log +} + +func (r *NIMBuildReconciler) GetClient() client.Client { + return r.Client +} + +func (r *NIMBuildReconciler) GetUpdater() conditions.Updater { + return r.updater +} + +func (r *NIMBuildReconciler) GetRenderer() render.Renderer { + return nil +} + +func (r *NIMBuildReconciler) GetEventRecorder() record.EventRecorder { + return r.recorder +} + +func (r *NIMBuildReconciler) GetOrchestratorType(ctx context.Context) (k8sutil.OrchestratorType, error) { + if r.orchestratorType == "" { + orchestratorType, err := k8sutil.GetOrchestratorType(ctx, r.GetClient()) + if err != nil { + return k8sutil.Unknown, fmt.Errorf("unable to get container orchestrator type, %v", err) + } + r.orchestratorType = orchestratorType + r.GetLogger().Info("Container orchestrator is successfully set", "type", orchestratorType) + } + return r.orchestratorType, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("nimbuild-controller") + return ctrl.NewControllerManagedBy(mgr). + For(&appsv1alpha1.NIMBuild{}). + Owns(&corev1.Pod{}). + Owns(&corev1.PersistentVolumeClaim{}). + WithEventFilter(predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + // Type assert to NIMBuild + if oldNIMBuild, ok := e.ObjectOld.(*appsv1alpha1.NIMBuild); ok { + newNIMBuild, ok := e.ObjectNew.(*appsv1alpha1.NIMBuild) + if ok { + // Handle case where object is marked for deletion + if !newNIMBuild.ObjectMeta.DeletionTimestamp.IsZero() { + return true + } + + // Handle only spec updates + return !reflect.DeepEqual(oldNIMBuild.Spec, newNIMBuild.Spec) + } + } + // For other types we watch, reconcile them + return true + }, + }). + Complete(r) +} + +func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) error { + var errList []error + logger := r.GetLogger() + + // All owned objects are garbage collected + + // Fetch the pod + podName := types.NamespacedName{Name: nimBuild.Name + "-engine-build-pod", Namespace: nimBuild.Namespace} + pod := &corev1.Pod{} + if err := r.Get(ctx, podName, pod); err != nil { + if errors.IsNotFound(err) { + return nil + } + logger.Error(err, "unable to fetch the pod for cleanup", "pod", podName) + return err + } + + if err := r.Delete(ctx, pod); err != nil { + if errors.IsNotFound(err) { + logger.Info("Pod already deleted", "pod", podName) + return nil + } + logger.Error(err, "unable to delete associated pods during cleanup", "job", podName, "pod", pod.Name) + errList = append(errList, err) + } + + if len(errList) > 0 { + return fmt.Errorf("failed to cleanup resources: %v", errList) + } + + return nil +} + +func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) (reconcile.Result, error) { + logger := r.GetLogger() + nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheRef, Namespace: nimBuild.GetNamespace()} + + nimCache := &appsv1alpha1.NIMCache{} + if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { + logger.Error(err, "unable to fetch NIMCache for NIMBuild", "NIMCache", nimCacheNamespacedName) + return ctrl.Result{}, err + } + + if nimCache.Status.State == appsv1alpha1.NimCacheStatusReady { + err := r.reconcileEngineBuild(ctx, nimBuild, nimCache) + if err != nil { + logger.Error(err, "reconciliation of nimbuild failed") + return ctrl.Result{}, err + } + } else { + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionWaitForNimCache, metav1.ConditionTrue, "NIMCache not ready", "Waiting for NIMCache to be ready before building engines") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending + return ctrl.Result{RequeueAfter: time.Second * 30}, nil + + } + + err := r.updateNIMBuildStatus(ctx, nimBuild) + if err != nil { + logger.Error(err, "Failed to update NIMBuild status", "NIMBuild", nimBuild.Name) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + // If the NIMCache is not in a state to build engines, return early + // Wait for caching to complete before building engines + buildableProfile := appsv1alpha1.NIMProfile{} + if nimBuild.Spec.ProfileName == "" { + buildableProfiles := getBuildableProfiles(nimCache) + if buildableProfiles != nil { + switch { + case len(buildableProfiles) > 1: + logger.Info("Multiple buildable profiles found", "Profiles", buildableProfiles) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found, please select one profile to build") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return nil + case len(buildableProfiles) == 1: + logger.Info("Single buildable profile found", "Profile", buildableProfiles[0].Name) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") + buildableProfile = buildableProfiles[0] + default: + logger.Info("No buildable profiles found, skipping engine build") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found for NIM Cache") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return nil + } + + } else { + logger.Info("No buildable profiles found, skipping engine build") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found for NIM Cache") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + } + } else { + foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.ProfileName) + if foundProfile == nil { + logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.ProfileName) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found, please select one profile to build") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return nil + } else { + logger.Info("Single buildable profile found", "Profile", foundProfile.Name) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") + buildableProfile = *foundProfile + } + } + + pod := &corev1.Pod{} + podName := types.NamespacedName{Name: nimBuild.Name + "-engine-build-pod", Namespace: nimBuild.GetNamespace()} + err := r.Get(ctx, podName, pod) + if err != nil && client.IgnoreNotFound(err) != nil { + return err + } + // If pod does not exist and caching is not complete, create a new one + if err != nil && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCreated) { + pod, err = r.constructEngineBuildPod(nimBuild, nimCache, r.orchestratorType, buildableProfile) + if err != nil { + logger.Error(err, "Failed to construct job") + return err + } + if err := controllerutil.SetControllerReference(nimBuild, pod, r.GetScheme()); err != nil { + return err + } + err = r.Create(ctx, pod) + if err != nil { + logger.Error(err, "Failed to create pod") + return err + } + logger.Info("Created pod for NIM Cache engine build", "pod", podName) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCreated, metav1.ConditionTrue, "EngineBuildPodCreated", "The pod to build engine has been created") + return nil + } + + // Reconcile the job status + if err := r.reconcileEngineBuildPodStatus(ctx, nimBuild, pod); err != nil { + return err + } + return nil +} + +func getBuildableProfiles(cache *appsv1alpha1.NIMCache) []appsv1alpha1.NIMProfile { + var buildableProfiles []appsv1alpha1.NIMProfile + for _, profile := range cache.Status.Profiles { + if profile.Config["trtllm_buildable"] == "true" { + buildableProfiles = append(buildableProfiles, profile) + } + } + return buildableProfiles +} + +func getBuildableProfileByName(cache *appsv1alpha1.NIMCache, profileName string) *appsv1alpha1.NIMProfile { + for _, profile := range cache.Status.Profiles { + if profile.Config["trtllm_buildable"] == "true" && profile.Name == profileName { + return &profile + } + } + return nil +} + +func (r *NIMBuildReconciler) reconcileEngineBuildPodStatus(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, pod *corev1.Pod) error { + logger := log.FromContext(ctx) + podName := pod.Name + + switch { + case isPodReady(pod) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): + logger.Info("Pod Ready", "pod", podName) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted, metav1.ConditionTrue, "PodReady", "The Pod to cache NIM has successfully completed") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusReady + if err := r.deletePod(ctx, pod); err != nil { + logger.Error(err, "Unable to delete NIM Cache build engine pod", "Name", pod.Name) + return err + } + + case pod.Status.Phase == corev1.PodFailed && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): + logger.Info("Failed to cache NIM, build pod failed", "pod", pod) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCreated, metav1.ConditionFalse, "PodFailed", "The pod to build engine has failed") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + + case !isPodReady(pod) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): + logger.Info("Caching NIM is in progress, build engine pod running", "job", podName) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodPending, metav1.ConditionTrue, "PodRunning", "The Pod to cache NIM is in progress") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusInProgress + + } + + return nil +} + +func (r *NIMBuildReconciler) updateNIMBuildStatus(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) error { + logger := r.GetLogger() + obj := &appsv1alpha1.NIMBuild{} + errGet := r.Get(ctx, types.NamespacedName{Name: nimBuild.Name, Namespace: nimBuild.GetNamespace()}, obj) + if errGet != nil { + logger.Error(errGet, "error getting NIMBuild", "name", nimBuild.Name) + return errGet + } + obj.Status = nimBuild.Status + if err := r.Status().Update(ctx, obj); err != nil { + logger.Error(err, "Failed to update status", "NIMBuild", nimBuild.Name) + return err + } + return nil +} + +func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType, nimProfile appsv1alpha1.NIMProfile) (*corev1.Pod, error) { + logger := r.GetLogger() + pvcName := shared.GetPVCName(nimCache, nimCache.Spec.Storage.PVC) + labels := map[string]string{ + "app": "k8s-nim-operator", + "app.kubernetes.io/name": nimBuild.Name, + "app.kubernetes.io/managed-by": "k8s-nim-operator", + } + + annotations := map[string]string{ + "sidecar.istio.io/inject": "false", + } + + // If no user-provided GPU resource is found, proceed with auto-assignment + // Get tensorParallelism from the profile + tensorParallelism, err := utils.GetTensorParallelismByProfileTags(nimProfile.Config) + if err != nil { + logger.Error(err, "Failed to retrieve tensorParallelism") + return nil, err + } + + gpuQuantity, err := apiResource.ParseQuantity(tensorParallelism) + if err != nil { + return nil, fmt.Errorf("failed to parse tensorParallelism: %w", err) + } + + if platformType == k8sutil.OpenShift { + if nimCache.GetProxySpec() != nil { + annotations["openshift.io/scc"] = "anyuid" + } else { + annotations["openshift.io/scc"] = "nonroot" + } + } + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: nimBuild.Name + "-engine-build-pod", + Namespace: nimBuild.Namespace, + Labels: labels, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + RuntimeClassName: nimCache.GetRuntimeClassName(), + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: nimCache.GetUserID(), + FSGroup: nimCache.GetGroupID(), + RunAsNonRoot: ptr.To[bool](true), + }, + Containers: []corev1.Container{}, + RestartPolicy: corev1.RestartPolicyNever, + Volumes: []corev1.Volume{ + { + Name: "nim-cache-volume", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + }, + }, + }, + ImagePullSecrets: []corev1.LocalObjectReference{}, + ServiceAccountName: NIMCacheServiceAccount, + Tolerations: nimCache.GetTolerations(), + NodeSelector: nimCache.GetNodeSelectors(), + }, + } + + // SeccompProfile must be set for TKGS + if platformType == k8sutil.TKGS { + pod.Spec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + } + } + + pod.Spec.Containers = []corev1.Container{ + { + Name: NIMBuildContainerName, + Image: nimCache.Spec.Source.NGC.ModelPuller, + EnvFrom: nimCache.Spec.Source.EnvFromSecrets(), + Env: []corev1.EnvVar{ + { + Name: "NIM_CACHE_PATH", + Value: "/model-store", + }, + { + Name: "NIM_SERVER_PORT", + Value: "8000", + }, + { + Name: "NIM_HTTP_API_PORT", + Value: "8000", + }, + { + Name: "NIM_CUSTOM_MODEL_NAME", + Value: nimCache.Name + "-" + nimProfile.Name, + }, + { + Name: "NGC_API_KEY", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: nimCache.Spec.Source.NGC.AuthSecret, + }, + Key: "NGC_API_KEY", + }, + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "nim-cache-volume", + MountPath: "/model-store", + SubPath: nimCache.Spec.Storage.PVC.SubPath, + }, + }, + Resources: corev1.ResourceRequirements{ + Limits: map[corev1.ResourceName]apiResource.Quantity{ + "cpu": nimCache.Spec.Resources.CPU, + "memory": nimCache.Spec.Resources.Memory, + "nvidia.com/gpu": gpuQuantity, + }, + Requests: map[corev1.ResourceName]apiResource.Quantity{ + "cpu": nimCache.Spec.Resources.CPU, + "memory": nimCache.Spec.Resources.Memory, + "nvidia.com/gpu": gpuQuantity, + }, + }, + TerminationMessagePath: "/dev/termination-log", + TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError, + Ports: []corev1.ContainerPort{{ + Name: "api", + ContainerPort: 8000, + Protocol: corev1.ProtocolTCP, + }}, + ReadinessProbe: &corev1.Probe{ + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 3, + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/v1/health/ready", + Port: intstr.FromString("api"), + }, + }, + }, + LivenessProbe: &corev1.Probe{ + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 3, + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/v1/health/live", + Port: intstr.FromString("api"), + }, + }, + }, + StartupProbe: &corev1.Probe{ + InitialDelaySeconds: 30, + TimeoutSeconds: 1, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 30, + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/v1/health/ready", + Port: intstr.FromString("api"), + }, + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To[bool](false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: ptr.To[bool](true), + RunAsGroup: nimCache.GetGroupID(), + RunAsUser: nimCache.GetUserID(), + }, + }, + } + pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ + { + Name: nimCache.Spec.Source.NGC.PullSecret, + }, + } + + // Merge env with the user provided values + pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimCache.Spec.Env) + + return pod, nil +} + +func isPodReady(pod *corev1.Pod) bool { + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady { + return cond.Status == corev1.ConditionTrue + } + } + return false +} + +func (r *NIMBuildReconciler) deletePod(ctx context.Context, pod *corev1.Pod) error { + logger := log.FromContext(ctx) + logger.Info("Deleting Engine Build Pod", "name", pod.Name, "namespace", pod.Namespace) + if err := r.Delete(ctx, pod); err != nil { + logger.Error(err, "Failed to Delete Engine Build Pod", "name", pod.Name) + return err + } + return nil +} diff --git a/internal/controller/platform/standalone/nimservice.go b/internal/controller/platform/standalone/nimservice.go index c2e611136..24c41bda9 100644 --- a/internal/controller/platform/standalone/nimservice.go +++ b/internal/controller/platform/standalone/nimservice.go @@ -686,24 +686,6 @@ func (r *NIMServiceReconciler) getNIMCacheProfile(ctx context.Context, nimServic return nil, nil } -// getTensorParallelismByProfile returns the value of tensor parallelism parameter in the given NIM profile. -func (r *NIMServiceReconciler) getTensorParallelismByProfile(ctx context.Context, profile *appsv1alpha1.NIMProfile) (string, error) { - // List of possible keys for tensor parallelism - possibleKeys := []string{"tensorParallelism", "tp"} - - tensorParallelism := "" - - // Iterate through possible keys and return the first valid value - for _, key := range possibleKeys { - if value, exists := profile.Config[key]; exists { - tensorParallelism = value - break - } - } - - return tensorParallelism, nil -} - // assignGPUResources automatically assigns GPU resources to the NIMService based on the provided profile, // but retains any user-specified GPU resources if they are explicitly provided. // @@ -737,7 +719,7 @@ func (r *NIMServiceReconciler) assignGPUResources(ctx context.Context, nimServic // If no user-provided GPU resource is found, proceed with auto-assignment // Get tensorParallelism from the profile - tensorParallelism, err := r.getTensorParallelismByProfile(ctx, profile) + tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config) if err != nil { logger.Error(err, "Failed to retrieve tensorParallelism") return err diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index 45b84d5ff..62880dc7c 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -50,6 +50,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "github.com/NVIDIA/k8s-nim-operator/internal/utils" + appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil" @@ -1275,7 +1277,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Config: map[string]string{"tp": "4"}, } - tensorParallelism, err := reconciler.getTensorParallelismByProfile(context.TODO(), profile) + tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config) Expect(err).To(BeNil()) Expect(tensorParallelism).To(Equal("4")) }) @@ -1285,7 +1287,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() { Name: "test-profile", Config: map[string]string{}, } - tensorParallelism, err := reconciler.getTensorParallelismByProfile(context.TODO(), profile) + tensorParallelism, err := utils.GetTensorParallelismByProfileTags(profile.Config) Expect(err).To(BeNil()) Expect(tensorParallelism).To(BeEmpty()) }) diff --git a/internal/utils/utils.go b/internal/utils/utils.go index 05903d709..0a793847f 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -426,3 +426,21 @@ func updateRoleBinding(obj, desired *rbacv1.RoleBinding) *rbacv1.RoleBinding { obj.RoleRef = desired.RoleRef return obj } + +// GetTensorParallelismByProfileTags retrieves the tensor parallelism value from the provided tags. +func GetTensorParallelismByProfileTags(tags map[string]string) (string, error) { + // List of possible keys for tensor parallelism + possibleKeys := []string{"tensorParallelism", "tp"} + + tensorParallelism := "" + + // Iterate through possible keys and return the first valid value + for _, key := range possibleKeys { + if value, exists := tags[key]; exists { + tensorParallelism = value + break + } + } + + return tensorParallelism, nil +} From 607007770e658856b2211eeb1bd84b4cfd99e246 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 17 Jun 2025 07:36:21 +0000 Subject: [PATCH 02/16] Adding support for NIMBuild CRD for buildable profiles Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 2 + internal/controller/nimbuild_controller.go | 329 +++++++++++++++++++-- internal/controller/nimcache_controller.go | 35 +-- internal/k8sutil/k8sutil.go | 33 +++ 4 files changed, 347 insertions(+), 52 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index 3aa38be1d..12acf0a7c 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -93,6 +93,8 @@ const ( NimBuildConditionEngineBuildPodCompleted = "NIM_BUILD_ENGINE_BUILD_POD_COMPLETED" // NimBuildConditionEngineBuildPodPending indicates that the engine build pod is in pending state. NimBuildConditionEngineBuildPodPending = "NIM_BUILD_ENGINE_BUILD_POD_PENDING" + // NimBuildConditionModelManifestPodCompleted indicates that the model manifest pod is in completed state. + NimBuildConditionModelManifestPodCompleted = "NIM_BUILD_MODEL_MANIFEST_POD_COMPLETED" // NimBuildStatusNotReady indicates that build is not ready. NimBuildStatusNotReady = "NotReady" diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 0c87bf955..8604400e0 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "reflect" + "strings" "time" "github.com/go-logr/logr" @@ -40,6 +41,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/yaml" + + "github.com/NVIDIA/k8s-nim-operator/internal/nimparser" + nimparserutils "github.com/NVIDIA/k8s-nim-operator/internal/nimparser/utils" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" @@ -56,6 +61,8 @@ const ( // NIMBuildContainerName returns the name of the container used for NIM Build operations. NIMBuildContainerName = "nim-build-ctr" + + NIMBuildManifestContainerName = "nim-build-manifest-ctr" ) // NIMBuildReconciler reconciles a NIMBuild object. @@ -285,6 +292,15 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap logger.Error(err, "reconciliation of nimbuild failed") return ctrl.Result{}, err } + + if meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionModelManifestPodCompleted) { + err = r.reconcileLocalModelManifest(ctx, nimBuild, nimCache) + if err != nil { + logger.Error(err, "reconciliation of nimbuild failed") + return ctrl.Result{}, err + } + } + } else { conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionWaitForNimCache, metav1.ConditionTrue, "NIMCache not ready", "Waiting for NIMCache to be ready before building engines") nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending @@ -302,8 +318,6 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() - // If the NIMCache is not in a state to build engines, return early - // Wait for caching to complete before building engines buildableProfile := appsv1alpha1.NIMProfile{} if nimBuild.Spec.ProfileName == "" { buildableProfiles := getBuildableProfiles(nimCache) @@ -311,16 +325,16 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild switch { case len(buildableProfiles) > 1: logger.Info("Multiple buildable profiles found", "Profiles", buildableProfiles) - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found, please select one profile to build") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found in NIM Cache, please select one profile to build") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil case len(buildableProfiles) == 1: - logger.Info("Single buildable profile found", "Profile", buildableProfiles[0].Name) - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") + logger.Info("Selected buildable profile found", "Profile", buildableProfiles[0].Name) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile cached in NIM Cache") buildableProfile = buildableProfiles[0] default: logger.Info("No buildable profiles found, skipping engine build") - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found for NIM Cache") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found in NIM Cache") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil } @@ -334,11 +348,11 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.ProfileName) if foundProfile == nil { logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.ProfileName) - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found, please select one profile to build") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found, please select a valid profile from the NIM cache") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil } else { - logger.Info("Single buildable profile found", "Profile", foundProfile.Name) + logger.Info("Selected buildable profile found", "Profile", foundProfile.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") buildableProfile = *foundProfile } @@ -360,11 +374,13 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild if err := controllerutil.SetControllerReference(nimBuild, pod, r.GetScheme()); err != nil { return err } - err = r.Create(ctx, pod) + + err = r.createPod(ctx, pod) if err != nil { - logger.Error(err, "Failed to create pod") + logger.Error(err, "failed to create", "pod", pod.Name) return err } + logger.Info("Created pod for NIM Cache engine build", "pod", podName) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCreated, metav1.ConditionTrue, "EngineBuildPodCreated", "The pod to build engine has been created") return nil @@ -403,8 +419,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuildPodStatus(ctx context.Context, switch { case isPodReady(pod) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): logger.Info("Pod Ready", "pod", podName) - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted, metav1.ConditionTrue, "PodReady", "The Pod to cache NIM has successfully completed") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusReady + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted, metav1.ConditionTrue, "PodReady", "The Pod to build engine has successfully completed") if err := r.deletePod(ctx, pod); err != nil { logger.Error(err, "Unable to delete NIM Cache build engine pod", "Name", pod.Name) return err @@ -417,7 +432,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuildPodStatus(ctx context.Context, case !isPodReady(pod) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): logger.Info("Caching NIM is in progress, build engine pod running", "job", podName) - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodPending, metav1.ConditionTrue, "PodRunning", "The Pod to cache NIM is in progress") + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodPending, metav1.ConditionTrue, "PodRunning", "The Pod to build engine is in progress") nimBuild.Status.State = appsv1alpha1.NimBuildStatusInProgress } @@ -468,11 +483,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB } if platformType == k8sutil.OpenShift { - if nimCache.GetProxySpec() != nil { - annotations["openshift.io/scc"] = "anyuid" - } else { - annotations["openshift.io/scc"] = "nonroot" - } + annotations["openshift.io/scc"] = "nonroot" } pod := &corev1.Pod{ @@ -655,3 +666,285 @@ func (r *NIMBuildReconciler) deletePod(ctx context.Context, pod *corev1.Pod) err } return nil } + +func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + + // Model manifest is available only for NGC model pullers + if nimCache.Spec.Source.NGC == nil { + return nil + } + + existingConfig := &corev1.ConfigMap{} + cmName := getManifestConfigName(nimCache) + err := r.Get(ctx, client.ObjectKey{Name: cmName, Namespace: nimBuild.Namespace}, existingConfig) + if err != nil { + logger.Error(err, "failed to get configmap of the local model manifest", "name", cmName) + return err + } + + // Create a temporary pod for parsing local model manifest + pod := constructLocalManifestPodSpec(nimCache, nimBuild, r.orchestratorType) + // Add nimBuild as owner for watching on status change + if err := controllerutil.SetControllerReference(nimBuild, pod, r.GetScheme()); err != nil { + return err + } + err = r.createPod(ctx, pod) + if err != nil { + logger.Error(err, "failed to create", "pod", pod.Name) + return err + } + + existingPod := &corev1.Pod{} + err = r.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: nimBuild.Namespace}, existingPod) + if err != nil { + logger.Error(err, "failed to get pod for model selection", "pod", pod.Name) + return err + } + + if existingPod.Status.Phase != corev1.PodRunning { + // requeue request with delay until the pod is ready + return nil + } + + // Extract manifest file + output, err := k8sutil.GetPodLogs(ctx, existingPod, NIMBuildManifestContainerName) + if err != nil { + logger.Error(err, "failed to get pod logs for parsing model manifest file", "pod", pod.Name) + return err + } + + if output == "" { + logger.Info("Requeuing to wait for the manifest to be copied from the container") + return nil + } + + parser := nimparserutils.GetNIMParser([]byte(output)) + // Parse the file + manifest, err := parser.ParseModelManifestFromRawOutput([]byte(output)) + if err != nil { + logger.Error(err, "Failed to parse model manifest from the pod") + return err + } + logger.V(2).Info("local manifest file", "nimbuild", nimBuild.Name, "manifest", manifest) + + // Create a ConfigMap with the model manifest file for re-use + err = r.updateManifestConfigMap(ctx, nimCache, &manifest) + if err != nil { + logger.Error(err, "Failed to create model manifest config map") + return err + } + + // Model manifest is successfully extracted, cleanup temporary pod + err = r.Delete(ctx, existingPod) + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "failed to delete", "pod", pod.Name) + // requeue request with delay until the pod is cleaned up + // this is required as NIM containers are resource heavy + return err + } + + // To Do: Explore changing the profile list to a map for faster lookups + for _, profileName := range manifest.GetProfilesList() { + for _, selectedProfile := range nimCache.Status.Profiles { + if selectedProfile.Name == profileName { + break + } + } + logger.Info("Adding profile to NIMCache status", "profileName", profileName) + // If the profile is not found, add it to the status + nimCache.Status.Profiles = append(nimCache.Status.Profiles, appsv1alpha1.NIMProfile{ + Name: profileName, + Model: manifest.GetProfileModel(profileName), + Config: manifest.GetProfileTags(profileName), + Release: manifest.GetProfileRelease(profileName), + }) + } + + // Update the NIMCache status with the new profiles + obj := &appsv1alpha1.NIMCache{} + errGet := r.Get(ctx, types.NamespacedName{Name: nimCache.Name, Namespace: nimCache.GetNamespace()}, obj) + if errGet != nil { + logger.Error(errGet, "error getting NIMCache", "name", nimCache.Name) + return errGet + } + obj.Status = nimCache.Status + if err := r.Status().Update(ctx, obj); err != nil { + logger.Error(err, "Failed to update status", "NIMCache", nimCache.Name) + return err + } + // Update the NIMBuild status + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionModelManifestPodCompleted, metav1.ConditionTrue, "PodCompleted", "The Pod to read local model manifest is completed") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusReady + + return nil +} + +func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *appsv1alpha1.NIMBuild, platformType k8sutil.OrchestratorType) *corev1.Pod { + + labels := map[string]string{ + "app": "k8s-nim-operator", + "app.kubernetes.io/name": nimBuild.Name, + "app.kubernetes.io/managed-by": "k8s-nim-operator", + } + + annotations := map[string]string{ + "sidecar.istio.io/inject": "false", + } + + if platformType == k8sutil.OpenShift { + annotations = map[string]string{ + "openshift.io/required-scc": "nonroot", + } + } + pvcName := shared.GetPVCName(nimCache, nimCache.Spec.Storage.PVC) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: nimBuild.Name + "-local-manifest-pod", + Namespace: nimBuild.Namespace, + Labels: labels, + Annotations: annotations, + }, + Spec: corev1.PodSpec{ + RuntimeClassName: nimCache.GetRuntimeClassName(), + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: nimCache.GetUserID(), + FSGroup: nimCache.GetGroupID(), + RunAsNonRoot: ptr.To[bool](true), + }, + Containers: []corev1.Container{}, + RestartPolicy: corev1.RestartPolicyNever, + Volumes: []corev1.Volume{ + { + Name: "nim-cache-volume", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + }, + }, + }, + ImagePullSecrets: []corev1.LocalObjectReference{}, + ServiceAccountName: NIMCacheServiceAccount, + Tolerations: nimCache.GetTolerations(), + NodeSelector: nimCache.GetNodeSelectors(), + }, + } + + // SeccompProfile must be set for TKGS + if platformType == k8sutil.TKGS { + pod.Spec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + } + } + + pod.Spec.Containers = []corev1.Container{ + { + Name: NIMBuildManifestContainerName, + Image: nimCache.Spec.Source.NGC.ModelPuller, + Command: getNIMBuildSidecarCommand(), + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To[bool](false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: ptr.To[bool](true), + RunAsGroup: nimCache.GetGroupID(), + RunAsUser: nimCache.GetUserID(), + }, + Resources: corev1.ResourceRequirements{ + Limits: map[corev1.ResourceName]apiResource.Quantity{ + "cpu": nimCache.Spec.Resources.CPU, + "memory": nimCache.Spec.Resources.Memory, + }, + Requests: map[corev1.ResourceName]apiResource.Quantity{ + "cpu": nimCache.Spec.Resources.CPU, + "memory": nimCache.Spec.Resources.Memory, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "nim-cache-volume", + MountPath: "/model-store", + SubPath: nimCache.Spec.Storage.PVC.SubPath, + }, + }, + }, + } + pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ + { + Name: nimCache.Spec.Source.NGC.PullSecret, + }, + } + + // Merge env with the user provided values + pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimCache.Spec.Env) + + return pod +} + +func getNIMBuildSidecarCommand() []string { + return []string{ + "sh", + "-c", + strings.Join([]string{ + "cat /model-store/local_cache/local_manifest.yaml;", + "sleep infinity", + }, " "), + } +} + +func (r *NIMBuildReconciler) createPod(ctx context.Context, pod *corev1.Pod) error { + // Create pod + err := r.Create(ctx, pod) + if err != nil && !errors.IsAlreadyExists(err) { + return err + } + return nil +} + +func (r *NIMBuildReconciler) updateManifestConfigMap(ctx context.Context, nimCache *appsv1alpha1.NIMCache, manifestData *nimparser.NIMManifestInterface) error { + // Convert manifestData to YAML + manifestBytes, err := yaml.Marshal(manifestData) + if err != nil { + return fmt.Errorf("failed to marshal manifest data: %w", err) + } + + // Pretty-print the YAML content + var prettyYAML interface{} + err = yaml.Unmarshal(manifestBytes, &prettyYAML) + if err != nil { + return fmt.Errorf("failed to unmarshal manifest data for pretty-printing: %w", err) + } + + prettyManifestBytes, err := yaml.Marshal(prettyYAML) + if err != nil { + return fmt.Errorf("failed to re-marshal manifest data for pretty-printing: %w", err) + } + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: getManifestConfigName(nimCache), + Namespace: nimCache.GetNamespace(), + Labels: map[string]string{ + "app": nimCache.GetName(), + }, + }, + } + + // Fetch the existing ConfigMap if it exists + err = r.Get(ctx, client.ObjectKey{Name: configMap.Name, Namespace: configMap.Namespace}, configMap) + if err != nil { + return fmt.Errorf("failed to get ConfigMap %s: %w", configMap.Name, err) + } + + // Update the data + configMap.Data["local_model_manifest.yaml"] = string(prettyManifestBytes) + + // Create the ConfigMap + if err := r.Update(ctx, configMap); err != nil { + return fmt.Errorf("failed to create manifest ConfigMap %s: %w", configMap.Name, err) + } + return nil +} diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index c85b6abc0..c0c56a01d 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -17,11 +17,9 @@ limitations under the License. package controller import ( - "bytes" "context" "encoding/json" "fmt" - "io" "reflect" "slices" "strings" @@ -36,8 +34,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -630,7 +626,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach } // Extract manifest file - output, err := r.getPodLogs(ctx, existingPod) + output, err := k8sutil.GetPodLogs(ctx, existingPod, NIMCacheContainerName) if err != nil { logger.Error(err, "failed to get pod logs for parsing model manifest file", "pod", pod.Name) return false, err @@ -1013,35 +1009,6 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache, platformType k8sutil.Orch return pod } -func (r *NIMCacheReconciler) getPodLogs(ctx context.Context, pod *corev1.Pod) (string, error) { - podLogOpts := corev1.PodLogOptions{Container: NIMCacheContainerName} - config, err := rest.InClusterConfig() - if err != nil { - return "", err - } - // create a clientset - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return "", err - } - req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) - podLogs, err := req.Stream(ctx) - if err != nil { - return "", err - } - defer func(podLogs io.ReadCloser) { - _ = podLogs.Close() - }(podLogs) - - buf := new(bytes.Buffer) - _, err = io.Copy(buf, podLogs) - if err != nil { - return "", err - } - - return buf.String(), nil -} - func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache, platformType k8sutil.OrchestratorType) (*batchv1.Job, error) { logger := r.GetLogger() pvcName := shared.GetPVCName(nimCache, nimCache.Spec.Storage.PVC) diff --git a/internal/k8sutil/k8sutil.go b/internal/k8sutil/k8sutil.go index 667ee3580..cad37d769 100644 --- a/internal/k8sutil/k8sutil.go +++ b/internal/k8sutil/k8sutil.go @@ -17,14 +17,18 @@ limitations under the License. package k8sutil import ( + "bytes" "context" goerrors "errors" "fmt" + "io" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -292,3 +296,32 @@ func GetRawYAMLFromConfigMap(ctx context.Context, k8sClient client.Client, names return raw, nil } + +func GetPodLogs(ctx context.Context, pod *corev1.Pod, containerName string) (string, error) { + podLogOpts := corev1.PodLogOptions{Container: containerName} + config, err := rest.InClusterConfig() + if err != nil { + return "", err + } + // create a clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return "", err + } + req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) + podLogs, err := req.Stream(ctx) + if err != nil { + return "", err + } + defer func(podLogs io.ReadCloser) { + _ = podLogs.Close() + }(podLogs) + + buf := new(bytes.Buffer) + _, err = io.Copy(buf, podLogs) + if err != nil { + return "", err + } + + return buf.String(), nil +} From 4449cf26b552b1d0780f674acd2c5c44c62f1179 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 17 Jun 2025 08:13:44 +0000 Subject: [PATCH 03/16] Fixing merge issues Signed-off-by: Vishesh Tanksale --- internal/controller/nimbuild_controller.go | 6 ++++++ internal/controller/nimcache_controller.go | 2 -- internal/controller/platform/standalone/nimservice_test.go | 2 +- internal/k8sutil/k8sutil.go | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 8604400e0..f19f5fb30 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -32,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/discovery" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -194,6 +195,11 @@ func (r *NIMBuildReconciler) GetUpdater() conditions.Updater { return r.updater } +// GetDiscoveryClient returns the discovery client instance. +func (r *NIMBuildReconciler) GetDiscoveryClient() discovery.DiscoveryInterface { + return nil +} + func (r *NIMBuildReconciler) GetRenderer() render.Renderer { return nil } diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index ac83a8b8f..a071c1b3c 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -35,8 +35,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/discovery" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" diff --git a/internal/controller/platform/standalone/nimservice_test.go b/internal/controller/platform/standalone/nimservice_test.go index d18aac091..9de7d2740 100644 --- a/internal/controller/platform/standalone/nimservice_test.go +++ b/internal/controller/platform/standalone/nimservice_test.go @@ -53,9 +53,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "k8s.io/apimachinery/pkg/version" "github.com/NVIDIA/k8s-nim-operator/internal/utils" - "k8s.io/apimachinery/pkg/version" appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1" "github.com/NVIDIA/k8s-nim-operator/internal/conditions" diff --git a/internal/k8sutil/k8sutil.go b/internal/k8sutil/k8sutil.go index 2b7573b6a..c614cc743 100644 --- a/internal/k8sutil/k8sutil.go +++ b/internal/k8sutil/k8sutil.go @@ -28,9 +28,9 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - "k8s.io/client-go/discovery" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" From 7998c58dd23edbb3b88ece76b2ec650f32117e7e Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 17 Jun 2025 22:15:20 +0000 Subject: [PATCH 04/16] Addressing review comments Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 39 ++-- api/apps/v1alpha1/zz_generated.deepcopy.go | 34 ++-- .../manifests/apps.nvidia.com_nimbuilds.yaml | 116 +++++++++++- .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 116 +++++++++++- .../crds/apps.nvidia.com_nimbuilds.yaml | 116 +++++++++++- internal/controller/nimbuild_controller.go | 174 ++++++++++++------ 6 files changed, 487 insertions(+), 108 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index 12acf0a7c..1487b240a 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -17,6 +17,7 @@ limitations under the License. package v1alpha1 import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -27,22 +28,18 @@ import ( type NIMBuildSpec struct { // Required: Reference to the model weights from NIMCache NIMCacheRef string `json:"nimCacheRef"` - // Required: Profile name for this engine build - ProfileName string `json:"profileName,omitempty"` - // Required: Number of GPUs to build the engine with - Parallelism int `json:"parallelism,omitempty"` - // Optional: Any user-defined tags for tracking builds + // Profile name for this engine build + Profile string `json:"profile,omitempty"` + // Any user-defined tags for tracking builds Tags map[string]string `json:"tags,omitempty"` - // Optional: Additional build params + // Additional build params BuildParams []string `json:"additionalBuildParams,omitempty"` -} - -// Parallelism defines the compute parallelism strategy for building the engine. -type Parallelism struct { - // Number of tensor parallelism shards (splits model across GPUs at tensor level) - TP int `json:"tp"` - // Optional: Number of pipeline parallelism stages (model split across layers) - PP int `json:"pp,omitempty"` + // Resources is the resource requirements for the NIMBuild pod. + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + // Tolerations for running the job to cache the NIM model + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // NodeSelector is the node selector labels to schedule the caching job. + NodeSelector map[string]string `json:"nodeSelector,omitempty"` } // NIMBuildStatus defines the observed state of NIMBuild. @@ -96,6 +93,10 @@ const ( // NimBuildConditionModelManifestPodCompleted indicates that the model manifest pod is in completed state. NimBuildConditionModelManifestPodCompleted = "NIM_BUILD_MODEL_MANIFEST_POD_COMPLETED" + NimBuildConditionNIMCacheNotFound = "NIM_BUILD_NIM_CACHE_NOT_FOUND" + + NimBuildConditionNimCacheFailed = "NIM_BUILD_NIM_CACHE_FAILED" + // NimBuildStatusNotReady indicates that build is not ready. NimBuildStatusNotReady = "NotReady" @@ -114,3 +115,13 @@ const ( func init() { SchemeBuilder.Register(&NIMBuild{}, &NIMBuildList{}) } + +// GetTolerations returns tolerations configured for the NIMBuild Pod. +func (n *NIMBuild) GetTolerations() []corev1.Toleration { + return n.Spec.Tolerations +} + +// GetNodeSelectors returns nodeselectors configured for the NIMBuild Pod. +func (n *NIMBuild) GetNodeSelectors() map[string]string { + return n.Spec.NodeSelector +} diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index b5ffad11a..aad85a893 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -774,6 +774,25 @@ func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(corev1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]corev1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildSpec. @@ -2411,21 +2430,6 @@ func (in *ObjectStoreCredentials) DeepCopy() *ObjectStoreCredentials { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Parallelism) DeepCopyInto(out *Parallelism) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Parallelism. -func (in *Parallelism) DeepCopy() *Parallelism { - if in == nil { - return nil - } - out := new(Parallelism) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PersistentVolumeClaim) DeepCopyInto(out *PersistentVolumeClaim) { *out = *in diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index a16b8c4ae..091eae66a 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -49,24 +49,126 @@ spec: model config and weights. properties: additionalBuildParams: - description: 'Optional: Additional build params' + description: Additional build params items: type: string type: array nimCacheRef: description: 'Required: Reference to the model weights from NIMCache' type: string - parallelism: - description: 'Required: Number of GPUs to build the engine with' - type: integer - profileName: - description: 'Required: Profile name for this engine build' + nodeSelector: + additionalProperties: + type: string + description: NodeSelector is the node selector labels to schedule + the caching job. + type: object + profile: + description: Profile name for this engine build type: string + resources: + description: Resources is the resource requirements for the NIMBuild + pod. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object tags: additionalProperties: type: string - description: 'Optional: Any user-defined tags for tracking builds' + description: Any user-defined tags for tracking builds type: object + tolerations: + description: Tolerations for running the job to cache the NIM model + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array required: - nimCacheRef type: object diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index a16b8c4ae..091eae66a 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -49,24 +49,126 @@ spec: model config and weights. properties: additionalBuildParams: - description: 'Optional: Additional build params' + description: Additional build params items: type: string type: array nimCacheRef: description: 'Required: Reference to the model weights from NIMCache' type: string - parallelism: - description: 'Required: Number of GPUs to build the engine with' - type: integer - profileName: - description: 'Required: Profile name for this engine build' + nodeSelector: + additionalProperties: + type: string + description: NodeSelector is the node selector labels to schedule + the caching job. + type: object + profile: + description: Profile name for this engine build type: string + resources: + description: Resources is the resource requirements for the NIMBuild + pod. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object tags: additionalProperties: type: string - description: 'Optional: Any user-defined tags for tracking builds' + description: Any user-defined tags for tracking builds type: object + tolerations: + description: Tolerations for running the job to cache the NIM model + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array required: - nimCacheRef type: object diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index a16b8c4ae..091eae66a 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -49,24 +49,126 @@ spec: model config and weights. properties: additionalBuildParams: - description: 'Optional: Additional build params' + description: Additional build params items: type: string type: array nimCacheRef: description: 'Required: Reference to the model weights from NIMCache' type: string - parallelism: - description: 'Required: Number of GPUs to build the engine with' - type: integer - profileName: - description: 'Required: Profile name for this engine build' + nodeSelector: + additionalProperties: + type: string + description: NodeSelector is the node selector labels to schedule + the caching job. + type: object + profile: + description: Profile name for this engine build type: string + resources: + description: Resources is the resource requirements for the NIMBuild + pod. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object tags: additionalProperties: type: string - description: 'Optional: Any user-defined tags for tracking builds' + description: Any user-defined tags for tracking builds type: object + tolerations: + description: Tolerations for running the job to cache the NIM model + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array required: - nimCacheRef type: object diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index f19f5fb30..ea282c6e1 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -21,7 +21,6 @@ import ( "fmt" "reflect" "strings" - "time" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -36,9 +35,11 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -223,6 +224,23 @@ func (r *NIMBuildReconciler) GetOrchestratorType(ctx context.Context) (k8sutil.O // SetupWithManager sets up the controller with the Manager. func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { r.recorder = mgr.GetEventRecorderFor("nimbuild-controller") + + err := mgr.GetFieldIndexer().IndexField( + context.Background(), + &appsv1alpha1.NIMBuild{}, + "spec.nimCacheRef", + func(rawObj client.Object) []string { + nimBuild, ok := rawObj.(*appsv1alpha1.NIMBuild) + if !ok { + return []string{} + } + return []string{nimBuild.Spec.NIMCacheRef} + }, + ) + if err != nil { + return err + } + return ctrl.NewControllerManagedBy(mgr). For(&appsv1alpha1.NIMBuild{}). Owns(&corev1.Pod{}). @@ -238,17 +256,52 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { return true } - // Handle only spec updates - return !reflect.DeepEqual(oldNIMBuild.Spec, newNIMBuild.Spec) + // Allow only status updates + // Reject updates that change anything other than status + // Spec is immutable + if !reflect.DeepEqual(oldNIMBuild.Spec, newNIMBuild.Spec) { + return false + } + // Allow if only status changed + return !reflect.DeepEqual(oldNIMBuild.Status, newNIMBuild.Status) } } // For other types we watch, reconcile them return true }, - }). + }).Watches( + &appsv1alpha1.NIMCache{}, + handler.EnqueueRequestsFromMapFunc(r.mapNIMCacheToNIMBuild), + builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}), + ). Complete(r) } +func (r *NIMBuildReconciler) mapNIMCacheToNIMBuild(ctx context.Context, obj client.Object) []ctrl.Request { + nimCache, ok := obj.(*appsv1alpha1.NIMCache) + if !ok { + return []ctrl.Request{} + } + + // Get all NIMBuilds that reference this NIMCache + var nimBuilds appsv1alpha1.NIMBuildList + if err := r.List(ctx, &nimBuilds, client.MatchingFields{"spec.nimCacheRef": nimCache.GetName()}, client.InNamespace(nimCache.GetNamespace())); err != nil { + return []ctrl.Request{} + } + + // Enqueue reconciliation for each matching NIMBuild object + requests := make([]ctrl.Request, len(nimBuilds.Items)) + for i, item := range nimBuilds.Items { + requests[i] = ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: item.Name, + Namespace: item.Namespace, + }, + } + } + return requests +} + func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) error { var errList []error logger := r.GetLogger() @@ -289,29 +342,38 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap nimCache := &appsv1alpha1.NIMCache{} if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { logger.Error(err, "unable to fetch NIMCache for NIMBuild", "NIMCache", nimCacheNamespacedName) + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNIMCacheNotFound, metav1.ConditionTrue, "NIMCache not found", "Not able to get NIMCache for NIMBuild") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return ctrl.Result{}, err } - if nimCache.Status.State == appsv1alpha1.NimCacheStatusReady { + switch nimCache.Status.State { + case appsv1alpha1.NimCacheStatusReady: + logger.V(4).Info("NIMCache is ready", "nimcache", nimCache.Name, "nimservice", nimBuild.Name) + case appsv1alpha1.NimCacheStatusFailed: + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNimCacheFailed, metav1.ConditionTrue, "NIMCache failed", "NIMCache is failed. NIMBuild can not progress") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return ctrl.Result{}, nil + default: + conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionWaitForNimCache, metav1.ConditionTrue, "NIMCache not ready", "Waiting for NIMCache to be ready before building engines") + nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending + return ctrl.Result{}, nil + } + + if !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted) { err := r.reconcileEngineBuild(ctx, nimBuild, nimCache) if err != nil { logger.Error(err, "reconciliation of nimbuild failed") return ctrl.Result{}, err } + } - if meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionModelManifestPodCompleted) { - err = r.reconcileLocalModelManifest(ctx, nimBuild, nimCache) - if err != nil { - logger.Error(err, "reconciliation of nimbuild failed") - return ctrl.Result{}, err - } + if meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionModelManifestPodCompleted) { + err := r.reconcileLocalModelManifest(ctx, nimBuild, nimCache) + if err != nil { + logger.Error(err, "reconciliation of nimbuild failed") + return ctrl.Result{}, err } - - } else { - conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionWaitForNimCache, metav1.ConditionTrue, "NIMCache not ready", "Waiting for NIMCache to be ready before building engines") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending - return ctrl.Result{RequeueAfter: time.Second * 30}, nil - } err := r.updateNIMBuildStatus(ctx, nimBuild) @@ -325,9 +387,9 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() buildableProfile := appsv1alpha1.NIMProfile{} - if nimBuild.Spec.ProfileName == "" { + if nimBuild.Spec.Profile == "" { buildableProfiles := getBuildableProfiles(nimCache) - if buildableProfiles != nil { + if len(buildableProfiles) > 0 { switch { case len(buildableProfiles) > 1: logger.Info("Multiple buildable profiles found", "Profiles", buildableProfiles) @@ -351,9 +413,9 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed } } else { - foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.ProfileName) + foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.Profile) if foundProfile == nil { - logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.ProfileName) + logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.Profile) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found, please select a valid profile from the NIM cache") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil @@ -475,7 +537,6 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB "sidecar.istio.io/inject": "false", } - // If no user-provided GPU resource is found, proceed with auto-assignment // Get tensorParallelism from the profile tensorParallelism, err := utils.GetTensorParallelismByProfileTags(nimProfile.Config) if err != nil { @@ -483,9 +544,32 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB return nil, err } - gpuQuantity, err := apiResource.ParseQuantity(tensorParallelism) - if err != nil { - return nil, fmt.Errorf("failed to parse tensorParallelism: %w", err) + // Ensure Resources, Requests, and Limits are initialized + if nimBuild.Spec.Resources == nil { + nimBuild.Spec.Resources = &corev1.ResourceRequirements{} + } + if nimBuild.Spec.Resources.Requests == nil { + nimBuild.Spec.Resources.Requests = corev1.ResourceList{} + } + if nimBuild.Spec.Resources.Limits == nil { + nimBuild.Spec.Resources.Limits = corev1.ResourceList{} + } + if tensorParallelism == "" { + if _, present := nimBuild.Spec.Resources.Requests["nvidia.com/gpu"]; !present { + return nil, fmt.Errorf("tensorParallelism is not set in the profile tags or resources") + } + } else { + if _, present := nimBuild.Spec.Resources.Requests["nvidia.com/gpu"]; !present { + gpuQuantity, err := apiResource.ParseQuantity(tensorParallelism) + if err != nil { + return nil, fmt.Errorf("failed to parse tensorParallelism: %w", err) + } + nimBuild.Spec.Resources.Requests["nvidia.com/gpu"] = gpuQuantity + nimBuild.Spec.Resources.Limits["nvidia.com/gpu"] = gpuQuantity + + } else { + return nil, fmt.Errorf("tensorParallelism is set in the profile tags, but nvidia.com/gpu is already set in resources") + } } if platformType == k8sutil.OpenShift { @@ -518,10 +602,9 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB }, }, }, - ImagePullSecrets: []corev1.LocalObjectReference{}, - ServiceAccountName: NIMCacheServiceAccount, - Tolerations: nimCache.GetTolerations(), - NodeSelector: nimCache.GetNodeSelectors(), + ImagePullSecrets: []corev1.LocalObjectReference{}, + Tolerations: nimBuild.GetTolerations(), + NodeSelector: nimBuild.GetNodeSelectors(), }, } @@ -573,18 +656,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB SubPath: nimCache.Spec.Storage.PVC.SubPath, }, }, - Resources: corev1.ResourceRequirements{ - Limits: map[corev1.ResourceName]apiResource.Quantity{ - "cpu": nimCache.Spec.Resources.CPU, - "memory": nimCache.Spec.Resources.Memory, - "nvidia.com/gpu": gpuQuantity, - }, - Requests: map[corev1.ResourceName]apiResource.Quantity{ - "cpu": nimCache.Spec.Resources.CPU, - "memory": nimCache.Spec.Resources.Memory, - "nvidia.com/gpu": gpuQuantity, - }, - }, + Resources: *nimBuild.Spec.Resources, TerminationMessagePath: "/dev/termination-log", TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError, Ports: []corev1.ContainerPort{{ @@ -734,7 +806,7 @@ func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, ni } logger.V(2).Info("local manifest file", "nimbuild", nimBuild.Name, "manifest", manifest) - // Create a ConfigMap with the model manifest file for re-use + // Update a ConfigMap with the model manifest file for re-use err = r.updateManifestConfigMap(ctx, nimCache, &manifest) if err != nil { logger.Error(err, "Failed to create model manifest config map") @@ -831,10 +903,8 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap }, }, }, - ImagePullSecrets: []corev1.LocalObjectReference{}, - ServiceAccountName: NIMCacheServiceAccount, - Tolerations: nimCache.GetTolerations(), - NodeSelector: nimCache.GetNodeSelectors(), + Tolerations: nimBuild.GetTolerations(), + NodeSelector: nimBuild.GetNodeSelectors(), }, } @@ -859,16 +929,6 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap RunAsGroup: nimCache.GetGroupID(), RunAsUser: nimCache.GetUserID(), }, - Resources: corev1.ResourceRequirements{ - Limits: map[corev1.ResourceName]apiResource.Quantity{ - "cpu": nimCache.Spec.Resources.CPU, - "memory": nimCache.Spec.Resources.Memory, - }, - Requests: map[corev1.ResourceName]apiResource.Quantity{ - "cpu": nimCache.Spec.Resources.CPU, - "memory": nimCache.Spec.Resources.Memory, - }, - }, VolumeMounts: []corev1.VolumeMount{ { Name: "nim-cache-volume", @@ -883,10 +943,8 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap Name: nimCache.Spec.Source.NGC.PullSecret, }, } - // Merge env with the user provided values pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimCache.Spec.Env) - return pod } From 4bcae597169f963637c76d35fabe10b89f35e4ef Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Wed, 18 Jun 2025 07:41:28 +0000 Subject: [PATCH 05/16] Updating NIMBuild CRD and NIMCache update status logic Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 30 ++++++--- api/apps/v1alpha1/zz_generated.deepcopy.go | 21 +----- .../manifests/apps.nvidia.com_nimbuilds.yaml | 64 ++++++++++--------- .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 64 ++++++++++--------- .../crds/apps.nvidia.com_nimbuilds.yaml | 64 ++++++++++--------- internal/controller/nimbuild_controller.go | 60 +++++++++++------ 6 files changed, 170 insertions(+), 133 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index 1487b240a..d1fe6b8af 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -27,13 +27,11 @@ import ( // NIMBuildSpec to build optimized trtllm engines with given model config and weights. type NIMBuildSpec struct { // Required: Reference to the model weights from NIMCache - NIMCacheRef string `json:"nimCacheRef"` - // Profile name for this engine build + NIMCacheName string `json:"nimCacheName"` + // Profile is the name of the profile to be used for building the engine. Profile string `json:"profile,omitempty"` - // Any user-defined tags for tracking builds - Tags map[string]string `json:"tags,omitempty"` - // Additional build params - BuildParams []string `json:"additionalBuildParams,omitempty"` + // EngineName is the name given to the engine being built. + EngineName string `json:"engineName,omitempty"` // Resources is the resource requirements for the NIMBuild pod. Resources *corev1.ResourceRequirements `json:"resources,omitempty"` // Tolerations for running the job to cache the NIM model @@ -44,9 +42,10 @@ type NIMBuildSpec struct { // NIMBuildStatus defines the observed state of NIMBuild. type NIMBuildStatus struct { - State string `json:"state,omitempty"` - Profiles []NIMProfile `json:"profiles,omitempty"` - Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` + State string `json:"state,omitempty"` + TargetProfile NIMProfile `json:"targetProfile,omitempty"` + BuiltProfile NIMProfile `json:"builtProfile,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` } // +genclient @@ -125,3 +124,16 @@ func (n *NIMBuild) GetTolerations() []corev1.Toleration { func (n *NIMBuild) GetNodeSelectors() map[string]string { return n.Spec.NodeSelector } + +// GetEngineName returns the name of the engine to be built. +func (n *NIMBuild) GetEngineName() string { + if n.Spec.EngineName != "" { + return n.Spec.EngineName + } + return n.Name +} + +// GetProfile returns the profile name for this engine build. +func (n *NIMBuild) GetProfile() string { + return n.Spec.Profile +} diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index aad85a893..e1af43644 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -762,18 +762,6 @@ func (in *NIMBuildList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { *out = *in - if in.Tags != nil { - in, out := &in.Tags, &out.Tags - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - if in.BuildParams != nil { - in, out := &in.BuildParams, &out.BuildParams - *out = make([]string, len(*in)) - copy(*out, *in) - } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(corev1.ResourceRequirements) @@ -808,13 +796,8 @@ func (in *NIMBuildSpec) DeepCopy() *NIMBuildSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMBuildStatus) DeepCopyInto(out *NIMBuildStatus) { *out = *in - if in.Profiles != nil { - in, out := &in.Profiles, &out.Profiles - *out = make([]NIMProfile, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } + in.TargetProfile.DeepCopyInto(&out.TargetProfile) + in.BuiltProfile.DeepCopyInto(&out.BuiltProfile) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index 091eae66a..e2500a1e0 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -48,12 +48,10 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - additionalBuildParams: - description: Additional build params - items: - type: string - type: array - nimCacheRef: + engineName: + description: EngineName is the name given to the engine being built. + type: string + nimCacheName: description: 'Required: Reference to the model weights from NIMCache' type: string nodeSelector: @@ -63,7 +61,8 @@ spec: the caching job. type: object profile: - description: Profile name for this engine build + description: Profile is the name of the profile to be used for building + the engine. type: string resources: description: Resources is the resource requirements for the NIMBuild @@ -125,11 +124,6 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object - tags: - additionalProperties: - type: string - description: Any user-defined tags for tracking builds - type: object tolerations: description: Tolerations for running the job to cache the NIM model items: @@ -170,11 +164,25 @@ spec: type: object type: array required: - - nimCacheRef + - nimCacheName type: object status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: + builtProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object conditions: items: description: Condition contains details for one aspect of the current @@ -231,24 +239,22 @@ spec: - type type: object type: array - profiles: - items: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object - type: array state: type: string + targetProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object type: object type: object served: true diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index 091eae66a..e2500a1e0 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -48,12 +48,10 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - additionalBuildParams: - description: Additional build params - items: - type: string - type: array - nimCacheRef: + engineName: + description: EngineName is the name given to the engine being built. + type: string + nimCacheName: description: 'Required: Reference to the model weights from NIMCache' type: string nodeSelector: @@ -63,7 +61,8 @@ spec: the caching job. type: object profile: - description: Profile name for this engine build + description: Profile is the name of the profile to be used for building + the engine. type: string resources: description: Resources is the resource requirements for the NIMBuild @@ -125,11 +124,6 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object - tags: - additionalProperties: - type: string - description: Any user-defined tags for tracking builds - type: object tolerations: description: Tolerations for running the job to cache the NIM model items: @@ -170,11 +164,25 @@ spec: type: object type: array required: - - nimCacheRef + - nimCacheName type: object status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: + builtProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object conditions: items: description: Condition contains details for one aspect of the current @@ -231,24 +239,22 @@ spec: - type type: object type: array - profiles: - items: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object - type: array state: type: string + targetProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object type: object type: object served: true diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index 091eae66a..e2500a1e0 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -48,12 +48,10 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - additionalBuildParams: - description: Additional build params - items: - type: string - type: array - nimCacheRef: + engineName: + description: EngineName is the name given to the engine being built. + type: string + nimCacheName: description: 'Required: Reference to the model weights from NIMCache' type: string nodeSelector: @@ -63,7 +61,8 @@ spec: the caching job. type: object profile: - description: Profile name for this engine build + description: Profile is the name of the profile to be used for building + the engine. type: string resources: description: Resources is the resource requirements for the NIMBuild @@ -125,11 +124,6 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object - tags: - additionalProperties: - type: string - description: Any user-defined tags for tracking builds - type: object tolerations: description: Tolerations for running the job to cache the NIM model items: @@ -170,11 +164,25 @@ spec: type: object type: array required: - - nimCacheRef + - nimCacheName type: object status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: + builtProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object conditions: items: description: Condition contains details for one aspect of the current @@ -231,24 +239,22 @@ spec: - type type: object type: array - profiles: - items: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object - type: array state: type: string + targetProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object type: object type: object served: true diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index ea282c6e1..8e2d11619 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -228,13 +228,13 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { err := mgr.GetFieldIndexer().IndexField( context.Background(), &appsv1alpha1.NIMBuild{}, - "spec.nimCacheRef", + "spec.nimCacheName", func(rawObj client.Object) []string { nimBuild, ok := rawObj.(*appsv1alpha1.NIMBuild) if !ok { return []string{} } - return []string{nimBuild.Spec.NIMCacheRef} + return []string{nimBuild.Spec.NIMCacheName} }, ) if err != nil { @@ -337,7 +337,7 @@ func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *apps func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) (reconcile.Result, error) { logger := r.GetLogger() - nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheRef, Namespace: nimBuild.GetNamespace()} + nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheName, Namespace: nimBuild.GetNamespace()} nimCache := &appsv1alpha1.NIMCache{} if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { @@ -400,6 +400,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild logger.Info("Selected buildable profile found", "Profile", buildableProfiles[0].Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile cached in NIM Cache") buildableProfile = buildableProfiles[0] + nimBuild.Status.TargetProfile = buildableProfile default: logger.Info("No buildable profiles found, skipping engine build") conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found in NIM Cache") @@ -423,6 +424,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild logger.Info("Selected buildable profile found", "Profile", foundProfile.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") buildableProfile = *foundProfile + nimBuild.Status.TargetProfile = buildableProfile } } @@ -635,7 +637,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB }, { Name: "NIM_CUSTOM_MODEL_NAME", - Value: nimCache.Name + "-" + nimProfile.Name, + Value: nimBuild.GetEngineName(), }, { Name: "NGC_API_KEY", @@ -822,21 +824,23 @@ func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, ni return err } - // To Do: Explore changing the profile list to a map for faster lookups - for _, profileName := range manifest.GetProfilesList() { - for _, selectedProfile := range nimCache.Status.Profiles { - if selectedProfile.Name == profileName { - break - } + // To Do: Explore changing the profile list on NIMCache to a map for faster lookups + // Update the NIMCache status with the details of the built profile + builtProfileName := getBuiltProfileName(manifest, nimBuild) + if builtProfileName != "" { + presentOnNIMCache := isBuiltProfilePresentOnNIMCacheStatus(nimCache, builtProfileName) + // If built profile is not present on NIMCache status, add it + if !presentOnNIMCache { + tagsMap := manifest.GetProfileTags(builtProfileName) + tagsMap["target_profile"] = nimBuild.Status.TargetProfile.Name + logger.Info("Adding profile to NIMCache status", "profileName", builtProfileName) + nimCache.Status.Profiles = append(nimCache.Status.Profiles, appsv1alpha1.NIMProfile{ + Name: builtProfileName, + Model: manifest.GetProfileModel(builtProfileName), + Config: manifest.GetProfileTags(builtProfileName), + Release: manifest.GetProfileRelease(builtProfileName), + }) } - logger.Info("Adding profile to NIMCache status", "profileName", profileName) - // If the profile is not found, add it to the status - nimCache.Status.Profiles = append(nimCache.Status.Profiles, appsv1alpha1.NIMProfile{ - Name: profileName, - Model: manifest.GetProfileModel(profileName), - Config: manifest.GetProfileTags(profileName), - Release: manifest.GetProfileRelease(profileName), - }) } // Update the NIMCache status with the new profiles @@ -858,6 +862,26 @@ func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, ni return nil } +func isBuiltProfilePresentOnNIMCacheStatus(nimCache *appsv1alpha1.NIMCache, builtProfileName string) bool { + for _, selectedProfile := range nimCache.Status.Profiles { + if selectedProfile.Name == builtProfileName { + return true + } + } + return false +} + +// getBuiltProfileName retrieves the auto generated profile name assigned for the built engine. +func getBuiltProfileName(manifest nimparser.NIMManifestInterface, nimBuild *appsv1alpha1.NIMBuild) string { + for _, profileName := range manifest.GetProfilesList() { + tagsMap := manifest.GetProfileTags(profileName) + if tagsMap["model_name"] == nimBuild.GetEngineName() { + return profileName + } + } + return "" +} + func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *appsv1alpha1.NIMBuild, platformType k8sutil.OrchestratorType) *corev1.Pod { labels := map[string]string{ From a49f0101a198dcbc86a94728bad68cf2192b375d Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Wed, 18 Jun 2025 19:15:26 +0000 Subject: [PATCH 06/16] Adding immutable logic to NIMBuild CRD Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 1 + bundle/manifests/apps.nvidia.com_nimbuilds.yaml | 3 +++ config/crd/bases/apps.nvidia.com_nimbuilds.yaml | 3 +++ .../crds/apps.nvidia.com_nimbuilds.yaml | 3 +++ internal/controller/nimbuild_controller.go | 13 ++----------- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index d1fe6b8af..7c1133bda 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -59,6 +59,7 @@ type NIMBuild struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` + // +kubebuilder:validation:XValidation:rule="self == oldSelf", message="spec is immutable" Spec NIMBuildSpec `json:"spec,omitempty"` Status NIMBuildStatus `json:"status,omitempty"` } diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index e2500a1e0..4c4bd540a 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -166,6 +166,9 @@ spec: required: - nimCacheName type: object + x-kubernetes-validations: + - message: spec is immutable + rule: self == oldSelf status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index e2500a1e0..4c4bd540a 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -166,6 +166,9 @@ spec: required: - nimCacheName type: object + x-kubernetes-validations: + - message: spec is immutable + rule: self == oldSelf status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index e2500a1e0..4c4bd540a 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -166,6 +166,9 @@ spec: required: - nimCacheName type: object + x-kubernetes-validations: + - message: spec is immutable + rule: self == oldSelf status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 8e2d11619..04f7d1789 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -19,7 +19,6 @@ package controller import ( "context" "fmt" - "reflect" "strings" "github.com/go-logr/logr" @@ -248,22 +247,14 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { WithEventFilter(predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { // Type assert to NIMBuild - if oldNIMBuild, ok := e.ObjectOld.(*appsv1alpha1.NIMBuild); ok { + if _, ok := e.ObjectOld.(*appsv1alpha1.NIMBuild); ok { newNIMBuild, ok := e.ObjectNew.(*appsv1alpha1.NIMBuild) if ok { // Handle case where object is marked for deletion if !newNIMBuild.ObjectMeta.DeletionTimestamp.IsZero() { return true } - - // Allow only status updates - // Reject updates that change anything other than status - // Spec is immutable - if !reflect.DeepEqual(oldNIMBuild.Spec, newNIMBuild.Spec) { - return false - } - // Allow if only status changed - return !reflect.DeepEqual(oldNIMBuild.Status, newNIMBuild.Status) + return false } } // For other types we watch, reconcile them From 515e27bafd914667664008ea45ed0774914013f8 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Wed, 18 Jun 2025 22:55:55 +0000 Subject: [PATCH 07/16] Updating the NIMBuild status to fail when reconilation fails Signed-off-by: Vishesh Tanksale --- internal/controller/nimbuild_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 04f7d1789..0fb3813e2 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -166,7 +166,7 @@ func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if err != nil { logger.Error(err, "error reconciling NIMBuild", "name", nimBuild.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionReconcileFailed, metav1.ConditionTrue, "ReconcileFailed", err.Error()) - nimBuild.Status.State = appsv1alpha1.NimBuildStatusNotReady + nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed err := r.updateNIMBuildStatus(ctx, nimBuild) if err != nil { From bf14470172320cfdf80782c1a009648232240389 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 24 Jun 2025 07:24:04 +0000 Subject: [PATCH 08/16] Updating the NIMBuild CRD Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 47 +++-- api/apps/v1alpha1/zz_generated.deepcopy.go | 68 ++++++- .../manifests/apps.nvidia.com_nimbuilds.yaml | 181 ++++++++++++++---- .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 181 ++++++++++++++---- .../crds/apps.nvidia.com_nimbuilds.yaml | 181 ++++++++++++++---- internal/controller/nimbuild_controller.go | 40 ++-- 6 files changed, 550 insertions(+), 148 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index 7c1133bda..f1779f626 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -26,18 +26,23 @@ import ( // NIMBuildSpec to build optimized trtllm engines with given model config and weights. type NIMBuildSpec struct { - // Required: Reference to the model weights from NIMCache - NIMCacheName string `json:"nimCacheName"` - // Profile is the name of the profile to be used for building the engine. - Profile string `json:"profile,omitempty"` - // EngineName is the name given to the engine being built. - EngineName string `json:"engineName,omitempty"` + // NIMCacheRef is Reference to the model weights from NIMCache + NIMCacheRef NIMCacheReference `json:"nimCacheRef"` + // ModelName is the name given to the locally built engine. + ModelName string `json:"engineName,omitempty"` // Resources is the resource requirements for the NIMBuild pod. - Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + Resources *ResourceRequirements `json:"resources,omitempty"` // Tolerations for running the job to cache the NIM model Tolerations []corev1.Toleration `json:"tolerations,omitempty"` // NodeSelector is the node selector labels to schedule the caching job. NodeSelector map[string]string `json:"nodeSelector,omitempty"` + + Env []corev1.EnvVar `json:"env,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` + ModelBuilder string `json:"ModelBuilder"` + // PullSecret to pull the model builder image + PullSecret string `json:"pullSecret,omitempty"` } // NIMBuildStatus defines the observed state of NIMBuild. @@ -48,6 +53,24 @@ type NIMBuildStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` } +type ResourceRequirements struct { + // Limits describes the maximum amount of compute resources allowed. + // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + // +optional + Limits corev1.ResourceList `json:"limits,omitempty" protobuf:"bytes,1,rep,name=limits,casttype=ResourceList,castkey=ResourceName"` + // Requests describes the minimum amount of compute resources required. + // If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + // otherwise to an implementation-defined value. Requests cannot exceed Limits. + // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + // +optional + Requests corev1.ResourceList `json:"requests,omitempty" protobuf:"bytes,2,rep,name=requests,casttype=ResourceList,castkey=ResourceName"` +} + +type NIMCacheReference struct { + Name string `json:"name"` + Profile string `json:"profile,omitempty"` +} + // +genclient // +kubebuilder:object:root=true // +kubebuilder:subresource:status @@ -126,15 +149,15 @@ func (n *NIMBuild) GetNodeSelectors() map[string]string { return n.Spec.NodeSelector } -// GetEngineName returns the name of the engine to be built. -func (n *NIMBuild) GetEngineName() string { - if n.Spec.EngineName != "" { - return n.Spec.EngineName +// GetModelName returns the model name for the engine being built. +func (n *NIMBuild) GetModelName() string { + if n.Spec.ModelName != "" { + return n.Spec.ModelName } return n.Name } // GetProfile returns the profile name for this engine build. func (n *NIMBuild) GetProfile() string { - return n.Spec.Profile + return n.Spec.NIMCacheRef.Profile } diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index e1af43644..43d450ba3 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -762,9 +762,10 @@ func (in *NIMBuildList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { *out = *in + out.NIMCacheRef = in.NIMCacheRef if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(ResourceRequirements) (*in).DeepCopyInto(*out) } if in.Tolerations != nil { @@ -781,6 +782,27 @@ func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { (*out)[key] = val } } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]corev1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Annotations != nil { + in, out := &in.Annotations, &out.Annotations + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildSpec. @@ -876,6 +898,21 @@ func (in *NIMCacheList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NIMCacheReference) DeepCopyInto(out *NIMCacheReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMCacheReference. +func (in *NIMCacheReference) DeepCopy() *NIMCacheReference { + if in == nil { + return nil + } + out := new(NIMCacheReference) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMCacheSpec) DeepCopyInto(out *NIMCacheSpec) { *out = *in @@ -2480,6 +2517,35 @@ func (in *ProxySpec) DeepCopy() *ProxySpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceRequirements) DeepCopyInto(out *ResourceRequirements) { + *out = *in + if in.Limits != nil { + in, out := &in.Limits, &out.Limits + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.Requests != nil { + in, out := &in.Requests, &out.Requests + *out = make(corev1.ResourceList, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceRequirements. +func (in *ResourceRequirements) DeepCopy() *ResourceRequirements { + if in == nil { + return nil + } + out := new(ResourceRequirements) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Resources) DeepCopyInto(out *Resources) { *out = *in diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index 4c4bd540a..e65e2a144 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -48,57 +48,159 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - engineName: - description: EngineName is the name given to the engine being built. + ModelBuilder: type: string - nimCacheName: - description: 'Required: Reference to the model weights from NIMCache' + annotations: + additionalProperties: + type: string + type: object + engineName: + description: ModelName is the name given to the locally built engine. type: string + env: + items: + description: EnvVar represents an environment variable present in + a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". + type: string + valueFrom: + description: Source for the environment variable's value. Cannot + be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + fieldRef: + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. + properties: + apiVersion: + description: Version of the schema the FieldPath is + written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified + API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + required: + - name + type: object + type: array + labels: + additionalProperties: + type: string + type: object + nimCacheRef: + description: NIMCacheRef is Reference to the model weights from NIMCache + properties: + name: + type: string + profile: + type: string + required: + - name + type: object nodeSelector: additionalProperties: type: string description: NodeSelector is the node selector labels to schedule the caching job. type: object - profile: - description: Profile is the name of the profile to be used for building - the engine. + pullSecret: + description: PullSecret to pull the model builder image type: string resources: description: Resources is the resource requirements for the NIMBuild pod. properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map limits: additionalProperties: anyOf: @@ -164,7 +266,8 @@ spec: type: object type: array required: - - nimCacheName + - ModelBuilder + - nimCacheRef type: object x-kubernetes-validations: - message: spec is immutable diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index 4c4bd540a..e65e2a144 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -48,57 +48,159 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - engineName: - description: EngineName is the name given to the engine being built. + ModelBuilder: type: string - nimCacheName: - description: 'Required: Reference to the model weights from NIMCache' + annotations: + additionalProperties: + type: string + type: object + engineName: + description: ModelName is the name given to the locally built engine. type: string + env: + items: + description: EnvVar represents an environment variable present in + a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". + type: string + valueFrom: + description: Source for the environment variable's value. Cannot + be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + fieldRef: + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. + properties: + apiVersion: + description: Version of the schema the FieldPath is + written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified + API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + required: + - name + type: object + type: array + labels: + additionalProperties: + type: string + type: object + nimCacheRef: + description: NIMCacheRef is Reference to the model weights from NIMCache + properties: + name: + type: string + profile: + type: string + required: + - name + type: object nodeSelector: additionalProperties: type: string description: NodeSelector is the node selector labels to schedule the caching job. type: object - profile: - description: Profile is the name of the profile to be used for building - the engine. + pullSecret: + description: PullSecret to pull the model builder image type: string resources: description: Resources is the resource requirements for the NIMBuild pod. properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map limits: additionalProperties: anyOf: @@ -164,7 +266,8 @@ spec: type: object type: array required: - - nimCacheName + - ModelBuilder + - nimCacheRef type: object x-kubernetes-validations: - message: spec is immutable diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index 4c4bd540a..e65e2a144 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -48,57 +48,159 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - engineName: - description: EngineName is the name given to the engine being built. + ModelBuilder: type: string - nimCacheName: - description: 'Required: Reference to the model weights from NIMCache' + annotations: + additionalProperties: + type: string + type: object + engineName: + description: ModelName is the name given to the locally built engine. type: string + env: + items: + description: EnvVar represents an environment variable present in + a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". + type: string + valueFrom: + description: Source for the environment variable's value. Cannot + be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + fieldRef: + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. + properties: + apiVersion: + description: Version of the schema the FieldPath is + written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified + API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed + resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + required: + - name + type: object + type: array + labels: + additionalProperties: + type: string + type: object + nimCacheRef: + description: NIMCacheRef is Reference to the model weights from NIMCache + properties: + name: + type: string + profile: + type: string + required: + - name + type: object nodeSelector: additionalProperties: type: string description: NodeSelector is the node selector labels to schedule the caching job. type: object - profile: - description: Profile is the name of the profile to be used for building - the engine. + pullSecret: + description: PullSecret to pull the model builder image type: string resources: description: Resources is the resource requirements for the NIMBuild pod. properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map limits: additionalProperties: anyOf: @@ -164,7 +266,8 @@ spec: type: object type: array required: - - nimCacheName + - ModelBuilder + - nimCacheRef type: object x-kubernetes-validations: - message: spec is immutable diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 0fb3813e2..b8125760d 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -233,7 +233,7 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { if !ok { return []string{} } - return []string{nimBuild.Spec.NIMCacheName} + return []string{nimBuild.Spec.NIMCacheRef.Name} }, ) if err != nil { @@ -328,7 +328,7 @@ func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *apps func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) (reconcile.Result, error) { logger := r.GetLogger() - nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheName, Namespace: nimBuild.GetNamespace()} + nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheRef.Name, Namespace: nimBuild.GetNamespace()} nimCache := &appsv1alpha1.NIMCache{} if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { @@ -378,7 +378,7 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() buildableProfile := appsv1alpha1.NIMProfile{} - if nimBuild.Spec.Profile == "" { + if nimBuild.Spec.NIMCacheRef.Profile == "" { buildableProfiles := getBuildableProfiles(nimCache) if len(buildableProfiles) > 0 { switch { @@ -405,9 +405,9 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed } } else { - foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.Profile) + foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.NIMCacheRef.Profile) if foundProfile == nil { - logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.Profile) + logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.NIMCacheRef.Profile) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found, please select a valid profile from the NIM cache") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil @@ -526,9 +526,16 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB "app.kubernetes.io/managed-by": "k8s-nim-operator", } + if nimBuild.GetLabels() != nil { + labels = utils.MergeMaps(labels, nimBuild.GetLabels()) + } + annotations := map[string]string{ "sidecar.istio.io/inject": "false", } + if nimBuild.GetAnnotations() != nil { + annotations = utils.MergeMaps(annotations, nimBuild.GetAnnotations()) + } // Get tensorParallelism from the profile tensorParallelism, err := utils.GetTensorParallelismByProfileTags(nimProfile.Config) @@ -539,7 +546,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB // Ensure Resources, Requests, and Limits are initialized if nimBuild.Spec.Resources == nil { - nimBuild.Spec.Resources = &corev1.ResourceRequirements{} + nimBuild.Spec.Resources = &appsv1alpha1.ResourceRequirements{} } if nimBuild.Spec.Resources.Requests == nil { nimBuild.Spec.Resources.Requests = corev1.ResourceList{} @@ -610,9 +617,8 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB pod.Spec.Containers = []corev1.Container{ { - Name: NIMBuildContainerName, - Image: nimCache.Spec.Source.NGC.ModelPuller, - EnvFrom: nimCache.Spec.Source.EnvFromSecrets(), + Name: NIMBuildContainerName, + Image: nimBuild.Spec.ModelBuilder, Env: []corev1.EnvVar{ { Name: "NIM_CACHE_PATH", @@ -628,7 +634,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB }, { Name: "NIM_CUSTOM_MODEL_NAME", - Value: nimBuild.GetEngineName(), + Value: nimBuild.GetModelName(), }, { Name: "NGC_API_KEY", @@ -649,7 +655,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB SubPath: nimCache.Spec.Storage.PVC.SubPath, }, }, - Resources: *nimBuild.Spec.Resources, + Resources: corev1.ResourceRequirements{Limits: nimBuild.Spec.Resources.Limits, Requests: nimBuild.Spec.Resources.Requests}, TerminationMessagePath: "/dev/termination-log", TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError, Ports: []corev1.ContainerPort{{ @@ -709,12 +715,12 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB } pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ { - Name: nimCache.Spec.Source.NGC.PullSecret, + Name: nimBuild.Spec.PullSecret, }, } // Merge env with the user provided values - pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimCache.Spec.Env) + pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimBuild.Spec.Env) return pod, nil } @@ -866,7 +872,7 @@ func isBuiltProfilePresentOnNIMCacheStatus(nimCache *appsv1alpha1.NIMCache, buil func getBuiltProfileName(manifest nimparser.NIMManifestInterface, nimBuild *appsv1alpha1.NIMBuild) string { for _, profileName := range manifest.GetProfilesList() { tagsMap := manifest.GetProfileTags(profileName) - if tagsMap["model_name"] == nimBuild.GetEngineName() { + if tagsMap["model_name"] == nimBuild.GetModelName() { return profileName } } @@ -933,7 +939,7 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap pod.Spec.Containers = []corev1.Container{ { Name: NIMBuildManifestContainerName, - Image: nimCache.Spec.Source.NGC.ModelPuller, + Image: nimBuild.Spec.ModelBuilder, Command: getNIMBuildSidecarCommand(), SecurityContext: &corev1.SecurityContext{ AllowPrivilegeEscalation: ptr.To[bool](false), @@ -955,11 +961,9 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap } pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ { - Name: nimCache.Spec.Source.NGC.PullSecret, + Name: nimBuild.Spec.PullSecret, }, } - // Merge env with the user provided values - pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimCache.Spec.Env) return pod } From 27841174920dca647b9a7635887b553ea75bfb2e Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Tue, 24 Jun 2025 23:59:27 +0000 Subject: [PATCH 09/16] Updating the NIMBuild CRD Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/common_types.go | 14 ++++ api/apps/v1alpha1/nimbuild_types.go | 41 +++++---- api/apps/v1alpha1/zz_generated.deepcopy.go | 83 ++++++++++--------- .../manifests/apps.nvidia.com_nimbuilds.yaml | 26 ++++-- .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 26 ++++-- .../crds/apps.nvidia.com_nimbuilds.yaml | 26 ++++-- internal/controller/nimbuild_controller.go | 38 +++++---- 7 files changed, 152 insertions(+), 102 deletions(-) diff --git a/api/apps/v1alpha1/common_types.go b/api/apps/v1alpha1/common_types.go index 793ce9104..5fba820e6 100644 --- a/api/apps/v1alpha1/common_types.go +++ b/api/apps/v1alpha1/common_types.go @@ -128,6 +128,20 @@ type IngressV1 struct { Spec *IngressSpec `json:"spec,omitempty"` } +// ResourceRequirements defines the resources required for a container. +type ResourceRequirements struct { + // Limits describes the maximum amount of compute resources allowed. + // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + // +optional + Limits corev1.ResourceList `json:"limits,omitempty" protobuf:"bytes,1,rep,name=limits,casttype=ResourceList,castkey=ResourceName"` + // Requests describes the minimum amount of compute resources required. + // If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + // otherwise to an implementation-defined value. Requests cannot exceed Limits. + // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + // +optional + Requests corev1.ResourceList `json:"requests,omitempty" protobuf:"bytes,2,rep,name=requests,casttype=ResourceList,castkey=ResourceName"` +} + func (i *IngressV1) GenerateNetworkingV1IngressSpec(name string) networkingv1.IngressSpec { if i.Spec == nil { return networkingv1.IngressSpec{} diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index f1779f626..cbe0b83bb 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -17,6 +17,8 @@ limitations under the License. package v1alpha1 import ( + "fmt" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -26,8 +28,8 @@ import ( // NIMBuildSpec to build optimized trtllm engines with given model config and weights. type NIMBuildSpec struct { - // NIMCacheRef is Reference to the model weights from NIMCache - NIMCacheRef NIMCacheReference `json:"nimCacheRef"` + // NIMCache is Reference to the model weights from NIMCache + NIMCache NIMCacheReference `json:"nimCacheRef"` // ModelName is the name given to the locally built engine. ModelName string `json:"engineName,omitempty"` // Resources is the resource requirements for the NIMBuild pod. @@ -37,12 +39,10 @@ type NIMBuildSpec struct { // NodeSelector is the node selector labels to schedule the caching job. NodeSelector map[string]string `json:"nodeSelector,omitempty"` - Env []corev1.EnvVar `json:"env,omitempty"` - Labels map[string]string `json:"labels,omitempty"` - Annotations map[string]string `json:"annotations,omitempty"` - ModelBuilder string `json:"ModelBuilder"` - // PullSecret to pull the model builder image - PullSecret string `json:"pullSecret,omitempty"` + Env []corev1.EnvVar `json:"env,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` + Image Image `json:"image"` } // NIMBuildStatus defines the observed state of NIMBuild. @@ -53,19 +53,6 @@ type NIMBuildStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` } -type ResourceRequirements struct { - // Limits describes the maximum amount of compute resources allowed. - // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - // +optional - Limits corev1.ResourceList `json:"limits,omitempty" protobuf:"bytes,1,rep,name=limits,casttype=ResourceList,castkey=ResourceName"` - // Requests describes the minimum amount of compute resources required. - // If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - // otherwise to an implementation-defined value. Requests cannot exceed Limits. - // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - // +optional - Requests corev1.ResourceList `json:"requests,omitempty" protobuf:"bytes,2,rep,name=requests,casttype=ResourceList,castkey=ResourceName"` -} - type NIMCacheReference struct { Name string `json:"name"` Profile string `json:"profile,omitempty"` @@ -159,5 +146,15 @@ func (n *NIMBuild) GetModelName() string { // GetProfile returns the profile name for this engine build. func (n *NIMBuild) GetProfile() string { - return n.Spec.NIMCacheRef.Profile + return n.Spec.NIMCache.Profile +} + +// GetImage returns the image to be used for building the NIM engine. +func (n *NIMBuild) GetImage() string { + return fmt.Sprintf("%s:%s", n.Spec.Image.Repository, n.Spec.Image.Tag) +} + +// GetImagePullSecrets returns the image pull secrets for the NIM engine build. +func (n *NIMBuild) GetImagePullSecrets() []string { + return n.Spec.Image.PullSecrets } diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index 43d450ba3..7dce6c93e 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -22,8 +22,8 @@ package v1alpha1 import ( "k8s.io/api/autoscaling/v2" - corev1 "k8s.io/api/core/v1" - "k8s.io/api/networking/v1" + "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -482,7 +482,7 @@ func (in *IngressPath) DeepCopyInto(out *IngressPath) { *out = *in if in.PathType != nil { in, out := &in.PathType, &out.PathType - *out = new(v1.PathType) + *out = new(networkingv1.PathType) **out = **in } } @@ -602,7 +602,7 @@ func (in *ModelDownloadJobsConfig) DeepCopyInto(out *ModelDownloadJobsConfig) { } if in.SecurityContext != nil { in, out := &in.SecurityContext, &out.SecurityContext - *out = new(corev1.PodSecurityContext) + *out = new(v1.PodSecurityContext) (*in).DeepCopyInto(*out) } } @@ -762,7 +762,7 @@ func (in *NIMBuildList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { *out = *in - out.NIMCacheRef = in.NIMCacheRef + out.NIMCache = in.NIMCache if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(ResourceRequirements) @@ -770,7 +770,7 @@ func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -784,7 +784,7 @@ func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -803,6 +803,7 @@ func (in *NIMBuildSpec) DeepCopyInto(out *NIMBuildSpec) { (*out)[key] = val } } + in.Image.DeepCopyInto(&out.Image) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMBuildSpec. @@ -921,7 +922,7 @@ func (in *NIMCacheSpec) DeepCopyInto(out *NIMCacheSpec) { in.Resources.DeepCopyInto(&out.Resources) if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -950,7 +951,7 @@ func (in *NIMCacheSpec) DeepCopyInto(out *NIMCacheSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1285,7 +1286,7 @@ func (in *NIMServiceSpec) DeepCopyInto(out *NIMServiceSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1314,19 +1315,19 @@ func (in *NIMServiceSpec) DeepCopyInto(out *NIMServiceSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } if in.DRAResources != nil { @@ -1540,7 +1541,7 @@ func (in *NemoCustomizerSpec) DeepCopyInto(out *NemoCustomizerSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1568,19 +1569,19 @@ func (in *NemoCustomizerSpec) DeepCopyInto(out *NemoCustomizerSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } in.Expose.DeepCopyInto(&out.Expose) @@ -1748,7 +1749,7 @@ func (in *NemoDatastoreSpec) DeepCopyInto(out *NemoDatastoreSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1776,19 +1777,19 @@ func (in *NemoDatastoreSpec) DeepCopyInto(out *NemoDatastoreSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } in.Expose.DeepCopyInto(&out.Expose) @@ -1940,7 +1941,7 @@ func (in *NemoEntitystoreSpec) DeepCopyInto(out *NemoEntitystoreSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1968,19 +1969,19 @@ func (in *NemoEntitystoreSpec) DeepCopyInto(out *NemoEntitystoreSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } in.Expose.DeepCopyInto(&out.Expose) @@ -2111,7 +2112,7 @@ func (in *NemoEvaluatorSpec) DeepCopyInto(out *NemoEvaluatorSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -2139,19 +2140,19 @@ func (in *NemoEvaluatorSpec) DeepCopyInto(out *NemoEvaluatorSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } in.Expose.DeepCopyInto(&out.Expose) @@ -2292,7 +2293,7 @@ func (in *NemoGuardrailSpec) DeepCopyInto(out *NemoGuardrailSpec) { } if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -2326,19 +2327,19 @@ func (in *NemoGuardrailSpec) DeepCopyInto(out *NemoGuardrailSpec) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } in.Expose.DeepCopyInto(&out.Expose) @@ -2487,7 +2488,7 @@ func (in *Probe) DeepCopyInto(out *Probe) { } if in.Probe != nil { in, out := &in.Probe, &out.Probe - *out = new(corev1.Probe) + *out = new(v1.Probe) (*in).DeepCopyInto(*out) } } @@ -2522,14 +2523,14 @@ func (in *ResourceRequirements) DeepCopyInto(out *ResourceRequirements) { *out = *in if in.Limits != nil { in, out := &in.Limits, &out.Limits - *out = make(corev1.ResourceList, len(*in)) + *out = make(v1.ResourceList, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } } if in.Requests != nil { in, out := &in.Requests, &out.Requests - *out = make(corev1.ResourceList, len(*in)) + *out = make(v1.ResourceList, len(*in)) for key, val := range *in { (*out)[key] = val.DeepCopy() } @@ -2686,7 +2687,7 @@ func (in *TrainingConfig) DeepCopyInto(out *TrainingConfig) { out.WorkspacePVC = in.WorkspacePVC if in.Env != nil { in, out := &in.Env, &out.Env - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -2704,7 +2705,7 @@ func (in *TrainingConfig) DeepCopyInto(out *TrainingConfig) { } if in.NetworkConfig != nil { in, out := &in.NetworkConfig, &out.NetworkConfig - *out = make([]corev1.EnvVar, len(*in)) + *out = make([]v1.EnvVar, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -2718,14 +2719,14 @@ func (in *TrainingConfig) DeepCopyInto(out *TrainingConfig) { } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations - *out = make([]corev1.Toleration, len(*in)) + *out = make([]v1.Toleration, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.PodAffinity != nil { in, out := &in.PodAffinity, &out.PodAffinity - *out = new(corev1.PodAffinity) + *out = new(v1.PodAffinity) (*in).DeepCopyInto(*out) } if in.SharedMemorySizeLimit != nil { @@ -2735,7 +2736,7 @@ func (in *TrainingConfig) DeepCopyInto(out *TrainingConfig) { } if in.Resources != nil { in, out := &in.Resources, &out.Resources - *out = new(corev1.ResourceRequirements) + *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } } diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index e65e2a144..6053589b6 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -48,8 +48,6 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - ModelBuilder: - type: string annotations: additionalProperties: type: string @@ -174,12 +172,29 @@ spec: - name type: object type: array + image: + description: Image defines image attributes. + properties: + pullPolicy: + type: string + pullSecrets: + items: + type: string + type: array + repository: + type: string + tag: + type: string + required: + - repository + - tag + type: object labels: additionalProperties: type: string type: object nimCacheRef: - description: NIMCacheRef is Reference to the model weights from NIMCache + description: NIMCache is Reference to the model weights from NIMCache properties: name: type: string @@ -194,9 +209,6 @@ spec: description: NodeSelector is the node selector labels to schedule the caching job. type: object - pullSecret: - description: PullSecret to pull the model builder image - type: string resources: description: Resources is the resource requirements for the NIMBuild pod. @@ -266,7 +278,7 @@ spec: type: object type: array required: - - ModelBuilder + - image - nimCacheRef type: object x-kubernetes-validations: diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index e65e2a144..6053589b6 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -48,8 +48,6 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - ModelBuilder: - type: string annotations: additionalProperties: type: string @@ -174,12 +172,29 @@ spec: - name type: object type: array + image: + description: Image defines image attributes. + properties: + pullPolicy: + type: string + pullSecrets: + items: + type: string + type: array + repository: + type: string + tag: + type: string + required: + - repository + - tag + type: object labels: additionalProperties: type: string type: object nimCacheRef: - description: NIMCacheRef is Reference to the model weights from NIMCache + description: NIMCache is Reference to the model weights from NIMCache properties: name: type: string @@ -194,9 +209,6 @@ spec: description: NodeSelector is the node selector labels to schedule the caching job. type: object - pullSecret: - description: PullSecret to pull the model builder image - type: string resources: description: Resources is the resource requirements for the NIMBuild pod. @@ -266,7 +278,7 @@ spec: type: object type: array required: - - ModelBuilder + - image - nimCacheRef type: object x-kubernetes-validations: diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index e65e2a144..6053589b6 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -48,8 +48,6 @@ spec: description: NIMBuildSpec to build optimized trtllm engines with given model config and weights. properties: - ModelBuilder: - type: string annotations: additionalProperties: type: string @@ -174,12 +172,29 @@ spec: - name type: object type: array + image: + description: Image defines image attributes. + properties: + pullPolicy: + type: string + pullSecrets: + items: + type: string + type: array + repository: + type: string + tag: + type: string + required: + - repository + - tag + type: object labels: additionalProperties: type: string type: object nimCacheRef: - description: NIMCacheRef is Reference to the model weights from NIMCache + description: NIMCache is Reference to the model weights from NIMCache properties: name: type: string @@ -194,9 +209,6 @@ spec: description: NodeSelector is the node selector labels to schedule the caching job. type: object - pullSecret: - description: PullSecret to pull the model builder image - type: string resources: description: Resources is the resource requirements for the NIMBuild pod. @@ -266,7 +278,7 @@ spec: type: object type: array required: - - ModelBuilder + - image - nimCacheRef type: object x-kubernetes-validations: diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index b8125760d..95b1924dd 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -233,7 +233,7 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { if !ok { return []string{} } - return []string{nimBuild.Spec.NIMCacheRef.Name} + return []string{nimBuild.Spec.NIMCache.Name} }, ) if err != nil { @@ -328,7 +328,7 @@ func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *apps func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild) (reconcile.Result, error) { logger := r.GetLogger() - nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCacheRef.Name, Namespace: nimBuild.GetNamespace()} + nimCacheNamespacedName := types.NamespacedName{Name: nimBuild.Spec.NIMCache.Name, Namespace: nimBuild.GetNamespace()} nimCache := &appsv1alpha1.NIMCache{} if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { @@ -378,7 +378,7 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() buildableProfile := appsv1alpha1.NIMProfile{} - if nimBuild.Spec.NIMCacheRef.Profile == "" { + if nimBuild.Spec.NIMCache.Profile == "" { buildableProfiles := getBuildableProfiles(nimCache) if len(buildableProfiles) > 0 { switch { @@ -405,9 +405,9 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed } } else { - foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.NIMCacheRef.Profile) + foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.NIMCache.Profile) if foundProfile == nil { - logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.NIMCacheRef.Profile) + logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.NIMCache.Profile) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found, please select a valid profile from the NIM cache") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed return nil @@ -576,6 +576,11 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB annotations["openshift.io/scc"] = "nonroot" } + var imagePullSecrets []corev1.LocalObjectReference + for _, name := range nimBuild.Spec.Image.PullSecrets { + imagePullSecrets = append(imagePullSecrets, corev1.LocalObjectReference{Name: name}) + } + pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: nimBuild.Name + "-engine-build-pod", @@ -602,7 +607,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB }, }, }, - ImagePullSecrets: []corev1.LocalObjectReference{}, + ImagePullSecrets: imagePullSecrets, Tolerations: nimBuild.GetTolerations(), NodeSelector: nimBuild.GetNodeSelectors(), }, @@ -618,7 +623,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB pod.Spec.Containers = []corev1.Container{ { Name: NIMBuildContainerName, - Image: nimBuild.Spec.ModelBuilder, + Image: nimBuild.GetImage(), Env: []corev1.EnvVar{ { Name: "NIM_CACHE_PATH", @@ -713,11 +718,6 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB }, }, } - pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ - { - Name: nimBuild.Spec.PullSecret, - }, - } // Merge env with the user provided values pod.Spec.Containers[0].Env = utils.MergeEnvVars(pod.Spec.Containers[0].Env, nimBuild.Spec.Env) @@ -898,6 +898,11 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap } pvcName := shared.GetPVCName(nimCache, nimCache.Spec.Storage.PVC) + var imagePullSecrets []corev1.LocalObjectReference + for _, name := range nimBuild.Spec.Image.PullSecrets { + imagePullSecrets = append(imagePullSecrets, corev1.LocalObjectReference{Name: name}) + } + pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: nimBuild.Name + "-local-manifest-pod", @@ -906,6 +911,7 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap Annotations: annotations, }, Spec: corev1.PodSpec{ + ImagePullSecrets: imagePullSecrets, RuntimeClassName: nimCache.GetRuntimeClassName(), SecurityContext: &corev1.PodSecurityContext{ RunAsUser: nimCache.GetUserID(), @@ -939,7 +945,7 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap pod.Spec.Containers = []corev1.Container{ { Name: NIMBuildManifestContainerName, - Image: nimBuild.Spec.ModelBuilder, + Image: nimBuild.GetImage(), Command: getNIMBuildSidecarCommand(), SecurityContext: &corev1.SecurityContext{ AllowPrivilegeEscalation: ptr.To[bool](false), @@ -959,11 +965,7 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap }, }, } - pod.Spec.ImagePullSecrets = []corev1.LocalObjectReference{ - { - Name: nimBuild.Spec.PullSecret, - }, - } + return pod } From f2fc179661f8e230e9aa644707b92abc51bc4bff Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 26 Jun 2025 06:51:38 +0000 Subject: [PATCH 10/16] Addressing Review comments Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 12 +++++++++++- internal/controller/nimbuild_controller.go | 10 +++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index cbe0b83bb..ffed6a7a8 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -29,7 +29,7 @@ import ( // NIMBuildSpec to build optimized trtllm engines with given model config and weights. type NIMBuildSpec struct { // NIMCache is Reference to the model weights from NIMCache - NIMCache NIMCacheReference `json:"nimCacheRef"` + NIMCache NIMCacheReference `json:"nimCache"` // ModelName is the name given to the locally built engine. ModelName string `json:"engineName,omitempty"` // Resources is the resource requirements for the NIMBuild pod. @@ -158,3 +158,13 @@ func (n *NIMBuild) GetImage() string { func (n *NIMBuild) GetImagePullSecrets() []string { return n.Spec.Image.PullSecrets } + +// GetEngineBuildPodName returns the name of the pod that will be created to build the NIM engine. +func (n *NIMBuild) GetEngineBuildPodName() string { + return fmt.Sprintf("%s-engine-build-pod", n.Name) +} + +// GetLocalManifestReaderPodName returns the name of the pod that will be created to read the local manifest. +func (n *NIMBuild) GetLocalManifestReaderPodName() string { + return fmt.Sprintf("%s-local-manifest-pod", n.Name) +} diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 95b1924dd..1a885f3e5 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -227,7 +227,7 @@ func (r *NIMBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { err := mgr.GetFieldIndexer().IndexField( context.Background(), &appsv1alpha1.NIMBuild{}, - "spec.nimCacheName", + "spec.nimCache.name", func(rawObj client.Object) []string { nimBuild, ok := rawObj.(*appsv1alpha1.NIMBuild) if !ok { @@ -300,7 +300,7 @@ func (r *NIMBuildReconciler) cleanupNIMBuild(ctx context.Context, nimBuild *apps // All owned objects are garbage collected // Fetch the pod - podName := types.NamespacedName{Name: nimBuild.Name + "-engine-build-pod", Namespace: nimBuild.Namespace} + podName := types.NamespacedName{Name: nimBuild.GetEngineBuildPodName(), Namespace: nimBuild.Namespace} pod := &corev1.Pod{} if err := r.Get(ctx, podName, pod); err != nil { if errors.IsNotFound(err) { @@ -420,7 +420,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild } pod := &corev1.Pod{} - podName := types.NamespacedName{Name: nimBuild.Name + "-engine-build-pod", Namespace: nimBuild.GetNamespace()} + podName := types.NamespacedName{Name: nimBuild.GetEngineBuildPodName(), Namespace: nimBuild.GetNamespace()} err := r.Get(ctx, podName, pod) if err != nil && client.IgnoreNotFound(err) != nil { return err @@ -583,7 +583,7 @@ func (r *NIMBuildReconciler) constructEngineBuildPod(nimBuild *appsv1alpha1.NIMB pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: nimBuild.Name + "-engine-build-pod", + Name: nimBuild.GetEngineBuildPodName(), Namespace: nimBuild.Namespace, Labels: labels, Annotations: annotations, @@ -905,7 +905,7 @@ func constructLocalManifestPodSpec(nimCache *appsv1alpha1.NIMCache, nimBuild *ap pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: nimBuild.Name + "-local-manifest-pod", + Name: nimBuild.GetLocalManifestReaderPodName(), Namespace: nimBuild.Namespace, Labels: labels, Annotations: annotations, From 9bc068b5d4bf720b741b8ebca947e52966cab732 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 26 Jun 2025 16:57:10 +0000 Subject: [PATCH 11/16] Addressing Review comments Signed-off-by: Vishesh Tanksale --- config/crd/bases/apps.nvidia.com_nimbuilds.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index 6053589b6..4bd816710 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -193,7 +193,7 @@ spec: additionalProperties: type: string type: object - nimCacheRef: + nimCache: description: NIMCache is Reference to the model weights from NIMCache properties: name: @@ -279,7 +279,7 @@ spec: type: array required: - image - - nimCacheRef + - nimCache type: object x-kubernetes-validations: - message: spec is immutable From 389a41396828a570005b3d8c414f3a36fdf7d01d Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 26 Jun 2025 17:41:03 +0000 Subject: [PATCH 12/16] Addressing Review comments Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimbuild_types.go | 6 +++--- internal/controller/nimbuild_controller.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/api/apps/v1alpha1/nimbuild_types.go b/api/apps/v1alpha1/nimbuild_types.go index ffed6a7a8..ff976d47c 100644 --- a/api/apps/v1alpha1/nimbuild_types.go +++ b/api/apps/v1alpha1/nimbuild_types.go @@ -31,7 +31,7 @@ type NIMBuildSpec struct { // NIMCache is Reference to the model weights from NIMCache NIMCache NIMCacheReference `json:"nimCache"` // ModelName is the name given to the locally built engine. - ModelName string `json:"engineName,omitempty"` + ModelName string `json:"modelName,omitempty"` // Resources is the resource requirements for the NIMBuild pod. Resources *ResourceRequirements `json:"resources,omitempty"` // Tolerations for running the job to cache the NIM model @@ -48,8 +48,8 @@ type NIMBuildSpec struct { // NIMBuildStatus defines the observed state of NIMBuild. type NIMBuildStatus struct { State string `json:"state,omitempty"` - TargetProfile NIMProfile `json:"targetProfile,omitempty"` - BuiltProfile NIMProfile `json:"builtProfile,omitempty"` + InputProfile NIMProfile `json:"inputProfile,omitempty"` + OutputProfile NIMProfile `json:"outputProfile,omitempty"` Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"` } diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 1a885f3e5..b8cda64da 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -391,7 +391,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild logger.Info("Selected buildable profile found", "Profile", buildableProfiles[0].Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile cached in NIM Cache") buildableProfile = buildableProfiles[0] - nimBuild.Status.TargetProfile = buildableProfile + nimBuild.Status.InputProfile = buildableProfile default: logger.Info("No buildable profiles found, skipping engine build") conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found in NIM Cache") @@ -415,7 +415,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild logger.Info("Selected buildable profile found", "Profile", foundProfile.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") buildableProfile = *foundProfile - nimBuild.Status.TargetProfile = buildableProfile + nimBuild.Status.InputProfile = buildableProfile } } @@ -829,7 +829,7 @@ func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, ni // If built profile is not present on NIMCache status, add it if !presentOnNIMCache { tagsMap := manifest.GetProfileTags(builtProfileName) - tagsMap["target_profile"] = nimBuild.Status.TargetProfile.Name + tagsMap["input_profile"] = nimBuild.Status.InputProfile.Name logger.Info("Adding profile to NIMCache status", "profileName", builtProfileName) nimCache.Status.Profiles = append(nimCache.Status.Profiles, appsv1alpha1.NIMProfile{ Name: builtProfileName, From d571d3ffb247e29573b6d68aa5af9175f0fcb0c7 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 26 Jun 2025 17:44:04 +0000 Subject: [PATCH 13/16] Addressing Review comments Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/zz_generated.deepcopy.go | 4 +- .../manifests/apps.nvidia.com_nimbuilds.yaml | 44 +++++++++---------- .../crd/bases/apps.nvidia.com_nimbuilds.yaml | 40 ++++++++--------- .../crds/apps.nvidia.com_nimbuilds.yaml | 44 +++++++++---------- 4 files changed, 66 insertions(+), 66 deletions(-) diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index 7dce6c93e..9d42c7392 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -819,8 +819,8 @@ func (in *NIMBuildSpec) DeepCopy() *NIMBuildSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NIMBuildStatus) DeepCopyInto(out *NIMBuildStatus) { *out = *in - in.TargetProfile.DeepCopyInto(&out.TargetProfile) - in.BuiltProfile.DeepCopyInto(&out.BuiltProfile) + in.InputProfile.DeepCopyInto(&out.InputProfile) + in.OutputProfile.DeepCopyInto(&out.OutputProfile) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml index 6053589b6..87baeada4 100644 --- a/bundle/manifests/apps.nvidia.com_nimbuilds.yaml +++ b/bundle/manifests/apps.nvidia.com_nimbuilds.yaml @@ -52,9 +52,6 @@ spec: additionalProperties: type: string type: object - engineName: - description: ModelName is the name given to the locally built engine. - type: string env: items: description: EnvVar represents an environment variable present in @@ -193,7 +190,10 @@ spec: additionalProperties: type: string type: object - nimCacheRef: + modelName: + description: ModelName is the name given to the locally built engine. + type: string + nimCache: description: NIMCache is Reference to the model weights from NIMCache properties: name: @@ -279,7 +279,7 @@ spec: type: array required: - image - - nimCacheRef + - nimCache type: object x-kubernetes-validations: - message: spec is immutable @@ -287,20 +287,6 @@ spec: status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: - builtProfile: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object conditions: items: description: Condition contains details for one aspect of the current @@ -357,9 +343,7 @@ spec: - type type: object type: array - state: - type: string - targetProfile: + inputProfile: description: NIMProfile defines the profiles that were cached. properties: config: @@ -373,6 +357,22 @@ spec: release: type: string type: object + outputProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + state: + type: string type: object type: object served: true diff --git a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml index 4bd816710..87baeada4 100644 --- a/config/crd/bases/apps.nvidia.com_nimbuilds.yaml +++ b/config/crd/bases/apps.nvidia.com_nimbuilds.yaml @@ -52,9 +52,6 @@ spec: additionalProperties: type: string type: object - engineName: - description: ModelName is the name given to the locally built engine. - type: string env: items: description: EnvVar represents an environment variable present in @@ -193,6 +190,9 @@ spec: additionalProperties: type: string type: object + modelName: + description: ModelName is the name given to the locally built engine. + type: string nimCache: description: NIMCache is Reference to the model weights from NIMCache properties: @@ -287,20 +287,6 @@ spec: status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: - builtProfile: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object conditions: items: description: Condition contains details for one aspect of the current @@ -357,9 +343,7 @@ spec: - type type: object type: array - state: - type: string - targetProfile: + inputProfile: description: NIMProfile defines the profiles that were cached. properties: config: @@ -373,6 +357,22 @@ spec: release: type: string type: object + outputProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + state: + type: string type: object type: object served: true diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml index 6053589b6..87baeada4 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimbuilds.yaml @@ -52,9 +52,6 @@ spec: additionalProperties: type: string type: object - engineName: - description: ModelName is the name given to the locally built engine. - type: string env: items: description: EnvVar represents an environment variable present in @@ -193,7 +190,10 @@ spec: additionalProperties: type: string type: object - nimCacheRef: + modelName: + description: ModelName is the name given to the locally built engine. + type: string + nimCache: description: NIMCache is Reference to the model weights from NIMCache properties: name: @@ -279,7 +279,7 @@ spec: type: array required: - image - - nimCacheRef + - nimCache type: object x-kubernetes-validations: - message: spec is immutable @@ -287,20 +287,6 @@ spec: status: description: NIMBuildStatus defines the observed state of NIMBuild. properties: - builtProfile: - description: NIMProfile defines the profiles that were cached. - properties: - config: - additionalProperties: - type: string - type: object - model: - type: string - name: - type: string - release: - type: string - type: object conditions: items: description: Condition contains details for one aspect of the current @@ -357,9 +343,7 @@ spec: - type type: object type: array - state: - type: string - targetProfile: + inputProfile: description: NIMProfile defines the profiles that were cached. properties: config: @@ -373,6 +357,22 @@ spec: release: type: string type: object + outputProfile: + description: NIMProfile defines the profiles that were cached. + properties: + config: + additionalProperties: + type: string + type: object + model: + type: string + name: + type: string + release: + type: string + type: object + state: + type: string type: object type: object served: true From ca9ca78c6fc9699b0701610bf57ca9e8aba0aa5b Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Fri, 27 Jun 2025 11:04:40 +0000 Subject: [PATCH 14/16] Addressing review comments Signed-off-by: Vishesh Tanksale --- internal/controller/nimbuild_controller.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index b8cda64da..c61c4bf0f 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -118,15 +118,11 @@ func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, client.IgnoreNotFound(err) } logger.Info("Reconciling", "NIMBuild", nimBuild.Name) - previousStatusState := nimBuild.Status.State defer func() { if err != nil { r.GetEventRecorder().Eventf(nimBuild, corev1.EventTypeWarning, "ReconcileFailed", "NIMBuild %s reconcile failed, msg: %s", nimBuild.Name, err.Error()) - } else if previousStatusState != nimBuild.Status.State { - r.GetEventRecorder().Eventf(nimBuild, corev1.EventTypeNormal, nimBuild.Status.State, - "NIMBuild %s reconcile success, new state: %s", nimBuild.Name, nimBuild.Status.State) } }() // Check if the instance is marked for deletion @@ -151,8 +147,8 @@ func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if err := r.Update(ctx, nimBuild); err != nil { return ctrl.Result{}, err } - return ctrl.Result{}, nil } + return ctrl.Result{}, nil } // Fetch container orchestrator type @@ -166,7 +162,7 @@ func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if err != nil { logger.Error(err, "error reconciling NIMBuild", "name", nimBuild.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionReconcileFailed, metav1.ConditionTrue, "ReconcileFailed", err.Error()) - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending err := r.updateNIMBuildStatus(ctx, nimBuild) if err != nil { @@ -335,7 +331,7 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap logger.Error(err, "unable to fetch NIMCache for NIMBuild", "NIMCache", nimCacheNamespacedName) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNIMCacheNotFound, metav1.ConditionTrue, "NIMCache not found", "Not able to get NIMCache for NIMBuild") nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return ctrl.Result{}, err + return ctrl.Result{}, nil } switch nimCache.Status.State { From 683906dc67b63c5456bb1ebc484e8aebce071004 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Fri, 27 Jun 2025 11:24:23 +0000 Subject: [PATCH 15/16] Addressing review comments Signed-off-by: Vishesh Tanksale --- internal/controller/nimbuild_controller.go | 41 ++++++++++------------ 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index c61c4bf0f..87dbe36e6 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -164,9 +164,9 @@ func (r *NIMBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionReconcileFailed, metav1.ConditionTrue, "ReconcileFailed", err.Error()) nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending - err := r.updateNIMBuildStatus(ctx, nimBuild) + err := r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusPending) if err != nil { - logger.Error(err, "Failed to update NIMBuild status", "NIMBuild", nimBuild.Name) + logger.Error(err, "Failed to update NIMBuild state", "NIMBuild", nimBuild.Name) return ctrl.Result{}, err } return result, err @@ -330,8 +330,7 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap if err := r.Get(ctx, nimCacheNamespacedName, nimCache); err != nil { logger.Error(err, "unable to fetch NIMCache for NIMBuild", "NIMCache", nimCacheNamespacedName) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNIMCacheNotFound, metav1.ConditionTrue, "NIMCache not found", "Not able to get NIMCache for NIMBuild") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return ctrl.Result{}, nil + return ctrl.Result{}, r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) } switch nimCache.Status.State { @@ -339,12 +338,10 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap logger.V(4).Info("NIMCache is ready", "nimcache", nimCache.Name, "nimservice", nimBuild.Name) case appsv1alpha1.NimCacheStatusFailed: conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNimCacheFailed, metav1.ConditionTrue, "NIMCache failed", "NIMCache is failed. NIMBuild can not progress") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return ctrl.Result{}, nil + return ctrl.Result{}, r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) default: conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionWaitForNimCache, metav1.ConditionTrue, "NIMCache not ready", "Waiting for NIMCache to be ready before building engines") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusPending - return ctrl.Result{}, nil + return ctrl.Result{}, r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) } if !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted) { @@ -363,12 +360,17 @@ func (r *NIMBuildReconciler) reconcileNIMBuild(ctx context.Context, nimBuild *ap } } + return ctrl.Result{}, nil +} +func (r *NIMBuildReconciler) updateNIMBuildState(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, state string) error { + logger := r.GetLogger() + nimBuild.Status.State = state err := r.updateNIMBuildStatus(ctx, nimBuild) if err != nil { logger.Error(err, "Failed to update NIMBuild status", "NIMBuild", nimBuild.Name) - return ctrl.Result{}, err + return err } - return ctrl.Result{}, nil + return nil } func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { @@ -381,8 +383,7 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild case len(buildableProfiles) > 1: logger.Info("Multiple buildable profiles found", "Profiles", buildableProfiles) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionMultipleBuildableProfilesFound, metav1.ConditionTrue, "MultipleBuildableProfilesFound", "Multiple buildable profiles found in NIM Cache, please select one profile to build") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return nil + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) case len(buildableProfiles) == 1: logger.Info("Selected buildable profile found", "Profile", buildableProfiles[0].Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile cached in NIM Cache") @@ -391,22 +392,20 @@ func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild default: logger.Info("No buildable profiles found, skipping engine build") conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found in NIM Cache") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return nil + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) } } else { logger.Info("No buildable profiles found, skipping engine build") conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found for NIM Cache") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) } } else { foundProfile := getBuildableProfileByName(nimCache, nimBuild.Spec.NIMCache.Profile) if foundProfile == nil { logger.Info("No buildable profiles found", "ProfileName", nimBuild.Spec.NIMCache.Profile) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionNoBuildableProfilesFound, metav1.ConditionTrue, "NoBuildableProfilesFound", "No buildable profiles found, please select a valid profile from the NIM cache") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed - return nil + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) } else { logger.Info("Selected buildable profile found", "Profile", foundProfile.Name) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionSingleBuildableProfilesFound, metav1.ConditionTrue, "BuildableProfileFound", "Single buildable profile found") @@ -485,12 +484,12 @@ func (r *NIMBuildReconciler) reconcileEngineBuildPodStatus(ctx context.Context, case pod.Status.Phase == corev1.PodFailed && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): logger.Info("Failed to cache NIM, build pod failed", "pod", pod) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCreated, metav1.ConditionFalse, "PodFailed", "The pod to build engine has failed") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusFailed + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusFailed) case !isPodReady(pod) && !meta.IsStatusConditionTrue(nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodCompleted): logger.Info("Caching NIM is in progress, build engine pod running", "job", podName) conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionEngineBuildPodPending, metav1.ConditionTrue, "PodRunning", "The Pod to build engine is in progress") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusInProgress + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusInProgress) } @@ -850,9 +849,7 @@ func (r *NIMBuildReconciler) reconcileLocalModelManifest(ctx context.Context, ni } // Update the NIMBuild status conditions.UpdateCondition(&nimBuild.Status.Conditions, appsv1alpha1.NimBuildConditionModelManifestPodCompleted, metav1.ConditionTrue, "PodCompleted", "The Pod to read local model manifest is completed") - nimBuild.Status.State = appsv1alpha1.NimBuildStatusReady - - return nil + return r.updateNIMBuildState(ctx, nimBuild, appsv1alpha1.NimBuildStatusReady) } func isBuiltProfilePresentOnNIMCacheStatus(nimCache *appsv1alpha1.NIMCache, builtProfileName string) bool { From 06bb936ba403ac4752f7628d82082b71d2ba1847 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Fri, 27 Jun 2025 19:39:33 +0000 Subject: [PATCH 16/16] Fixing Kubebuilder PROJECT file Signed-off-by: Vishesh Tanksale --- PROJECT | 8 ++++++ config/crd/kustomization.yaml | 1 + config/rbac/kustomization.yaml | 4 ++- config/rbac/nimbuild_admin_role.yaml | 27 ++++++++++++++++++ config/rbac/nimbuild_editor_role.yaml | 33 ++++++++++++++++++++++ config/rbac/nimbuild_viewer_role.yaml | 29 +++++++++++++++++++ config/samples/kustomization.yaml | 2 ++ internal/controller/nimbuild_controller.go | 2 +- 8 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 config/rbac/nimbuild_admin_role.yaml create mode 100644 config/rbac/nimbuild_editor_role.yaml create mode 100644 config/rbac/nimbuild_viewer_role.yaml diff --git a/PROJECT b/PROJECT index affc699ab..d11d27ece 100644 --- a/PROJECT +++ b/PROJECT @@ -80,4 +80,12 @@ resources: kind: NemoCustomizer path: github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1 version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + domain: nvidia.com + group: apps + kind: NIMBuild + path: github.com/NVIDIA/k8s-nim-operator/api/v1aplha1 + version: v1aplha1 version: "3" diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 9201741b1..9e65c2c47 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -10,6 +10,7 @@ resources: - bases/apps.nvidia.com_nemoevaluators.yaml - bases/apps.nvidia.com_nemodatastores.yaml - bases/apps.nvidia.com_nemoentitystores.yaml +- bases/apps.nvidia.com_nimbuilds.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 2eeff1338..b5598480d 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -29,4 +29,6 @@ resources: - nimcache_viewer_role.yaml - nimservice_editor_role.yaml - nimservice_viewer_role.yaml - +- nimbuild_admin_role.yaml +- nimbuild_editor_role.yaml +- nimbuild_viewer_role.yaml diff --git a/config/rbac/nimbuild_admin_role.yaml b/config/rbac/nimbuild_admin_role.yaml new file mode 100644 index 000000000..306abebde --- /dev/null +++ b/config/rbac/nimbuild_admin_role.yaml @@ -0,0 +1,27 @@ +# This rule is not used by the project k8s-nim-operator itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants full permissions ('*') over apps.nvidia.com. +# This role is intended for users authorized to modify roles and bindings within the cluster, +# enabling them to delegate specific permissions to other users or groups as needed. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: k8s-nim-operator + app.kubernetes.io/managed-by: kustomize + name: nimbuild-admin-role +rules: +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds + verbs: + - '*' +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds/status + verbs: + - get diff --git a/config/rbac/nimbuild_editor_role.yaml b/config/rbac/nimbuild_editor_role.yaml new file mode 100644 index 000000000..95739d81d --- /dev/null +++ b/config/rbac/nimbuild_editor_role.yaml @@ -0,0 +1,33 @@ +# This rule is not used by the project k8s-nim-operator itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants permissions to create, update, and delete resources within the apps.nvidia.com. +# This role is intended for users who need to manage these resources +# but should not control RBAC or manage permissions for others. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: k8s-nim-operator + app.kubernetes.io/managed-by: kustomize + name: nimbuild-editor-role +rules: +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds/status + verbs: + - get diff --git a/config/rbac/nimbuild_viewer_role.yaml b/config/rbac/nimbuild_viewer_role.yaml new file mode 100644 index 000000000..c42209fa3 --- /dev/null +++ b/config/rbac/nimbuild_viewer_role.yaml @@ -0,0 +1,29 @@ +# This rule is not used by the project k8s-nim-operator itself. +# It is provided to allow the cluster admin to help manage permissions for users. +# +# Grants read-only access to apps.nvidia.com resources. +# This role is intended for users who need visibility into these resources +# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: k8s-nim-operator + app.kubernetes.io/managed-by: kustomize + name: nimbuild-viewer-role +rules: +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds + verbs: + - get + - list + - watch +- apiGroups: + - apps.nvidia.com + resources: + - nimbuilds/status + verbs: + - get diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index 629c23e87..0201e5444 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -2,9 +2,11 @@ resources: - nim/llm/nimservice.yaml - nim/llm/nimcache-llm.yaml +- nim/llm/nimbuild.yaml - nemo/latest/apps_v1alpha1_nemocustomizer.yaml - nemo/latest/apps_v1alpha1_nemoguardrails.yaml - nemo/latest/apps_v1alpha1_nemoevaluator.yaml - nemo/latest/apps_v1alpha1_nemodatastore.yaml - nemo/latest/apps_v1alpha1_nemoentitystore.yaml +- apps_v1aplha1_nimbuild.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/internal/controller/nimbuild_controller.go b/internal/controller/nimbuild_controller.go index 87dbe36e6..a7e07d7b4 100644 --- a/internal/controller/nimbuild_controller.go +++ b/internal/controller/nimbuild_controller.go @@ -375,7 +375,7 @@ func (r *NIMBuildReconciler) updateNIMBuildState(ctx context.Context, nimBuild * func (r *NIMBuildReconciler) reconcileEngineBuild(ctx context.Context, nimBuild *appsv1alpha1.NIMBuild, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() - buildableProfile := appsv1alpha1.NIMProfile{} + var buildableProfile appsv1alpha1.NIMProfile if nimBuild.Spec.NIMCache.Profile == "" { buildableProfiles := getBuildableProfiles(nimCache) if len(buildableProfiles) > 0 {