Skip to content

Commit 38208cb

Browse files
committed
Allow size of emptyDir volumes to be specified for NIMs Customizer
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent b122d4b commit 38208cb

4 files changed

Lines changed: 19 additions & 5 deletions

File tree

api/apps/v1alpha1/nemo_customizer_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
corev1 "k8s.io/api/core/v1"
3232
networkingv1 "k8s.io/api/networking/v1"
3333
rbacv1 "k8s.io/api/rbac/v1"
34+
"k8s.io/apimachinery/pkg/api/resource"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3536
"k8s.io/apimachinery/pkg/util/intstr"
3637
"k8s.io/utils/ptr"
@@ -149,6 +150,8 @@ type TrainingConfig struct {
149150
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
150151
// PodAffinity for the training jobs
151152
PodAffinity *corev1.PodAffinity `json:"podAffinity,omitempty"`
153+
// SharedMemorySizeLimit sets the max size of the shared memory volume (emptyDir) used by the training jobs for fast model runtime I/O.
154+
SharedMemorySizeLimit *resource.Quantity `json:"sharedMemorySizeLimit,omitempty"`
152155
// Resources for the training jobs
153156
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
154157
}

api/apps/v1alpha1/nimservice_types.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
corev1 "k8s.io/api/core/v1"
3030
networkingv1 "k8s.io/api/networking/v1"
3131
rbacv1 "k8s.io/api/rbac/v1"
32+
"k8s.io/apimachinery/pkg/api/resource"
3233
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3334
"k8s.io/apimachinery/pkg/util/intstr"
3435
"k8s.io/utils/ptr"
@@ -134,6 +135,8 @@ type NIMServiceList struct {
134135
// NIMServiceStorage defines the attributes of various storage targets used to store the model
135136
type NIMServiceStorage struct {
136137
NIMCache NIMCacheVolSpec `json:"nimCache,omitempty"`
138+
// SharedMemorySizeLimit sets the max size of the shared memory volume (emptyDir) used by NIMs for fast model runtime I/O.
139+
SharedMemorySizeLimit *resource.Quantity `json:"sharedMemorySizeLimit,omitempty"`
137140
// PersistentVolumeClaim is the pvc volume used for caching NIM
138141
PVC PersistentVolumeClaim `json:"pvc,omitempty"`
139142
// HostPath is the host path volume for caching NIM
@@ -448,7 +451,8 @@ func (n *NIMService) GetVolumes(modelPVC PersistentVolumeClaim) []corev1.Volume
448451
Name: "dshm",
449452
VolumeSource: corev1.VolumeSource{
450453
EmptyDir: &corev1.EmptyDirVolumeSource{
451-
Medium: corev1.StorageMediumMemory,
454+
Medium: corev1.StorageMediumMemory,
455+
SizeLimit: n.Spec.Storage.SharedMemorySizeLimit,
452456
},
453457
},
454458
},

internal/controller/nemocustomizer_controller.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,12 @@ func (r *NemoCustomizerReconciler) addTrainingConfig(ctx context.Context, cfg ma
626626
// Add PVC configuration
627627
r.addWorkspacePVCConfig(ctx, trainingCfg, n)
628628

629+
emptyDir := map[string]interface{}{
630+
"medium": "Memory",
631+
}
632+
if n.Spec.Training.SharedMemorySizeLimit != nil {
633+
emptyDir["sizeLimit"] = n.Spec.Training.SharedMemorySizeLimit.String()
634+
}
629635
trainingCfg["volumes"] = []map[string]interface{}{
630636
{
631637
"name": "models",
@@ -635,10 +641,8 @@ func (r *NemoCustomizerReconciler) addTrainingConfig(ctx context.Context, cfg ma
635641
},
636642
},
637643
{
638-
"name": "dshm",
639-
"emptyDir": map[string]interface{}{
640-
"medium": "Memory",
641-
},
644+
"name": "dshm",
645+
"emptyDir": emptyDir,
642646
},
643647
}
644648

manifests/deployment.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ spec:
121121
{{- if .EmptyDir }}
122122
emptyDir:
123123
medium: {{ .EmptyDir.Medium }}
124+
{{- if .EmptyDir.SizeLimit }}
125+
sizeLimit: {{ .EmptyDir.SizeLimit }}
126+
{{- end }}
124127
{{- end }}
125128
{{- if .PersistentVolumeClaim }}
126129
persistentVolumeClaim:

0 commit comments

Comments
 (0)