Skip to content

Commit 37f2aa1

Browse files
authored
Pod name validation (#105)
- feat: add pod name validation to PodGangSet admission webhook - Implement validation for pod name length constraints (45 chars max) - Add separate validation for PodClique templates not in scaling groups - Ensure total pod names stay under 64 characters for annotation support - Add comprehensive test coverage for validation scenarios - Handle nil pointer cases and improve error messages Signed-off-by: Ron Kahn <rkahn@nvidia.com> --------- Signed-off-by: Ron Kahn <rkahn@nvidia.com>
1 parent 0a637fd commit 37f2aa1

File tree

2 files changed

+612
-13
lines changed

2 files changed

+612
-13
lines changed

operator/internal/webhook/admission/pgs/validation/podgangset.go

Lines changed: 106 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package validation
1818

1919
import (
20+
"fmt"
2021
"reflect"
2122
"slices"
2223
"strings"
@@ -33,6 +34,10 @@ import (
3334
"k8s.io/apimachinery/pkg/util/validation/field"
3435
)
3536

37+
const (
38+
maxCombinedResourceNameLength = 45
39+
)
40+
3641
var allowedStartupTypes = sets.New(grovecorev1alpha1.CliqueStartupTypeInOrder, grovecorev1alpha1.CliqueStartupTypeAnyOrder, grovecorev1alpha1.CliqueStartupTypeExplicit)
3742

3843
type pgsValidator struct {
@@ -53,7 +58,8 @@ func newPGSValidator(pgs *grovecorev1alpha1.PodGangSet, operation admissionv1.Op
5358
func (v *pgsValidator) validate() ([]string, error) {
5459
allErrs := field.ErrorList{}
5560

56-
allErrs = append(allErrs, apivalidation.ValidateObjectMeta(&v.pgs.ObjectMeta, true, apivalidation.NameIsDNSSubdomain, field.NewPath("metadata"))...)
61+
allErrs = append(allErrs, apivalidation.ValidateObjectMeta(&v.pgs.ObjectMeta, true,
62+
apivalidation.NameIsDNSSubdomain, field.NewPath("metadata"))...)
5763
fldPath := field.NewPath("spec")
5864
warnings, errs := v.validatePodGangSetSpec(fldPath)
5965
if len(errs) != 0 {
@@ -101,19 +107,22 @@ func (v *pgsValidator) validatePodCliqueTemplates(fldPath *field.Path) ([]string
101107
allErrs = append(allErrs, field.Required(fldPath, "at least one PodClique must be defined"))
102108
}
103109

110+
// Get all clique names that belong to scaling groups
111+
scalingGroupCliqueNames := v.getScalingGroupCliqueNames()
112+
104113
cliqueNames := make([]string, 0, len(cliqueTemplateSpecs))
105114
cliqueRoles := make([]string, 0, len(cliqueTemplateSpecs))
106115
schedulerNames := make([]string, 0, len(cliqueTemplateSpecs))
107116
for _, cliqueTemplateSpec := range cliqueTemplateSpecs {
108-
cliqueNames = append(cliqueNames, cliqueTemplateSpec.Name)
109-
cliqueRoles = append(cliqueRoles, cliqueTemplateSpec.Spec.RoleName)
110-
warns, errs := v.validatePodCliqueTemplateSpec(cliqueTemplateSpec, fldPath)
117+
warns, errs := v.validatePodCliqueTemplateSpec(cliqueTemplateSpec, fldPath, scalingGroupCliqueNames)
111118
if len(errs) != 0 {
112119
allErrs = append(allErrs, errs...)
113120
}
114121
if len(warns) != 0 {
115122
warnings = append(warnings, warns...)
116123
}
124+
cliqueNames = append(cliqueNames, cliqueTemplateSpec.Name)
125+
cliqueRoles = append(cliqueRoles, cliqueTemplateSpec.Spec.RoleName)
117126
schedulerNames = append(schedulerNames, cliqueTemplateSpec.Spec.PodSpec.SchedulerName)
118127
}
119128

@@ -137,6 +146,31 @@ func (v *pgsValidator) validatePodCliqueTemplates(fldPath *field.Path) ([]string
137146
return warnings, allErrs
138147
}
139148

149+
func (v *pgsValidator) validatePodCliqueNameConstrains(fldPath *field.Path, cliqueTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, scalingGroupCliqueNames sets.Set[string]) field.ErrorList {
150+
allErrs := field.ErrorList{}
151+
if err := apivalidation.NameIsDNSSubdomain(cliqueTemplateSpec.Name, false); err != nil {
152+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), cliqueTemplateSpec.Name,
153+
"invalid PodCliqueTemplateSpec name, must be a valid DNS subdomain"))
154+
}
155+
156+
// Only validate pod name constraints for PodCliques that are NOT part of any scaling group
157+
// any pod clique that is part of scaling groups will be checked as part of scaling group pod name constraints.
158+
if !scalingGroupCliqueNames.Has(cliqueTemplateSpec.Name) {
159+
allErrs = append(allErrs, validateStandalonePodClique(fldPath, v, cliqueTemplateSpec)...)
160+
}
161+
return allErrs
162+
}
163+
164+
func validateStandalonePodClique(fldPath *field.Path, v *pgsValidator, cliqueTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec) field.ErrorList {
165+
allErrs := field.ErrorList{}
166+
if err := validatePodNameConstraints(v.pgs.Name, "", cliqueTemplateSpec.Name); err != nil {
167+
// add error to each of filed paths that compose the podName in case of a PodCliqueTemplateSpec
168+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), cliqueTemplateSpec.Name, err.Error()))
169+
allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), v.pgs.Name, err.Error()))
170+
}
171+
return allErrs
172+
}
173+
140174
func (v *pgsValidator) validatePodGangSchedulingPolicyConfig(schedulingPolicyConfig *grovecorev1alpha1.SchedulingPolicyConfig, fldPath *field.Path) field.ErrorList {
141175
allErrs := field.ErrorList{}
142176
if schedulingPolicyConfig == nil {
@@ -156,12 +190,18 @@ func (v *pgsValidator) validatePodCliqueScalingGroupConfigs(fldPath *field.Path)
156190
})
157191
pclqScalingGroupNames := make([]string, 0, len(v.pgs.Spec.Template.PodCliqueScalingGroupConfigs))
158192
var cliqueNamesAcrossAllScalingGroups []string
193+
groupNameFiledPath := fldPath.Child("name")
159194

160195
for _, scalingGroupConfig := range v.pgs.Spec.Template.PodCliqueScalingGroupConfigs {
196+
if err := apivalidation.NameIsDNSSubdomain(scalingGroupConfig.Name, false); err != nil {
197+
allErrs = append(allErrs, field.Invalid(groupNameFiledPath, scalingGroupConfig.Name,
198+
"invalid PodCliqueScalingGroupConfig name, must be a valid DNS subdomain"))
199+
}
161200
pclqScalingGroupNames = append(pclqScalingGroupNames, scalingGroupConfig.Name)
162201
cliqueNamesAcrossAllScalingGroups = append(cliqueNamesAcrossAllScalingGroups, scalingGroupConfig.CliqueNames...)
163202
// validate that scaling groups only contains clique names that are defined in the PodGangSet.
164-
allErrs = append(allErrs, validateScalingGroupPodCliqueNames(allPodGangSetCliqueNames, scalingGroupConfig.CliqueNames, fldPath.Child("cliqueNames"))...)
203+
allErrs = append(allErrs, v.validateScalingGroupPodCliqueNames(scalingGroupConfig.Name, allPodGangSetCliqueNames,
204+
scalingGroupConfig.CliqueNames, fldPath.Child("cliqueNames"), groupNameFiledPath)...)
165205
}
166206

167207
// validate that the scaling group names are unique
@@ -195,7 +235,8 @@ func (v *pgsValidator) validateTerminationDelay(fldPath *field.Path) field.Error
195235
return allErrs
196236
}
197237

198-
func (v *pgsValidator) validatePodCliqueTemplateSpec(cliqueTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, fldPath *field.Path) ([]string, field.ErrorList) {
238+
func (v *pgsValidator) validatePodCliqueTemplateSpec(cliqueTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec,
239+
fldPath *field.Path, scalingGroupCliqueNames sets.Set[string]) ([]string, field.ErrorList) {
199240
allErrs := field.ErrorList{}
200241

201242
allErrs = append(allErrs, validateNonEmptyStringField(cliqueTemplateSpec.Name, fldPath.Child("name"))...)
@@ -206,6 +247,7 @@ func (v *pgsValidator) validatePodCliqueTemplateSpec(cliqueTemplateSpec *groveco
206247
if len(errs) != 0 {
207248
allErrs = append(allErrs, errs...)
208249
}
250+
allErrs = append(allErrs, v.validatePodCliqueNameConstrains(fldPath, cliqueTemplateSpec, scalingGroupCliqueNames)...)
209251

210252
return warnings, allErrs
211253
}
@@ -280,15 +322,33 @@ func (v *pgsValidator) checkNetworkPackGroupConfigsForPartialPCSGInclusions(fldP
280322
return allErrs
281323
}
282324

325+
// getScalingGroupCliqueNames returns a set of all clique names that belong to scaling groups
326+
func (v *pgsValidator) getScalingGroupCliqueNames() sets.Set[string] {
327+
scalingGroupCliqueNames := sets.New[string]()
328+
for _, scalingGroupConfig := range v.pgs.Spec.Template.PodCliqueScalingGroupConfigs {
329+
scalingGroupCliqueNames.Insert(scalingGroupConfig.CliqueNames...)
330+
}
331+
return scalingGroupCliqueNames
332+
}
333+
283334
// checks if the PodClique names specified in PodCliqueScalingGroupConfig refer to a defined clique in the PodGangSet.
284-
func validateScalingGroupPodCliqueNames(allPclqNames, pclqNameInScalingGrp []string, fldPath *field.Path) field.ErrorList {
335+
func (v *pgsValidator) validateScalingGroupPodCliqueNames(pcsgName string, allPclqNames, pclqNameInScalingGrp []string, fldPath, pcsgNameFieldPath *field.Path) field.ErrorList {
285336
allErrs := field.ErrorList{}
286337

287338
_, unidentifiedPclqNames := lo.Difference(allPclqNames, lo.Uniq(pclqNameInScalingGrp))
288339
if len(unidentifiedPclqNames) > 0 {
289340
allErrs = append(allErrs, field.Invalid(fldPath, strings.Join(unidentifiedPclqNames, ","), "unidentified PodClique names found"))
290341
}
291342

343+
// validate scaling group PodClique pods names are valid.
344+
for _, pclqName := range pclqNameInScalingGrp {
345+
if err := validatePodNameConstraints(v.pgs.Name, pcsgName, pclqName); err != nil {
346+
// add error to each of filed paths that compose the podName
347+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), pclqName, err.Error()))
348+
allErrs = append(allErrs, field.Invalid(pcsgNameFieldPath, pclqName, err.Error()))
349+
allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), v.pgs.Name, err.Error()))
350+
}
351+
}
292352
return allErrs
293353
}
294354

@@ -302,12 +362,14 @@ func (v *pgsValidator) validatePodCliqueSpec(name string, cliqueSpec grovecorev1
302362
// Ideally this should never happen, the defaulting webhook will always set the default value for minAvailable.
303363
if cliqueSpec.MinAvailable == nil {
304364
allErrs = append(allErrs, field.Required(fldPath.Child("minAvailable"), "field is required"))
305-
}
306-
if *cliqueSpec.MinAvailable <= 0 {
307-
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "must be greater than 0"))
308-
}
309-
if *cliqueSpec.MinAvailable > cliqueSpec.Replicas {
310-
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "minAvailable must not be greater than replicas"))
365+
} else {
366+
// prevent nil pointer dereference, no point checking the value if it is nil
367+
if *cliqueSpec.MinAvailable <= 0 {
368+
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "must be greater than 0"))
369+
}
370+
if *cliqueSpec.MinAvailable > cliqueSpec.Replicas {
371+
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "minAvailable must not be greater than replicas"))
372+
}
311373
}
312374

313375
if v.isStartupTypeExplicit() && len(cliqueSpec.StartsAfter) > 0 {
@@ -469,3 +531,34 @@ func clearContainerImages(containers []corev1.Container) {
469531
containers[i].Image = ""
470532
}
471533
}
534+
535+
// validatePodNameConstraints validates Grove pod name component constraints.
536+
// This function validates the constraints for component names that will be used
537+
// to construct pod names.
538+
//
539+
// Pod names that belong to a PCSG follow the format:
540+
// <pgs-name>-<pgs-index>-<pcsg-name>-<pcsg-index>-<pclq-name>-<random>
541+
//
542+
// Pod names that do not belong to a PCSG follow the format:
543+
// <pgs-name>-<pgs-index>-<pclq-name>-<random>
544+
//
545+
// Constraints:
546+
// - Random string + hyphens: 10 chars for PCSG pods, 8 chars for non-PCSG pods
547+
// - Max sum of all resource name characters: 45 chars
548+
func validatePodNameConstraints(pgsName, pcsgName, pclqName string) error {
549+
// Check resource name constraints
550+
resourceNameLength := len(pgsName) + len(pclqName)
551+
if pcsgName != "" {
552+
resourceNameLength += len(pcsgName)
553+
}
554+
555+
if resourceNameLength > maxCombinedResourceNameLength {
556+
if pcsgName != "" {
557+
return fmt.Errorf("combined resource name length %d exceeds 45-character limit required for pod naming. Consider shortening: PodGangSet '%s', PodCliqueScalingGroup '%s', or PodClique '%s'",
558+
resourceNameLength, pgsName, pcsgName, pclqName)
559+
}
560+
return fmt.Errorf("combined resource name length %d exceeds 45-character limit required for pod naming. Consider shortening: PodGangSet '%s' or PodClique '%s'",
561+
resourceNameLength, pgsName, pclqName)
562+
}
563+
return nil
564+
}

0 commit comments

Comments
 (0)