Skip to content

Commit bc54e66

Browse files
committed
feat: add pod name validation to PodGangSet admission webhook
- Implement validation for pod name length constraints (45 chars max) - Add separate validation for PodClique templates not in scaling groups - Ensure total pod names stay under 64 characters for annotation support - Add comprehensive test coverage for validation scenarios - Handle nil pointer cases and improve error messages Signed-off-by: Ron Kahn <rkahn@nvidia.com>
1 parent b1be700 commit bc54e66

File tree

3 files changed

+663
-10
lines changed

3 files changed

+663
-10
lines changed

CLAUDE.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Claude Rules
2+
3+
## Security Guidelines
4+
- ONLY assist with defensive security tasks
5+
- REFUSE to create, modify, or improve code that may be used maliciously
6+
- ALLOW security analysis, detection rules, vulnerability explanations, defensive tools, and security documentation
7+
8+
## Git Commits
9+
- Always sign commits with `-s` flag
10+
- Use conventional commit format
11+
- Dont add `claude:` prefix to commit messages
12+
- Dont co-author commits with `claude:` prefix
13+
- NEVER add "🤖 Generated with [Claude Code](https://claude.ai/code)" to commit messages
14+
- NEVER add "Co-Authored-By: Claude <noreply@anthropic.com>" to commit messages
15+
- Keep commit messages under 72 characters for first line
16+
- Always check git status and diff before committing
17+
- Add all relevant files to staging area before commit
18+
19+
## Testing Requirements
20+
- MUST run tests after making code changes
21+
- Use `go test ./...` for full test suite
22+
- Focus on running tests that cover the modified code
23+
- Prefer running the smallest set of tests that validate the change
24+
- Use `go test -run <TestName>` to run specific tests
25+
- Use `go test -cover` to check test coverage
26+
- NEVER assume tests pass without running them
27+
- Fix any failing tests before committing
28+
29+
## Code Standards
30+
- Follow existing Go code patterns and conventions
31+
- Prefer editing existing files over creating new ones
32+
- NEVER proactively create documentation files unless explicitly requested
33+
- Add comments only when explicitly requested by user
34+
- Use existing imports and libraries - check package.json/go.mod first
35+
36+
## Grove Operator Specific Rules
37+
- Validate pod naming constraints with 45-character limit for resource names
38+
- Only validate PodClique templates separately if they don't belong to scaling groups
39+
- Ensure total pod names stay under 64 characters for annotation support
40+
- Test both PodClique template and scaling group validation scenarios
41+
42+
## Task Management
43+
- Use TodoWrite tool for complex multi-step tasks (3+ steps)
44+
- Mark tasks as in_progress before starting work
45+
- Mark tasks as completed immediately after finishing
46+
- Only have ONE task in_progress at a time
47+
- Create specific, actionable todo items
48+
49+
## Error Handling
50+
- Check for compilation errors after code changes
51+
- Validate that all imports are available in the codebase
52+
- Use proper error field paths in validation functions
53+
- Handle both individual and aggregate errors appropriately
54+
55+
## Performance Guidelines
56+
- Move expensive operations outside loops when possible
57+
- Cache results of helper methods like `getScalingGroupCliqueNames()`
58+
- Use efficient data structures (sets.Set for lookups)
59+
60+
## Response Style
61+
- Be concise and direct
62+
- Keep responses under 4 lines unless detail is requested
63+
- Avoid unnecessary preamble or explanations
64+
- Use tools to show results rather than describing what will be done

operator/internal/webhook/admission/pgs/validation/podgangset.go

Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
package validation
1818

1919
import (
20+
"fmt"
2021
"reflect"
2122
"slices"
2223
"strings"
2324

2425
grovecorev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
2526
"github.com/NVIDIA/grove/operator/internal/utils"
26-
2727
"github.com/samber/lo"
2828
admissionv1 "k8s.io/api/admission/v1"
2929
corev1 "k8s.io/api/core/v1"
@@ -33,6 +33,10 @@ import (
3333
"k8s.io/apimachinery/pkg/util/validation/field"
3434
)
3535

36+
const (
37+
maxCombinedResourceNameLength = 45
38+
)
39+
3640
var allowedStartupTypes = sets.New(grovecorev1alpha1.CliqueStartupTypeInOrder, grovecorev1alpha1.CliqueStartupTypeAnyOrder, grovecorev1alpha1.CliqueStartupTypeExplicit)
3741

3842
type pgsValidator struct {
@@ -53,7 +57,8 @@ func newPGSValidator(pgs *grovecorev1alpha1.PodGangSet, operation admissionv1.Op
5357
func (v *pgsValidator) validate() ([]string, error) {
5458
allErrs := field.ErrorList{}
5559

56-
allErrs = append(allErrs, apivalidation.ValidateObjectMeta(&v.pgs.ObjectMeta, true, apivalidation.NameIsDNSSubdomain, field.NewPath("metadata"))...)
60+
allErrs = append(allErrs, apivalidation.ValidateObjectMeta(&v.pgs.ObjectMeta, true,
61+
apivalidation.NameIsDNSSubdomain, field.NewPath("metadata"))...)
5762
fldPath := field.NewPath("spec")
5863
warnings, errs := v.validatePodGangSetSpec(fldPath)
5964
if len(errs) != 0 {
@@ -101,10 +106,28 @@ func (v *pgsValidator) validatePodCliqueTemplates(fldPath *field.Path) ([]string
101106
allErrs = append(allErrs, field.Required(fldPath, "at least one PodClique must be defined"))
102107
}
103108

109+
// Get all clique names that belong to scaling groups
110+
scalingGroupCliqueNames := v.getScalingGroupCliqueNames()
111+
104112
cliqueNames := make([]string, 0, len(cliqueTemplateSpecs))
105113
cliqueRoles := make([]string, 0, len(cliqueTemplateSpecs))
106114
schedulerNames := make([]string, 0, len(cliqueTemplateSpecs))
107115
for _, cliqueTemplateSpec := range cliqueTemplateSpecs {
116+
if err := apivalidation.NameIsDNSSubdomain(cliqueTemplateSpec.Name, false); err != nil {
117+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), cliqueTemplateSpec.Name,
118+
"invalid PodCliqueTemplateSpec name, must be a valid DNS subdomain"))
119+
}
120+
121+
// Only validate pod name constraints for PodCliques that are NOT part of any scaling group
122+
// any pod clique that is part of scaling groups will be checked as part of scaling group pod name constraints.
123+
if !scalingGroupCliqueNames.Has(cliqueTemplateSpec.Name) {
124+
if err := validatePodNameConstraints(v.pgs.Name, "", cliqueTemplateSpec.Name); err != nil {
125+
// add error to each of filed paths that compose the podName in case of a PodCliqueTemplateSpec
126+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), cliqueTemplateSpec.Name, err.Error()))
127+
allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), v.pgs.Name, err.Error()))
128+
}
129+
}
130+
108131
cliqueNames = append(cliqueNames, cliqueTemplateSpec.Name)
109132
cliqueRoles = append(cliqueRoles, cliqueTemplateSpec.Spec.RoleName)
110133
warns, errs := v.validatePodCliqueTemplateSpec(cliqueTemplateSpec, fldPath)
@@ -156,12 +179,19 @@ func (v *pgsValidator) validatePodCliqueScalingGroupConfigs(fldPath *field.Path)
156179
})
157180
pclqScalingGroupNames := make([]string, 0, len(v.pgs.Spec.Template.PodCliqueScalingGroupConfigs))
158181
var cliqueNamesAcrossAllScalingGroups []string
182+
groupNameFiledPath := fldPath.Child("name")
159183

160184
for _, scalingGroupConfig := range v.pgs.Spec.Template.PodCliqueScalingGroupConfigs {
185+
if err := apivalidation.NameIsDNSSubdomain(scalingGroupConfig.Name, false); err != nil {
186+
allErrs = append(allErrs, field.Invalid(groupNameFiledPath, scalingGroupConfig.Name,
187+
"invalid PodCliqueScalingGroupConfig name, must be a valid DNS subdomain"))
188+
}
161189
pclqScalingGroupNames = append(pclqScalingGroupNames, scalingGroupConfig.Name)
162190
cliqueNamesAcrossAllScalingGroups = append(cliqueNamesAcrossAllScalingGroups, scalingGroupConfig.CliqueNames...)
163191
// validate that scaling groups only contains clique names that are defined in the PodGangSet.
164-
allErrs = append(allErrs, validateScalingGroupPodCliqueNames(allPodGangSetCliqueNames, scalingGroupConfig.CliqueNames, fldPath.Child("cliqueNames"))...)
192+
allErrs = append(allErrs, v.validateScalingGroupPodCliqueNames(scalingGroupConfig.Name, allPodGangSetCliqueNames,
193+
scalingGroupConfig.CliqueNames, fldPath.Child("cliqueNames"), groupNameFiledPath)...)
194+
165195
}
166196

167197
// validate that the scaling group names are unique
@@ -280,15 +310,36 @@ func (v *pgsValidator) checkNetworkPackGroupConfigsForPartialPCSGInclusions(fldP
280310
return allErrs
281311
}
282312

313+
// getScalingGroupCliqueNames returns a set of all clique names that belong to scaling groups
314+
func (v *pgsValidator) getScalingGroupCliqueNames() sets.Set[string] {
315+
scalingGroupCliqueNames := sets.New[string]()
316+
for _, scalingGroupConfig := range v.pgs.Spec.Template.PodCliqueScalingGroupConfigs {
317+
for _, cliqueName := range scalingGroupConfig.CliqueNames {
318+
scalingGroupCliqueNames.Insert(cliqueName)
319+
}
320+
}
321+
return scalingGroupCliqueNames
322+
323+
}
324+
283325
// checks if the PodClique names specified in PodCliqueScalingGroupConfig refer to a defined clique in the PodGangSet.
284-
func validateScalingGroupPodCliqueNames(allPclqNames, pclqNameInScalingGrp []string, fldPath *field.Path) field.ErrorList {
326+
func (v *pgsValidator) validateScalingGroupPodCliqueNames(pcsgName string, allPclqNames, pclqNameInScalingGrp []string, fldPath, pcsgNameFieldPath *field.Path) field.ErrorList {
285327
allErrs := field.ErrorList{}
286328

287329
_, unidentifiedPclqNames := lo.Difference(allPclqNames, lo.Uniq(pclqNameInScalingGrp))
288330
if len(unidentifiedPclqNames) > 0 {
289331
allErrs = append(allErrs, field.Invalid(fldPath, strings.Join(unidentifiedPclqNames, ","), "unidentified PodClique names found"))
290332
}
291333

334+
// validate scaling group PodClique pods names are valid.
335+
for _, pclqName := range pclqNameInScalingGrp {
336+
if err := validatePodNameConstraints(v.pgs.Name, pcsgName, pclqName); err != nil {
337+
// add error to each of filed paths that compose the podName
338+
allErrs = append(allErrs, field.Invalid(fldPath.Child("name"), pclqName, err.Error()))
339+
allErrs = append(allErrs, field.Invalid(pcsgNameFieldPath, pclqName, err.Error()))
340+
allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), v.pgs.Name, err.Error()))
341+
}
342+
}
292343
return allErrs
293344
}
294345

@@ -302,12 +353,14 @@ func (v *pgsValidator) validatePodCliqueSpec(name string, cliqueSpec grovecorev1
302353
// Ideally this should never happen, the defaulting webhook will always set the default value for minAvailable.
303354
if cliqueSpec.MinAvailable == nil {
304355
allErrs = append(allErrs, field.Required(fldPath.Child("minAvailable"), "field is required"))
305-
}
306-
if *cliqueSpec.MinAvailable <= 0 {
307-
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "must be greater than 0"))
308-
}
309-
if *cliqueSpec.MinAvailable > cliqueSpec.Replicas {
310-
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "minAvailable must not be greater than replicas"))
356+
} else {
357+
// prevent nil pointer dereference, no point checking the value if it is nil
358+
if *cliqueSpec.MinAvailable <= 0 {
359+
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "must be greater than 0"))
360+
}
361+
if *cliqueSpec.MinAvailable > cliqueSpec.Replicas {
362+
allErrs = append(allErrs, field.Invalid(fldPath.Child("minAvailable"), *cliqueSpec.MinAvailable, "minAvailable must not be greater than replicas"))
363+
}
311364
}
312365

313366
if v.isStartupTypeExplicit() && len(cliqueSpec.StartsAfter) > 0 {
@@ -469,3 +522,34 @@ func clearContainerImages(containers []corev1.Container) {
469522
containers[i].Image = ""
470523
}
471524
}
525+
526+
// validatePodNameConstraints validates Grove pod name component constraints.
527+
// This function validates the constraints for component names that will be used
528+
// to construct pod names.
529+
//
530+
// Pod names that belong to a PCSG follow the format:
531+
// <pgs-name>-<pgs-index>-<pcsg-name>-<pcsg-index>-<pclq-name>-<random>
532+
//
533+
// Pod names that do not belong to a PCSG follow the format:
534+
// <pgs-name>-<pgs-index>-<pclq-name>-<random>
535+
//
536+
// Constraints:
537+
// - Random string + hyphens: 10 chars for PCSG pods, 8 chars for non-PCSG pods
538+
// - Max sum of all resource name characters: 45 chars
539+
func validatePodNameConstraints(pgsName, pcsgName, pclqName string) error {
540+
// Check resource name constraints
541+
resourceNameLength := len(pgsName) + len(pclqName)
542+
if pcsgName != "" {
543+
resourceNameLength += len(pcsgName)
544+
}
545+
546+
if resourceNameLength > maxCombinedResourceNameLength {
547+
if pcsgName != "" {
548+
return fmt.Errorf("combined resource name length %d exceeds 45-character limit required for pod naming. Consider shortening: PodGangSet '%s', PodCliqueScalingGroup '%s', or PodClique '%s'",
549+
resourceNameLength, pgsName, pcsgName, pclqName)
550+
}
551+
return fmt.Errorf("combined resource name length %d exceeds 45-character limit required for pod naming. Consider shortening: PodGangSet '%s' or PodClique '%s'",
552+
resourceNameLength, pgsName, pclqName)
553+
}
554+
return nil
555+
}

0 commit comments

Comments
 (0)