Skip to content

Commit fd43299

Browse files
committed
feat: add support for workload-gate and workload-selector
Mainly for use with Skyhook and skyhook-customizations to allow proper scheduling around workloads in day-2 operations
1 parent 0463e2d commit fd43299

File tree

13 files changed

+837
-2
lines changed

13 files changed

+837
-2
lines changed

docs/user/cli-reference.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ eidos bundle [flags]
678678
| `--system-node-toleration` | | string[] | Toleration for system components (format: key=value:effect, repeatable) |
679679
| `--accelerated-node-selector` | | string[] | Node selector for accelerated/GPU nodes (format: key=value, repeatable) |
680680
| `--accelerated-node-toleration` | | string[] | Toleration for accelerated/GPU nodes (format: key=value:effect, repeatable) |
681+
| `--workload-gate` | | string | Taint for skyhook-operator runtime required (format: key=value:effect or key:effect). This is a day 2 option for cluster scaling operations. |
682+
| `--workload-selector` | | string[] | Label selector for skyhook-customizations to prevent eviction of running training jobs (format: key=value, repeatable). Required when skyhook-customizations is enabled with training intent. |
681683

682684
**Behavior:**
683685
- All components from the recipe are bundled automatically
@@ -768,6 +770,12 @@ eidos bundle -r recipe.yaml \
768770
--accelerated-node-toleration nvidia.com/gpu=present:NoSchedule \
769771
-o ./bundles
770772
773+
# Day 2 options: workload-gate and workload-selector for skyhook
774+
eidos bundle -r recipe.yaml \
775+
--workload-gate skyhook.io/runtime-required=true:NoSchedule \
776+
--workload-selector workload-type=training \
777+
-o ./bundles
778+
771779
# Generate ArgoCD Application manifests for GitOps
772780
eidos bundle -r recipe.yaml --deployer argocd -o ./bundles
773781
@@ -816,6 +824,48 @@ bundles/
816824
└── README.md # ArgoCD deployment guide
817825
```
818826
827+
**Day 2 Options:**
828+
829+
The `--workload-gate` and `--workload-selector` flags are day 2 operational options for cluster scaling operations:
830+
831+
- **`--workload-gate`**: Specifies a taint for skyhook-operator's runtime required feature. This ensures nodes are properly configured before workloads can schedule on them during cluster scaling. The taint is configured in the skyhook-operator Helm values file at `controllerManager.manager.env.runtimeRequiredTaint`. For more information about runtime required, see the [skyhook documentation](hhttps://github.com/NVIDIA/skyhook/blob/main/docs/runtime_required.md).
832+
833+
- **`--workload-selector`**: Specifies a label selector for skyhook-customizations to prevent skyhook from evicting running training jobs. This is critical for training workloads where job eviction would cause significant disruption. The selector is set in the Skyhook CR manifest (tuning.yaml) in the `spec.workloadSelector.matchLabels` field.
834+
835+
**Validation Warnings:**
836+
837+
When generating bundles with skyhook-customizations enabled, validation warnings are displayed for missing configuration:
838+
839+
1. **Workload Selector Warning**: When skyhook-customizations is enabled with training intent, if `--workload-selector` is not set, a warning will be displayed:
840+
841+
```
842+
Warning: skyhook-customizations is enabled with training intent but --workload-selector is not set.
843+
This may cause skyhook to evict running training jobs. Consider setting --workload-selector to prevent eviction.
844+
```
845+
846+
2. **Accelerated Selector Warning**: When skyhook-customizations is enabled with training or inference intent, if `--accelerated-node-selector` is not set, a warning will be displayed:
847+
848+
```
849+
Warning: skyhook-customizations is enabled with {training|inference} intent but --accelerated-node-selector is not set.
850+
Without this selector, the customization will run on all nodes. Consider setting --accelerated-node-selector to target specific nodes.
851+
```
852+
853+
**Examples:**
854+
```shell
855+
# Generate bundle with day 2 options for training workloads
856+
eidos bundle -r recipe.yaml \
857+
--workload-gate skyhook.io/runtime-required=true:NoSchedule \
858+
--workload-selector workload-type=training \
859+
--workload-selector intent=training \
860+
--accelerated-node-selector accelerator=nvidia-h100 \
861+
-o ./bundles
862+
863+
# Generate bundle for inference workloads with accelerated selector
864+
eidos bundle -r recipe.yaml \
865+
--accelerated-node-selector accelerator=nvidia-h100 \
866+
-o ./bundles
867+
```
868+
819869
ArgoCD Applications use multi-source to:
820870
1. Pull Helm charts from upstream repositories
821871
2. Apply values.yaml from your GitOps repository

pkg/bundler/bundler.go

Lines changed: 126 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ type DefaultBundler struct {
5050
// AllowLists defines which criteria values are permitted for bundle requests.
5151
// When set, the bundler validates that the recipe's criteria are within the allowed values.
5252
AllowLists *recipe.AllowLists
53+
54+
// warnings stores warning messages to be added to deployment notes.
55+
warnings []string
5356
}
5457

5558
// Option defines a functional option for configuring DefaultBundler.
@@ -160,6 +163,12 @@ func (b *DefaultBundler) Make(ctx context.Context, input recipe.RecipeInput, dir
160163
"failed to extract component values", err)
161164
}
162165

166+
// Validate workload-selector for skyhook-customizations with training intent
167+
b.validateWorkloadSelector(recipeResult)
168+
169+
// Validate accelerated selector for skyhook-customizations with training/inference intent
170+
b.validateAcceleratedSelector(recipeResult)
171+
163172
// Route based on deployer
164173
deployer := b.Config.Deployer()
165174
if deployer == config.DeployerArgoCD {
@@ -226,9 +235,14 @@ func (b *DefaultBundler) makeHelmBundle(ctx context.Context, recipeResult *recip
226235
resultOutput.Results = append(resultOutput.Results, helmResult)
227236

228237
// Populate deployment info from generator output
238+
notes := make([]string, 0)
239+
if len(b.warnings) > 0 {
240+
notes = append(notes, b.warnings...)
241+
}
229242
resultOutput.Deployment = &result.DeploymentInfo{
230243
Type: "Helm per-component bundle",
231244
Steps: output.DeploymentSteps,
245+
Notes: notes,
232246
}
233247

234248
slog.Debug("helm bundle generation complete",
@@ -284,10 +298,17 @@ func (b *DefaultBundler) makeArgoCD(ctx context.Context, recipeResult *recipe.Re
284298
resultOutput.Results = append(resultOutput.Results, argocdResult)
285299

286300
// Populate deployment info from generator output
301+
notes := make([]string, 0)
302+
if len(output.DeploymentNotes) > 0 {
303+
notes = append(notes, output.DeploymentNotes...)
304+
}
305+
if len(b.warnings) > 0 {
306+
notes = append(notes, b.warnings...)
307+
}
287308
resultOutput.Deployment = &result.DeploymentInfo{
288309
Type: "ArgoCD applications",
289310
Steps: output.DeploymentSteps,
290-
Notes: output.DeploymentNotes,
311+
Notes: notes,
291312
}
292313

293314
slog.Debug("argocd applications generation complete",
@@ -329,7 +350,7 @@ func (b *DefaultBundler) extractComponentValues(ctx context.Context, recipeResul
329350
}
330351
}
331352

332-
// Apply node selectors and tolerations based on component type
353+
// Apply node selectors, tolerations, workload selector, and taints based on component type
333354
b.applyNodeSchedulingOverrides(ref.Name, values)
334355

335356
componentValues[ref.Name] = values
@@ -433,6 +454,109 @@ func (b *DefaultBundler) applyNodeSchedulingOverrides(componentName string, valu
433454
component.ApplyTolerationsOverrides(values, tolerations, paths...)
434455
}
435456
}
457+
458+
// Apply workload selector (for components like skyhook-customizations)
459+
if workloadSelector := b.Config.WorkloadSelector(); len(workloadSelector) > 0 {
460+
if paths := comp.GetAcceleratedWorkloadSelectorPaths(); len(paths) > 0 {
461+
component.ApplyNodeSelectorOverrides(values, workloadSelector, paths...)
462+
}
463+
}
464+
465+
// Apply workload-gate taint (as string format for skyhook-operator)
466+
if taint := b.Config.WorkloadGateTaint(); taint != nil {
467+
if paths := comp.GetAcceleratedTaintStrPaths(); len(paths) > 0 {
468+
taintStr := taint.ToString()
469+
overrides := make(map[string]string, len(paths))
470+
for _, path := range paths {
471+
overrides[path] = taintStr
472+
}
473+
if err := component.ApplyMapOverrides(values, overrides); err != nil {
474+
slog.Warn("failed to apply workload-gate taint",
475+
"component", componentName,
476+
"error", err,
477+
)
478+
}
479+
}
480+
}
481+
}
482+
483+
// validateWorkloadSelector validates that workload-selector is set when skyhook-customizations
484+
// is present with training intent.
485+
func (b *DefaultBundler) validateWorkloadSelector(recipeResult *recipe.RecipeResult) {
486+
if b.Config == nil {
487+
return
488+
}
489+
490+
// Check if skyhook-customizations component exists
491+
hasSkyhookCustomizations := false
492+
for _, ref := range recipeResult.ComponentRefs {
493+
if ref.Name == "skyhook-customizations" {
494+
hasSkyhookCustomizations = true
495+
break
496+
}
497+
}
498+
499+
if !hasSkyhookCustomizations {
500+
return
501+
}
502+
503+
// Check if intent is training
504+
if recipeResult.Criteria == nil || recipeResult.Criteria.Intent != recipe.CriteriaIntentTraining {
505+
return
506+
}
507+
508+
// Check if workload-selector is not set
509+
selector := b.Config.WorkloadSelector()
510+
if len(selector) == 0 {
511+
slog.Warn("skyhook-customizations is enabled with training intent but --workload-selector is not set",
512+
"component", "skyhook-customizations",
513+
"intent", "training",
514+
)
515+
// Store warning to be added to deployment notes
516+
b.warnings = append(b.warnings, "Warning: skyhook-customizations is enabled with training intent but --workload-selector is not set. This may cause skyhook to evict running training jobs. Consider setting --workload-selector to prevent eviction.")
517+
}
518+
}
519+
520+
// validateAcceleratedSelector validates that accelerated-node-selector is set when skyhook-customizations
521+
// is present with training or inference intent.
522+
func (b *DefaultBundler) validateAcceleratedSelector(recipeResult *recipe.RecipeResult) {
523+
if b.Config == nil {
524+
return
525+
}
526+
527+
// Check if skyhook-customizations component exists
528+
hasSkyhookCustomizations := false
529+
for _, ref := range recipeResult.ComponentRefs {
530+
if ref.Name == "skyhook-customizations" {
531+
hasSkyhookCustomizations = true
532+
break
533+
}
534+
}
535+
536+
if !hasSkyhookCustomizations {
537+
return
538+
}
539+
540+
// Check if intent is training or inference
541+
if recipeResult.Criteria == nil {
542+
return
543+
}
544+
intent := recipeResult.Criteria.Intent
545+
if intent != recipe.CriteriaIntentTraining && intent != recipe.CriteriaIntentInference {
546+
return
547+
}
548+
549+
// Check if accelerated-node-selector is not set
550+
selector := b.Config.AcceleratedNodeSelector()
551+
if len(selector) == 0 {
552+
slog.Warn("skyhook-customizations is enabled with training/inference intent but --accelerated-node-selector is not set",
553+
"component", "skyhook-customizations",
554+
"intent", intent,
555+
)
556+
// Store warning to be added to deployment notes
557+
warningMsg := fmt.Sprintf("Warning: skyhook-customizations is enabled with %s intent but --accelerated-node-selector is not set. Without this selector, the customization will run on all nodes. Consider setting --accelerated-node-selector to target specific nodes.", intent)
558+
b.warnings = append(b.warnings, warningMsg)
559+
}
436560
}
437561

438562
// writeRecipeFile serializes the recipe to the bundle directory.

0 commit comments

Comments
 (0)