Skip to content

Commit 680571c

Browse files
dirangedclaude
andauthored
refactor(config): rename startupTaintKeys to knownStartupTaintKeys (#19)
## Summary - Renames `startupTaintKeys` to `knownStartupTaintKeys` across all config, Go code, tests, Helm values, and docs - Adds detailed documentation explaining that Vigil does NOT remove these taints — they are managed by their respective controllers (CSI drivers, CNI plugins, etc.) - The setting is used solely for DaemonSet discovery: stripped during scheduling predicate evaluation so DaemonSets that don't tolerate temporary taints aren't incorrectly excluded **Breaking change:** Config key renamed from `startupTaintKeys` to `knownStartupTaintKeys` ## Test plan - [x] All unit tests pass - [x] Code compiles (including stress tests) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a1e908e commit 680571c

10 files changed

Lines changed: 59 additions & 23 deletions

File tree

charts/vigil-controller/values.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,21 @@ skipConfig: false
154154
config:
155155
taintKey: "node.nextdoor.com/initializing"
156156
taintEffect: "NoSchedule"
157-
startupTaintKeys:
157+
# -- All known temporary startup taint keys used in the cluster.
158+
# Vigil only removes the taint specified by taintKey. However, new nodes often
159+
# have additional temporary taints set by other controllers (CSI drivers, CNI
160+
# plugins, etc.). Those controllers remove their own taints — Vigil does not
161+
# touch them.
162+
#
163+
# This list is used solely for DaemonSet discovery: when determining which
164+
# DaemonSets should run on a node in steady state, Vigil strips these taints
165+
# before evaluating scheduling predicates. Without this, DaemonSets that don't
166+
# tolerate these temporary taints would be incorrectly excluded from Vigil's
167+
# expected set, causing it to remove its taint too early.
168+
#
169+
# Must include taintKey itself. Add any other startup taints used by your
170+
# infrastructure (CSI drivers, CNI plugins, GPU initializers, etc.).
171+
knownStartupTaintKeys:
158172
- "node.nextdoor.com/initializing"
159173
- "cni.istio.io/not-ready"
160174
- "ebs.csi.aws.com/agent-not-ready"

config.example.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,20 @@ taintKey: "node.nextdoor.com/initializing"
77
# The taint effect to match
88
taintEffect: "NoSchedule"
99

10-
# ALL startup taint keys present in this cluster.
11-
# These are stripped from the node's taint list when evaluating DaemonSet
12-
# tolerations, so that discovery answers "will this DS run in steady state?"
13-
# rather than "can this DS tolerate the node right now?"
14-
startupTaintKeys:
10+
# All known temporary startup taint keys used in the cluster.
11+
#
12+
# Vigil only removes the taint specified by taintKey above. However, new nodes
13+
# often have additional temporary taints set by other controllers (CSI drivers,
14+
# CNI plugins, etc.). Those taints are removed by their respective controllers,
15+
# not by Vigil.
16+
#
17+
# This list is used only for DaemonSet discovery: Vigil strips these taints
18+
# before evaluating which DaemonSets should run on a node in steady state.
19+
# Without this, DaemonSets that don't tolerate these temporary taints would be
20+
# incorrectly excluded from the expected set.
21+
#
22+
# Must include taintKey itself plus any other startup taints in the cluster.
23+
knownStartupTaintKeys:
1524
- "node.nextdoor.com/initializing"
1625
- "cni.istio.io/not-ready"
1726
- "ebs.csi.aws.com/agent-not-ready"

internal/controller/integration_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func testConfig() *config.Config {
5757
return &config.Config{
5858
TaintKey: "node.example.com/initializing",
5959
TaintEffect: "NoSchedule",
60-
StartupTaintKeys: []string{
60+
KnownStartupTaintKeys: []string{
6161
"node.example.com/initializing",
6262
},
6363
TimeoutSeconds: 120,

internal/discovery/discovery.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ func (d *DaemonSetDiscovery) ExpectedDaemonSets(ctx context.Context, node *corev
4848
}
4949

5050
// Compute steady-state taints: node taints minus all configured startup taint keys.
51-
steadyStateTaints := steadyStateTaints(node.Spec.Taints, d.config.StartupTaintKeys)
51+
steadyStateTaints := steadyStateTaints(node.Spec.Taints, d.config.KnownStartupTaintKeys)
5252

5353
// Build exclusion lookup.
5454
excludeByName := buildNameExclusionSet(d.config.ExcludeDaemonSets.ByName)

internal/discovery/discovery_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ func defaultConfig() *config.Config {
2828
return &config.Config{
2929
TaintKey: "node.example.com/initializing",
3030
TaintEffect: "NoSchedule",
31-
StartupTaintKeys: []string{
31+
KnownStartupTaintKeys: []string{
3232
"node.example.com/initializing",
3333
"cni.example.io/not-ready",
3434
},
@@ -301,7 +301,7 @@ func TestExpectedDaemonSets_NoDaemonSets(t *testing.T) {
301301
func TestExpectedDaemonSets_NoStartupTaintsConfigured(t *testing.T) {
302302
scheme := newScheme()
303303
cfg := defaultConfig()
304-
cfg.StartupTaintKeys = nil // No startup taints to strip.
304+
cfg.KnownStartupTaintKeys = nil // No startup taints to strip.
305305

306306
// Node has a taint that the DS doesn't tolerate.
307307
node := newNode(nil, []corev1.Taint{

pkg/config/config.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,21 @@ type Config struct {
1515
// TaintEffect is the taint effect to match.
1616
TaintEffect string `mapstructure:"taintEffect"`
1717

18-
// StartupTaintKeys is the list of ALL startup taint keys present in this cluster.
19-
// These are stripped from the node's taint list when evaluating DaemonSet
20-
// tolerations, so that discovery answers "will this DS run in steady state?"
21-
StartupTaintKeys []string `mapstructure:"startupTaintKeys"`
18+
// KnownStartupTaintKeys lists ALL temporary startup taint keys used in the cluster.
19+
//
20+
// Vigil only removes the taint specified by TaintKey. However, new nodes often
21+
// have additional temporary taints set by other controllers (e.g., CSI drivers,
22+
// CNI plugins). These taints are removed by their respective controllers — not
23+
// by Vigil.
24+
//
25+
// This list is used solely for DaemonSet discovery: when determining which
26+
// DaemonSets should run on a node in steady state, Vigil strips these taints
27+
// before evaluating scheduling predicates. Without this, DaemonSets that don't
28+
// tolerate these temporary taints would be incorrectly excluded from the
29+
// expected set.
30+
//
31+
// Must include TaintKey itself plus any other startup taints in the cluster.
32+
KnownStartupTaintKeys []string `mapstructure:"knownStartupTaintKeys"`
2233

2334
// TimeoutSeconds is the maximum time to wait before removing the taint anyway.
2435
TimeoutSeconds int `mapstructure:"timeoutSeconds"`
@@ -67,7 +78,7 @@ func NewDefault() *Config {
6778
return &Config{
6879
TaintKey: "node.nextdoor.com/initializing",
6980
TaintEffect: "NoSchedule",
70-
StartupTaintKeys: []string{
81+
KnownStartupTaintKeys: []string{
7182
"node.nextdoor.com/initializing",
7283
},
7384
TimeoutSeconds: 120,

pkg/config/config_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ func TestNewDefault(t *testing.T) {
1414
assert.Equal(t, "node.nextdoor.com/initializing", cfg.TaintKey)
1515
assert.Equal(t, "NoSchedule", cfg.TaintEffect)
1616
assert.Equal(t, 120, cfg.TimeoutSeconds)
17-
assert.Len(t, cfg.StartupTaintKeys, 1)
17+
assert.Len(t, cfg.KnownStartupTaintKeys, 1)
1818
}
1919

2020
func TestLoad_FileNotFound(t *testing.T) {
@@ -30,7 +30,7 @@ func TestLoad_ValidConfig(t *testing.T) {
3030
taintKey: "custom.taint/key"
3131
taintEffect: "NoExecute"
3232
timeoutSeconds: 60
33-
startupTaintKeys:
33+
knownStartupTaintKeys:
3434
- "custom.taint/key"
3535
- "cni.istio.io/not-ready"
3636
excludeDaemonSets:
@@ -45,7 +45,7 @@ excludeDaemonSets:
4545
assert.Equal(t, "custom.taint/key", cfg.TaintKey)
4646
assert.Equal(t, "NoExecute", cfg.TaintEffect)
4747
assert.Equal(t, 60, cfg.TimeoutSeconds)
48-
assert.Len(t, cfg.StartupTaintKeys, 2)
48+
assert.Len(t, cfg.KnownStartupTaintKeys, 2)
4949
assert.Len(t, cfg.ExcludeDaemonSets.ByName, 1)
5050
assert.Equal(t, "slow-ds", cfg.ExcludeDaemonSets.ByName[0].Name)
5151
}

test/stress/stress_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func TestStress(t *testing.T) {
103103
cfg := &config.Config{
104104
TaintKey: taintKey,
105105
TaintEffect: "NoSchedule",
106-
StartupTaintKeys: []string{
106+
KnownStartupTaintKeys: []string{
107107
taintKey,
108108
"node.kubernetes.io/not-ready", // envtest adds this (no kubelet)
109109
},

website/content/en/docs/reference/configuration.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ taintKey: "node.example.com/initializing"
1515
# The taint effect to match
1616
taintEffect: "NoSchedule"
1717

18-
# ALL startup taint keys present in this cluster
19-
startupTaintKeys:
18+
# All known temporary startup taint keys in the cluster.
19+
# Used for DaemonSet discovery only — Vigil does not remove these taints.
20+
# Their respective controllers (CSI drivers, CNI plugins) handle removal.
21+
knownStartupTaintKeys:
2022
- "node.example.com/initializing"
2123
- "cni.istio.io/not-ready"
2224
- "ebs.csi.aws.com/agent-not-ready"
@@ -48,6 +50,6 @@ excludeDaemonSets:
4850
|-------|------|---------|-------------|
4951
| `taintKey` | string | `node.nextdoor.com/initializing` | The taint key to watch and remove |
5052
| `taintEffect` | string | `NoSchedule` | The taint effect to match |
51-
| `startupTaintKeys` | []string | `[taintKey]` | All startup taint keys in the cluster |
53+
| `knownStartupTaintKeys` | []string | `[taintKey]` | All temporary startup taint keys in the cluster (used for discovery only — Vigil does not remove these) |
5254
| `timeoutSeconds` | int | `120` | Max wait time before forced taint removal |
5355
| `excludeDaemonSets.byName` | []object | `[]` | DaemonSets to exclude by namespace/name |

website/content/en/docs/reference/helm-chart.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ helm install vigil vigil/vigil-controller \
2626
| `config.taintKey` | `node.nextdoor.com/initializing` | Taint key to watch |
2727
| `config.taintEffect` | `NoSchedule` | Taint effect |
2828
| `config.timeoutSeconds` | `120` | Timeout before forced removal |
29-
| `config.startupTaintKeys` | See values.yaml | All startup taint keys |
29+
| `config.knownStartupTaintKeys` | See values.yaml | Temporary startup taints to ignore during discovery |
3030
| `serviceMonitor.enabled` | `false` | Create ServiceMonitor |
3131
| `resources.requests.cpu` | `100m` | CPU request |
3232
| `resources.requests.memory` | `64Mi` | Memory request |

0 commit comments

Comments
 (0)