Skip to content

Commit 890f970

Browse files
committed
docs: document NoExecute taint risks and add admission warning
restructure docs update taint key naming instructions
1 parent 354b151 commit 890f970

File tree

7 files changed

+303
-90
lines changed

7 files changed

+303
-90
lines changed

api/v1alpha1/nodereadinessrule_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ type NodeReadinessRuleSpec struct {
6969
// taint defines the specific Taint (Key, Value, and Effect) to be managed
7070
// on Nodes that meet the defined condition criteria.
7171
//
72+
// Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
73+
// Caution: NoExecute evicts existing pods and can cause significant disruption
74+
// when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
75+
//
7276
// +required
7377
Taint corev1.Taint `json:"taint,omitempty,omitzero"`
7478

config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ spec:
157157
description: |-
158158
taint defines the specific Taint (Key, Value, and Effect) to be managed
159159
on Nodes that meet the defined condition criteria.
160+
161+
Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
162+
Caution: NoExecute evicts existing pods and can cause significant disruption
163+
when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
160164
properties:
161165
effect:
162166
description: |-

docs/book/src/SUMMARY.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
- [Core Concepts](./user-guide/concepts.md)
88
- [Installation](./user-guide/installation.md)
9-
<!-- - [Creating Rules](./user-guide/creating-rules.md) -->
9+
- [Getting Started](./user-guide/getting-started.md)
1010
<!-- - [Quickstart](./user-guide/quickstart.md) -- [TODO] How to run the Calico example in KIND cluster -->
1111

1212
# Examples

docs/getting-started.md renamed to docs/book/src/user-guide/getting-started.md

Lines changed: 72 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11

22
## Getting Started
33

4+
This guide covers creating and configuring `NodeReadinessRule` resources.
5+
6+
> **Prerequisites**: Node Readiness Controller must be installed. See [Installation](./installation.md).
7+
48
### API Spec
59

610
#### Example: Storage Readiness Rule (Bootstrap-only)
@@ -43,82 +47,19 @@ spec:
4347
| `nodeSelector` | Label selector to target specific nodes | No |
4448
| `dryRun` | Preview changes without applying them | No |
4549

46-
### Deployment
47-
48-
#### Option 1: Install official release images
49-
50-
Node-Readiness Controller offers two variants of the container image to support different cluster architectures.
51-
52-
Released container images are available for:
53-
* **x86_64** (AMD64)
54-
* **Arm64** (AArch64)
55-
56-
The controller image is available in the Kubernetes staging registry:
57-
58-
```sh
59-
REPO="us-central1-docker.pkg.dev/k8s-staging-images/node-readiness-controller/node-readiness-controller"
60-
61-
TAG=$(skopeo list-tags docker://$REPO | jq .Tags[-1] | tr -d '"')
62-
63-
docker pull $REPO:$TAG
64-
```
65-
66-
#### Option 2: Deploy Using Make Commands
67-
68-
**Build and push your image to the location specified by `IMG_PREFIX`:`IMG_TAG` :**
69-
70-
```sh
71-
make docker-build docker-push IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
72-
```
73-
74-
```sh
75-
# Install the CRDs
76-
make install
77-
78-
# Deploy the controller
79-
make deploy IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
80-
81-
# Create sample rules
82-
kubectl apply -k examples/network-readiness-rule.yaml
83-
```
84-
85-
#### Option 3: Deploy Using Kustomize Directly
86-
87-
```sh
88-
# Install CRDs
89-
kubectl apply -k config/crd
90-
91-
# Deploy controller and RBAC
92-
kubectl apply -k config/default
93-
94-
# Create sample rules
95-
kubectl apply -f examples/network-readiness-rule.yaml
96-
```
97-
98-
### Uninstallation
99-
100-
> **Important**: Follow this order to avoid stuck resources due to finalizers.
101-
102-
The controller adds a finalizer (`readiness.node.x-k8s.io/cleanup-taints`) to each `NodeReadinessRule` to ensure node taints are cleaned up before the rule is deleted. This means you must delete CRs **while the controller is still running**.
103-
104-
```sh
105-
# 1. Delete all rule instances first (while controller is running)
106-
kubectl delete nodereadinessrules --all
107-
108-
# 2. Delete the controller
109-
make undeploy
110-
111-
# 3. Delete the CRDs
112-
make uninstall
113-
```
114-
115-
#### Recovering from Stuck Resources
50+
### Enforcement Modes
11651

117-
If you deleted the controller before removing the CRs, the finalizer will block CR deletion. To recover, manually remove the finalizer:
52+
#### Bootstrap-only Mode
53+
- Removes bootstrap taint when conditions are first satisfied
54+
- Marks completion with node annotation
55+
- Stops monitoring after successful removal (fail-safe)
56+
- Ideal for one-time setup conditions (storage, installing node daemons e.g: security agent or kernel-module update)
11857

119-
```sh
120-
kubectl patch nodereadinessrule <rule-name> -p '{"metadata":{"finalizers":[]}}' --type=merge
121-
```
58+
#### Continuous Mode
59+
- Continuously monitors conditions
60+
- Adds taint when any condition becomes unsatisfied
61+
- Removes taint when all conditions become satisfied
62+
- Ideal for ongoing health monitoring (network connectivity, resource availability)
12263

12364
## Operations
12465

@@ -162,19 +103,65 @@ Check dry run results:
162103
kubectl get nodereadinessrule <rule-name> -o jsonpath='{.status.dryRunResults}'
163104
```
164105

165-
### Enforcement Modes
106+
### Rule Validation and Constraints
166107

167-
#### Bootstrap-only Mode
168-
- Removes bootstrap taint when conditions are first satisfied
169-
- Marks completion with node annotation
170-
- Stops monitoring after successful removal (fail-safe)
171-
- Ideal for one-time setup conditions (storage, installing node daemons e.g: security agent or kernel-module update)
108+
#### NoExecute Taint Effect Warning
109+
110+
**`NoExecute` with `continuous` enforcement mode will evict existing workloads when conditions fail.**
111+
112+
If a critical component becomes temporarily unavailable (e.g., CNI daemon restart), all pods without matching tolerations are immediately evicted from the node. Use `NoSchedule` to prevent new scheduling without disrupting running workloads.
113+
114+
The admission webhook warns when using `NoExecute`.
115+
116+
See [Kubernetes taints documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) for taint behavior details.
117+
118+
#### Avoiding Taint Key Conflicts
119+
120+
The admission webhook prevents multiple rules from using the same `taint.key` and `taint.effect` on overlapping node selectors.
121+
122+
**Example conflict:**
123+
```yaml
124+
# Rule 1
125+
spec:
126+
nodeSelector:
127+
matchLabels:
128+
node-role.kubernetes.io/worker: ""
129+
taint:
130+
key: "readiness.k8s.io/network"
131+
effect: "NoSchedule"
132+
133+
# Rule 2 - This will be REJECTED
134+
spec:
135+
nodeSelector:
136+
matchLabels:
137+
node-role.kubernetes.io/worker: ""
138+
taint:
139+
key: "readiness.k8s.io/network" # Same key + effect = conflict
140+
effect: "NoSchedule"
141+
```
142+
143+
Use unique, descriptive taint keys for different readiness checks.
144+
145+
#### Taint Key Naming
146+
147+
Follow [Kubernetes naming conventions](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/).
148+
149+
Taint keys must have the `readiness.k8s.io/` prefix to clearly identify readiness-related taints and avoid conflicts with other controllers
150+
151+
**Valid:**
152+
```yaml
153+
taint:
154+
key: "readiness.k8s.io/NetworkReady"
155+
key: "readiness.k8s.io/StorageReady"
156+
```
157+
158+
**Invalid:**
159+
```yaml
160+
taint:
161+
key: "network-ready" # Missing prefix
162+
key: "node.kubernetes.io/ready" # Wrong prefix
163+
```
172164

173-
#### Continuous Mode
174-
- Continuously monitors conditions
175-
- Adds taint when any condition becomes unsatisfied
176-
- Removes taint when all conditions become satisfied
177-
- Ideal for ongoing health monitoring (network connectivity, resource availability)
178165

179166
## Configuration
180167

docs/book/src/user-guide/installation.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,30 @@ If it gets evicted during resource pressure, nodes can't transition to Ready sta
3434

3535
This is the priority class used by other critical cluster components (eg: core-dns).
3636

37-
**Images**: The official releases use multi-arch images (AMD64, Arm64).
37+
**Images**: The official releases use multi-arch images (AMD64, Arm64) available in the Kubernetes staging registry:
3838

39-
### Option 2: Deploy Using Kustomize
39+
```sh
40+
REPO="us-central1-docker.pkg.dev/k8s-staging-images/node-readiness-controller/node-readiness-controller"
41+
TAG=$(skopeo list-tags docker://$REPO | jq .Tags[-1] | tr -d '"')
42+
docker pull $REPO:$TAG
43+
```
44+
45+
### Option 2: Deploy Using Make Commands
46+
47+
**Build and push your image to the location specified by `IMG_PREFIX`:`IMG_TAG` :**
48+
49+
```sh
50+
# Build and push your image
51+
make docker-build docker-push IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
52+
53+
# Install the CRDs
54+
make install
55+
56+
# Deploy the controller
57+
make deploy IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
58+
```
59+
60+
### Option 3: Deploy Using Kustomize
4061

4162
If you have cloned the repository and want to deploy from source, you can use Kustomize.
4263

internal/webhook/nodereadinessgaterule_webhook.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222

23+
corev1 "k8s.io/api/core/v1"
2324
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/apimachinery/pkg/util/validation/field"
@@ -165,6 +166,35 @@ func (w *NodeReadinessRuleWebhook) nodSelectorsOverlap(selector1, selector2 meta
165166
return sel1.String() == sel2.String()
166167
}
167168

169+
// generateNoExecuteWarnings generates admission warnings for NoExecute taint usage.
170+
// NoExecute taints cause immediate pod eviction, which can be disruptive when
171+
// used with continuous enforcement mode.
172+
func (w *NodeReadinessRuleWebhook) generateNoExecuteWarnings(spec readinessv1alpha1.NodeReadinessRuleSpec) admission.Warnings {
173+
var warnings admission.Warnings
174+
175+
if spec.Taint.Effect != corev1.TaintEffectNoExecute {
176+
return warnings
177+
}
178+
179+
// NoExecute with continuous mode is particularly risky
180+
if spec.EnforcementMode == readinessv1alpha1.EnforcementModeContinuous {
181+
warnings = append(warnings,
182+
"CAUTION: Using NoExecute taint effect with continuous enforcement mode. "+
183+
"This configuration will evict existing pods when conditions fail, which may cause "+
184+
"significant workload disruption if conditions are unstable. Consider using NoSchedule "+
185+
"effect instead, or bootstrap-only enforcement mode. "+
186+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
187+
} else {
188+
// NoExecute with bootstrap-only is less risky but still worth noting
189+
warnings = append(warnings,
190+
"NOTE: Using NoExecute taint effect. This will evict existing pods that do not "+
191+
"tolerate this taint when applied. Ensure critical system pods have appropriate tolerations. "+
192+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
193+
}
194+
195+
return warnings
196+
}
197+
168198
// SetupWithManager sets up the webhook with the manager.
169199
func (w *NodeReadinessRuleWebhook) SetupWithManager(mgr ctrl.Manager) error {
170200
return ctrl.NewWebhookManagedBy(mgr).
@@ -185,7 +215,10 @@ func (w *NodeReadinessRuleWebhook) ValidateCreate(ctx context.Context, obj runti
185215
if allErrs := w.validateNodeReadinessRule(ctx, rule, false); len(allErrs) > 0 {
186216
return nil, fmt.Errorf("validation failed: %v", allErrs)
187217
}
188-
return nil, nil
218+
219+
// Generate warnings for NoExecute taint usage
220+
warnings := w.generateNoExecuteWarnings(rule.Spec)
221+
return warnings, nil
189222
}
190223

191224
func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
@@ -197,7 +230,10 @@ func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, n
197230
if allErrs := w.validateNodeReadinessRule(ctx, rule, true); len(allErrs) > 0 {
198231
return nil, fmt.Errorf("validation failed: %v", allErrs)
199232
}
200-
return nil, nil
233+
234+
// Generate warnings for NoExecute taint usage
235+
warnings := w.generateNoExecuteWarnings(rule.Spec)
236+
return warnings, nil
201237
}
202238

203239
func (w *NodeReadinessRuleWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {

0 commit comments

Comments
 (0)