Skip to content

Commit 171a18e

Browse files
committed
docs: document NoExecute taint risks and add admission warning
restructure docs update taint key naming instructions update registry url, simplify installation methods refactor based on PR feedback
1 parent 354b151 commit 171a18e

File tree

7 files changed

+303
-143
lines changed

7 files changed

+303
-143
lines changed

api/v1alpha1/nodereadinessrule_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ type NodeReadinessRuleSpec struct {
6969
// taint defines the specific Taint (Key, Value, and Effect) to be managed
7070
// on Nodes that meet the defined condition criteria.
7171
//
72+
// Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
73+
// Caution: NoExecute evicts existing pods and can cause significant disruption
74+
// when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
75+
//
7276
// +required
7377
Taint corev1.Taint `json:"taint,omitempty,omitzero"`
7478

config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ spec:
157157
description: |-
158158
taint defines the specific Taint (Key, Value, and Effect) to be managed
159159
on Nodes that meet the defined condition criteria.
160+
161+
Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
162+
Caution: NoExecute evicts existing pods and can cause significant disruption
163+
when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
160164
properties:
161165
effect:
162166
description: |-

docs/book/src/SUMMARY.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
- [Core Concepts](./user-guide/concepts.md)
88
- [Installation](./user-guide/installation.md)
9-
<!-- - [Creating Rules](./user-guide/creating-rules.md) -->
9+
- [Getting Started](./user-guide/getting-started.md)
1010
<!-- - [Quickstart](./user-guide/quickstart.md) -- [TODO] How to run the Calico example in KIND cluster -->
1111

1212
# Examples
Lines changed: 87 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11

22
## Getting Started
33

4+
This guide covers creating and configuring `NodeReadinessRule` resources.
5+
6+
> **Prerequisites**: Node Readiness Controller must be installed. See [Installation](./installation.md).
7+
48
### API Spec
59

610
#### Example: Storage Readiness Rule (Bootstrap-only)
@@ -11,17 +15,18 @@ This rule ensures nodes have working storage before removing the storage readine
1115
apiVersion: readiness.node.x-k8s.io/v1alpha1
1216
kind: NodeReadinessRule
1317
metadata:
14-
name: storage-readiness-rule
18+
name: nfs-storage-readiness-rule
1519
spec:
16-
conditions:
17-
- type: "storage.kubernetes.io/CSIReady"
18-
requiredStatus: "True"
19-
- type: "storage.kubernetes.io/VolumePluginReady"
20-
requiredStatus: "True"
20+
conditions:
21+
- type: "csi.example.net/NodePluginRegistered"
22+
requiredStatus: "True"
23+
- type: "csi.example.net/BackendReachable"
24+
requiredStatus: "True"
25+
- type: "DiskPressure"
26+
requiredStatus: "False"
2127
taint:
22-
key: "readiness.k8s.io/StorageReady"
28+
key: "readiness.k8s.io/vendor.com/nfs-unhealthy"
2329
effect: "NoSchedule"
24-
value: "pending"
2530
enforcementMode: "bootstrap-only"
2631
nodeSelector:
2732
matchLabels:
@@ -43,82 +48,19 @@ spec:
4348
| `nodeSelector` | Label selector to target specific nodes | No |
4449
| `dryRun` | Preview changes without applying them | No |
4550

46-
### Deployment
47-
48-
#### Option 1: Install official release images
49-
50-
Node-Readiness Controller offers two variants of the container image to support different cluster architectures.
51-
52-
Released container images are available for:
53-
* **x86_64** (AMD64)
54-
* **Arm64** (AArch64)
55-
56-
The controller image is available in the Kubernetes staging registry:
57-
58-
```sh
59-
REPO="us-central1-docker.pkg.dev/k8s-staging-images/node-readiness-controller/node-readiness-controller"
60-
61-
TAG=$(skopeo list-tags docker://$REPO | jq .Tags[-1] | tr -d '"')
62-
63-
docker pull $REPO:$TAG
64-
```
65-
66-
#### Option 2: Deploy Using Make Commands
67-
68-
**Build and push your image to the location specified by `IMG_PREFIX`:`IMG_TAG` :**
69-
70-
```sh
71-
make docker-build docker-push IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
72-
```
73-
74-
```sh
75-
# Install the CRDs
76-
make install
77-
78-
# Deploy the controller
79-
make deploy IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
80-
81-
# Create sample rules
82-
kubectl apply -k examples/network-readiness-rule.yaml
83-
```
84-
85-
#### Option 3: Deploy Using Kustomize Directly
86-
87-
```sh
88-
# Install CRDs
89-
kubectl apply -k config/crd
90-
91-
# Deploy controller and RBAC
92-
kubectl apply -k config/default
93-
94-
# Create sample rules
95-
kubectl apply -f examples/network-readiness-rule.yaml
96-
```
97-
98-
### Uninstallation
99-
100-
> **Important**: Follow this order to avoid stuck resources due to finalizers.
101-
102-
The controller adds a finalizer (`readiness.node.x-k8s.io/cleanup-taints`) to each `NodeReadinessRule` to ensure node taints are cleaned up before the rule is deleted. This means you must delete CRs **while the controller is still running**.
103-
104-
```sh
105-
# 1. Delete all rule instances first (while controller is running)
106-
kubectl delete nodereadinessrules --all
107-
108-
# 2. Delete the controller
109-
make undeploy
110-
111-
# 3. Delete the CRDs
112-
make uninstall
113-
```
114-
115-
#### Recovering from Stuck Resources
51+
### Enforcement Modes
11652

117-
If you deleted the controller before removing the CRs, the finalizer will block CR deletion. To recover, manually remove the finalizer:
53+
#### Bootstrap-only Mode
54+
- Removes bootstrap taint when conditions are first satisfied
55+
- Marks completion with node annotation
56+
- Stops monitoring after successful removal (fail-safe)
57+
- Ideal for one-time setup conditions (installing node daemons e.g: security agent or kernel-module update)
11858

119-
```sh
120-
kubectl patch nodereadinessrule <rule-name> -p '{"metadata":{"finalizers":[]}}' --type=merge
121-
```
59+
#### Continuous Mode
60+
- Continuously monitors conditions
61+
- Adds taint when any condition becomes unsatisfied
62+
- Removes taint when all conditions become satisfied
63+
- Ideal for ongoing health monitoring (network connectivity, resource availability)
12264

12365
## Operations
12466

@@ -151,7 +93,7 @@ Test rules safely before applying:
15193
spec:
15294
dryRun: true # Enable dry run mode
15395
conditions:
154-
- type: "storage.kubernetes.io/CSIReady"
96+
- type: "csi.example.net/NodePluginRegistered"
15597
requiredStatus: "True"
15698
# ... rest of spec
15799
```
@@ -162,28 +104,73 @@ Check dry run results:
162104
kubectl get nodereadinessrule <rule-name> -o jsonpath='{.status.dryRunResults}'
163105
```
164106

165-
### Enforcement Modes
107+
### Rule Validation and Constraints
166108

167-
#### Bootstrap-only Mode
168-
- Removes bootstrap taint when conditions are first satisfied
169-
- Marks completion with node annotation
170-
- Stops monitoring after successful removal (fail-safe)
171-
- Ideal for one-time setup conditions (storage, installing node daemons e.g: security agent or kernel-module update)
109+
#### NoExecute Taint Effect Warning
172110

173-
#### Continuous Mode
174-
- Continuously monitors conditions
175-
- Adds taint when any condition becomes unsatisfied
176-
- Removes taint when all conditions become satisfied
177-
- Ideal for ongoing health monitoring (network connectivity, resource availability)
111+
**`NoExecute` with `continuous` enforcement mode will evict existing workloads when conditions fail.**
178112

179-
## Configuration
113+
If a critical component becomes temporarily unavailable (e.g., CNI daemon restart), all pods without matching tolerations are immediately evicted from the node. Use `NoSchedule` to prevent new scheduling without disrupting running workloads.
114+
115+
The admission webhook warns when using `NoExecute`.
116+
117+
See [Kubernetes taints documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) for taint behavior details.
118+
119+
#### Avoiding Taint Key Conflicts
120+
121+
The admission webhook prevents multiple rules from using the same `taint.key` and `taint.effect` on overlapping node selectors.
122+
123+
**Example conflict:**
124+
```yaml
125+
# Rule 1
126+
spec:
127+
conditions:
128+
- type: "device.gpu-vendor.net/DevicePluginRegistered"
129+
requiredStatus: "True"
130+
nodeSelector:
131+
matchLabels:
132+
feature.node.kubernetes.io/pci-10de.present: "true"
133+
taint:
134+
key: "readiness.k8s.io/vendor.com/gpu-not-ready"
135+
effect: "NoSchedule"
136+
137+
# Rule 2 - This will be REJECTED
138+
spec:
139+
conditions:
140+
- type: "cniplugin.example.net/rdma/NetworkReady"
141+
requiredStatus: "True"
142+
nodeSelector:
143+
matchLabels:
144+
feature.node.kubernetes.io/pci-10de.present: "true"
145+
taint:
146+
key: "readiness.k8s.io/vendor.com/gpu-not-ready" # Same (taint-key + effect) but different conditions = conflict
147+
effect: "NoSchedule"
148+
```
149+
150+
Use unique, descriptive taint keys for different readiness checks.
151+
152+
#### Taint Key Naming
153+
154+
Follow [Kubernetes naming conventions](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/).
155+
156+
Taint keys must have the `readiness.k8s.io/` prefix to clearly identify readiness-related taints and avoid conflicts with other controllers
180157

181-
### Security
158+
**Valid:**
159+
```yaml
160+
taint:
161+
key: "readiness.k8s.io/vendor.com/network-not-ready"
162+
key: "readiness.k8s.io/vendor.com/gpu-not-ready"
163+
```
182164

183-
The controller requires the following RBAC permissions:
184-
- **Nodes**: `get`, `list`, `watch`, `patch`, `update` (for taint management)
185-
- **NodeReadinessRules**: Full CRUD access
186-
- **Events**: `create` (for status reporting)
165+
**Invalid:**
166+
```yaml
167+
taint:
168+
key: "network-ready" # Missing prefix
169+
key: "node.kubernetes.io/ready" # Wrong prefix
170+
```
171+
172+
173+
## Configuration
187174

188175
### Performance and Scalability
189176

@@ -211,38 +198,3 @@ conditions:
211198
- type: "readiness.k8s.io/mycompany.example.com/CacheWarmed"
212199
requiredStatus: "True"
213200
```
214-
215-
#### With Cluster Autoscaler
216-
NodeReadinessController work well with cluster autoscaling:
217-
- New nodes start with restrictive taints
218-
- Controller removes taints once conditions are satisfied
219-
- Autoscaler can safely scale knowing nodes are truly ready
220-
221-
## Development
222-
223-
### Building from Source
224-
225-
```sh
226-
# Clone the repository
227-
git clone https://sigs.k8s.io/node-readiness-controller.git
228-
cd node-readiness-controller
229-
230-
# Run tests
231-
make test
232-
233-
# Build binary
234-
make build
235-
236-
# Generate manifests
237-
make manifests
238-
```
239-
240-
### Running Locally
241-
242-
```sh
243-
# Install CRDs
244-
make install
245-
246-
# Run against cluster (requires KUBECONFIG)
247-
make run
248-
```

docs/book/src/user-guide/installation.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ Follow this guide to install the Node Readiness Controller in your Kubernetes cl
66

77
### Option 1: Install Official Release (Recommended)
88

9-
The easiest way to get started is by applying the official release manifests.
10-
119
First, to install the CRDs, apply the `crds.yaml` manifest:
1210

1311
```sh
@@ -34,11 +32,16 @@ If it gets evicted during resource pressure, nodes can't transition to Ready sta
3432

3533
This is the priority class used by other critical cluster components (eg: core-dns).
3634

37-
**Images**: The official releases use multi-arch images (AMD64, Arm64).
35+
#### Images
3836

39-
### Option 2: Deploy Using Kustomize
37+
The official releases use multi-arch images (AMD64, Arm64) and are available at `registry.k8s.io/node-readiness-controller/node-readiness-controller`
4038

41-
If you have cloned the repository and want to deploy from source, you can use Kustomize.
39+
```sh
40+
REPO="registry.k8s.io/node-readiness-controller/node-readiness-reporter"
41+
TAG=$(skopeo list-tags docker://$REPO | jq .'Tags[-1]' | tr -d '"')
42+
docker pull $REPO:$TAG
43+
```
44+
### Option 2: Deploy Using Kustomize
4245

4346
```sh
4447
# 1. Install Custom Resource Definitions (CRDs)

internal/webhook/nodereadinessgaterule_webhook.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222

23+
corev1 "k8s.io/api/core/v1"
2324
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/apimachinery/pkg/util/validation/field"
@@ -165,6 +166,35 @@ func (w *NodeReadinessRuleWebhook) nodSelectorsOverlap(selector1, selector2 meta
165166
return sel1.String() == sel2.String()
166167
}
167168

169+
// generateNoExecuteWarnings generates admission warnings for NoExecute taint usage.
170+
// NoExecute taints cause immediate pod eviction, which can be disruptive when
171+
// used with continuous enforcement mode.
172+
func (w *NodeReadinessRuleWebhook) generateNoExecuteWarnings(spec readinessv1alpha1.NodeReadinessRuleSpec) admission.Warnings {
173+
var warnings admission.Warnings
174+
175+
if spec.Taint.Effect != corev1.TaintEffectNoExecute {
176+
return warnings
177+
}
178+
179+
// NoExecute with continuous mode is particularly risky
180+
if spec.EnforcementMode == readinessv1alpha1.EnforcementModeContinuous {
181+
warnings = append(warnings,
182+
"CAUTION: Using NoExecute taint effect with continuous enforcement mode. "+
183+
"This configuration will evict existing pods when conditions fail, which may cause "+
184+
"significant workload disruption if conditions are unstable. Consider using NoSchedule "+
185+
"effect instead, or bootstrap-only enforcement mode. "+
186+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
187+
} else {
188+
// NoExecute with bootstrap-only is less risky but still worth noting
189+
warnings = append(warnings,
190+
"NOTE: Using NoExecute taint effect. This will evict existing pods that do not "+
191+
"tolerate this taint when applied. Ensure critical system pods have appropriate tolerations. "+
192+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
193+
}
194+
195+
return warnings
196+
}
197+
168198
// SetupWithManager sets up the webhook with the manager.
169199
func (w *NodeReadinessRuleWebhook) SetupWithManager(mgr ctrl.Manager) error {
170200
return ctrl.NewWebhookManagedBy(mgr).
@@ -185,7 +215,10 @@ func (w *NodeReadinessRuleWebhook) ValidateCreate(ctx context.Context, obj runti
185215
if allErrs := w.validateNodeReadinessRule(ctx, rule, false); len(allErrs) > 0 {
186216
return nil, fmt.Errorf("validation failed: %v", allErrs)
187217
}
188-
return nil, nil
218+
219+
// Generate warnings for NoExecute taint usage
220+
warnings := w.generateNoExecuteWarnings(rule.Spec)
221+
return warnings, nil
189222
}
190223

191224
func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
@@ -197,7 +230,10 @@ func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, n
197230
if allErrs := w.validateNodeReadinessRule(ctx, rule, true); len(allErrs) > 0 {
198231
return nil, fmt.Errorf("validation failed: %v", allErrs)
199232
}
200-
return nil, nil
233+
234+
// Generate warnings for NoExecute taint usage
235+
warnings := w.generateNoExecuteWarnings(rule.Spec)
236+
return warnings, nil
201237
}
202238

203239
func (w *NodeReadinessRuleWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {

0 commit comments

Comments
 (0)