Skip to content

Commit ed8a285

Browse files
committed
docs: document NoExecute taint risks and add admission warning
restructure docs update taint key naming instructions update registry url, simplify installation methods refactor based on PR feedback fix repo url minor updates
1 parent 354b151 commit ed8a285

File tree

7 files changed

+303
-143
lines changed

7 files changed

+303
-143
lines changed

api/v1alpha1/nodereadinessrule_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ type NodeReadinessRuleSpec struct {
6969
// taint defines the specific Taint (Key, Value, and Effect) to be managed
7070
// on Nodes that meet the defined condition criteria.
7171
//
72+
// Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
73+
// Caution: NoExecute evicts existing pods and can cause significant disruption
74+
// when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
75+
//
7276
// +required
7377
Taint corev1.Taint `json:"taint,omitempty,omitzero"`
7478

config/crd/bases/readiness.node.x-k8s.io_nodereadinessrules.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ spec:
157157
description: |-
158158
taint defines the specific Taint (Key, Value, and Effect) to be managed
159159
on Nodes that meet the defined condition criteria.
160+
161+
Supported effects: NoSchedule, PreferNoSchedule, NoExecute.
162+
Caution: NoExecute evicts existing pods and can cause significant disruption
163+
when combined with continuous enforcement mode. Prefer NoSchedule for most use cases.
160164
properties:
161165
effect:
162166
description: |-

docs/book/src/SUMMARY.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
- [Core Concepts](./user-guide/concepts.md)
88
- [Installation](./user-guide/installation.md)
9-
<!-- - [Creating Rules](./user-guide/creating-rules.md) -->
9+
- [Getting Started](./user-guide/getting-started.md)
1010
<!-- - [Quickstart](./user-guide/quickstart.md) -- [TODO] How to run the Calico example in KIND cluster -->
1111

1212
# Examples
Lines changed: 87 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11

22
## Getting Started
33

4+
This guide covers creating and configuring `NodeReadinessRule` resources.
5+
6+
> **Prerequisites**: Node Readiness Controller must be installed. See [Installation](./installation.md).
7+
48
### API Spec
59

610
#### Example: Storage Readiness Rule (Bootstrap-only)
@@ -11,17 +15,18 @@ This rule ensures nodes have working storage before removing the storage readine
1115
apiVersion: readiness.node.x-k8s.io/v1alpha1
1216
kind: NodeReadinessRule
1317
metadata:
14-
name: storage-readiness-rule
18+
name: nfs-storage-readiness-rule
1519
spec:
16-
conditions:
17-
- type: "storage.kubernetes.io/CSIReady"
18-
requiredStatus: "True"
19-
- type: "storage.kubernetes.io/VolumePluginReady"
20-
requiredStatus: "True"
20+
conditions:
21+
- type: "csi.example.net/NodePluginRegistered"
22+
requiredStatus: "True"
23+
- type: "csi.example.net/BackendReachable"
24+
requiredStatus: "True"
25+
- type: "DiskPressure"
26+
requiredStatus: "False"
2127
taint:
22-
key: "readiness.k8s.io/StorageReady"
28+
key: "readiness.k8s.io/vendor.com/nfs-unhealthy"
2329
effect: "NoSchedule"
24-
value: "pending"
2530
enforcementMode: "bootstrap-only"
2631
nodeSelector:
2732
matchLabels:
@@ -43,82 +48,19 @@ spec:
4348
| `nodeSelector` | Label selector to target specific nodes | No |
4449
| `dryRun` | Preview changes without applying them | No |
4550

46-
### Deployment
47-
48-
#### Option 1: Install official release images
49-
50-
Node-Readiness Controller offers two variants of the container image to support different cluster architectures.
51-
52-
Released container images are available for:
53-
* **x86_64** (AMD64)
54-
* **Arm64** (AArch64)
55-
56-
The controller image is available in the Kubernetes staging registry:
57-
58-
```sh
59-
REPO="us-central1-docker.pkg.dev/k8s-staging-images/node-readiness-controller/node-readiness-controller"
60-
61-
TAG=$(skopeo list-tags docker://$REPO | jq .Tags[-1] | tr -d '"')
62-
63-
docker pull $REPO:$TAG
64-
```
65-
66-
#### Option 2: Deploy Using Make Commands
67-
68-
**Build and push your image to the location specified by `IMG_PREFIX`:`IMG_TAG` :**
69-
70-
```sh
71-
make docker-build docker-push IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
72-
```
73-
74-
```sh
75-
# Install the CRDs
76-
make install
77-
78-
# Deploy the controller
79-
make deploy IMG_PREFIX=<some-registry>/nrr-controller IMG_TAG=tag
80-
81-
# Create sample rules
82-
kubectl apply -k examples/network-readiness-rule.yaml
83-
```
84-
85-
#### Option 3: Deploy Using Kustomize Directly
86-
87-
```sh
88-
# Install CRDs
89-
kubectl apply -k config/crd
90-
91-
# Deploy controller and RBAC
92-
kubectl apply -k config/default
93-
94-
# Create sample rules
95-
kubectl apply -f examples/network-readiness-rule.yaml
96-
```
97-
98-
### Uninstallation
99-
100-
> **Important**: Follow this order to avoid stuck resources due to finalizers.
101-
102-
The controller adds a finalizer (`readiness.node.x-k8s.io/cleanup-taints`) to each `NodeReadinessRule` to ensure node taints are cleaned up before the rule is deleted. This means you must delete CRs **while the controller is still running**.
103-
104-
```sh
105-
# 1. Delete all rule instances first (while controller is running)
106-
kubectl delete nodereadinessrules --all
107-
108-
# 2. Delete the controller
109-
make undeploy
110-
111-
# 3. Delete the CRDs
112-
make uninstall
113-
```
114-
115-
#### Recovering from Stuck Resources
51+
### Enforcement Modes
11652

117-
If you deleted the controller before removing the CRs, the finalizer will block CR deletion. To recover, manually remove the finalizer:
53+
#### Bootstrap-only Mode
54+
- Removes bootstrap taint when conditions are first satisfied
55+
- Marks completion with node annotation
56+
- Stops monitoring after successful removal (fail-safe)
57+
- Ideal for one-time setup conditions (installing node daemons e.g: security agent or kernel-module update)
11858

119-
```sh
120-
kubectl patch nodereadinessrule <rule-name> -p '{"metadata":{"finalizers":[]}}' --type=merge
121-
```
59+
#### Continuous Mode
60+
- Continuously monitors conditions
61+
- Adds taint when any condition becomes unsatisfied
62+
- Removes taint when all conditions become satisfied
63+
- Ideal for ongoing health monitoring (network connectivity, resource availability)
12264

12365
## Operations
12466

@@ -151,7 +93,7 @@ Test rules safely before applying:
15193
spec:
15294
dryRun: true # Enable dry run mode
15395
conditions:
154-
- type: "storage.kubernetes.io/CSIReady"
96+
- type: "csi.example.net/NodePluginRegistered"
15597
requiredStatus: "True"
15698
# ... rest of spec
15799
```
@@ -162,28 +104,73 @@ Check dry run results:
162104
kubectl get nodereadinessrule <rule-name> -o jsonpath='{.status.dryRunResults}'
163105
```
164106

165-
### Enforcement Modes
107+
### Rule Validation and Constraints
166108

167-
#### Bootstrap-only Mode
168-
- Removes bootstrap taint when conditions are first satisfied
169-
- Marks completion with node annotation
170-
- Stops monitoring after successful removal (fail-safe)
171-
- Ideal for one-time setup conditions (storage, installing node daemons e.g: security agent or kernel-module update)
109+
#### NoExecute Taint Effect Warning
172110

173-
#### Continuous Mode
174-
- Continuously monitors conditions
175-
- Adds taint when any condition becomes unsatisfied
176-
- Removes taint when all conditions become satisfied
177-
- Ideal for ongoing health monitoring (network connectivity, resource availability)
111+
**`NoExecute` with `continuous` enforcement mode will evict existing workloads when conditions fail.**
178112

179-
## Configuration
113+
If a readiness condition on the node is failing temporarily (eg., the component restarted), all pods without matching tolerations are immediately evicted from the node, if configured with a `NoExecute` taint. Use `NoSchedule` to prevent new scheduling without disrupting running workloads.
114+
115+
The admission webhook warns when using `NoExecute`.
116+
117+
See [Kubernetes taints documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) for taint behavior details.
118+
119+
#### Avoiding Taint Key Conflicts
120+
121+
The admission webhook prevents multiple rules from using the same `taint.key` and `taint.effect` on overlapping node selectors.
122+
123+
**Example conflict:**
124+
```yaml
125+
# Rule 1
126+
spec:
127+
conditions:
128+
- type: "device.gpu-vendor.net/DevicePluginRegistered"
129+
requiredStatus: "True"
130+
nodeSelector:
131+
matchLabels:
132+
feature.node.kubernetes.io/pci-10de.present: "true"
133+
taint:
134+
key: "readiness.k8s.io/vendor.com/gpu-not-ready"
135+
effect: "NoSchedule"
136+
137+
# Rule 2 - This will be REJECTED
138+
spec:
139+
conditions:
140+
- type: "cniplugin.example.net/rdma/NetworkReady"
141+
requiredStatus: "True"
142+
nodeSelector:
143+
matchLabels:
144+
feature.node.kubernetes.io/pci-10de.present: "true"
145+
taint:
146+
key: "readiness.k8s.io/vendor.com/gpu-not-ready" # Same (taint-key + effect) but different conditions = conflict
147+
effect: "NoSchedule"
148+
```
149+
150+
Use unique, descriptive taint keys for different readiness checks.
151+
152+
#### Taint Key Naming
153+
154+
Follow [Kubernetes naming conventions](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/).
155+
156+
Taint keys must have the `readiness.k8s.io/` prefix to clearly identify readiness-related taints and avoid conflicts with other controllers
180157

181-
### Security
158+
**Valid:**
159+
```yaml
160+
taint:
161+
key: "readiness.k8s.io/vendor.com/network-not-ready"
162+
key: "readiness.k8s.io/vendor.com/gpu-not-ready"
163+
```
182164

183-
The controller requires the following RBAC permissions:
184-
- **Nodes**: `get`, `list`, `watch`, `patch`, `update` (for taint management)
185-
- **NodeReadinessRules**: Full CRUD access
186-
- **Events**: `create` (for status reporting)
165+
**Invalid:**
166+
```yaml
167+
taint:
168+
key: "network-ready" # Missing prefix
169+
key: "node.kubernetes.io/ready" # Wrong prefix
170+
```
171+
172+
173+
## Configuration
187174

188175
### Performance and Scalability
189176

@@ -211,38 +198,3 @@ conditions:
211198
- type: "readiness.k8s.io/mycompany.example.com/CacheWarmed"
212199
requiredStatus: "True"
213200
```
214-
215-
#### With Cluster Autoscaler
216-
NodeReadinessController work well with cluster autoscaling:
217-
- New nodes start with restrictive taints
218-
- Controller removes taints once conditions are satisfied
219-
- Autoscaler can safely scale knowing nodes are truly ready
220-
221-
## Development
222-
223-
### Building from Source
224-
225-
```sh
226-
# Clone the repository
227-
git clone https://sigs.k8s.io/node-readiness-controller.git
228-
cd node-readiness-controller
229-
230-
# Run tests
231-
make test
232-
233-
# Build binary
234-
make build
235-
236-
# Generate manifests
237-
make manifests
238-
```
239-
240-
### Running Locally
241-
242-
```sh
243-
# Install CRDs
244-
make install
245-
246-
# Run against cluster (requires KUBECONFIG)
247-
make run
248-
```

docs/book/src/user-guide/installation.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ Follow this guide to install the Node Readiness Controller in your Kubernetes cl
66

77
### Option 1: Install Official Release (Recommended)
88

9-
The easiest way to get started is by applying the official release manifests.
10-
119
First, to install the CRDs, apply the `crds.yaml` manifest:
1210

1311
```sh
@@ -34,11 +32,16 @@ If it gets evicted during resource pressure, nodes can't transition to Ready sta
3432

3533
This is the priority class used by other critical cluster components (eg: core-dns).
3634

37-
**Images**: The official releases use multi-arch images (AMD64, Arm64).
35+
#### Images
3836

39-
### Option 2: Deploy Using Kustomize
37+
The official releases use multi-arch images (AMD64, Arm64) and are available at `registry.k8s.io/node-readiness-controller/node-readiness-controller`
4038

41-
If you have cloned the repository and want to deploy from source, you can use Kustomize.
39+
```sh
40+
REPO="registry.k8s.io/node-readiness-controller/node-readiness-controller"
41+
TAG=$(skopeo list-tags docker://$REPO | jq .'Tags[-1]' | tr -d '"')
42+
docker pull $REPO:$TAG
43+
```
44+
### Option 2: Deploy Using Kustomize
4245

4346
```sh
4447
# 1. Install Custom Resource Definitions (CRDs)

internal/webhook/nodereadinessgaterule_webhook.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222

23+
corev1 "k8s.io/api/core/v1"
2324
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/apimachinery/pkg/util/validation/field"
@@ -165,6 +166,35 @@ func (w *NodeReadinessRuleWebhook) nodSelectorsOverlap(selector1, selector2 meta
165166
return sel1.String() == sel2.String()
166167
}
167168

169+
// generateNoExecuteWarnings generates admission warnings for NoExecute taint usage.
170+
// NoExecute taints cause immediate pod eviction, which can be disruptive when
171+
// used with continuous enforcement mode.
172+
func (w *NodeReadinessRuleWebhook) generateNoExecuteWarnings(spec readinessv1alpha1.NodeReadinessRuleSpec) admission.Warnings {
173+
var warnings admission.Warnings
174+
175+
if spec.Taint.Effect != corev1.TaintEffectNoExecute {
176+
return warnings
177+
}
178+
179+
// NoExecute with continuous mode is particularly risky
180+
if spec.EnforcementMode == readinessv1alpha1.EnforcementModeContinuous {
181+
warnings = append(warnings,
182+
"CAUTION: Using NoExecute taint effect with continuous enforcement mode. "+
183+
"This configuration will evict existing pods when conditions fail, which may cause "+
184+
"workload disruption if conditions are unstable. Consider using NoSchedule "+
185+
"effect instead, or bootstrap-only enforcement mode. "+
186+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
187+
} else {
188+
// NoExecute with bootstrap-only is less risky but still worth noting
189+
warnings = append(warnings,
190+
"NOTE: Using NoExecute taint effect. This will evict existing pods that do not "+
191+
"tolerate this taint when applied. Ensure critical system pods have appropriate tolerations. "+
192+
"See: https://node-readiness-controller.sigs.k8s.io/user-guide/getting-started.html")
193+
}
194+
195+
return warnings
196+
}
197+
168198
// SetupWithManager sets up the webhook with the manager.
169199
func (w *NodeReadinessRuleWebhook) SetupWithManager(mgr ctrl.Manager) error {
170200
return ctrl.NewWebhookManagedBy(mgr).
@@ -185,7 +215,10 @@ func (w *NodeReadinessRuleWebhook) ValidateCreate(ctx context.Context, obj runti
185215
if allErrs := w.validateNodeReadinessRule(ctx, rule, false); len(allErrs) > 0 {
186216
return nil, fmt.Errorf("validation failed: %v", allErrs)
187217
}
188-
return nil, nil
218+
219+
// Generate warnings for NoExecute taint usage
220+
warnings := w.generateNoExecuteWarnings(rule.Spec)
221+
return warnings, nil
189222
}
190223

191224
func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
@@ -197,7 +230,10 @@ func (w *NodeReadinessRuleWebhook) ValidateUpdate(ctx context.Context, oldObj, n
197230
if allErrs := w.validateNodeReadinessRule(ctx, rule, true); len(allErrs) > 0 {
198231
return nil, fmt.Errorf("validation failed: %v", allErrs)
199232
}
200-
return nil, nil
233+
234+
// Generate warnings for NoExecute taint usage
235+
warnings := w.generateNoExecuteWarnings(rule.Spec)
236+
return warnings, nil
201237
}
202238

203239
func (w *NodeReadinessRuleWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {

0 commit comments

Comments
 (0)