Skip to content

Commit 0e03074

Browse files
authored
Implement MaxParkedNodes, arbitrary node labels, and parking SafetyCheck (#384)
* Implement MaxParkedNodes feature to limit number of nodes parked at one time * Add way to add arbitrary labels to parked nodes and pods * Add SafetyCheck feature to make sure we don't force evict unlabeled pods * update helm chart to default to latest app version
1 parent 44857bb commit 0e03074

35 files changed

+3585
-193
lines changed

Makefile

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
NAME ?= adobe/k8s-shredder
44
K8S_SHREDDER_VERSION ?= "dev"
5-
KINDNODE_VERSION ?= "v1.31.9"
5+
KINDNODE_VERSION ?= "v1.34.0"
66
COMMIT ?= $(shell git rev-parse --short HEAD)
77
TEST_CLUSTERNAME ?= "k8s-shredder-test-cluster"
88
TEST_CLUSTERNAME_KARPENTER ?= "k8s-shredder-test-cluster-karpenter"
@@ -106,6 +106,10 @@ build: check-license lint vet security unit-test ## Builds the local Docker cont
106106
@CGO_ENABLED=0 GOOS=linux go build \
107107
-ldflags="-s -w -X github.com/adobe/k8s-shredder/cmd.buildVersion=${K8S_SHREDDER_VERSION}-${COMMIT} -X github.com/adobe/k8s-shredder/cmd.gitSHA=${COMMIT} -X github.com/adobe/k8s-shredder/cmd.buildTime=$(date)" \
108108
-o k8s-shredder
109+
@CGO_ENABLED=0 go build \
110+
-ldflags="-s -w" \
111+
-o park-node \
112+
./cmd/park-node
109113
@DOCKER_BUILDKIT=1 docker build -t ${NAME}:${K8S_SHREDDER_VERSION} .
110114

111115
# TEST
@@ -149,13 +153,13 @@ e2e-tests: ## Run e2e tests for k8s-shredder deployed in a local kind cluster
149153
@echo "Run e2e tests for k8s-shredder..."
150154
@if [ -f "${PWD}/${KUBECONFIG_KARPENTER}" ]; then \
151155
echo "Using Karpenter test cluster configuration..."; \
152-
KUBECONFIG=${PWD}/${KUBECONFIG_KARPENTER} go test internal/testing/e2e_test.go -v; \
156+
PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_KARPENTER} go test internal/testing/e2e_test.go -v; \
153157
elif [ -f "${PWD}/${KUBECONFIG_NODE_LABELS}" ]; then \
154158
echo "Using node labels test cluster configuration..."; \
155-
KUBECONFIG=${PWD}/${KUBECONFIG_NODE_LABELS} go test internal/testing/e2e_test.go -v; \
159+
PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_NODE_LABELS} go test internal/testing/e2e_test.go -v; \
156160
else \
157161
echo "Using default test cluster configuration..."; \
158-
KUBECONFIG=${PWD}/${KUBECONFIG_LOCALTEST} go test internal/testing/e2e_test.go -v; \
162+
PROJECT_ROOT=${PWD} KUBECONFIG=${PWD}/${KUBECONFIG_LOCALTEST} go test internal/testing/e2e_test.go -v; \
159163
fi
160164

161165
# DEMO targets
@@ -187,5 +191,5 @@ clean: ## Clean up local testing environment
187191
@kind delete cluster --name="${TEST_CLUSTERNAME_KARPENTER}" ## > /dev/null 2>&1 || true
188192
@kind delete cluster --name="${TEST_CLUSTERNAME_NODE_LABELS}" ## > /dev/null 2>&1 || true
189193
@echo "Removing all generated files and directories"
190-
@rm -rf dist/ k8s-shredder kubeconfig ${KUBECONFIG_LOCALTEST} ${KUBECONFIG_KARPENTER} ${KUBECONFIG_NODE_LABELS}
194+
@rm -rf dist/ k8s-shredder park-node kubeconfig ${KUBECONFIG_LOCALTEST} ${KUBECONFIG_KARPENTER} ${KUBECONFIG_NODE_LABELS}
191195
@echo "Done!"

README.md

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ rabbitmq, redis, etc) may be sensitive to rescheduling, or application developer
1818

1919
You can find more about node parking [here](docs/node-parking.md).
2020

21-
## Advantages of parking and shredding nodes
21+
## Advantages of parking nodes and shredding pods
2222

2323
- allow teams running stateful apps to move their workloads off of parked nodes at their will, independent of node lifecycle
2424
- optimizes cloud costs by dynamically purging unschedulable workers nodes (parked nodes).
@@ -52,12 +52,14 @@ The following options can be used to customize the k8s-shredder controller:
5252
| AllowEvictionLabel | "shredder.ethos.adobe.net/allow-eviction" | Label used for skipping evicting pods that have explicitly set this label on false |
5353
| ToBeDeletedTaint | "ToBeDeletedByClusterAutoscaler" | Node taint used for skipping a subset of parked nodes that are already handled by cluster-autoscaler |
5454
| ArgoRolloutsAPIVersion | "v1alpha1" | API version from `argoproj.io` API group to be used while handling Argo Rollouts objects |
55-
5655
| EnableKarpenterDriftDetection | false | Controls whether to scan for drifted Karpenter NodeClaims and automatically label their nodes |
5756
| ParkedByLabel | "shredder.ethos.adobe.net/parked-by" | Label used to identify which component parked the node |
5857
| ParkedNodeTaint | "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" | Taint to apply to parked nodes in format key=value:effect |
5958
| EnableNodeLabelDetection | false | Controls whether to scan for nodes with specific labels and automatically park them |
6059
| NodeLabelsToDetect | [] | List of node labels to detect. Supports both key-only and key=value formats |
60+
| MaxParkedNodes | 0 | Maximum number of nodes that can be parked simultaneously. Set to 0 (default) for no limit. |
61+
| ExtraParkingLabels | {} | (Optional) Map of extra labels to apply to nodes and pods during parking. Example: `{ "example.com/owner": "infrastructure" }` |
62+
| EvictionSafetyCheck | true | Controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels. |
6163

6264
### How it works
6365

@@ -81,6 +83,7 @@ k8s-shredder includes an optional feature for automatic detection of drifted Kar
8183
- `UpgradeStatusLabel` (set to "parked")
8284
- `ExpiresOnLabel` (set to current time + `ParkedNodeTTL`)
8385
- `ParkedByLabel` (set to "k8s-shredder")
86+
- Any labels specified in `ExtraParkingLabels`
8487
- **Cordoning** the nodes to prevent new pod scheduling
8588
- **Tainting** the nodes with the configured `ParkedNodeTaint`
8689

@@ -98,15 +101,102 @@ k8s-shredder includes optional automatic detection of nodes with specific labels
98101
- `UpgradeStatusLabel` (set to "parked")
99102
- `ExpiresOnLabel` (set to current time + `ParkedNodeTTL`)
100103
- `ParkedByLabel` (set to "k8s-shredder")
104+
- Any labels specified in `ExtraParkingLabels`
101105
- **Cordoning** the nodes to prevent new pod scheduling
102106
- **Tainting** the nodes with the configured `ParkedNodeTaint`
103107

104108
This integration allows k8s-shredder to automatically handle node lifecycle management based on custom labeling strategies, enabling teams to mark nodes for parking using their own operational workflows and labels. For example, this can be used in conjunction with [AKS cluster upgrades](https://learn.microsoft.com/en-us/azure/aks/upgrade-cluster#set-new-cordon-behavior).
105109

110+
#### Parking Limits with MaxParkedNodes
111+
112+
k8s-shredder supports limiting the maximum number of nodes that can be parked simultaneously using the `MaxParkedNodes` configuration option. This feature helps prevent overwhelming the cluster with too many parked nodes at once, which could impact application availability.
113+
114+
When `MaxParkedNodes` is set to a positive integer:
115+
116+
1. **Before parking nodes**: k8s-shredder counts the number of currently parked nodes
117+
2. **Calculate available slots**: `availableSlots = MaxParkedNodes - currentlyParked`
118+
3. **Limit parking**: If the number of eligible nodes exceeds available slots, only the first `availableSlots` nodes are parked
119+
4. **Skip if full**: If no slots are available (currentlyParked >= MaxParkedNodes), parking is skipped for that eviction interval
120+
121+
**Examples:**
122+
- `MaxParkedNodes: 0` (default): No limit, all eligible nodes are parked
123+
- `MaxParkedNodes: 5`: Maximum 5 nodes can be parked at any time
124+
- `MaxParkedNodes: -1`: Invalid value, treated as 0 (no limit) with a warning logged
125+
126+
This limit applies to both Karpenter drift detection and node label detection features. When multiple nodes are eligible for parking but the limit would be exceeded, k8s-shredder will park the nodes in the order they are discovered and skip the remaining nodes until the next eviction interval.
127+
128+
**Use cases:**
129+
- **Gradual node replacement**: Control the pace of node cycling during cluster upgrades
130+
- **Resource management**: Prevent excessive resource pressure from too many parked nodes
131+
- **Application stability**: Ensure applications have sufficient capacity during node transitions
132+
- **Cost optimization**: Balance between node replacement speed and cluster stability
133+
134+
#### ExtraParkingLabels
135+
136+
The `ExtraParkingLabels` option allows you to specify a map of additional Kubernetes labels that will be applied to all nodes and pods during the parking process. This is useful for custom automation, monitoring, or compliance workflows.
137+
138+
**Configuration:**
139+
```yaml
140+
ExtraParkingLabels:
141+
example.com/owner: "infrastructure"
142+
example.com/maintenance: "true"
143+
example.com/upgrade-batch: "batch-1"
144+
```
145+
146+
**Use cases:**
147+
- **Team ownership**: Mark parked nodes with team ownership labels for accountability
148+
- **Maintenance tracking**: Add labels to track maintenance windows or upgrade batches
149+
- **Compliance**: Apply labels required by compliance or governance policies
150+
- **Monitoring**: Enable custom alerting or monitoring based on parking labels
151+
- **Automation**: Trigger external automation workflows based on parking labels
152+
153+
**Behavior:**
154+
- Labels are applied to both nodes and their non-DaemonSet pods during parking
155+
- Labels are removed during the unparking process (if `EvictionSafetyCheck` triggers unparking)
156+
- If not set or empty, no extra labels are applied
157+
- Labels are applied in addition to the standard parking labels (`UpgradeStatusLabel`, `ExpiresOnLabel`, `ParkedByLabel`)
158+
159+
#### EvictionSafetyCheck
160+
161+
The `EvictionSafetyCheck` feature provides an additional safety mechanism to prevent force eviction of pods that weren't properly prepared for parking. When enabled (default: `true`), k8s-shredder performs a safety check before force evicting pods from expired parked nodes.
162+
163+
**How it works:**
164+
165+
1. **Before force eviction**: When a node's TTL expires and force eviction is about to begin, k8s-shredder checks all non-DaemonSet and non-static pods on the node
166+
2. **Required labels check**: Each pod must have:
167+
- `UpgradeStatusLabel` set to "parked"
168+
- `ExpiresOnLabel` present with any value
169+
3. **Safety decision**:
170+
- If **all** pods have the required labels → proceed with force eviction
171+
- If **any** pod is missing required labels → unpark the node instead of force evicting
172+
173+
**Unparking process:**
174+
When safety check fails, k8s-shredder automatically unparks the node by:
175+
- Removing `ExpiresOnLabel` and `ExtraParkingLabels` from nodes and pods
176+
- Removing the `ParkedNodeTaint`
177+
- Uncordoning the node (making it schedulable again)
178+
- Setting `UpgradeStatusLabel` to "unparked" on nodes and pods
179+
- Setting `ParkedByLabel` to the configured `ParkedByValue`
180+
181+
**Use cases:**
182+
- **Safety during manual parking**: If nodes are manually parked but pods weren't properly labeled
183+
- **Partial parking failures**: When parking automation fails to label all pods
184+
- **Emergency recovery**: Provides a safe way to recover from parking mistakes
185+
- **Compliance**: Ensures only properly prepared workloads are force evicted
186+
187+
**Configuration:**
188+
```yaml
189+
EvictionSafetyCheck: true # Enable safety checks (default)
190+
EvictionSafetyCheck: false # Disable safety checks (force eviction always proceeds)
191+
```
192+
193+
**Logging:**
194+
When safety checks fail, k8s-shredder logs detailed information about which pods are missing required labels, helping operators understand why the node was unparked instead of force evicted.
195+
106196
## Metrics
107197

108198
k8s-shredder exposes comprehensive metrics for monitoring its operation. You can find detailed information about all available metrics in the [metrics documentation](docs/metrics.md).
109199

110200
#### Creating a new release
111201

112-
See [RELEASE.md](RELEASE.md).
202+
See [RELEASE.md](RELEASE.md).

charts/k8s-shredder/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ maintainers:
1212
- name: sfotony
1313
1414
url: https://adobe.com
15-
version: 0.2.4
16-
appVersion: v0.3.1
15+
version: 0.2.5
16+
appVersion: v0.3.5

charts/k8s-shredder/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# k8s-shredder
22

3-
![Version: 0.2.4](https://img.shields.io/badge/Version-0.2.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.3.1](https://img.shields.io/badge/AppVersion-v0.3.1-informational?style=flat-square)
3+
![Version: 0.2.5](https://img.shields.io/badge/Version-0.2.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.3.5](https://img.shields.io/badge/AppVersion-v0.3.5-informational?style=flat-square)
44

55
a novel way of dealing with kubernetes nodes blocked from draining
66

@@ -23,9 +23,10 @@ a novel way of dealing with kubernetes nodes blocked from draining
2323
| dryRun | bool | `false` | Enable dry-run mode - when true, k8s-shredder will log actions but not execute them |
2424
| environmentVars | list | `[]` | Additional environment variables to set in the container |
2525
| fullnameOverride | string | `""` | Override the full name used for resources |
26-
| image | object | `{"pullPolicy":"IfNotPresent","registry":"ghcr.io/adobe/k8s-shredder"}` | Container image configuration |
26+
| image | object | `{"pullPolicy":"IfNotPresent","registry":"ghcr.io/adobe/k8s-shredder","tag":"latest"}` | Container image configuration |
2727
| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy - IfNotPresent, Always, or Never |
2828
| image.registry | string | `"ghcr.io/adobe/k8s-shredder"` | Container registry where the k8s-shredder image is hosted |
29+
| image.tag | string | `"latest"` | Image tag to use |
2930
| imagePullSecrets | list | `[]` | Secrets for pulling images from private registries |
3031
| initContainers | list | `[]` | Init containers to run before the main k8s-shredder container starts |
3132
| logFormat | string | `"text"` | Log output format: text (human-readable) or json (structured logging) |
@@ -63,13 +64,16 @@ a novel way of dealing with kubernetes nodes blocked from draining
6364
| serviceAccount.annotations | object | `{}` | Additional annotations for the service account (useful for IAM roles, etc.) |
6465
| serviceAccount.create | bool | `true` | Create a service account for k8s-shredder |
6566
| serviceAccount.name | string | `"k8s-shredder"` | Name of the service account |
66-
| shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopInterval":"1h","ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration |
67+
| shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopInterval":"1h","EvictionSafetyCheck":true,"ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","ExtraParkingLabels":{},"MaxParkedNodes":0,"NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration |
6768
| shredder.AllowEvictionLabel | string | `"shredder.ethos.adobe.net/allow-eviction"` | Label to explicitly allow eviction on specific resources |
6869
| shredder.ArgoRolloutsAPIVersion | string | `"v1alpha1"` | API version for Argo Rollouts integration |
6970
| shredder.EnableKarpenterDriftDetection | bool | `false` | Enable Karpenter drift detection for node lifecycle management |
7071
| shredder.EnableNodeLabelDetection | bool | `false` | Enable detection of nodes based on specific labels |
7172
| shredder.EvictionLoopInterval | string | `"1h"` | How often to run the main eviction loop |
73+
| shredder.EvictionSafetyCheck | bool | `true` | Controls whether to perform safety checks before force eviction |
7274
| shredder.ExpiresOnLabel | string | `"shredder.ethos.adobe.net/parked-node-expires-on"` | Label used to track when a parked node expires |
75+
| shredder.ExtraParkingLabels | object | `{}` | Additional labels to apply to nodes and pods during parking |
76+
| shredder.MaxParkedNodes | int | `0` | Maximum number of nodes that can be parked simultaneously (0 = no limit) |
7377
| shredder.NamespacePrefixSkipInitialEviction | string | `"ns-ethos-"` | Namespace prefix to skip during initial eviction (useful for system namespaces) |
7478
| shredder.NodeLabelsToDetect | list | `[]` | List of node labels to monitor for triggering shredder actions |
7579
| shredder.ParkedByLabel | string | `"shredder.ethos.adobe.net/parked-by"` | Label to track which component parked a node |

charts/k8s-shredder/templates/configmap.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@ data:
2323
ParkedNodeTaint: "{{.Values.shredder.ParkedNodeTaint}}"
2424
EnableNodeLabelDetection: {{.Values.shredder.EnableNodeLabelDetection}}
2525
NodeLabelsToDetect: {{.Values.shredder.NodeLabelsToDetect | toJson}}
26+
MaxParkedNodes: {{.Values.shredder.MaxParkedNodes}}
27+
EvictionSafetyCheck: {{.Values.shredder.EvictionSafetyCheck}}
28+
ExtraParkingLabels: {{.Values.shredder.ExtraParkingLabels | toJson}}

charts/k8s-shredder/values.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ image:
55
registry: ghcr.io/adobe/k8s-shredder
66
# -- Image pull policy - IfNotPresent, Always, or Never
77
pullPolicy: IfNotPresent
8+
# -- Image tag to use
9+
tag: latest
810
# -- Number of k8s-shredder pods to run
911
replicaCount: 1
1012
# -- Deployment strategy for rolling updates (e.g., RollingUpdate, Recreate)
@@ -58,6 +60,15 @@ shredder:
5860
EnableNodeLabelDetection: false
5961
# -- List of node labels to monitor for triggering shredder actions
6062
NodeLabelsToDetect: []
63+
# -- Maximum number of nodes that can be parked simultaneously (0 = no limit)
64+
MaxParkedNodes: 0
65+
# -- Controls whether to perform safety checks before force eviction
66+
EvictionSafetyCheck: true
67+
# -- Additional labels to apply to nodes and pods during parking
68+
ExtraParkingLabels: {}
69+
# Example configuration:
70+
# example.com/owner: "infrastructure"
71+
# example.com/maintenance: "true"
6172
# -- RBAC (Role-Based Access Control) configuration
6273
rbac:
6374
# -- Create RBAC resources (ClusterRole, ClusterRoleBinding)

cmd/park-node/main.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2025 Adobe. All rights reserved.
2+
package main
3+
4+
import (
5+
"flag"
6+
"log"
7+
"os"
8+
9+
e2e "github.com/adobe/k8s-shredder/internal/testing"
10+
)
11+
12+
func main() {
13+
var nodeName, kubeconfigPath string
14+
15+
// Use a custom flag set to avoid conflicts with client-go flags
16+
fs := flag.NewFlagSet("park-node", flag.ExitOnError)
17+
fs.StringVar(&nodeName, "node", "", "Name of the node to park")
18+
fs.StringVar(&kubeconfigPath, "park-kubeconfig", "", "Path to kubeconfig file")
19+
if err := fs.Parse(os.Args[1:]); err != nil {
20+
log.Fatal(err)
21+
}
22+
23+
if nodeName == "" {
24+
log.Fatal("Node name is required. Use -node flag")
25+
}
26+
if kubeconfigPath == "" {
27+
log.Fatal("Kubeconfig path is required. Use -park-kubeconfig flag")
28+
}
29+
30+
if err := e2e.ParkNodeForTesting(nodeName, kubeconfigPath); err != nil {
31+
log.Fatalf("Failed to park node: %v", err)
32+
}
33+
34+
log.Printf("Successfully parked node %s", nodeName)
35+
os.Exit(0)
36+
}

0 commit comments

Comments
 (0)