From 8b53c8ace01504794f8746564593477725fe67a8 Mon Sep 17 00:00:00 2001 From: Tony Gosselin Date: Thu, 23 Oct 2025 18:12:41 -0400 Subject: [PATCH] Max parked nodes by using percentages (#403) * Allow MaxParkedNodes to work with percentages * Park nodes by ago, oldest first --- .github/workflows/ci-chart.yaml | 2 +- .goreleaser.yml | 5 +- README.md | 46 ++- charts/k8s-shredder/Chart.yaml | 4 +- charts/k8s-shredder/README.md | 6 +- charts/k8s-shredder/values.yaml | 4 +- cmd/root.go | 22 +- config.yaml | 2 +- internal/testing/k8s-shredder-karpenter.yaml | 2 +- .../testing/k8s-shredder-node-labels.yaml | 2 +- internal/testing/k8s-shredder.yaml | 2 +- pkg/config/config.go | 7 +- pkg/utils/karpenter_detection_test.go | 18 +- pkg/utils/node_label_detection_test.go | 10 +- pkg/utils/parking.go | 192 +++++++++++-- pkg/utils/parking_test.go | 267 ++++++++++++++++-- 16 files changed, 501 insertions(+), 90 deletions(-) diff --git a/.github/workflows/ci-chart.yaml b/.github/workflows/ci-chart.yaml index 5f217c7..2e03867 100644 --- a/.github/workflows/ci-chart.yaml +++ b/.github/workflows/ci-chart.yaml @@ -18,7 +18,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: '3.14' + python-version: '3.12' check-latest: true - name: Set up chart-testing uses: helm/chart-testing-action@v2.7.0 diff --git a/.goreleaser.yml b/.goreleaser.yml index bfa2bc2..2e70262 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -22,11 +22,10 @@ builds: signs: - cmd: cosign env: [COSIGN_EXPERIMENTAL=1] - certificate: ${artifact}.pem + signature: ${artifact}.bundle args: - sign-blob - - --output-certificate=${certificate} - - --output-signature=${signature} + - --bundle=${signature} - ${artifact} - --yes # needed on cosign 2.0.0+ artifacts: checksum diff --git a/README.md b/README.md index b680821..edeaf71 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ The following options can be used to customize the k8s-shredder controller: | ParkedNodeTaint | "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" | Taint to apply to parked nodes in format key=value:effect | | EnableNodeLabelDetection | false | Controls whether to scan for nodes with specific labels and automatically park them | | NodeLabelsToDetect | [] | List of node labels to detect. Supports both key-only and key=value formats | -| MaxParkedNodes | 0 | Maximum number of nodes that can be parked simultaneously. Set to 0 (default) for no limit. | +| MaxParkedNodes | "0" | Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" (default) for no limit. | | ExtraParkingLabels | {} | (Optional) Map of extra labels to apply to nodes and pods during parking. Example: `{ "example.com/owner": "infrastructure" }` | | EvictionSafetyCheck | true | Controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels. | | ParkingReasonLabel | "shredder.ethos.adobe.net/parked-reason" | Label used to track why a node or pod was parked (values: node-label, karpenter-drifted, karpenter-disrupted) | @@ -131,25 +131,49 @@ This integration allows k8s-shredder to automatically handle node lifecycle mana k8s-shredder supports limiting the maximum number of nodes that can be parked simultaneously using the `MaxParkedNodes` configuration option. This feature helps prevent overwhelming the cluster with too many parked nodes at once, which could impact application availability. -When `MaxParkedNodes` is set to a positive integer: +`MaxParkedNodes` can be specified as either: +- **Integer value** (e.g., `"5"`): Absolute maximum number of nodes that can be parked +- **Percentage value** (e.g., `"20%"`): Maximum percentage of total cluster nodes that can be parked (calculated dynamically each cycle) -1. **Before parking nodes**: k8s-shredder counts the number of currently parked nodes -2. **Calculate available slots**: `availableSlots = MaxParkedNodes - currentlyParked` -3. **Limit parking**: If the number of eligible nodes exceeds available slots, only the first `availableSlots` nodes are parked -4. **Skip if full**: If no slots are available (currentlyParked >= MaxParkedNodes), parking is skipped for that eviction interval +When `MaxParkedNodes` is set to a non-zero value: -**Examples:** -- `MaxParkedNodes: 0` (default): No limit, all eligible nodes are parked -- `MaxParkedNodes: 5`: Maximum 5 nodes can be parked at any time -- `MaxParkedNodes: -1`: Invalid value, treated as 0 (no limit) with a warning logged +1. **Parse the limit**: The configuration is parsed to determine the actual limit + - For percentages: `limit = (percentage / 100) * totalNodes` (rounded down) + - For integers: `limit = configured value` +2. **Count parked nodes**: k8s-shredder counts the number of currently parked nodes +3. **Calculate available slots**: `availableSlots = limit - currentlyParked` +4. **Sort by age**: Eligible nodes are sorted by creation timestamp (oldest first) to ensure predictable parking order +5. **Limit parking**: If the number of eligible nodes exceeds available slots, only the oldest `availableSlots` nodes are parked +6. **Skip if full**: If no slots are available (currentlyParked >= limit), parking is skipped for that eviction interval -This limit applies to both Karpenter drift detection and node label detection features. When multiple nodes are eligible for parking but the limit would be exceeded, k8s-shredder will park the nodes in the order they are discovered and skip the remaining nodes until the next eviction interval. +**Examples:** +- `MaxParkedNodes: "0"` (default): No limit, all eligible nodes are parked +- `MaxParkedNodes: "5"`: Maximum 5 nodes can be parked at any time +- `MaxParkedNodes: "20%"`: Maximum 20% of total cluster nodes can be parked (e.g., 2 nodes in a 10-node cluster) +- Invalid values (e.g., `"-1"`, `"invalid"`): Treated as 0 (no limit) with a warning logged + +**Percentage Benefits:** +- **Dynamic scaling**: Limit automatically adjusts as cluster size changes +- **Proportional safety**: Maintains a consistent percentage of available capacity regardless of cluster size +- **Auto-scaling friendly**: Works well with cluster auto-scaling by recalculating limits each cycle + +**Predictable Parking Order:** +Eligible nodes are **always sorted by creation timestamp (oldest first)**, regardless of whether `MaxParkedNodes` is set. This ensures: +- **Consistent behavior**: The same nodes will be parked first across multiple eviction cycles +- **Fair rotation**: Oldest nodes are prioritized for replacement during rolling upgrades +- **Predictable capacity**: You can anticipate which nodes will be parked next when slots become available +- **Deterministic ordering**: Even when parking all eligible nodes (no limit), they are processed in a predictable order + +This sorting behavior applies to both Karpenter drift detection and node label detection features. When multiple nodes are eligible for parking: +- **With no limit** (`MaxParkedNodes: "0"`): All nodes are parked in order from oldest to newest +- **With a limit**: Only the oldest nodes up to the limit are parked; newer nodes wait for the next eviction interval **Use cases:** - **Gradual node replacement**: Control the pace of node cycling during cluster upgrades - **Resource management**: Prevent excessive resource pressure from too many parked nodes - **Application stability**: Ensure applications have sufficient capacity during node transitions - **Cost optimization**: Balance between node replacement speed and cluster stability +- **Auto-scaling clusters**: Use percentage-based limits to maintain consistent safety margins as cluster size changes #### ExtraParkingLabels diff --git a/charts/k8s-shredder/Chart.yaml b/charts/k8s-shredder/Chart.yaml index 1b59bda..ad8e37e 100644 --- a/charts/k8s-shredder/Chart.yaml +++ b/charts/k8s-shredder/Chart.yaml @@ -12,5 +12,5 @@ maintainers: - name: sfotony email: gosselin@adobe.com url: https://adobe.com -version: 0.2.6 -appVersion: v0.3.6 +version: 0.2.7 +appVersion: v0.3.7 diff --git a/charts/k8s-shredder/README.md b/charts/k8s-shredder/README.md index 897b8f0..b562d24 100644 --- a/charts/k8s-shredder/README.md +++ b/charts/k8s-shredder/README.md @@ -1,6 +1,6 @@ # k8s-shredder -![Version: 0.2.6](https://img.shields.io/badge/Version-0.2.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.3.6](https://img.shields.io/badge/AppVersion-v0.3.6-informational?style=flat-square) +![Version: 0.2.7](https://img.shields.io/badge/Version-0.2.7-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.3.7](https://img.shields.io/badge/AppVersion-v0.3.7-informational?style=flat-square) a novel way of dealing with kubernetes nodes blocked from draining @@ -64,7 +64,7 @@ a novel way of dealing with kubernetes nodes blocked from draining | serviceAccount.annotations | object | `{}` | Additional annotations for the service account (useful for IAM roles, etc.) | | serviceAccount.create | bool | `true` | Create a service account for k8s-shredder | | serviceAccount.name | string | `"k8s-shredder"` | Name of the service account | -| shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDisruptionDetection":false,"EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopInterval":"1h","EvictionSafetyCheck":true,"ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","ExtraParkingLabels":{},"MaxParkedNodes":0,"NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","ParkingReasonLabel":"shredder.ethos.adobe.net/parked-reason","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration | +| shredder | object | `{"AllowEvictionLabel":"shredder.ethos.adobe.net/allow-eviction","ArgoRolloutsAPIVersion":"v1alpha1","EnableKarpenterDisruptionDetection":false,"EnableKarpenterDriftDetection":false,"EnableNodeLabelDetection":false,"EvictionLoopInterval":"1h","EvictionSafetyCheck":true,"ExpiresOnLabel":"shredder.ethos.adobe.net/parked-node-expires-on","ExtraParkingLabels":{},"MaxParkedNodes":"0","NamespacePrefixSkipInitialEviction":"ns-ethos-","NodeLabelsToDetect":[],"ParkedByLabel":"shredder.ethos.adobe.net/parked-by","ParkedByValue":"k8s-shredder","ParkedNodeTTL":"168h","ParkedNodeTaint":"shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule","ParkingReasonLabel":"shredder.ethos.adobe.net/parked-reason","RestartedAtAnnotation":"shredder.ethos.adobe.net/restartedAt","RollingRestartThreshold":0.1,"ToBeDeletedTaint":"ToBeDeletedByClusterAutoscaler","UpgradeStatusLabel":"shredder.ethos.adobe.net/upgrade-status"}` | Core k8s-shredder configuration | | shredder.AllowEvictionLabel | string | `"shredder.ethos.adobe.net/allow-eviction"` | Label to explicitly allow eviction on specific resources | | shredder.ArgoRolloutsAPIVersion | string | `"v1alpha1"` | API version for Argo Rollouts integration | | shredder.EnableKarpenterDisruptionDetection | bool | `false` | Enable Karpenter disruption detection for node lifecycle management | @@ -74,7 +74,7 @@ a novel way of dealing with kubernetes nodes blocked from draining | shredder.EvictionSafetyCheck | bool | `true` | Controls whether to perform safety checks before force eviction | | shredder.ExpiresOnLabel | string | `"shredder.ethos.adobe.net/parked-node-expires-on"` | Label used to track when a parked node expires | | shredder.ExtraParkingLabels | object | `{}` | Additional labels to apply to nodes and pods during parking | -| shredder.MaxParkedNodes | int | `0` | Maximum number of nodes that can be parked simultaneously (0 = no limit) | +| shredder.MaxParkedNodes | string | `"0"` | Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit | | shredder.NamespacePrefixSkipInitialEviction | string | `"ns-ethos-"` | Namespace prefix to skip during initial eviction (useful for system namespaces) | | shredder.NodeLabelsToDetect | list | `[]` | List of node labels to monitor for triggering shredder actions | | shredder.ParkedByLabel | string | `"shredder.ethos.adobe.net/parked-by"` | Label to track which component parked a node | diff --git a/charts/k8s-shredder/values.yaml b/charts/k8s-shredder/values.yaml index d9e79a1..67e54c4 100644 --- a/charts/k8s-shredder/values.yaml +++ b/charts/k8s-shredder/values.yaml @@ -62,8 +62,8 @@ shredder: EnableNodeLabelDetection: false # -- List of node labels to monitor for triggering shredder actions NodeLabelsToDetect: [] - # -- Maximum number of nodes that can be parked simultaneously (0 = no limit) - MaxParkedNodes: 0 + # -- Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" for no limit + MaxParkedNodes: '0' # -- Controls whether to perform safety checks before force eviction EvictionSafetyCheck: true # -- Label used to track why a node or pod was parked diff --git a/cmd/root.go b/cmd/root.go index f8ca3b4..027cbdd 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -12,6 +12,7 @@ governing permissions and limitations under the License. package cmd import ( + "strconv" "strings" "time" @@ -123,7 +124,7 @@ func discoverConfig() { viper.SetDefault("ParkedNodeTaint", "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule") viper.SetDefault("EnableNodeLabelDetection", false) viper.SetDefault("NodeLabelsToDetect", []string{}) - viper.SetDefault("MaxParkedNodes", 0) + viper.SetDefault("MaxParkedNodes", "0") viper.SetDefault("ExtraParkingLabels", map[string]string{}) viper.SetDefault("EvictionSafetyCheck", true) viper.SetDefault("ParkingReasonLabel", "shredder.ethos.adobe.net/parked-reason") @@ -149,9 +150,22 @@ func parseConfig() { } // Validate MaxParkedNodes configuration - if cfg.MaxParkedNodes < 0 { - log.WithField("MaxParkedNodes", cfg.MaxParkedNodes).Warn("MaxParkedNodes is negative, treating as 0 (no limit)") - cfg.MaxParkedNodes = 0 + // Basic validation - detailed parsing happens in LimitNodesToPark + if cfg.MaxParkedNodes != "" && cfg.MaxParkedNodes != "0" { + // Check if it's a percentage + if strings.HasSuffix(cfg.MaxParkedNodes, "%") { + percentageStr := strings.TrimSuffix(cfg.MaxParkedNodes, "%") + if _, err := strconv.ParseFloat(percentageStr, 64); err != nil { + log.WithField("MaxParkedNodes", cfg.MaxParkedNodes).Warn("Invalid MaxParkedNodes percentage format, treating as 0 (no limit)") + cfg.MaxParkedNodes = "0" + } + } else { + // Check if it's a valid integer + if _, err := strconv.Atoi(cfg.MaxParkedNodes); err != nil { + log.WithField("MaxParkedNodes", cfg.MaxParkedNodes).Warn("Invalid MaxParkedNodes format, treating as 0 (no limit)") + cfg.MaxParkedNodes = "0" + } + } } log.WithFields(log.Fields{ diff --git a/config.yaml b/config.yaml index fd4cab8..28645ad 100644 --- a/config.yaml +++ b/config.yaml @@ -32,7 +32,7 @@ NodeLabelsToDetect: [] # List of node labels to detect. Supports both key-only # - "node.example.com/park" # Matches any node with the "node.example.com/park" label # Parking limits -MaxParkedNodes: 0 # Maximum number of nodes that can be parked simultaneously. Set to 0 (default) for no limit. +MaxParkedNodes: '0' # Maximum number of nodes that can be parked simultaneously. Can be an integer (e.g., "5") or percentage (e.g., "20%"). Set to "0" (default) for no limit. # Extra labels to apply to parked nodes and pods # ExtraParkingLabels: # (optional) Additional labels to apply to nodes and pods during parking diff --git a/internal/testing/k8s-shredder-karpenter.yaml b/internal/testing/k8s-shredder-karpenter.yaml index 08b186d..5a04a0c 100644 --- a/internal/testing/k8s-shredder-karpenter.yaml +++ b/internal/testing/k8s-shredder-karpenter.yaml @@ -80,7 +80,7 @@ data: ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" EnableNodeLabelDetection: false NodeLabelsToDetect: [] - MaxParkedNodes: 0 + MaxParkedNodes: "0" --- apiVersion: v1 kind: Service diff --git a/internal/testing/k8s-shredder-node-labels.yaml b/internal/testing/k8s-shredder-node-labels.yaml index e656381..422ab0f 100644 --- a/internal/testing/k8s-shredder-node-labels.yaml +++ b/internal/testing/k8s-shredder-node-labels.yaml @@ -83,7 +83,7 @@ data: - "test-label" - "maintenance=scheduled" - "node.test.io/park" - MaxParkedNodes: 0 + MaxParkedNodes: "0" --- apiVersion: v1 kind: Service diff --git a/internal/testing/k8s-shredder.yaml b/internal/testing/k8s-shredder.yaml index 1868360..3236d36 100644 --- a/internal/testing/k8s-shredder.yaml +++ b/internal/testing/k8s-shredder.yaml @@ -80,7 +80,7 @@ data: ParkedNodeTaint: "shredder.ethos.adobe.net/upgrade-status=parked:NoSchedule" EnableNodeLabelDetection: false NodeLabelsToDetect: [] - MaxParkedNodes: 0 + MaxParkedNodes: "0" --- apiVersion: v1 kind: Service diff --git a/pkg/config/config.go b/pkg/config/config.go index 9ddca37..1fd7ea3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -49,8 +49,11 @@ type Config struct { EnableNodeLabelDetection bool // NodeLabelsToDetect is a list of node labels to look for. Can be just keys or key=value pairs NodeLabelsToDetect []string - // MaxParkedNodes is the maximum number of nodes that can be parked simultaneously. If set to 0 (default), no limit is applied. - MaxParkedNodes int + // MaxParkedNodes is the maximum number of nodes that can be parked simultaneously. + // Can be either an integer (e.g. "5") or a percentage (e.g. "20%"). + // If set to "0" or empty (default), no limit is applied. + // When a percentage is specified, the limit is calculated as (percentage/100) * (total nodes in cluster). + MaxParkedNodes string // ExtraParkingLabels is a map of additional labels to apply to nodes and pods during the parking process. If not set, no extra labels are applied. ExtraParkingLabels map[string]string // EvictionSafetyCheck controls whether to perform safety checks before force eviction. If true, nodes will be unparked if pods don't have required parking labels. diff --git a/pkg/utils/karpenter_detection_test.go b/pkg/utils/karpenter_detection_test.go index 1d3cae5..f3d10c1 100644 --- a/pkg/utils/karpenter_detection_test.go +++ b/pkg/utils/karpenter_detection_test.go @@ -205,7 +205,7 @@ func TestLabelDriftedNodes(t *testing.T) { name: "No drifted node claims", driftedNodeClaims: []KarpenterNodeClaimInfo{}, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -225,7 +225,7 @@ func TestLabelDriftedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -245,7 +245,7 @@ func TestLabelDriftedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -265,7 +265,7 @@ func TestLabelDriftedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -484,7 +484,7 @@ func TestLabelDisruptedNodes(t *testing.T) { name: "No disrupted node claims", disruptedNodeClaims: []KarpenterNodeClaimInfo{}, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -505,7 +505,7 @@ func TestLabelDisruptedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -526,7 +526,7 @@ func TestLabelDisruptedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -547,7 +547,7 @@ func TestLabelDisruptedNodes(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", ParkingReasonLabel: "parked-reason", }, @@ -614,7 +614,7 @@ func TestProcessDriftedKarpenterNodes(t *testing.T) { appContext: &AppContext{ Config: config.Config{ UpgradeStatusLabel: "upgrade-status", - MaxParkedNodes: 5, + MaxParkedNodes: "5", ParkingReasonLabel: "parked-reason", }, K8sClient: fake.NewSimpleClientset(), diff --git a/pkg/utils/node_label_detection_test.go b/pkg/utils/node_label_detection_test.go index bb547b8..25d1180 100644 --- a/pkg/utils/node_label_detection_test.go +++ b/pkg/utils/node_label_detection_test.go @@ -430,7 +430,7 @@ func TestParkNodesWithLabels(t *testing.T) { name: "No matching nodes", matchingNodes: []NodeLabelInfo{}, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", }, dryRun: false, @@ -448,7 +448,7 @@ func TestParkNodesWithLabels(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", }, dryRun: false, @@ -472,7 +472,7 @@ func TestParkNodesWithLabels(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", }, dryRun: false, @@ -490,7 +490,7 @@ func TestParkNodesWithLabels(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 5, + MaxParkedNodes: "5", UpgradeStatusLabel: "upgrade-status", }, dryRun: true, @@ -520,7 +520,7 @@ func TestParkNodesWithLabels(t *testing.T) { }, }, cfg: config.Config{ - MaxParkedNodes: 2, + MaxParkedNodes: "2", UpgradeStatusLabel: "upgrade-status", }, dryRun: false, diff --git a/pkg/utils/parking.go b/pkg/utils/parking.go index d564648..543d849 100644 --- a/pkg/utils/parking.go +++ b/pkg/utils/parking.go @@ -476,14 +476,167 @@ func CountParkedNodes(ctx context.Context, k8sClient kubernetes.Interface, upgra return count, nil } +// ParseMaxParkedNodes parses the MaxParkedNodes configuration and returns the actual limit +// It supports both integer values (e.g., "5") and percentage values (e.g., "20%") +// For percentage values, it calculates the limit as (percentage/100) * totalNodes +// Returns 0 for invalid values or when no limit should be applied +func ParseMaxParkedNodes(ctx context.Context, k8sClient kubernetes.Interface, maxParkedNodesStr string, logger *log.Entry) (int, error) { + logger = logger.WithField("function", "ParseMaxParkedNodes") + + // Handle empty or "0" values + if maxParkedNodesStr == "" || maxParkedNodesStr == "0" { + logger.Debug("MaxParkedNodes is empty or 0, no limit will be applied") + return 0, nil + } + + // Check if it's a percentage + if strings.HasSuffix(maxParkedNodesStr, "%") { + // Parse percentage value + percentageStr := strings.TrimSuffix(maxParkedNodesStr, "%") + percentage, err := strconv.ParseFloat(percentageStr, 64) + if err != nil { + logger.WithError(err).WithField("value", maxParkedNodesStr).Warn("Failed to parse MaxParkedNodes percentage, treating as 0 (no limit)") + return 0, nil + } + + if percentage < 0 { + logger.WithField("percentage", percentage).Warn("MaxParkedNodes percentage is negative, treating as 0 (no limit)") + return 0, nil + } + + if percentage == 0 { + logger.Debug("MaxParkedNodes percentage is 0, no limit will be applied") + return 0, nil + } + + // Get total number of nodes in the cluster + nodeList, err := k8sClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + logger.WithError(err).Error("Failed to list nodes for percentage calculation") + return 0, errors.Wrap(err, "failed to list nodes for percentage calculation") + } + + totalNodes := len(nodeList.Items) + if totalNodes == 0 { + logger.Warn("No nodes found in cluster, cannot calculate percentage-based limit") + return 0, nil + } + + // Calculate the limit: (percentage/100) * totalNodes, rounded down + limit := int((percentage / 100.0) * float64(totalNodes)) + + logger.WithFields(log.Fields{ + "percentage": percentage, + "totalNodes": totalNodes, + "limit": limit, + }).Info("Calculated MaxParkedNodes limit from percentage") + + return limit, nil + } + + // Not a percentage, try to parse as integer + limit, err := strconv.Atoi(maxParkedNodesStr) + if err != nil { + logger.WithError(err).WithField("value", maxParkedNodesStr).Warn("Failed to parse MaxParkedNodes as integer, treating as 0 (no limit)") + return 0, nil + } + + if limit < 0 { + logger.WithField("limit", limit).Warn("MaxParkedNodes is negative, treating as 0 (no limit)") + return 0, nil + } + + logger.WithField("limit", limit).Debug("Using MaxParkedNodes as integer limit") + return limit, nil +} + +// sortNodesByCreationTime sorts a list of NodeInfo by creation timestamp (oldest first) +// Returns a new sorted slice without modifying the original +func sortNodesByCreationTime(ctx context.Context, k8sClient kubernetes.Interface, nodes []NodeInfo, logger *log.Entry) ([]NodeInfo, error) { + if len(nodes) <= 1 { + return nodes, nil + } + + // Create a map to store creation times + type nodeWithTime struct { + info NodeInfo + createTime time.Time + } + + nodesWithTime := make([]nodeWithTime, 0, len(nodes)) + + // Fetch creation times for all nodes + for _, node := range nodes { + k8sNode, err := k8sClient.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{}) + if err != nil { + logger.WithError(err).WithField("nodeName", node.Name).Warn("Failed to get node for sorting, will use current position") + // If we can't get the node, add it with zero time (will be sorted to end) + nodesWithTime = append(nodesWithTime, nodeWithTime{ + info: node, + createTime: time.Time{}, + }) + continue + } + + nodesWithTime = append(nodesWithTime, nodeWithTime{ + info: node, + createTime: k8sNode.CreationTimestamp.Time, + }) + } + + // Sort by creation time (oldest first) + slices.SortFunc(nodesWithTime, func(a, b nodeWithTime) int { + // Nodes with zero time (failed to fetch) go to the end + if a.createTime.IsZero() && !b.createTime.IsZero() { + return 1 + } + if !a.createTime.IsZero() && b.createTime.IsZero() { + return -1 + } + // Both have valid times or both are zero - compare normally + if a.createTime.Before(b.createTime) { + return -1 + } + if a.createTime.After(b.createTime) { + return 1 + } + return 0 + }) + + // Extract sorted NodeInfo + sortedNodes := make([]NodeInfo, len(nodesWithTime)) + for i, nwt := range nodesWithTime { + sortedNodes[i] = nwt.info + } + + logger.WithField("nodeCount", len(sortedNodes)).Debug("Sorted nodes by creation time (oldest first)") + + return sortedNodes, nil +} + // LimitNodesToPark limits the number of nodes to park based on MaxParkedNodes configuration // It returns the nodes that should be parked, prioritizing the oldest nodes first -func LimitNodesToPark(ctx context.Context, k8sClient kubernetes.Interface, nodes []NodeInfo, maxParkedNodes int, upgradeStatusLabel string, logger *log.Entry) ([]NodeInfo, error) { +func LimitNodesToPark(ctx context.Context, k8sClient kubernetes.Interface, nodes []NodeInfo, maxParkedNodesStr string, upgradeStatusLabel string, logger *log.Entry) ([]NodeInfo, error) { logger = logger.WithField("function", "LimitNodesToPark") + // Always sort nodes by creation timestamp (oldest first) to ensure predictable parking order + // This happens regardless of whether MaxParkedNodes is set + sortedNodes, err := sortNodesByCreationTime(ctx, k8sClient, nodes, logger) + if err != nil { + logger.WithError(err).Warn("Failed to sort nodes by creation time, using original order") + sortedNodes = nodes + } + + // Parse MaxParkedNodes to get the actual limit + maxParkedNodes, err := ParseMaxParkedNodes(ctx, k8sClient, maxParkedNodesStr, logger) + if err != nil { + logger.WithError(err).Error("Failed to parse MaxParkedNodes") + return nil, errors.Wrap(err, "failed to parse MaxParkedNodes") + } + if maxParkedNodes <= 0 { - logger.Debug("MaxParkedNodes is not set or invalid, parking all eligible nodes") - return nodes, nil + logger.Debug("MaxParkedNodes is not set or invalid, parking all eligible nodes in order (oldest first)") + return sortedNodes, nil } // Count currently parked nodes @@ -494,49 +647,50 @@ func LimitNodesToPark(ctx context.Context, k8sClient kubernetes.Interface, nodes } logger.WithFields(log.Fields{ - "currentlyParked": currentlyParked, - "maxParkedNodes": maxParkedNodes, - "eligibleNodes": len(nodes), + "currentlyParked": currentlyParked, + "maxParkedNodes": maxParkedNodes, + "maxParkedNodesStr": maxParkedNodesStr, + "eligibleNodes": len(sortedNodes), }).Info("Checking parking limits") // Calculate how many nodes we can park availableSlots := maxParkedNodes - currentlyParked if availableSlots <= 0 { logger.WithFields(log.Fields{ - "currentlyParked": currentlyParked, - "maxParkedNodes": maxParkedNodes, - "availableSlots": availableSlots, + "currentlyParked": currentlyParked, + "maxParkedNodes": maxParkedNodes, + "maxParkedNodesStr": maxParkedNodesStr, + "availableSlots": availableSlots, }).Warn("No available parking slots, skipping parking for this interval") return []NodeInfo{}, nil } // If we have more eligible nodes than available slots, limit to the oldest nodes - if len(nodes) > availableSlots { + if len(sortedNodes) > availableSlots { logger.WithFields(log.Fields{ - "eligibleNodes": len(nodes), + "eligibleNodes": len(sortedNodes), "availableSlots": availableSlots, "nodesToPark": availableSlots, - "nodesToSkip": len(nodes) - availableSlots, + "nodesToSkip": len(sortedNodes) - availableSlots, }).Info("Limiting nodes to park based on MaxParkedNodes configuration") - // For now, we'll take the first availableSlots nodes - // In a future enhancement, we could sort by node creation time or other criteria - limitedNodes := nodes[:availableSlots] + // Take the first availableSlots nodes (oldest nodes due to sorting) + limitedNodes := sortedNodes[:availableSlots] // Log which nodes are being skipped - for i := availableSlots; i < len(nodes); i++ { - logger.WithField("skippedNode", nodes[i].Name).Debug("Skipping node due to MaxParkedNodes limit") + for i := availableSlots; i < len(sortedNodes); i++ { + logger.WithField("skippedNode", sortedNodes[i].Name).Debug("Skipping node due to MaxParkedNodes limit") } return limitedNodes, nil } logger.WithFields(log.Fields{ - "eligibleNodes": len(nodes), + "eligibleNodes": len(sortedNodes), "availableSlots": availableSlots, }).Info("All eligible nodes can be parked within MaxParkedNodes limit") - return nodes, nil + return sortedNodes, nil } // UnparkNode unparks a node by removing parking labels, taints, and uncordoning it diff --git a/pkg/utils/parking_test.go b/pkg/utils/parking_test.go index f3eea38..c89523d 100644 --- a/pkg/utils/parking_test.go +++ b/pkg/utils/parking_test.go @@ -13,6 +13,7 @@ package utils import ( "context" + "fmt" "testing" "time" @@ -25,21 +26,44 @@ import ( ) func TestLimitNodesToPark_NoLimit(t *testing.T) { - // Test case: MaxParkedNodes = 0 (no limit) + // Test case: MaxParkedNodes = "0" (no limit) + // Even with no limit, nodes should be sorted by creation time (oldest first) cfg := config.Config{ - MaxParkedNodes: 0, + MaxParkedNodes: "0", UpgradeStatusLabel: "test-upgrade-status", } - nodes := []NodeInfo{ - {Name: "node1"}, - {Name: "node2"}, - {Name: "node3"}, - } - // Create a fake k8s client fakeClient := fake.NewSimpleClientset() + // Create nodes with different creation times (node3 is oldest, node1 is newest) + baseTime := time.Now() + node3 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-3 * time.Hour)}, // Oldest + }, + } + node2 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-2 * time.Hour)}, + }, + } + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-1 * time.Hour)}, // Newest + }, + } + + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), node1, metav1.CreateOptions{}) + assert.NoError(t, err) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), node2, metav1.CreateOptions{}) + assert.NoError(t, err) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), node3, metav1.CreateOptions{}) + assert.NoError(t, err) + // Add some parked nodes to simulate existing parked nodes parkedNode := &v1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -49,34 +73,67 @@ func TestLimitNodesToPark_NoLimit(t *testing.T) { }, }, } - _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) assert.NoError(t, err) + // Pass nodes in arbitrary order - they should be sorted by creation time + nodes := []NodeInfo{ + {Name: "node1"}, + {Name: "node2"}, + {Name: "node3"}, + } + logger := log.WithField("test", "TestLimitNodesToPark_NoLimit") result, err := LimitNodesToPark(context.Background(), fakeClient, nodes, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) assert.NoError(t, err) - assert.Equal(t, len(nodes), len(result)) - assert.Equal(t, nodes, result) + assert.Equal(t, 3, len(result), "Should return all nodes when no limit") + // Verify nodes are sorted by creation time (oldest first) + assert.Equal(t, "node3", result[0].Name, "Oldest node should be first") + assert.Equal(t, "node2", result[1].Name, "Middle node should be second") + assert.Equal(t, "node1", result[2].Name, "Newest node should be last") } func TestLimitNodesToPark_WithLimit(t *testing.T) { - // Test case: MaxParkedNodes = 2, 1 already parked, 3 eligible nodes + // Test case: MaxParkedNodes = "2", 1 already parked, 3 eligible nodes + // Nodes should be sorted by creation time (oldest first) cfg := config.Config{ - MaxParkedNodes: 2, + MaxParkedNodes: "2", UpgradeStatusLabel: "test-upgrade-status", } - nodes := []NodeInfo{ - {Name: "node1"}, - {Name: "node2"}, - {Name: "node3"}, - } - // Create a fake k8s client fakeClient := fake.NewSimpleClientset() + // Create nodes with different creation times (node3 is oldest, node1 is newest) + baseTime := time.Now() + node3 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node3", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-3 * time.Hour)}, // Oldest + }, + } + node2 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-2 * time.Hour)}, + }, + } + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + CreationTimestamp: metav1.Time{Time: baseTime.Add(-1 * time.Hour)}, // Newest + }, + } + + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), node1, metav1.CreateOptions{}) + assert.NoError(t, err) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), node2, metav1.CreateOptions{}) + assert.NoError(t, err) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), node3, metav1.CreateOptions{}) + assert.NoError(t, err) + // Add one parked node to simulate existing parked nodes parkedNode := &v1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -86,9 +143,16 @@ func TestLimitNodesToPark_WithLimit(t *testing.T) { }, }, } - _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) + _, err = fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) assert.NoError(t, err) + // Pass nodes in arbitrary order - they should be sorted by creation time + nodes := []NodeInfo{ + {Name: "node1"}, + {Name: "node2"}, + {Name: "node3"}, + } + logger := log.WithField("test", "TestLimitNodesToPark_WithLimit") result, err := LimitNodesToPark(context.Background(), fakeClient, nodes, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) @@ -96,13 +160,14 @@ func TestLimitNodesToPark_WithLimit(t *testing.T) { assert.NoError(t, err) // Should only park 1 node (2 max - 1 already parked = 1 available slot) assert.Equal(t, 1, len(result)) - assert.Equal(t, "node1", result[0].Name) + // Should park node3 as it's the oldest + assert.Equal(t, "node3", result[0].Name) } func TestLimitNodesToPark_NoAvailableSlots(t *testing.T) { - // Test case: MaxParkedNodes = 2, 2 already parked, 3 eligible nodes + // Test case: MaxParkedNodes = "2", 2 already parked, 3 eligible nodes cfg := config.Config{ - MaxParkedNodes: 2, + MaxParkedNodes: "2", UpgradeStatusLabel: "test-upgrade-status", } @@ -147,9 +212,9 @@ func TestLimitNodesToPark_NoAvailableSlots(t *testing.T) { } func TestLimitNodesToPark_NegativeLimit(t *testing.T) { - // Test case: MaxParkedNodes = -1 (invalid, should be treated as 0) + // Test case: MaxParkedNodes = "-1" (invalid, should be treated as 0) cfg := config.Config{ - MaxParkedNodes: -1, + MaxParkedNodes: "-1", UpgradeStatusLabel: "test-upgrade-status", } @@ -171,6 +236,158 @@ func TestLimitNodesToPark_NegativeLimit(t *testing.T) { assert.Equal(t, nodes, result) } +func TestLimitNodesToPark_PercentageLimit(t *testing.T) { + // Test case: MaxParkedNodes = "20%", 10 total nodes, 1 already parked, 3 eligible nodes + // Expected: 20% of 10 = 2 nodes max, 1 already parked, so 1 more can be parked + cfg := config.Config{ + MaxParkedNodes: "20%", + UpgradeStatusLabel: "test-upgrade-status", + } + + nodes := []NodeInfo{ + {Name: "node1"}, + {Name: "node2"}, + {Name: "node3"}, + } + + // Create a fake k8s client + fakeClient := fake.NewSimpleClientset() + + // Add 10 total nodes to the cluster + for i := 1; i <= 10; i++ { + nodeName := fmt.Sprintf("cluster-node-%d", i) + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), node, metav1.CreateOptions{}) + assert.NoError(t, err) + } + + // Add one parked node to simulate existing parked nodes + parkedNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "already-parked-node", + Labels: map[string]string{ + "test-upgrade-status": "parked", + }, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) + assert.NoError(t, err) + + logger := log.WithField("test", "TestLimitNodesToPark_PercentageLimit") + + result, err := LimitNodesToPark(context.Background(), fakeClient, nodes, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) + + assert.NoError(t, err) + // 11 total nodes (10 cluster + 1 parked), 20% = 2.2 -> floor to 2 + // 1 already parked, so should park 1 more + assert.Equal(t, 1, len(result)) + assert.Equal(t, "node1", result[0].Name) +} + +func TestLimitNodesToPark_PercentageLimit_NoSlots(t *testing.T) { + // Test case: MaxParkedNodes = "10%", 10 total nodes (10% = 1), 1 already parked + // Expected: No slots available + cfg := config.Config{ + MaxParkedNodes: "10%", + UpgradeStatusLabel: "test-upgrade-status", + } + + nodes := []NodeInfo{ + {Name: "node1"}, + {Name: "node2"}, + } + + // Create a fake k8s client with 10 nodes + fakeClient := fake.NewSimpleClientset() + + for i := 1; i <= 9; i++ { + nodeName := fmt.Sprintf("cluster-node-%d", i) + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), node, metav1.CreateOptions{}) + assert.NoError(t, err) + } + + // Add one parked node (total = 10) + parkedNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "already-parked-node", + Labels: map[string]string{ + "test-upgrade-status": "parked", + }, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), parkedNode, metav1.CreateOptions{}) + assert.NoError(t, err) + + logger := log.WithField("test", "TestLimitNodesToPark_PercentageLimit_NoSlots") + + result, err := LimitNodesToPark(context.Background(), fakeClient, nodes, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) + + assert.NoError(t, err) + // 10 total nodes, 10% = 1, 1 already parked, no slots available + assert.Equal(t, 0, len(result)) +} + +func TestLimitNodesToPark_SortingByAge(t *testing.T) { + // Test case: Verify nodes are sorted by creation time (oldest first) + cfg := config.Config{ + MaxParkedNodes: "2", + UpgradeStatusLabel: "test-upgrade-status", + } + + // Create a fake k8s client + fakeClient := fake.NewSimpleClientset() + + // Create 5 nodes with different creation times + baseTime := time.Now() + nodeCreationTimes := map[string]time.Time{ + "node-newest": baseTime.Add(-1 * time.Hour), + "node-older": baseTime.Add(-2 * time.Hour), + "node-middle": baseTime.Add(-3 * time.Hour), + "node-very-old": baseTime.Add(-5 * time.Hour), + "node-oldest-ever": baseTime.Add(-10 * time.Hour), // Should be parked first + } + + for name, createTime := range nodeCreationTimes { + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + CreationTimestamp: metav1.Time{Time: createTime}, + }, + } + _, err := fakeClient.CoreV1().Nodes().Create(context.Background(), node, metav1.CreateOptions{}) + assert.NoError(t, err) + } + + // Pass nodes in arbitrary order + nodes := []NodeInfo{ + {Name: "node-newest"}, + {Name: "node-middle"}, + {Name: "node-oldest-ever"}, + {Name: "node-older"}, + {Name: "node-very-old"}, + } + + logger := log.WithField("test", "TestLimitNodesToPark_SortingByAge") + + result, err := LimitNodesToPark(context.Background(), fakeClient, nodes, cfg.MaxParkedNodes, cfg.UpgradeStatusLabel, logger) + + assert.NoError(t, err) + // Should park 2 oldest nodes + assert.Equal(t, 2, len(result)) + // Should be sorted oldest first + assert.Equal(t, "node-oldest-ever", result[0].Name, "Oldest node should be first") + assert.Equal(t, "node-very-old", result[1].Name, "Second oldest node should be second") +} + func TestCountParkedNodes(t *testing.T) { // Test case: Count parked nodes upgradeStatusLabel := "test-upgrade-status"