kubernetes-sigs
diff --git a/‎.github/actions/install-deps/action.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/actions/install-deps/action.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎OWNERS_ALIASES
Lines changed: 3 additions & 3 deletions b/‎OWNERS_ALIASES
Lines changed: 3 additions & 3 deletions
diff --git a/‎contributing-guidelines.md
Lines changed: 6 additions & 2 deletions b/‎contributing-guidelines.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎designs/capacity-reservations.md
Lines changed: 3 additions & 3 deletions b/‎designs/capacity-reservations.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎designs/images/noderegistrationhealthy-nodepools1.png
37.2 KB b/‎designs/images/noderegistrationhealthy-nodepools1.png
37.2 KB
diff --git a/‎designs/images/noderegistrationhealthy-nodepools2.png
36.3 KB b/‎designs/images/noderegistrationhealthy-nodepools2.png
36.3 KB
diff --git a/‎designs/noderegistrationhealthy-status-condition.md
Lines changed: 94 additions & 0 deletions b/‎designs/noderegistrationhealthy-status-condition.md
Lines changed: 94 additions & 0 deletions
diff --git a/‎go.mod
Lines changed: 8 additions & 8 deletions b/‎go.mod
Lines changed: 8 additions & 8 deletions
@@ -16,7 +16,7 @@ runs:
     # Root path permission workaround for caching https://github.com/actions/cache/issues/845#issuecomment-1252594999
     - run: sudo chown "$USER" /usr/local
       shell: bash
-    - uses: actions/cache@0c907a75c2c80ebcb7f088228285e798b750cf8f # v4.2.1
+    - uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
       id: cache-toolchain
       with:
         path: |
 
@@ -6,6 +6,7 @@ coverage.html
 *.heapprofile
 go.work
 go.work.sum
+report.json
 
 # Common in OSs and IDEs
 .idea
 
@@ -9,10 +9,10 @@ aliases:
     - ellistarn
     - jonathan-innis
     - tzneal
-    - bwagner5
     - njtran
+    - jmdeal
   karpenter-reviewers:
-    - jackfrancis
     - tallaxes
     - engedaam
-    - jmdeal
+  emeritus-karpenter-maintainers:
+    - bwagner5
@@ -4,7 +4,7 @@ This document’s goal is to define clear, scalable, and transparent criteria to
 
 Ultimately, the criteria in this doc is aspirational. No set of written requirements can encapsulate the full criteria when determining if someone meets the bar to be a reviewer or approver, as some of the criteria are subjective and relies on the trust that each nominee has established with the community. To help guide readers, this document outlines ways to demonstrate expertise of the code base, sound judgement on decision tradeoffs, end user advocacy, care for community, and ability to work as a distributed team.
 
-Much of this document uses the [SIG-Node Contributor Ladder](https://github.com/kubernetes/community/blob/master/sig-node/sig-node-contributor-ladder.md) as prior art. The goal is to mold these requirements to fit the Karpenter’s community. These requirements also lean on the established Kubernetes[membership documentation](https://github.com/kubernetes/community/blob/master/community-membership.md) for terminology.
+Much of this document uses the [SIG-Node Contributor Ladder](https://github.com/kubernetes/community/blob/master/sig-node/sig-node-contributor-ladder.md) as prior art. The goal is to mold these requirements to fit the Karpenter’s community. These requirements also lean on the established Kubernetes [membership documentation](https://github.com/kubernetes/community/blob/master/community-membership.md) for terminology.
 
 As a final precursor, to become a reviewer or approver, users must nominate themselves. They are responsible for cutting a PR to the upstream repository, providing evidence in-line with the suggested requirements. Users should feel free to reach out to an existing approver to understand what how they land in respect to the criteria. The following sections are guiding criteria and guidelines, where the final decision lies with the maintainers. 
 
@@ -92,4 +92,8 @@ In addition to the formal requirements for the [approver role](https://github.co
 * Have approval rights in a well-known cloud provider implementation of Karpenter or in an adjacent SIG Autoscaling sub-project. 
 * Be a primary PR reviewer for numerous PRs in multiple areas listed as a requirement for a reviewer.
 * Actively triage issues and PRs, provide support to contributors to drive their PRs to completion.
-* Be present, and participate in Karpenter meetings by speaking about features or improvements driven, or find some other way to prove the identity behind GitHub handle.
+* Be present, and participate in Karpenter meetings by speaking about features or improvements driven, or find some other way to prove the identity behind GitHub handle.
+
+### Cleanup and Emeritus
+
+The `kubernetes-sigs/karpenter` sub-project abides by the same cleanup process prescribed in https://github.com/kubernetes/community/blob/master/contributors/guide/owners.md#cleanup. It is generally recommended that reviewers or approvers who know that they are no longer going to be actively invovled _remove themselves_ from the OWNERS_ALIASES file; however, approvers may also initate a PR and reachout to the relevant reviewer/approver if they recognize that the user is no longer actively involved in the project (as defined in the community contributors doc linked above).
@@ -22,13 +22,13 @@ Karpenter doesn't currently support reasoning about this capacity type. Karpente
 3. Karpenter should add logic to its scheduler to reason about this availability as an `int` -- ensuring that the scheduler never schedules more offerings of an instance type for a capacity type than are available
 4. Karpenter should extend its CloudProvider [InstanceType](https://github.com/kubernetes-sigs/karpenter/blob/35d6197e38e64cd6abfef71a082aee80e38d09fd/pkg/cloudprovider/types.go#L75) struct to allow offerings to represent availability of an offering as an `int` rather than a `bool` -- allowing Cloud Providers to represent the constrained capacity of `reserved`
 5. Karpenter should consolidate between `on-demand` and/or `spot` instance types to `reserved` when the capacity type is available
-6. Karpenter should introduce a feature flag `FEATURE_FLAG=CapacityReservations` to gate this new feature in `ALPHA` when it's introduced
+6. Karpenter should introduce a feature flag `FEATURE_FLAG=ReservedCapacity` to gate this new feature in `ALPHA` when it's introduced
 
 ### `karpenter.sh/capacity-type` API
 
 _Note: Some excerpts taken from [`aws/karpenter-provider-aws` RFC](https://github.com/aws/karpenter-provider-aws/blob/main/designs/odcr.md#nodepool-api)._
 
-This RFC proposes the addition of a new `karpenter.sh/capacity-type` label value, called `reserved`. A cluster admin could then select to support only launching reserved node capacity and falling back between reserved capacity to on-demand (or even spot) capacity respectively. 
+This RFC proposes the addition of a new `karpenter.sh/capacity-type` label value, called `reserved`. A cluster admin could then select to support only launching reserved node capacity and falling back between reserved capacity to on-demand (or even spot) capacity respectively.
 
 _Note: This option requires any applications (pods) that are using node selection on `karpenter.sh/capacity-type: "on-demand"` to expand their selection to include `reserved` or to update it to perform a `NotIn` node affinity on `karpenter.sh/capacity-type: spot`_
 
@@ -140,4 +140,4 @@ In practice, this means that if a user has two capacity reservation offerings av
 
 ## Appendix
 
-1. AWS Cloud Provider's RFC for On-Demand Capacity Reservations: https://github.com/aws/karpenter-provider-aws/blob/main/designs/odcr.md
+1. AWS Cloud Provider's RFC for On-Demand Capacity Reservations: https://github.com/aws/karpenter-provider-aws/blob/main/designs/odcr.md
@@ -0,0 +1,94 @@
+# RFC: NodeRegistrationHealthy Status Condition on NodePool
+
+## Motivation
+
+Karpenter may initiate the creation of nodes based on a NodePool configuration, but these nodes might fail to join the cluster due to unforeseen registration issues that Karpenter cannot anticipate or prevent. An example illustrating this issue is when network connectivity is impeded by incorrect cluster security group configuration, such as missing outbound rule that allows outbound access to any IPv4 address. In such cases, Karpenter will continue its attempts to provision compute resources, but these resources will fail to join the cluster until the outbound rule for the security group is updated. Currently, there isn't a way to surface these kind of failures to the users. 
+
+This RFC proposes enhancing the visibility of these failure modes by introducing a `NodeRegistrationHealthy` status condition on the NodePool. We can then create new metrics around this status condition which will improve observability by alerting cluster administrators to potential issues within a NodePool that require investigation and resolution.
+
+The `NodeRegistrationHealthy` status would specifically highlight instance launch/registration failures that Karpenter cannot fully diagnose or predict. However, this status should not be a mechanism to catch all types of launch/registration failures. Karpenter should not mark resources as `NodeRegistrationHealthy` if it can definitively determine, based on the NodePool/NodeClass configurations or through dry-run, that launch or registration will fail. For instance, if a NodePool is restricted to a specific zone using the `topology.kubernetes.io/zone` label, but the specified zone is not accessible through the provided subnet configurations, this inconsistency shouldn't trigger a `NodeRegistrationHealthy: False` status. CloudProviders should also try implementing deterministic mechanisms to identify launch failures like this [validation controller](https://github.com/aws/karpenter-provider-aws/blob/main/pkg/controllers/nodeclass/validation.go) added for AWS provider.
+
+Currently, while launch and registration processes have defined timeouts, the initialization phase does not. As a result, there's no concept of initialization failures today. However, the proposed design can be extended to potentially incorporate initialization failure detection in future iterations.
+
+## 🔑 Introduce a NodeRegistrationHealthy Status Condition on the NodePool Status
+
+```
+// 'NodeRegistrationHealthy' condition indicates if a misconfiguration exists that prevents the normal, successful use of a Karpenter resource
+Status:
+  Conditions:
+    Last Transition Time:  2025-01-13T18:57:20Z
+    Message:               
+    Observed Generation:   1
+    Reason:                NodeRegistrationHealthy
+    Status:                True
+    Type:                  NodeRegistrationHealthy
+```
+`NodeRegistrationHealthy` status condition is introduced in the NodePool status which can be set to - 
+1. Unknown - When the NodePool is first created, `NodeRegistrationHealthy` is set to Unknown. This means that we don't have enough data to tell if the nodes launched using this NodePool can successfully register or not. 
+2. False - NodePool has configuration issues that require user investigation and resolution. Since Karpenter cannot automatically detect these specific launch or registration failures, we will document common failure scenarios and possible fixes in our troubleshooting guide to assist users. The cause for the failure will also be surfaced through the status condition reason and message fields. 
+3. True - There has been successful node registration using this unique combination of NodePool and NodeClass spec.
+
+A NodePool marked with `NodeRegistrationHealthy: False` can still be used for provisioning workloads, as this status isn't a precondition for readiness. We can expand this in the future with a follow-up where we will have some cooldown period for trying to schedule with a NodePool that has `NodeRegistrationHealthy: False`. 
+
+## Goals
+1. Tolerate transient errors.
+2. Respond to corrections in external configuration (i.e. can remove the NodeRegistrationHealthy status condition from a NodePool if an external fix allows Nodes to register).
+
+### Option 1: Track if a node was successfully launched using a NodePool  - Recommended
+This option sets `NodeRegistrationHealthy` status condition on the nodePool by checking if the node launched using this nodePool failed to launch/register.
+
+![](./images/noderegistrationhealthy-nodepools2.png)
+
+Evaluation conditions -
+
+1. When a nodePool is first created, set `NodeRegistrationHealthy: Unknown`. A nodePool which already has `NodeRegistrationHealthy: True` will not go back to unknown unless there is an update to the NodePool or the referenced nodeClass.
+2. On a failed launch/registration, set `NodeRegistrationHealthy: False`. Do not update the status condition to false if the nodePool already has `NodeRegistrationHealthy: True`.
+3. On successful registration, set `NodeRegistrationHealthy: True`.
+4. Do not update the `NodeRegistrationHealthy` status condition when Karpenter restarts.
+
+#### Considerations
+
+1. 👍 This approach is particularly helpful for pod binding/ready metrics for pods that are scheduled against NodePools that have `NodeRegistrationHealthy: True`. Metrics collection will begin only after its first successful node registration. Once `NodeRegistrationHealthy: True`, and a node is launched with a bad AMI then we will still see pod metrics that can help identify node launch failures due to bad AMIs.
+
+### Option 2: In-memory Buffer to store history
+
+This option will have an in-memory FIFO buffer, which will grow to a max size of 10 (this can be changed later). This buffer will store data about the success or failure during launch/registration and is evaluated by a controller to determine the relative health of the NodePool. This would be implemented as a `[]bool`, where `true` indicates a launch success, and `false` represents a failure. The state of the `NodeRegistrationHealthy` condition would be based on the number of `false` entries in the buffer.
+
+![](./images/noderegistrationhealthy-nodepools1.png)
+
+Evaluation conditions -
+
+1. We start with an empty buffer with `NodeRegistrationHealthy: Unknown`.
+2. There have to be 2 minimum failures in the buffer for `NodeRegistrationHealthy` to transition to `False`. 
+3. If the buffer starts with a success then `NodeRegistrationHealthy: True`. 
+4. If Karpenter restarts then we flush the buffer but don't change the existing state of `NodeRegistrationHealthy` status condition.
+5. If there is an update to a Nodepool/Nodeclass, flush the buffer and set `NodeRegistrationHealthy: Unknown`.
+6. Since the buffer is FIFO, we remove the oldest launch result when the max buffer size is reached.
+7. If no new launches/registrations happen for 2 consecutive registration ttl cycles (currently 15 minutes), expire/remove the oldest entry in the buffer and revaluate the status condition if it was previously set to False. This ensures that a NodePool's unhealthy status does not persist indefinitely when no new launch attempts have occurred within 30 minutes.
+
+See below for example evaluations:
+
+```
+Successful Launch: true
+Unsuccessful Launch: false
+
+[] = 'NodeRegistrationHealthy: Unknown'
+[true] = 'NodeRegistrationHealthy: True'
+[false, true] = 'NodeRegistrationHealthy: True'
+[false, false] = 'NodeRegistrationHealthy: False'
+[false, true, false] = 'NodeRegistrationHealthy: False'
+[false, true, true, true, true, true, true, true, true, true] = 'NodeRegistrationHealthy: True'
+```
+
+#### Considerations
+
+1. 👍 Tolerates transient failures such as those that happen due to underlying hardware failure because we keep track of recent launch history and set `NodeRegistrationHealthy: True` only when there are 2 or more launch/registration failures.
+2. 👍 Can be easily expanded if we want to update the buffer size depending on the cluster size.
+3. 👎 This approach tends to get more complex as we need to scale with the cluster size.
+4. 👎 Managing buffer entry expiration adds another layer of complexity. We need to ensure that a NodePool's unhealthy status doesn't persist indefinitely when no new launch attempts have occurred within a specified timeframe.
+
+### How Does this Improve Observability?
+The introduction of `NodeRegistrationHealthy` status condition serves two key purposes:
+
+1. It provides users with clearer visibility into NodeClaim launch and registration failures. 
+2. It establishes groundwork for future scheduling optimizations, where we can assign lower priority to NodePools marked with `NodeRegistrationHealthy: False`.
@@ -6,21 +6,21 @@ require (
 	github.com/Pallinder/go-randomdata v1.2.0
 	github.com/avast/retry-go v3.0.0+incompatible
 	github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115
-	github.com/docker/docker v28.0.0+incompatible
+	github.com/docker/docker v28.0.1+incompatible
 	github.com/go-logr/logr v1.4.2
 	github.com/imdario/mergo v0.3.16
 	github.com/klauspost/compress v1.17.11 // indirect
 	github.com/mitchellh/hashstructure/v2 v2.0.2
-	github.com/onsi/ginkgo/v2 v2.22.2
+	github.com/onsi/ginkgo/v2 v2.23.0
 	github.com/onsi/gomega v1.36.2
 	github.com/patrickmn/go-cache v2.1.0+incompatible
-	github.com/prometheus/client_golang v1.21.0
+	github.com/prometheus/client_golang v1.21.1
 	github.com/prometheus/client_model v0.6.1
 	github.com/samber/lo v1.49.1
 	go.uber.org/multierr v1.11.0
 	go.uber.org/zap v1.27.0
-	golang.org/x/text v0.22.0
-	golang.org/x/time v0.10.0
+	golang.org/x/text v0.23.0
+	golang.org/x/time v0.11.0
 	k8s.io/api v0.32.2
 	k8s.io/apiextensions-apiserver v0.32.2
 	k8s.io/apimachinery v0.32.2
@@ -30,7 +30,7 @@ require (
 	k8s.io/csi-translation-lib v0.32.2
 	k8s.io/klog/v2 v2.130.1
 	k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
-	sigs.k8s.io/controller-runtime v0.20.2
+	sigs.k8s.io/controller-runtime v0.20.3
 )
 
 require (
@@ -66,7 +66,7 @@ require (
 	github.com/spf13/cobra v1.8.1 // indirect
 	github.com/spf13/pflag v1.0.6 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
-	golang.org/x/net v0.35.0 // indirect
+	golang.org/x/net v0.36.0 // indirect
 	golang.org/x/oauth2 v0.24.0 // indirect
 	golang.org/x/sys v0.30.0 // indirect
 	golang.org/x/term v0.29.0 // indirect
@@ -86,7 +86,7 @@ require (
 	github.com/fsnotify/fsnotify v1.7.0 // indirect
 	github.com/google/btree v1.1.3 // indirect
 	github.com/rogpeppe/go-internal v1.13.1 // indirect
-	golang.org/x/sync v0.11.0 // indirect
+	golang.org/x/sync v0.12.0 // indirect
 )
 
 retract (