Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ core,github.com/aws/aws-sdk-go-v2/internal/configsources,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/internal/endpoints/v2,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/internal/ini,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/internal/sync/singleflight,BSD-3-Clause
core,github.com/aws/aws-sdk-go-v2/service/autoscaling,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/service/cloudformation,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/service/ec2,Apache-2.0
core,github.com/aws/aws-sdk-go-v2/service/eks,Apache-2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
awssdk "github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/aws/arn"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/autoscaling"
"github.com/aws/aws-sdk-go-v2/service/cloudformation"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/eks"
Expand Down Expand Up @@ -38,6 +39,7 @@ import (
type Clients struct {
// AWS clients
Config awssdk.Config
Autoscaling *autoscaling.Client
CloudFormation *cloudformation.Client
EC2 *ec2.Client
EKS *eks.Client
Expand Down Expand Up @@ -101,6 +103,7 @@ func Build(ctx context.Context, configFlags *genericclioptions.ConfigFlags, k8sC

return &Clients{
Config: awsConfig,
Autoscaling: autoscaling.NewFromConfig(awsConfig),
CloudFormation: cloudformation.NewFromConfig(awsConfig),
EC2: ec2.NewFromConfig(awsConfig),
EKS: eks.NewFromConfig(awsConfig),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
package clusterinfo

import (
"context"
"fmt"
"log"
"maps"
"regexp"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/service/autoscaling"
astypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types"
"github.com/samber/lo"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)

const (
nodeListChunkSize = 100
// describeASGInstancesMaxIDs is the documented per-call limit of
// autoscaling:DescribeAutoScalingInstances. Sending more triggers a
// ValidationError at the API.
describeASGInstancesMaxIDs = 50
)

// awsProviderIDRegexp matches the AWS provider ID for EC2-backed nodes.
// Format: aws:///<az>/i-<hex>. Fargate nodes use a different shape and
// must therefore be classified by label before reaching this regex.
var awsProviderIDRegexp = regexp.MustCompile(`^aws:///[^/]+/(i-[0-9a-f]+)$`)

// AutoscalingDescriber is the subset of *autoscaling.Client used by Classify.
// Defined as an interface so tests can substitute a fake without spinning up
// AWS SDK middleware.
type AutoscalingDescriber interface {
DescribeAutoScalingInstances(ctx context.Context, in *autoscaling.DescribeAutoScalingInstancesInput, opts ...func(*autoscaling.Options)) (*autoscaling.DescribeAutoScalingInstancesOutput, error)
}

// Classify inspects every node in the cluster, groups them by management
// method, and returns the resulting snapshot.
func Classify(ctx context.Context, k8sClient kubernetes.Interface, asg AutoscalingDescriber, clusterName string) (*ClusterInfo, error) {
info := &ClusterInfo{
APIVersion: APIVersion,
ClusterName: clusterName,
GeneratedAt: time.Now().UTC(),
NodeManagement: map[NodeManager]map[string][]string{},
}

asgCandidates, err := classifyByLabels(ctx, k8sClient, info)
if err != nil {
return nil, err
}

if err = resolveASGs(ctx, asg, asgCandidates, info); err != nil {
return nil, err
}

info.ClusterAutoscaler, err = detectClusterAutoscaler(ctx, k8sClient)
if err != nil {
return nil, fmt.Errorf("failed to detect cluster-autoscaler: %w", err)
}

return info, nil
}

// asgCandidate is a node that needs an AWS API call to determine whether
// it's in an ASG (asg bucket) or not (standalone bucket).
type asgCandidate struct {
instanceID string
nodeName string
}

// classifyByLabels walks all nodes and applies the label-only branches of the
// decision tree (Fargate, Karpenter, EKS managed node group, unknown). Nodes
// with an AWS EC2 providerID that don't match any of the above are returned
// as ASG candidates for resolveASGs to bucket as asg or standalone.
func classifyByLabels(ctx context.Context, k8sClient kubernetes.Interface, info *ClusterInfo) ([]asgCandidate, error) {
var candidates []asgCandidate

var cont string
for {
list, err := k8sClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{
Limit: nodeListChunkSize,
Continue: cont,
})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}

for _, node := range list.Items {
if mgr, entity, ok := classifyNodeByLabel(&node); ok {
addToBucket(info, mgr, entity, node.Name)
continue
}

matches := awsProviderIDRegexp.FindStringSubmatch(node.Spec.ProviderID)
if len(matches) == 2 {
candidates = append(candidates, asgCandidate{
instanceID: matches[1],
nodeName: node.Name,
})
} else {
addToBucket(info, NodeManagerUnknown, node.Spec.ProviderID, node.Name)
}
}

cont = list.Continue
if cont == "" {
return candidates, nil
}
}
}

// classifyNodeByLabel applies steps 1-3 of the decision tree using only the
// Node labels and name. Returns false when the node needs an AWS API lookup
// or the unknown-bucket fallback.
func classifyNodeByLabel(node *corev1.Node) (NodeManager, string, bool) {
if node.Labels["eks.amazonaws.com/compute-type"] == "fargate" || strings.HasPrefix(node.Name, "fargate-ip-") {
return NodeManagerFargate, node.Labels["eks.amazonaws.com/fargate-profile"], true
}

if v := node.Labels["karpenter.sh/nodepool"]; v != "" {
return NodeManagerKarpenter, v, true
}
if v := node.Labels["karpenter.k8s.aws/ec2nodeclass"]; v != "" {
return NodeManagerKarpenter, v, true
}
// Legacy Karpenter v0.x (pre-NodePool) uses Provisioner instead.
if v := node.Labels["karpenter.sh/provisioner-name"]; v != "" {
return NodeManagerKarpenter, v, true
}

if v := node.Labels["eks.amazonaws.com/nodegroup"]; v != "" {
return NodeManagerEKSManagedNodeGroup, v, true
}

return "", "", false
}

// resolveASGs batches DescribeAutoScalingInstances calls (50 IDs per call,
// the documented limit) to map instance IDs to ASGs. Instances reported
// without an AutoScalingGroupName fall into the standalone bucket.
func resolveASGs(ctx context.Context, asg AutoscalingDescriber, candidates []asgCandidate, info *ClusterInfo) error {
byInstance := make(map[string]string, len(candidates))

for _, batch := range lo.Chunk(candidates, describeASGInstancesMaxIDs) {
ids := lo.Map(batch, func(c asgCandidate, _ int) string { return c.instanceID })
out, err := asg.DescribeAutoScalingInstances(ctx, &autoscaling.DescribeAutoScalingInstancesInput{
InstanceIds: ids,
})
if err != nil {
return fmt.Errorf("failed to describe autoscaling instances: %w", err)
}
maps.Copy(byInstance, lo.FilterSliceToMap(out.AutoScalingInstances, func(ai astypes.AutoScalingInstanceDetails) (string, string, bool) {
if ai.InstanceId == nil || ai.AutoScalingGroupName == nil {
return "", "", false
}
return *ai.InstanceId, *ai.AutoScalingGroupName, true
}))
}

for _, c := range candidates {
if name := byInstance[c.instanceID]; name != "" {
addToBucket(info, NodeManagerASG, name, c.nodeName)
} else {
addToBucket(info, NodeManagerStandalone, "", c.nodeName)
}
}
return nil
}

func addToBucket(info *ClusterInfo, mgr NodeManager, entity, nodeName string) {
bucket := info.NodeManagement[mgr]
if bucket == nil {
bucket = map[string][]string{}
info.NodeManagement[mgr] = bucket
}
bucket[entity] = append(bucket[entity], nodeName)
}

// detectClusterAutoscaler scans Deployments cluster-wide and returns the
// first match. A match is any Deployment with name "cluster-autoscaler", a
// well-known label, or a container image referencing "cluster-autoscaler".
// Multiple matches yield a warning but only the first is recorded.
func detectClusterAutoscaler(ctx context.Context, k8sClient kubernetes.Interface) (ClusterAutoscaler, error) {
list, err := k8sClient.AppsV1().Deployments(corev1.NamespaceAll).List(ctx, metav1.ListOptions{})
if err != nil {
return ClusterAutoscaler{}, err
}

matches := lo.Filter(list.Items, isClusterAutoscaler)
if len(matches) == 0 {
return ClusterAutoscaler{}, nil
}
if len(matches) > 1 {
log.Printf("Warning: %d Deployments match cluster-autoscaler heuristics; recording the first one (%s/%s).",
len(matches), matches[0].Namespace, matches[0].Name)
}
return ClusterAutoscaler{
Present: true,
Namespace: matches[0].Namespace,
Name: matches[0].Name,
Version: extractClusterAutoscalerVersion(matches[0]),
}, nil
}

// isClusterAutoscaler matches lo.Filter's predicate signature so it can be
// passed directly without a wrapper.
func isClusterAutoscaler(d appsv1.Deployment, _ int) bool {
// A Deployment scaled to zero is effectively disabled; ignoring it lets
// users who already stopped CA (per the Karpenter migration guide) get
// `Present: false` in the snapshot. A nil Replicas defaults to 1 per the
// Kubernetes API, so it counts as active.
if d.Spec.Replicas != nil && *d.Spec.Replicas == 0 {
return false
}
if d.Name == "cluster-autoscaler" ||
d.Labels["app.kubernetes.io/name"] == "cluster-autoscaler" ||
d.Labels["k8s-app"] == "cluster-autoscaler" {
return true
}
return lo.SomeBy(d.Spec.Template.Spec.Containers, func(c corev1.Container) bool {
return strings.Contains(c.Image, "cluster-autoscaler")
})
}

// extractClusterAutoscalerVersion returns the running cluster-autoscaler
// version. Prefers the image tag of the matching container (the source of
// truth) and falls back to the `app.kubernetes.io/version` label on the
// Deployment or its pod template (set by most Helm charts). Empty when
// neither is available — e.g. an image referenced by digest only and no
// version label.
func extractClusterAutoscalerVersion(d appsv1.Deployment) string {
for _, c := range d.Spec.Template.Spec.Containers {
if !strings.Contains(c.Image, "cluster-autoscaler") {
continue
}
if tag := imageTag(c.Image); tag != "" {
return tag
}
}
if v := d.Labels["app.kubernetes.io/version"]; v != "" {
return v
}
return d.Spec.Template.Labels["app.kubernetes.io/version"]
}

// imageTag extracts the tag portion of an OCI image reference, stripping
// any `@sha256:...` digest. Returns empty when no tag is set (for instance,
// digest-only references or bare image names).
func imageTag(image string) string {
if i := strings.Index(image, "@"); i >= 0 {
image = image[:i]
}
// The last colon is the tag separator only if it is not followed by a
// path component — otherwise it's a registry port (e.g. `localhost:5000/foo`).
if i := strings.LastIndex(image, ":"); i >= 0 && !strings.Contains(image[i+1:], "/") {
return image[i+1:]
}
return ""
}
Loading
Loading