From db319403138700d24a0fb13219ca9352136b7901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Mon, 27 Apr 2026 17:49:50 +0200 Subject: [PATCH 1/4] [CASCL-1304] Snapshot node managers in ConfigMap during install Once Karpenter is installed on an existing EKS cluster, the user must migrate workloads off the existing nodes. The right migration action depends on how each node is currently managed (Fargate / Karpenter / EKS managed node group / EC2 ASG / standalone) and whether a legacy cluster-autoscaler is still running. Capture that topology in a `dd-cluster-info` ConfigMap so the follow-up migration step has a ground truth to drive from. Errors are logged as a warning and do not fail `install`: the snapshot is informational, and Karpenter is already up by the time it runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cluster/common/clients/clients.go | 3 + .../cluster/common/clusterinfo/classify.go | 226 ++++++++++++++++ .../common/clusterinfo/classify_test.go | 254 ++++++++++++++++++ .../cluster/common/clusterinfo/persist.go | 41 +++ .../common/clusterinfo/persist_test.go | 70 +++++ .../cluster/common/clusterinfo/types.go | 47 ++++ .../autoscaling/cluster/install/install.go | 20 ++ go.mod | 9 +- go.sum | 18 +- 9 files changed, 676 insertions(+), 12 deletions(-) create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist_test.go create mode 100644 cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clients/clients.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clients/clients.go index b2c53ed80f..bfd11280c2 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/clients/clients.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clients/clients.go @@ -11,6 +11,7 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/arn" "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" "github.com/aws/aws-sdk-go-v2/service/cloudformation" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/eks" @@ -38,6 +39,7 @@ import ( type Clients struct { // AWS clients Config awssdk.Config + Autoscaling *autoscaling.Client CloudFormation *cloudformation.Client EC2 *ec2.Client EKS *eks.Client @@ -101,6 +103,7 @@ func Build(ctx context.Context, configFlags *genericclioptions.ConfigFlags, k8sC return &Clients{ Config: awsConfig, + Autoscaling: autoscaling.NewFromConfig(awsConfig), CloudFormation: cloudformation.NewFromConfig(awsConfig), EC2: ec2.NewFromConfig(awsConfig), EKS: eks.NewFromConfig(awsConfig), diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go new file mode 100644 index 0000000000..d08cb7272e --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go @@ -0,0 +1,226 @@ +package clusterinfo + +import ( + "context" + "fmt" + "log" + "maps" + "regexp" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/service/autoscaling" + astypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" + "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +const ( + nodeListChunkSize = 100 + // describeASGInstancesMaxIDs is the documented per-call limit of + // autoscaling:DescribeAutoScalingInstances. Sending more triggers a + // ValidationError at the API. + describeASGInstancesMaxIDs = 50 +) + +// awsProviderIDRegexp matches the AWS provider ID for EC2-backed nodes. +// Format: aws:////i-. Fargate nodes use a different shape and +// must therefore be classified by label before reaching this regex. +var awsProviderIDRegexp = regexp.MustCompile(`^aws:///[^/]+/(i-[0-9a-f]+)$`) + +// AutoscalingDescriber is the subset of *autoscaling.Client used by Classify. +// Defined as an interface so tests can substitute a fake without spinning up +// AWS SDK middleware. +type AutoscalingDescriber interface { + DescribeAutoScalingInstances(ctx context.Context, in *autoscaling.DescribeAutoScalingInstancesInput, opts ...func(*autoscaling.Options)) (*autoscaling.DescribeAutoScalingInstancesOutput, error) +} + +// Classify inspects every node in the cluster, groups them by management +// method, and returns the resulting snapshot. +func Classify(ctx context.Context, k8sClient kubernetes.Interface, asg AutoscalingDescriber, clusterName string) (*ClusterInfo, error) { + info := &ClusterInfo{ + APIVersion: APIVersion, + ClusterName: clusterName, + GeneratedAt: time.Now().UTC(), + NodeManagement: map[NodeManager]map[string][]string{}, + } + + asgCandidates, err := classifyByLabels(ctx, k8sClient, info) + if err != nil { + return nil, err + } + + if err = resolveASGs(ctx, asg, asgCandidates, info); err != nil { + return nil, err + } + + info.ClusterAutoscaler, err = detectClusterAutoscaler(ctx, k8sClient) + if err != nil { + return nil, fmt.Errorf("failed to detect cluster-autoscaler: %w", err) + } + + return info, nil +} + +// asgCandidate is a node that needs an AWS API call to determine whether +// it's in an ASG (asg bucket) or not (standalone bucket). +type asgCandidate struct { + instanceID string + nodeName string +} + +// classifyByLabels walks all nodes and applies the label-only branches of the +// decision tree (Fargate, Karpenter, EKS managed node group, unknown). Nodes +// with an AWS EC2 providerID that don't match any of the above are returned +// as ASG candidates for resolveASGs to bucket as asg or standalone. +func classifyByLabels(ctx context.Context, k8sClient kubernetes.Interface, info *ClusterInfo) ([]asgCandidate, error) { + var candidates []asgCandidate + + var cont string + for { + list, err := k8sClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + Limit: nodeListChunkSize, + Continue: cont, + }) + if err != nil { + return nil, fmt.Errorf("failed to list nodes: %w", err) + } + + for _, node := range list.Items { + if mgr, entity, ok := classifyNodeByLabel(&node); ok { + addToBucket(info, mgr, entity, node.Name) + continue + } + + matches := awsProviderIDRegexp.FindStringSubmatch(node.Spec.ProviderID) + if len(matches) == 2 { + candidates = append(candidates, asgCandidate{ + instanceID: matches[1], + nodeName: node.Name, + }) + } else { + addToBucket(info, NodeManagerUnknown, node.Spec.ProviderID, node.Name) + } + } + + cont = list.Continue + if cont == "" { + return candidates, nil + } + } +} + +// classifyNodeByLabel applies steps 1-3 of the decision tree using only the +// Node labels and name. Returns false when the node needs an AWS API lookup +// or the unknown-bucket fallback. +func classifyNodeByLabel(node *corev1.Node) (NodeManager, string, bool) { + if node.Labels["eks.amazonaws.com/compute-type"] == "fargate" || strings.HasPrefix(node.Name, "fargate-ip-") { + return NodeManagerFargate, node.Labels["eks.amazonaws.com/fargate-profile"], true + } + + if v := node.Labels["karpenter.sh/nodepool"]; v != "" { + return NodeManagerKarpenter, v, true + } + if v := node.Labels["karpenter.k8s.aws/ec2nodeclass"]; v != "" { + return NodeManagerKarpenter, v, true + } + // Legacy Karpenter v0.x (pre-NodePool) uses Provisioner instead. + if v := node.Labels["karpenter.sh/provisioner-name"]; v != "" { + return NodeManagerKarpenter, v, true + } + + if v := node.Labels["eks.amazonaws.com/nodegroup"]; v != "" { + return NodeManagerEKSManagedNodeGroup, v, true + } + + return "", "", false +} + +// resolveASGs batches DescribeAutoScalingInstances calls (50 IDs per call, +// the documented limit) to map instance IDs to ASGs. Instances reported +// without an AutoScalingGroupName fall into the standalone bucket. +func resolveASGs(ctx context.Context, asg AutoscalingDescriber, candidates []asgCandidate, info *ClusterInfo) error { + byInstance := make(map[string]string, len(candidates)) + + for _, batch := range lo.Chunk(candidates, describeASGInstancesMaxIDs) { + ids := lo.Map(batch, func(c asgCandidate, _ int) string { return c.instanceID }) + out, err := asg.DescribeAutoScalingInstances(ctx, &autoscaling.DescribeAutoScalingInstancesInput{ + InstanceIds: ids, + }) + if err != nil { + return fmt.Errorf("failed to describe autoscaling instances: %w", err) + } + maps.Copy(byInstance, lo.FilterSliceToMap(out.AutoScalingInstances, func(ai astypes.AutoScalingInstanceDetails) (string, string, bool) { + if ai.InstanceId == nil || ai.AutoScalingGroupName == nil { + return "", "", false + } + return *ai.InstanceId, *ai.AutoScalingGroupName, true + })) + } + + for _, c := range candidates { + if name := byInstance[c.instanceID]; name != "" { + addToBucket(info, NodeManagerASG, name, c.nodeName) + } else { + addToBucket(info, NodeManagerStandalone, "", c.nodeName) + } + } + return nil +} + +func addToBucket(info *ClusterInfo, mgr NodeManager, entity, nodeName string) { + bucket := info.NodeManagement[mgr] + if bucket == nil { + bucket = map[string][]string{} + info.NodeManagement[mgr] = bucket + } + bucket[entity] = append(bucket[entity], nodeName) +} + +// detectClusterAutoscaler scans Deployments cluster-wide and returns the +// first match. A match is any Deployment with name "cluster-autoscaler", a +// well-known label, or a container image referencing "cluster-autoscaler". +// Multiple matches yield a warning but only the first is recorded. +func detectClusterAutoscaler(ctx context.Context, k8sClient kubernetes.Interface) (ClusterAutoscaler, error) { + list, err := k8sClient.AppsV1().Deployments(corev1.NamespaceAll).List(ctx, metav1.ListOptions{}) + if err != nil { + return ClusterAutoscaler{}, err + } + + matches := lo.Filter(list.Items, isClusterAutoscaler) + if len(matches) == 0 { + return ClusterAutoscaler{}, nil + } + if len(matches) > 1 { + log.Printf("Warning: %d Deployments match cluster-autoscaler heuristics; recording the first one (%s/%s).", + len(matches), matches[0].Namespace, matches[0].Name) + } + return ClusterAutoscaler{ + Present: true, + Namespace: matches[0].Namespace, + Name: matches[0].Name, + }, nil +} + +// isClusterAutoscaler matches lo.Filter's predicate signature so it can be +// passed directly without a wrapper. +func isClusterAutoscaler(d appsv1.Deployment, _ int) bool { + // A Deployment scaled to zero is effectively disabled; ignoring it lets + // users who already stopped CA (per the Karpenter migration guide) get + // `Present: false` in the snapshot. A nil Replicas defaults to 1 per the + // Kubernetes API, so it counts as active. + if d.Spec.Replicas != nil && *d.Spec.Replicas == 0 { + return false + } + if d.Name == "cluster-autoscaler" || + d.Labels["app.kubernetes.io/name"] == "cluster-autoscaler" || + d.Labels["k8s-app"] == "cluster-autoscaler" { + return true + } + return lo.SomeBy(d.Spec.Template.Spec.Containers, func(c corev1.Container) bool { + return strings.Contains(c.Image, "cluster-autoscaler") + }) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go new file mode 100644 index 0000000000..e5018cef18 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go @@ -0,0 +1,254 @@ +package clusterinfo + +import ( + "context" + "fmt" + "testing" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/autoscaling" + astypes "github.com/aws/aws-sdk-go-v2/service/autoscaling/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +// fakeASG implements AutoscalingDescriber, returning a static instance->ASG +// map and recording the inputs of every call so tests can assert batching. +type fakeASG struct { + instances map[string]string + calls []*autoscaling.DescribeAutoScalingInstancesInput + err error +} + +func (f *fakeASG) DescribeAutoScalingInstances(_ context.Context, in *autoscaling.DescribeAutoScalingInstancesInput, _ ...func(*autoscaling.Options)) (*autoscaling.DescribeAutoScalingInstancesOutput, error) { + f.calls = append(f.calls, in) + if f.err != nil { + return nil, f.err + } + out := &autoscaling.DescribeAutoScalingInstancesOutput{} + for _, id := range in.InstanceIds { + if asgName, ok := f.instances[id]; ok { + out.AutoScalingInstances = append(out.AutoScalingInstances, astypes.AutoScalingInstanceDetails{ + InstanceId: awssdk.String(id), + AutoScalingGroupName: awssdk.String(asgName), + }) + } + } + return out, nil +} + +func node(name string, providerID string, labels map[string]string) *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: name, Labels: labels}, + Spec: corev1.NodeSpec{ProviderID: providerID}, + } +} + +func deploymentWith(namespace, name string, labels map[string]string, image string) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name, Labels: labels}, + Spec: appsv1.DeploymentSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: image}}}, + }, + }, + } +} + +func deploymentWithReplicas(namespace, name string, replicas int32, image string) *appsv1.Deployment { + d := deploymentWith(namespace, name, nil, image) + d.Spec.Replicas = &replicas + return d +} + +func TestClassify_EmptyCluster(t *testing.T) { + clientset := fake.NewSimpleClientset() + asg := &fakeASG{} + + info, err := Classify(t.Context(), clientset, asg, "test-cluster") + + require.NoError(t, err) + assert.Equal(t, APIVersion, info.APIVersion) + assert.Equal(t, "test-cluster", info.ClusterName) + assert.False(t, info.GeneratedAt.IsZero()) + assert.Empty(t, info.NodeManagement) + assert.False(t, info.ClusterAutoscaler.Present) + assert.Empty(t, asg.calls, "no candidates should mean no AWS API calls") +} + +func TestClassify_AllBucketsByLabel(t *testing.T) { + objs := []runtime.Object{ + // fargate via label + node("fargate-by-label", "aws:///us-east-1a/fargate-abc", map[string]string{ + "eks.amazonaws.com/compute-type": "fargate", + "eks.amazonaws.com/fargate-profile": "fp-default", + }), + // fargate via name fallback (no compute-type label) + node("fargate-ip-10-0-0-1.eu-west-3.compute.internal", "", nil), + // karpenter via primary label + node("kp-primary", "aws:///us-east-1a/i-0aaa", map[string]string{ + "karpenter.sh/nodepool": "default-np", + }), + // karpenter via fallback (only the EC2NodeClass label) + node("kp-fallback", "aws:///us-east-1a/i-0bbb", map[string]string{ + "karpenter.k8s.aws/ec2nodeclass": "default-nc", + }), + // karpenter v0.x legacy label + node("kp-legacy", "aws:///us-east-1a/i-0ddd", map[string]string{ + "karpenter.sh/provisioner-name": "legacy-provisioner", + }), + // EKS managed node group + node("mng", "aws:///us-east-1a/i-0ccc", map[string]string{ + "eks.amazonaws.com/nodegroup": "workers", + }), + // non-AWS providerID -> unknown + node("gke", "gce://project/zone/instance", nil), + // empty providerID -> unknown + node("orphan", "", nil), + } + clientset := fake.NewSimpleClientset(objs...) + asg := &fakeASG{} + + info, err := Classify(t.Context(), clientset, asg, "c") + require.NoError(t, err) + + assert.Equal(t, []string{"fargate-by-label"}, + info.NodeManagement[NodeManagerFargate]["fp-default"]) + assert.Equal(t, []string{"fargate-ip-10-0-0-1.eu-west-3.compute.internal"}, + info.NodeManagement[NodeManagerFargate][""]) + assert.Equal(t, []string{"kp-primary"}, + info.NodeManagement[NodeManagerKarpenter]["default-np"]) + assert.Equal(t, []string{"kp-fallback"}, + info.NodeManagement[NodeManagerKarpenter]["default-nc"]) + assert.Equal(t, []string{"kp-legacy"}, + info.NodeManagement[NodeManagerKarpenter]["legacy-provisioner"]) + assert.Equal(t, []string{"mng"}, + info.NodeManagement[NodeManagerEKSManagedNodeGroup]["workers"]) + assert.Equal(t, []string{"gke"}, + info.NodeManagement[NodeManagerUnknown]["gce://project/zone/instance"]) + assert.Equal(t, []string{"orphan"}, + info.NodeManagement[NodeManagerUnknown][""]) + assert.Empty(t, asg.calls, "label-only nodes must not trigger AWS calls") +} + +func TestClassify_ASGAndStandalone(t *testing.T) { + clientset := fake.NewSimpleClientset( + node("worker-1", "aws:///us-east-1a/i-1111", nil), + node("worker-2", "aws:///us-east-1a/i-2222", nil), + node("solo", "aws:///us-east-1a/i-3333", nil), + ) + asg := &fakeASG{ + instances: map[string]string{ + "i-1111": "legacy-asg", + "i-2222": "legacy-asg", + // i-3333 is intentionally absent -> standalone + }, + } + + info, err := Classify(t.Context(), clientset, asg, "c") + require.NoError(t, err) + + assert.ElementsMatch(t, []string{"worker-1", "worker-2"}, + info.NodeManagement[NodeManagerASG]["legacy-asg"]) + assert.Equal(t, []string{"solo"}, + info.NodeManagement[NodeManagerStandalone][""]) +} + +func TestClassify_ASGBatching(t *testing.T) { + const total = 75 // 50 + 25 -> exactly 2 batches + + objs := make([]runtime.Object, 0, total) + asg := &fakeASG{instances: map[string]string{}} + for i := range total { + id := fmt.Sprintf("i-%016x", i) + objs = append(objs, node(fmt.Sprintf("n-%d", i), "aws:///us-east-1a/"+id, nil)) + asg.instances[id] = "asg-1" + } + clientset := fake.NewSimpleClientset(objs...) + + info, err := Classify(t.Context(), clientset, asg, "c") + require.NoError(t, err) + + require.Len(t, asg.calls, 2) + assert.Len(t, asg.calls[0].InstanceIds, describeASGInstancesMaxIDs) + assert.Len(t, asg.calls[1].InstanceIds, total-describeASGInstancesMaxIDs) + assert.Len(t, info.NodeManagement[NodeManagerASG]["asg-1"], total) +} + +func TestClassify_ASGAPIError(t *testing.T) { + clientset := fake.NewSimpleClientset(node("n", "aws:///us-east-1a/i-1111", nil)) + asg := &fakeASG{err: fmt.Errorf("AccessDenied")} + + _, err := Classify(t.Context(), clientset, asg, "c") + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to describe autoscaling instances") +} + +func TestClassify_ClusterAutoscalerDetection(t *testing.T) { + for _, tc := range []struct { + name string + deploy *appsv1.Deployment + want ClusterAutoscaler + }{ + { + name: "by name", + deploy: deploymentWith("kube-system", "cluster-autoscaler", nil, "registry.k8s.io/some-image:v1"), + want: ClusterAutoscaler{Present: true, Namespace: "kube-system", Name: "cluster-autoscaler"}, + }, + { + name: "by app.kubernetes.io/name", + deploy: deploymentWith("autoscaler", "ca-renamed", + map[string]string{"app.kubernetes.io/name": "cluster-autoscaler"}, + "registry.k8s.io/foo:v1"), + want: ClusterAutoscaler{Present: true, Namespace: "autoscaler", Name: "ca-renamed"}, + }, + { + name: "by k8s-app", + deploy: deploymentWith("autoscaler", "ca-renamed", + map[string]string{"k8s-app": "cluster-autoscaler"}, + "registry.k8s.io/foo:v1"), + want: ClusterAutoscaler{Present: true, Namespace: "autoscaler", Name: "ca-renamed"}, + }, + { + name: "by image substring", + deploy: deploymentWith("custom", "scaler", nil, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0"), + want: ClusterAutoscaler{Present: true, Namespace: "custom", Name: "scaler"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.deploy) + info, err := Classify(t.Context(), clientset, &fakeASG{}, "c") + require.NoError(t, err) + assert.Equal(t, tc.want, info.ClusterAutoscaler) + }) + } +} + +func TestClassify_NoClusterAutoscaler(t *testing.T) { + clientset := fake.NewSimpleClientset( + // Karpenter must not be detected as cluster-autoscaler. + deploymentWith("dd-karpenter", "karpenter", nil, "public.ecr.aws/karpenter/karpenter:v1.9.0"), + ) + info, err := Classify(t.Context(), clientset, &fakeASG{}, "c") + require.NoError(t, err) + assert.False(t, info.ClusterAutoscaler.Present) + assert.Empty(t, info.ClusterAutoscaler.Namespace) + assert.Empty(t, info.ClusterAutoscaler.Name) +} + +func TestClassify_ClusterAutoscalerScaledToZero(t *testing.T) { + // A user following the Karpenter migration guide may have already + // scaled the cluster-autoscaler Deployment to 0. We want Present: false + // so the migration tooling doesn't repeatedly nag the user about it. + clientset := fake.NewSimpleClientset( + deploymentWithReplicas("kube-system", "cluster-autoscaler", 0, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0"), + ) + info, err := Classify(t.Context(), clientset, &fakeASG{}, "c") + require.NoError(t, err) + assert.False(t, info.ClusterAutoscaler.Present) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist.go new file mode 100644 index 0000000000..071208a731 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist.go @@ -0,0 +1,41 @@ +package clusterinfo + +import ( + "context" + "fmt" + + "gopkg.in/yaml.v3" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s" +) + +// Persist marshals info as YAML and writes it to a ConfigMap in namespace. +// The ConfigMap is created or updated idempotently. +func Persist(ctx context.Context, cli client.Client, namespace string, info *ClusterInfo) error { + payload, err := yaml.Marshal(info) + if err != nil { + return fmt.Errorf("failed to marshal cluster info: %w", err) + } + + cm := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: ConfigMapName, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/managed-by": "kubectl-datadog", + }, + }, + Data: map[string]string{ + ConfigMapDataKey: string(payload), + }, + } + + return commonk8s.CreateOrUpdate(ctx, cli, cm) +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist_test.go new file mode 100644 index 0000000000..f13ae55c7a --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/persist_test.go @@ -0,0 +1,70 @@ +package clusterinfo + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func sampleInfo() *ClusterInfo { + return &ClusterInfo{ + APIVersion: APIVersion, + ClusterName: "test-cluster", + GeneratedAt: time.Date(2026, 4, 27, 14, 33, 0, 0, time.UTC), + NodeManagement: map[NodeManager]map[string][]string{ + NodeManagerFargate: {"fp-default": {"fargate-ip-10-0-0-1.eu-west-3.compute.internal"}}, + NodeManagerStandalone: {"": {"ip-10-0-0-9.eu-west-3.compute.internal"}}, + }, + ClusterAutoscaler: ClusterAutoscaler{Present: true, Namespace: "kube-system", Name: "cluster-autoscaler"}, + } +} + +func TestPersist_CreatesConfigMap(t *testing.T) { + cli := fake.NewClientBuilder().WithScheme(scheme.Scheme).Build() + info := sampleInfo() + + err := Persist(t.Context(), cli, "dd-karpenter", info) + require.NoError(t, err) + + got := &corev1.ConfigMap{} + require.NoError(t, cli.Get(t.Context(), types.NamespacedName{Namespace: "dd-karpenter", Name: ConfigMapName}, got)) + assert.Equal(t, "kubectl-datadog", got.Labels["app.kubernetes.io/managed-by"]) + + var roundTrip ClusterInfo + require.NoError(t, yaml.Unmarshal([]byte(got.Data[ConfigMapDataKey]), &roundTrip)) + assert.Equal(t, info.APIVersion, roundTrip.APIVersion) + assert.Equal(t, info.ClusterName, roundTrip.ClusterName) + assert.True(t, info.GeneratedAt.Equal(roundTrip.GeneratedAt)) + assert.Equal(t, info.NodeManagement, roundTrip.NodeManagement) + assert.Equal(t, info.ClusterAutoscaler, roundTrip.ClusterAutoscaler) +} + +func TestPersist_UpdatesExistingConfigMap(t *testing.T) { + cli := fake.NewClientBuilder().WithScheme(scheme.Scheme).Build() + + first := sampleInfo() + require.NoError(t, Persist(t.Context(), cli, "dd-karpenter", first)) + + second := sampleInfo() + second.ClusterName = "renamed-cluster" + second.NodeManagement = map[NodeManager]map[string][]string{ + NodeManagerASG: {"asg-1": {"node-x"}}, + } + require.NoError(t, Persist(t.Context(), cli, "dd-karpenter", second)) + + got := &corev1.ConfigMap{} + require.NoError(t, cli.Get(t.Context(), types.NamespacedName{Namespace: "dd-karpenter", Name: ConfigMapName}, got)) + + var roundTrip ClusterInfo + require.NoError(t, yaml.Unmarshal([]byte(got.Data[ConfigMapDataKey]), &roundTrip)) + assert.Equal(t, "renamed-cluster", roundTrip.ClusterName) + assert.Equal(t, []string{"node-x"}, roundTrip.NodeManagement[NodeManagerASG]["asg-1"]) + assert.NotContains(t, roundTrip.NodeManagement, NodeManagerFargate, "previous buckets should be gone") +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go new file mode 100644 index 0000000000..b2105ea097 --- /dev/null +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go @@ -0,0 +1,47 @@ +// Package clusterinfo classifies cluster nodes by their management method +// (Fargate, Karpenter, EKS managed node group, ASG, standalone, unknown) and +// persists the classification in a ConfigMap. The snapshot drives the +// follow-up migration of workloads from the existing nodes to Karpenter. +package clusterinfo + +import "time" + +// APIVersion is the schema version of the ConfigMap payload. Bump on +// backward-incompatible shape changes so future readers can branch. +const APIVersion = "v1" + +// ConfigMapName is the name of the ConfigMap that stores the snapshot. +const ConfigMapName = "dd-cluster-info" + +// ConfigMapDataKey is the key under .data containing the YAML payload. +const ConfigMapDataKey = "cluster-info" + +// NodeManager identifies the management method for a Kubernetes node. +type NodeManager string + +const ( + NodeManagerFargate NodeManager = "fargate" + NodeManagerKarpenter NodeManager = "karpenter" + NodeManagerEKSManagedNodeGroup NodeManager = "eksManagedNodeGroup" + NodeManagerASG NodeManager = "asg" + NodeManagerStandalone NodeManager = "standalone" + NodeManagerUnknown NodeManager = "unknown" +) + +// ClusterInfo is the snapshot persisted in the ConfigMap. +type ClusterInfo struct { + APIVersion string `yaml:"apiVersion"` + ClusterName string `yaml:"clusterName"` + GeneratedAt time.Time `yaml:"generatedAt"` + NodeManagement map[NodeManager]map[string][]string `yaml:"nodeManagement"` + ClusterAutoscaler ClusterAutoscaler `yaml:"clusterAutoscaler"` +} + +// ClusterAutoscaler captures whether a legacy cluster-autoscaler Deployment +// is running and where, so the migration can warn the user to stop it before +// scaling EKS managed node groups (per the Karpenter migration guide). +type ClusterAutoscaler struct { + Present bool `yaml:"present"` + Namespace string `yaml:"namespace,omitempty"` + Name string `yaml:"name,omitempty"` +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go index 60821f32c2..e6fbf90d9e 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/install/install.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/install/install.go @@ -28,6 +28,7 @@ import ( "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/aws" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clients" + "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/display" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/helm" "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/install/guess" @@ -308,6 +309,10 @@ func (o *options) run(cmd *cobra.Command) error { return err } + if err = recordClusterInfo(ctx, cli, clusterName, karpenterNamespace); err != nil { + log.Printf("Warning: %v", err) + } + return displaySuccessMessage(cmd, clusterName, createKarpenterResources) } @@ -597,6 +602,21 @@ func createNodePoolResources(ctx context.Context, cmd *cobra.Command, cli *clien return nil } +// recordClusterInfo classifies every node by its current management method +// and writes the snapshot to a ConfigMap. The information is consumed by the +// follow-up migration step. +func recordClusterInfo(ctx context.Context, cli *clients.Clients, clusterName, namespace string) error { + info, err := clusterinfo.Classify(ctx, cli.K8sClientset, cli.Autoscaling, clusterName) + if err != nil { + return fmt.Errorf("failed to classify cluster nodes: %w", err) + } + if err := clusterinfo.Persist(ctx, cli.K8sClient, namespace, info); err != nil { + return fmt.Errorf("failed to write %s ConfigMap: %w", clusterinfo.ConfigMapName, err) + } + log.Printf("Wrote node-management snapshot to ConfigMap %s/%s.", namespace, clusterinfo.ConfigMapName) + return nil +} + func openAutoscalingSettingsURL(cmd *cobra.Command, clusterName string) string { autoscalingSettingsURL := (&url.URL{ Scheme: "https", diff --git a/go.mod b/go.mod index d0288befc1..460b691e90 100644 --- a/go.mod +++ b/go.mod @@ -44,15 +44,16 @@ require ( github.com/DataDog/datadog-agent/pkg/proto v0.63.0-rc.1 github.com/DataDog/datadog-agent/pkg/remoteconfig/state v0.77.3 github.com/DataDog/datadog-operator/api v0.0.0-20250130131115-7f198adcc856 - github.com/aws/aws-sdk-go-v2 v1.41.1 + github.com/aws/aws-sdk-go-v2 v1.41.6 github.com/aws/aws-sdk-go-v2/config v1.32.7 + github.com/aws/aws-sdk-go-v2/service/autoscaling v1.66.1 github.com/aws/aws-sdk-go-v2/service/cloudformation v1.68.3 github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.2 github.com/aws/aws-sdk-go-v2/service/eks v1.77.0 github.com/aws/aws-sdk-go-v2/service/iam v1.53.2 github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 github.com/aws/karpenter-provider-aws v1.9.0 - github.com/aws/smithy-go v1.24.0 + github.com/aws/smithy-go v1.25.0 github.com/cenkalti/backoff v2.2.1+incompatible github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/fatih/color v1.18.0 @@ -106,8 +107,8 @@ require ( github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.7 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect diff --git a/go.sum b/go.sum index b92dfc6fdd..e641f1a17f 100644 --- a/go.sum +++ b/go.sum @@ -186,20 +186,22 @@ github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:W github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0= -github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU= -github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= +github.com/aws/aws-sdk-go-v2 v1.41.6 h1:1AX0AthnBQzMx1vbmir3Y4WsnJgiydmnJjiLu+LvXOg= +github.com/aws/aws-sdk-go-v2 v1.41.6/go.mod h1:dy0UzBIfwSeot4grGvY1AqFWN5zgziMmWGzysDnHFcQ= github.com/aws/aws-sdk-go-v2/config v1.32.7 h1:vxUyWGUwmkQ2g19n7JY/9YL8MfAIl7bTesIUykECXmY= github.com/aws/aws-sdk-go-v2/config v1.32.7/go.mod h1:2/Qm5vKUU/r7Y+zUk/Ptt2MDAEKAfUtKc1+3U1Mo3oY= github.com/aws/aws-sdk-go-v2/credentials v1.19.7 h1:tHK47VqqtJxOymRrNtUXN5SP/zUTvZKeLx4tH6PGQc8= github.com/aws/aws-sdk-go-v2/credentials v1.19.7/go.mod h1:qOZk8sPDrxhf+4Wf4oT2urYJrYt3RejHSzgAquYeppw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 h1:GmLa5Kw1ESqtFpXsx5MmC84QWa/ZrLZvlJGa2y+4kcQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22/go.mod h1:6sW9iWm9DK9YRpRGga/qzrzNLgKpT2cIxb7Vo2eNOp0= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 h1:dY4kWZiSaXIzxnKlj17nHnBcXXBfac6UlsAx2qL6XrU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22/go.mod h1:KIpEUx0JuRZLO7U6cbV204cWAEco2iC3l061IxlwLtI= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.66.1 h1:kGlbhb5GMfkP/bcqcbt3oDi50kwDTpRmNzYUY9LqbLk= +github.com/aws/aws-sdk-go-v2/service/autoscaling v1.66.1/go.mod h1:z45kurrOonQepd3SN5LIgropAn1NGHwBn1yOMF+QVFU= github.com/aws/aws-sdk-go-v2/service/cloudformation v1.68.3 h1:H4jVDatTYCt6WSG7oC0dlZl8kfKHT2anADHQiQI1HVo= github.com/aws/aws-sdk-go-v2/service/cloudformation v1.68.3/go.mod h1:llucikq1Q6I1Ps8rNV3St0bOY5RQMxYh1lpCaskyhPw= github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.2 h1:MG12Z/W1zzJLkw2gCU2gKZ872rqLM0pi9LdkZ/z3FHc= @@ -232,8 +234,8 @@ github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.35.16 h1:0lxNpE8zuNIvxUS github.com/aws/aws-sdk-go-v2/service/timestreamwrite v1.35.16/go.mod h1:3FcOfkSHwdxE2w0pDKTXkt1PmloObRPokcCt1fkLSK0= github.com/aws/karpenter-provider-aws v1.9.0 h1:JER51ivJqzz4FeAuRpA5ozEUEmJhHpyqHQlscJW7M14= github.com/aws/karpenter-provider-aws v1.9.0/go.mod h1:luLmzj7d9IhXS9O4OuO44QYySzKwkZc0TFb8yGHxfyw= -github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= -github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/aws/smithy-go v1.25.0 h1:Sz/XJ64rwuiKtB6j98nDIPyYrV1nVNJ4YU74gttcl5U= +github.com/aws/smithy-go v1.25.0/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20240229193347-cfab22a10647 h1:8yRBVsjGmI7qQsPWtIrbWP+XfwHO9Wq7gdLVzjqiZFs= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20240229193347-cfab22a10647/go.mod h1:9NafTAUHL0FlMeL6Cu5PXnMZ1q/LnC9X2emLXHsVbM8= github.com/awslabs/operatorpkg v0.0.0-20251222193911-34e9a1898737 h1:hF8FFDPnboX/ABn1r8oS77t8tG4TVS8i99iPXMaL8Jk= From 6864d256379e6f8a4c5a4ad81d95734442b81e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Mon, 27 Apr 2026 17:58:41 +0200 Subject: [PATCH 2/4] [CASCL-1304] Sync test/e2e go.mod/go.sum Run `make update-golang` to align test/e2e/go.mod and go.sum with the indirect aws-sdk-go-v2 / smithy-go patch bumps pulled in by `go work sync`. Required by the GitLab `check-golang-version` job. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/e2e/go.mod | 8 ++++---- test/e2e/go.sum | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test/e2e/go.mod b/test/e2e/go.mod index 2efbdc2401..420247ecc0 100644 --- a/test/e2e/go.mod +++ b/test/e2e/go.mod @@ -8,10 +8,10 @@ require ( github.com/DataDog/datadog-agent/test/e2e-framework v0.78.0-devel github.com/DataDog/datadog-agent/test/fakeintake v0.78.0-devel github.com/DataDog/datadog-api-client-go/v2 v2.56.0 // indirect - github.com/aws/aws-sdk-go-v2 v1.41.5 + github.com/aws/aws-sdk-go-v2 v1.41.6 github.com/aws/aws-sdk-go-v2/config v1.32.13 github.com/aws/aws-sdk-go-v2/service/cloudformation v1.71.9 - github.com/aws/smithy-go v1.24.2 // indirect + github.com/aws/smithy-go v1.25.0 // indirect github.com/pulumi/pulumi-kubernetes/sdk/v4 v4.28.0 github.com/pulumi/pulumi/sdk/v3 v3.228.0 github.com/stretchr/testify v1.11.1 @@ -66,8 +66,8 @@ require ( github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.13 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 // indirect github.com/aws/aws-sdk-go-v2/service/ec2 v1.285.0 // indirect diff --git a/test/e2e/go.sum b/test/e2e/go.sum index 752b172bc4..dcc0ee0e7b 100644 --- a/test/e2e/go.sum +++ b/test/e2e/go.sum @@ -81,8 +81,8 @@ github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/aws/aws-sdk-go v1.55.8 h1:JRmEUbU52aJQZ2AjX4q4Wu7t4uZjOu71uyNmaWlUkJQ= github.com/aws/aws-sdk-go v1.55.8/go.mod h1:ZkViS9AqA6otK+JBBNH2++sx1sgxrPKcSzPPvQkUtXk= -github.com/aws/aws-sdk-go-v2 v1.41.5 h1:dj5kopbwUsVUVFgO4Fi5BIT3t4WyqIDjGKCangnV/yY= -github.com/aws/aws-sdk-go-v2 v1.41.5/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o= +github.com/aws/aws-sdk-go-v2 v1.41.6 h1:1AX0AthnBQzMx1vbmir3Y4WsnJgiydmnJjiLu+LvXOg= +github.com/aws/aws-sdk-go-v2 v1.41.6/go.mod h1:dy0UzBIfwSeot4grGvY1AqFWN5zgziMmWGzysDnHFcQ= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4= github.com/aws/aws-sdk-go-v2/config v1.32.13 h1:5KgbxMaS2coSWRrx9TX/QtWbqzgQkOdEa3sZPhBhCSg= @@ -91,10 +91,10 @@ github.com/aws/aws-sdk-go-v2/credentials v1.19.13 h1:mA59E3fokBvyEGHKFdnpNNrvaR3 github.com/aws/aws-sdk-go-v2/credentials v1.19.13/go.mod h1:yoTXOQKea18nrM69wGF9jBdG4WocSZA1h38A+t/MAsk= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 h1:NUS3K4BTDArQqNu2ih7yeDLaS3bmHD0YndtA6UP884g= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21/go.mod h1:YWNWJQNjKigKY1RHVJCuupeWDrrHjRqHm0N9rdrWzYI= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 h1:Rgg6wvjjtX8bNHcvi9OnXWwcE0a2vGpbwmtICOsvcf4= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21/go.mod h1:A/kJFst/nm//cyqonihbdpQZwiUhhzpqTsdbhDdRF9c= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 h1:PEgGVtPoB6NTpPrBgqSE5hE/o47Ij9qk/SEZFbUOe9A= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21/go.mod h1:p+hz+PRAYlY3zcpJhPwXlLC4C+kqn70WIHwnzAfs6ps= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22 h1:GmLa5Kw1ESqtFpXsx5MmC84QWa/ZrLZvlJGa2y+4kcQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.22/go.mod h1:6sW9iWm9DK9YRpRGga/qzrzNLgKpT2cIxb7Vo2eNOp0= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22 h1:dY4kWZiSaXIzxnKlj17nHnBcXXBfac6UlsAx2qL6XrU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.22/go.mod h1:KIpEUx0JuRZLO7U6cbV204cWAEco2iC3l061IxlwLtI= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6 h1:qYQ4pzQ2Oz6WpQ8T3HvGHnZydA72MnLuFK9tJwmrbHw= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.6/go.mod h1:O3h0IK87yXci+kg6flUKzJnWeziQUKciKrLjcatSNcY= github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.16 h1:CjMzUs78RDDv4ROu3JnJn/Ig1r6ZD7/T2DXLLRpejic= @@ -131,8 +131,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 h1:p8ogvvLugcR/zLBXTXrTkj0RYBU github.com/aws/aws-sdk-go-v2/service/sts v1.41.10/go.mod h1:60dv0eZJfeVXfbT1tFJinbHrDfSJ2GZl4Q//OSSNAVw= github.com/aws/session-manager-plugin v0.0.0-20241119210807-82dc72922492 h1:Ihams/fjKo4iWwM313ng2gCJWoetsL7ZQkXhOTmVUq4= github.com/aws/session-manager-plugin v0.0.0-20241119210807-82dc72922492/go.mod h1:7n17tunRPUsniNBu5Ja9C7WwJWTdOzaLqr/H0Ns3uuI= -github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng= -github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= +github.com/aws/smithy-go v1.25.0 h1:Sz/XJ64rwuiKtB6j98nDIPyYrV1nVNJ4YU74gttcl5U= +github.com/aws/smithy-go v1.25.0/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/bazelbuild/buildtools v0.0.0-20260211083412-859bfffeef82 h1:PmoVmwzAnGb0iCjulb7Mgsaqw2Wj36LQJ8VyYaFe/ak= From 6a5a25b67c023d42b158d614bf3be796854b6693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Mon, 27 Apr 2026 18:10:46 +0200 Subject: [PATCH 3/4] [CASCL-1304] Add autoscaling SDK to LICENSE-3rdparty.csv The new clusterinfo package imports `github.com/aws/aws-sdk-go-v2/service/autoscaling` for DescribeAutoScalingInstances batching, so the third-party license manifest needs the matching entry. Patch matches the diff produced by the CI's `make verify-licenses` step verbatim. Co-Authored-By: Claude Opus 4.7 (1M context) --- LICENSE-3rdparty.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index a71b5c1110..5f3b279c45 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -50,6 +50,7 @@ core,github.com/aws/aws-sdk-go-v2/internal/configsources,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/internal/endpoints/v2,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/internal/ini,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/internal/sync/singleflight,BSD-3-Clause +core,github.com/aws/aws-sdk-go-v2/service/autoscaling,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/service/cloudformation,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/service/ec2,Apache-2.0 core,github.com/aws/aws-sdk-go-v2/service/eks,Apache-2.0 From ec857027f13832e7026676ff648655c2cda25304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9na=C3=AFc=20Huard?= Date: Tue, 28 Apr 2026 10:10:50 +0200 Subject: [PATCH 4/4] [CASCL-1304] Capture cluster-autoscaler version in the snapshot Per review feedback: the migration consumer needs to know which CA version is running so it can branch on known-bad versions or surface deprecation guidance. Capture the version, preferring the image tag of the matching container (the source of truth) and falling back to the `app.kubernetes.io/version` label set by most Helm charts. The field is omitted from the YAML when neither signal is available (e.g. an image referenced by digest only and no version label). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cluster/common/clusterinfo/classify.go | 37 ++++++++++++++ .../common/clusterinfo/classify_test.go | 48 ++++++++++++++++++- .../cluster/common/clusterinfo/types.go | 1 + 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go index d08cb7272e..03cefe2ac2 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify.go @@ -202,6 +202,7 @@ func detectClusterAutoscaler(ctx context.Context, k8sClient kubernetes.Interface Present: true, Namespace: matches[0].Namespace, Name: matches[0].Name, + Version: extractClusterAutoscalerVersion(matches[0]), }, nil } @@ -224,3 +225,39 @@ func isClusterAutoscaler(d appsv1.Deployment, _ int) bool { return strings.Contains(c.Image, "cluster-autoscaler") }) } + +// extractClusterAutoscalerVersion returns the running cluster-autoscaler +// version. Prefers the image tag of the matching container (the source of +// truth) and falls back to the `app.kubernetes.io/version` label on the +// Deployment or its pod template (set by most Helm charts). Empty when +// neither is available — e.g. an image referenced by digest only and no +// version label. +func extractClusterAutoscalerVersion(d appsv1.Deployment) string { + for _, c := range d.Spec.Template.Spec.Containers { + if !strings.Contains(c.Image, "cluster-autoscaler") { + continue + } + if tag := imageTag(c.Image); tag != "" { + return tag + } + } + if v := d.Labels["app.kubernetes.io/version"]; v != "" { + return v + } + return d.Spec.Template.Labels["app.kubernetes.io/version"] +} + +// imageTag extracts the tag portion of an OCI image reference, stripping +// any `@sha256:...` digest. Returns empty when no tag is set (for instance, +// digest-only references or bare image names). +func imageTag(image string) string { + if i := strings.Index(image, "@"); i >= 0 { + image = image[:i] + } + // The last colon is the tag separator only if it is not followed by a + // path component — otherwise it's a registry port (e.g. `localhost:5000/foo`). + if i := strings.LastIndex(image, ":"); i >= 0 && !strings.Contains(image[i+1:], "/") { + return image[i+1:] + } + return "" +} diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go index e5018cef18..e428b1c93c 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/classify_test.go @@ -217,7 +217,7 @@ func TestClassify_ClusterAutoscalerDetection(t *testing.T) { { name: "by image substring", deploy: deploymentWith("custom", "scaler", nil, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0"), - want: ClusterAutoscaler{Present: true, Namespace: "custom", Name: "scaler"}, + want: ClusterAutoscaler{Present: true, Namespace: "custom", Name: "scaler", Version: "v1.30.0"}, }, } { t.Run(tc.name, func(t *testing.T) { @@ -229,6 +229,52 @@ func TestClassify_ClusterAutoscalerDetection(t *testing.T) { } } +func TestClassify_ClusterAutoscalerVersion(t *testing.T) { + for _, tc := range []struct { + name string + deploy *appsv1.Deployment + want string + }{ + { + name: "from image tag", + deploy: deploymentWith("kube-system", "cluster-autoscaler", nil, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0"), + want: "v1.30.0", + }, + { + name: "image tag wins over label", + deploy: deploymentWith("kube-system", "cluster-autoscaler", map[string]string{"app.kubernetes.io/version": "v9.9.9"}, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0"), + want: "v1.30.0", + }, + { + name: "tag with digest suffix", + deploy: deploymentWith("kube-system", "cluster-autoscaler", nil, "registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.0@sha256:abcdef"), + want: "v1.30.0", + }, + { + name: "registry with port and tag", + deploy: deploymentWith("kube-system", "cluster-autoscaler", nil, "localhost:5000/cluster-autoscaler:v1.31.0"), + want: "v1.31.0", + }, + { + name: "fallback to deployment label when image is digest only", + deploy: deploymentWith("kube-system", "cluster-autoscaler", map[string]string{"app.kubernetes.io/version": "v1.32.0"}, "registry.k8s.io/autoscaling/cluster-autoscaler@sha256:abcdef"), + want: "v1.32.0", + }, + { + name: "no tag, no label", + deploy: deploymentWith("kube-system", "cluster-autoscaler", nil, "registry.k8s.io/autoscaling/cluster-autoscaler@sha256:abcdef"), + want: "", + }, + } { + t.Run(tc.name, func(t *testing.T) { + clientset := fake.NewSimpleClientset(tc.deploy) + info, err := Classify(t.Context(), clientset, &fakeASG{}, "c") + require.NoError(t, err) + assert.Equal(t, tc.want, info.ClusterAutoscaler.Version) + }) + } +} + func TestClassify_NoClusterAutoscaler(t *testing.T) { clientset := fake.NewSimpleClientset( // Karpenter must not be detected as cluster-autoscaler. diff --git a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go index b2105ea097..bc8038bc8b 100644 --- a/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go +++ b/cmd/kubectl-datadog/autoscaling/cluster/common/clusterinfo/types.go @@ -44,4 +44,5 @@ type ClusterAutoscaler struct { Present bool `yaml:"present"` Namespace string `yaml:"namespace,omitempty"` Name string `yaml:"name,omitempty"` + Version string `yaml:"version,omitempty"` }