Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package guess

import (
"context"
"fmt"
"log"
"slices"

"github.com/samber/lo"
rbacv1 "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)

// Datadog-namespaced labels written via the Karpenter chart's additionalLabels.
// We avoid overriding standard app.kubernetes.io/* keys: the chart's
// _helpers.tpl emits them before additionalLabels, producing duplicate YAML
// keys whose deduplication at the API server is non-deterministic.
const (
InstalledByLabel = "autoscaling.datadoghq.com/installed-by"
InstalledByValue = "kubectl-datadog"
InstallerVersionLabel = "autoscaling.datadoghq.com/installer-version"
)

// karpenterAPIGroup is the API group every Karpenter controller's ClusterRole
// must reference: the chart's clusterrole.yaml and clusterrole-core.yaml
// templates hard-code rules on `karpenter.sh` for nodepools/nodeclaims, which
// is the structural fingerprint of a Karpenter install. Unlike the
// `app.kubernetes.io/name` label this is not affected by the chart's
// `nameOverride`, raw kubectl apply renames, or ArgoCD label rewrites.
const karpenterAPIGroup = "karpenter.sh"

// clusterRoleListChunkSize bounds the size of a single List response so we
// don't pull thousands of ClusterRoles into memory at once on dense clusters.
// Matches the chunk size used by GetNodesProperties.
const clusterRoleListChunkSize = 100

// IsForeignKarpenterInstalled reports whether a Karpenter installation that
// was not produced by this plugin is running on the cluster. Detection lists
// every ClusterRole and looks for rules on the `karpenter.sh` API group,
// which the chart hard-codes for nodepools/nodeclaims regardless of
// `nameOverride` or other metadata customizations. ClusterRoles bearing our
// InstalledByLabel sentinel are skipped. ClusterRoles are deleted by `helm
// uninstall`, unlike the CRDs in `crds/`, so a leftover-only state returns
// false.
//
// The list is paginated with an early exit on the first foreign match: dense
// clusters with thousands of ClusterRoles do not need to be fully materialised
// in memory just to answer "is there at least one foreign Karpenter".
func IsForeignKarpenterInstalled(ctx context.Context, clientset kubernetes.Interface) (bool, error) {
var cont string
for {
crs, err := clientset.RbacV1().ClusterRoles().List(ctx, metav1.ListOptions{
Limit: clusterRoleListChunkSize,
Continue: cont,
})
if err != nil {
return false, fmt.Errorf("failed to list ClusterRoles: %w", err)
}

for _, cr := range crs.Items {
if !hasKarpenterAPIGroupRule(cr.Rules) {
continue
}
if cr.Labels[InstalledByLabel] == InstalledByValue {
continue
}
log.Printf("Detected foreign Karpenter ClusterRole")
return true, nil
}

cont = crs.Continue
if cont == "" {
return false, nil
}
}
}

// hasKarpenterAPIGroupRule reports whether any rule grants permissions on the
// karpenter.sh API group. We don't constrain on resource names: any rule
// touching the group is enough to identify a Karpenter ClusterRole, and a
// looser check stays robust against upstream resource additions.
func hasKarpenterAPIGroupRule(rules []rbacv1.PolicyRule) bool {
return lo.ContainsBy(rules, func(rule rbacv1.PolicyRule) bool {
return slices.Contains(rule.APIGroups, karpenterAPIGroup)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Require a real Karpenter fingerprint before no-oping

On clusters that already have the Datadog Operator installed but no Karpenter controller, this API-group-only check returns true because the operator's own ClusterRole grants karpenter.sh permissions (config/rbac/role.yaml:378-389, generated from internal/controller/datadogagent_controller.go:103). Since install.go now exits when this helper reports true, those users are incorrectly told Karpenter is installed and cannot use the plugin to install autoscaling; please exclude the operator role or match a more Karpenter-specific RBAC shape.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 68b5e73: tightened the predicate to require both the karpenter.sh API group AND at least one explicit controller resource (nodepools or nodeclaims). The Datadog Operator's wildcard rule (apiGroups: [karpenter.sh], resources: ['*']) no longer matches because slices.Contains(["*"], "nodepools") is false. Added a regression test Datadog Operator role with karpenter.sh wildcard is not a controller and a defensive split-rule case to make sure either resource alone is enough to trigger the guard.

})
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
package guess

import (
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
rbacv1 "k8s.io/api/rbac/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
k8stesting "k8s.io/client-go/testing"
)

// karpenterCoreRules is the minimal set of rules that uniquely identifies a
// Karpenter ClusterRole: the chart's clusterrole-core.yaml hard-codes the
// karpenter.sh API group with nodepools/nodeclaims, so any real Karpenter
// install produces a ClusterRole containing them.
var karpenterCoreRules = []rbacv1.PolicyRule{
{
APIGroups: []string{"karpenter.sh"},
Resources: []string{"nodepools", "nodeclaims"},
Verbs: []string{"get", "list", "watch"},
},
}

// TestKarpenterAPIGroupContract pins the API group fingerprint we match
// against. Subsequent tests build fake objects via the same constant, so a
// typo here would silently make them pass while real Karpenter installs stop
// matching — this assertion locks down the contract against the chart's
// hard-coded apiGroup.
func TestKarpenterAPIGroupContract(t *testing.T) {
assert.Equal(t, "karpenter.sh", karpenterAPIGroup)
}

func TestIsForeignKarpenterInstalled(t *testing.T) {
for _, tc := range []struct {
name string
objects []runtime.Object
expected bool
}{
{
name: "no ClusterRoles on the cluster",
objects: nil,
expected: false,
},
{
name: "no Karpenter ClusterRoles among unrelated ones",
objects: []runtime.Object{
clusterRole("system:auth-delegator", nil, []rbacv1.PolicyRule{
{APIGroups: []string{"authentication.k8s.io"}, Resources: []string{"tokenreviews"}, Verbs: []string{"create"}},
}),
},
expected: false,
},
{
name: "only kubectl-datadog ClusterRoles",
objects: []runtime.Object{
clusterRole("karpenter", map[string]string{
"app.kubernetes.io/name": "karpenter",
"app.kubernetes.io/instance": "karpenter",
InstalledByLabel: InstalledByValue,
}, karpenterCoreRules),
clusterRole("karpenter-core", map[string]string{
"app.kubernetes.io/name": "karpenter",
"app.kubernetes.io/instance": "karpenter",
InstalledByLabel: InstalledByValue,
}, karpenterCoreRules),
},
expected: false,
},
{
name: "foreign ClusterRole without our sentinel",
objects: []runtime.Object{
clusterRole("karpenter", map[string]string{
"app.kubernetes.io/name": "karpenter",
"app.kubernetes.io/instance": "karpenter",
"app.kubernetes.io/managed-by": "Helm",
}, karpenterCoreRules),
},
expected: true,
},
{
name: "foreign Karpenter installed with custom nameOverride",
// Helm chart with `nameOverride: my-karpenter` renames both the
// ClusterRole and its app.kubernetes.io/name label. The rules,
// however, still reference karpenter.sh — so we must detect it.
objects: []runtime.Object{
clusterRole("my-karpenter-core", map[string]string{
"app.kubernetes.io/name": "my-karpenter",
"app.kubernetes.io/instance": "my-karpenter",
"app.kubernetes.io/managed-by": "Helm",
}, karpenterCoreRules),
},
expected: true,
},
{
name: "mix of ours and foreign returns true",
objects: []runtime.Object{
clusterRole("karpenter-core", map[string]string{
"app.kubernetes.io/name": "karpenter",
"app.kubernetes.io/instance": "karpenter",
InstalledByLabel: InstalledByValue,
}, karpenterCoreRules),
clusterRole("karpenter", map[string]string{
"app.kubernetes.io/name": "karpenter",
"app.kubernetes.io/instance": "their-release",
}, karpenterCoreRules),
},
expected: true,
},
{
name: "ClusterRole with karpenter-looking labels but no karpenter.sh rules is ignored",
// Defensive: a user-authored ClusterRole that happens to carry
// `app.kubernetes.io/name=karpenter` but no actual Karpenter
// rules must not trigger the guard.
objects: []runtime.Object{
clusterRole("fake-karpenter", map[string]string{
"app.kubernetes.io/name": "karpenter",
}, []rbacv1.PolicyRule{
{APIGroups: []string{""}, Resources: []string{"configmaps"}, Verbs: []string{"get"}},
}),
},
expected: false,
},
{
name: "foreign sentinel value is treated as foreign",
objects: []runtime.Object{
clusterRole("karpenter", map[string]string{
"app.kubernetes.io/name": "karpenter",
InstalledByLabel: "someone-else",
}, karpenterCoreRules),
},
expected: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
clientset := fake.NewSimpleClientset(tc.objects...)

result, err := IsForeignKarpenterInstalled(t.Context(), clientset)

require.NoError(t, err)
assert.Equal(t, tc.expected, result)
})
}

t.Run("API list error propagates", func(t *testing.T) {
clientset := fake.NewSimpleClientset()
clientset.PrependReactor("list", "clusterroles", func(_ k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, apierrors.NewServiceUnavailable("test failure")
})

_, err := IsForeignKarpenterInstalled(t.Context(), clientset)

require.Error(t, err)
assert.Contains(t, err.Error(), "failed to list ClusterRoles")
})

t.Run("pagination follows Continue token across pages and short-circuits on first foreign match", func(t *testing.T) {
// Three pages: an empty page with a non-empty Continue token
// (the API server may legitimately return one), a page with only
// our own ClusterRole, and a page where the foreign install lives.
// We expect the function to walk pages 1 and 2, find the foreign on
// page 3, and stop. Page 4 must never be requested.
pages := []*rbacv1.ClusterRoleList{
{
ListMeta: metav1.ListMeta{Continue: "page2"},
Items: nil,
},
{
ListMeta: metav1.ListMeta{Continue: "page3"},
Items: []rbacv1.ClusterRole{
*clusterRole("karpenter", map[string]string{
InstalledByLabel: InstalledByValue,
}, karpenterCoreRules),
},
},
{
ListMeta: metav1.ListMeta{Continue: "page4"},
Items: []rbacv1.ClusterRole{
*clusterRole("their-karpenter", map[string]string{
"app.kubernetes.io/instance": "their-release",
}, karpenterCoreRules),
},
},
{
Items: []rbacv1.ClusterRole{
*clusterRole("never-fetched", nil, karpenterCoreRules),
},
},
}

clientset := fake.NewSimpleClientset()
var calls []string
clientset.PrependReactor("list", "clusterroles", func(action k8stesting.Action) (bool, runtime.Object, error) {
opts := action.(k8stesting.ListActionImpl).GetListOptions()
calls = append(calls, opts.Continue)
assert.EqualValues(t, clusterRoleListChunkSize, opts.Limit, "Limit must be set so the API server can chunk")
require.Less(t, len(calls)-1, len(pages),
"reactor would over-fetch beyond the synthetic pages — early-exit broken")
return true, pages[len(calls)-1], nil
})

result, err := IsForeignKarpenterInstalled(t.Context(), clientset)

require.NoError(t, err)
assert.True(t, result)
assert.Equal(t, []string{"", "page2", "page3"}, calls,
"each call must forward the previous page's Continue token, and page 4 must never be requested")
})
}

func clusterRole(name string, labels map[string]string, rules []rbacv1.PolicyRule) *rbacv1.ClusterRole {
return &rbacv1.ClusterRole{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: labels,
},
Rules: rules,
}
}
27 changes: 25 additions & 2 deletions cmd/kubectl-datadog/autoscaling/cluster/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,12 @@ func (o *options) run(cmd *cobra.Command) error {
return displayEKSAutoModeMessage(cmd, clusterName)
}

if foreign, err := guess.IsForeignKarpenterInstalled(ctx, o.Clientset); err != nil {
Copy link
Copy Markdown
Collaborator

@clamoriniere clamoriniere Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nits: can we get more info about other installation (the namespace/name for example) so we can display it to the user too

return fmt.Errorf("failed to check for an existing Karpenter installation: %w", err)
} else if foreign {
return displayForeignKarpenterMessage(cmd, clusterName)
}

display.PrintBox(cmd.OutOrStdout(), "Installing Karpenter on cluster "+clusterName+".")

cli, err := clients.Build(ctx, o.ConfigFlags, o.Clientset)
Expand Down Expand Up @@ -513,9 +519,10 @@ func (o *options) installHelmChart(ctx context.Context, clusterName, karpenterNa
// the controller can assume the role via sts:AssumeRoleWithWebIdentity.
func karpenterHelmValues(clusterName string, mode InstallMode, irsaRoleArn string) map[string]any {
values := map[string]any{
// See guess.InstalledByLabel for why these keys are Datadog-namespaced.
"additionalLabels": map[string]any{
"app.kubernetes.io/managed-by": "kubectl-datadog",
"app.kubernetes.io/version": version.GetVersion(),
guess.InstalledByLabel: guess.InstalledByValue,
guess.InstallerVersionLabel: version.GetVersion(),
},
"settings": map[string]any{
"clusterName": clusterName,
Expand Down Expand Up @@ -631,6 +638,22 @@ func displayEKSAutoModeMessage(cmd *cobra.Command, clusterName string) error {
return nil
}

func displayForeignKarpenterMessage(cmd *cobra.Command, clusterName string) error {
coloredURL := openAutoscalingSettingsURL(cmd, clusterName)

display.PrintBox(cmd.OutOrStdout(),
"Karpenter is already installed on cluster "+clusterName+".",
"",
"kubectl-datadog has nothing to install.",
"",
"Navigate to the Autoscaling settings page",
"and select cluster to start generating recommendations:",
coloredURL,
)

return nil
}

func displaySuccessMessage(cmd *cobra.Command, clusterName string, createResources CreateKarpenterResources) error {
coloredURL := openAutoscalingSettingsURL(cmd, clusterName)

Expand Down
Loading
Loading