diff --git a/pkg/cli/snapshot.go b/pkg/cli/snapshot.go index 2a672f7e..50b901d0 100644 --- a/pkg/cli/snapshot.go +++ b/pkg/cli/snapshot.go @@ -62,6 +62,14 @@ func parseSnapshotTemplateOptions(cmd *cli.Command, outFormat serializer.Format) }, nil } +// createSnapshotSerializer creates the output serializer based on template options. +func createSnapshotSerializer(tmplOpts *snapshotTemplateOptions) (serializer.Serializer, error) { + if tmplOpts.templatePath != "" { + return serializer.NewTemplateFileWriter(tmplOpts.templatePath, tmplOpts.outputPath) + } + return serializer.NewFileWriterOrStdout(tmplOpts.format, tmplOpts.outputPath) +} + func snapshotCmd() *cli.Command { return &cli.Command{ Name: "snapshot", @@ -177,6 +185,14 @@ See examples/templates/snapshot-template.md.tmpl for a sample template. Name: "template", Usage: "Path to Go template file for custom output formatting (requires YAML format)", }, + &cli.StringSliceFlag{ + Name: "helm-namespaces", + Usage: "Namespaces for Helm release collection (creates scoped RBAC for secrets access). Mutually exclusive with --helm-all-namespaces.", + }, + &cli.BoolFlag{ + Name: "helm-all-namespaces", + Usage: "Grant cluster-wide secrets access for Helm release collection. Mutually exclusive with --helm-namespaces.", + }, outputFlag, formatFlag, kubeconfigFlag, @@ -203,19 +219,9 @@ See examples/templates/snapshot-template.md.tmpl for a sample template. factory := collector.NewDefaultFactory() // Create output serializer - var ser serializer.Serializer - if tmplOpts.templatePath != "" { - // Use template writer - ser, err = serializer.NewTemplateFileWriter(tmplOpts.templatePath, tmplOpts.outputPath) - if err != nil { - return errors.Wrap(errors.ErrCodeInternal, "failed to create template writer", err) - } - } else { - // Use standard format writer - ser, err = serializer.NewFileWriterOrStdout(tmplOpts.format, tmplOpts.outputPath) - if err != nil { - return errors.Wrap(errors.ErrCodeInternal, "failed to create output writer", err) - } + ser, err := createSnapshotSerializer(tmplOpts) + if err != nil { + return errors.Wrap(errors.ErrCodeInternal, "failed to create output serializer", err) } // Build snapshotter configuration @@ -238,6 +244,13 @@ See examples/templates/snapshot-template.md.tmpl for a sample template. return errors.Wrap(errors.ErrCodeInvalidRequest, "invalid toleration", err) } + // Validate mutual exclusivity of helm flags + helmNamespaces := cmd.StringSlice("helm-namespaces") + helmAllNamespaces := cmd.Bool("helm-all-namespaces") + if len(helmNamespaces) > 0 && helmAllNamespaces { + return errors.New(errors.ErrCodeInvalidRequest, "--helm-namespaces and --helm-all-namespaces are mutually exclusive") + } + // When running inside an agent Job, collect locally instead of // deploying another agent (prevents infinite nesting). if os.Getenv("AICR_AGENT_MODE") == "true" { @@ -261,6 +274,8 @@ See examples/templates/snapshot-template.md.tmpl for a sample template. Privileged: cmd.Bool("privileged"), RequireGPU: cmd.Bool("require-gpu"), TemplatePath: tmplOpts.templatePath, + HelmNamespaces: helmNamespaces, + HelmAllNamespaces: helmAllNamespaces, } return ns.Measure(ctx) diff --git a/pkg/cli/validate.go b/pkg/cli/validate.go index c52849c7..94d77dbc 100644 --- a/pkg/cli/validate.go +++ b/pkg/cli/validate.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "log/slog" + "sort" "time" "github.com/urfave/cli/v3" @@ -47,6 +48,8 @@ type validateAgentConfig struct { debug bool privileged bool requireGPU bool + helmNamespaces []string + helmAllNamespaces bool } // parseValidateAgentConfig parses agent deployment flags from the command. @@ -75,6 +78,8 @@ func parseValidateAgentConfig(cmd *cli.Command) (*validateAgentConfig, error) { debug: cmd.Bool("debug"), privileged: cmd.Bool("privileged"), requireGPU: cmd.Bool("require-gpu"), + helmNamespaces: cmd.StringSlice("helm-namespaces"), + helmAllNamespaces: cmd.Bool("helm-all-namespaces"), }, nil } @@ -132,6 +137,8 @@ func deployAgentForValidation(ctx context.Context, cfg *validateAgentConfig) (*s Debug: cfg.debug, Privileged: cfg.privileged, RequireGPU: cfg.requireGPU, + HelmNamespaces: cfg.helmNamespaces, + HelmAllNamespaces: cfg.helmAllNamespaces, } snap, err := snapshotter.DeployAndGetSnapshot(ctx, agentConfig) @@ -263,6 +270,25 @@ func runValidation( return nil } +// helmNamespacesFromRecipe extracts unique namespaces from Helm ComponentRefs. +func helmNamespacesFromRecipe(rec *recipe.RecipeResult) []string { + seen := make(map[string]bool) + for _, ref := range rec.ComponentRefs { + if ref.Type == recipe.ComponentTypeHelm && ref.Namespace != "" { + seen[ref.Namespace] = true + } + } + if len(seen) == 0 { + return nil + } + namespaces := make([]string, 0, len(seen)) + for ns := range seen { + namespaces = append(namespaces, ns) + } + sort.Strings(namespaces) + return namespaces +} + func validateCmdFlags() []cli.Flag { return []cli.Flag{ &cli.StringFlag{ @@ -367,6 +393,18 @@ func validateCmdFlags() []cli.Flag { Name: "result", Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Note: saved results do not include diagnostic artifacts captured during live runs. Requires --phase conformance and --evidence-dir.", }, + &cli.BoolFlag{ + Name: "skip-helm-check", + Usage: "Skip Helm values deployment check and don't create secrets RBAC", + }, + &cli.StringSliceFlag{ + Name: "helm-namespaces", + Usage: "Override namespaces for Helm release collection (creates scoped RBAC). Mutually exclusive with --helm-all-namespaces.", + }, + &cli.BoolFlag{ + Name: "helm-all-namespaces", + Usage: "Grant cluster-wide secrets access for Helm release collection. Mutually exclusive with --helm-namespaces.", + }, outputFlag, formatFlag, kubeconfigFlag, @@ -494,6 +532,28 @@ Use a saved result file for evidence instead of the live run: return errors.Wrap(errors.ErrCodeInternal, fmt.Sprintf("failed to load recipe from %q", recipeFilePath), err) } + // Resolve helm namespace config for agent RBAC + skipHelmCheck := cmd.Bool("skip-helm-check") + helmNamespaces := cmd.StringSlice("helm-namespaces") + helmAllNamespaces := cmd.Bool("helm-all-namespaces") + + if len(helmNamespaces) > 0 && helmAllNamespaces { + return errors.New(errors.ErrCodeInvalidRequest, "--helm-namespaces and --helm-all-namespaces are mutually exclusive") + } + + if !skipHelmCheck && !cmd.IsSet("helm-namespaces") && !helmAllNamespaces { + // Auto-derive from recipe ComponentRefs + helmNamespaces = helmNamespacesFromRecipe(rec) + if len(helmNamespaces) > 0 { + slog.Info("auto-derived helm namespaces from recipe", "namespaces", helmNamespaces) + } + } + + if skipHelmCheck { + helmNamespaces = nil + helmAllNamespaces = false + } + // Get snapshot - either from file or by deploying an agent var snap *snapshotter.Snapshot var snapshotSource string @@ -515,6 +575,10 @@ Use a saved result file for evidence instead of the live run: return cfgErr } + // Apply resolved helm namespace config + agentCfg.helmNamespaces = helmNamespaces + agentCfg.helmAllNamespaces = helmAllNamespaces + var deployErr error snap, snapshotSource, deployErr = deployAgentForValidation(ctx, agentCfg) if deployErr != nil { diff --git a/pkg/cli/validate_test.go b/pkg/cli/validate_test.go index df17df75..0ae2df14 100644 --- a/pkg/cli/validate_test.go +++ b/pkg/cli/validate_test.go @@ -19,6 +19,7 @@ import ( "strings" "testing" + "github.com/NVIDIA/aicr/pkg/recipe" "github.com/NVIDIA/aicr/pkg/validator" ) @@ -213,6 +214,84 @@ func TestValidateCmd_AgentFlags(t *testing.T) { } } +func TestHelmNamespacesFromRecipe(t *testing.T) { + tests := []struct { + name string + rec *recipe.RecipeResult + expected []string + }{ + { + name: "nil recipe", + rec: &recipe.RecipeResult{}, + expected: nil, + }, + { + name: "no helm components", + rec: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "app", Type: recipe.ComponentTypeKustomize, Namespace: "default"}, + }, + }, + expected: nil, + }, + { + name: "helm components with namespaces", + rec: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Namespace: "gpu-operator"}, + {Name: "network-operator", Type: recipe.ComponentTypeHelm, Namespace: "network-operator"}, + }, + }, + expected: []string{"gpu-operator", "network-operator"}, + }, + { + name: "deduplicates namespaces", + rec: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Namespace: "gpu-operator"}, + {Name: "gpu-feature-discovery", Type: recipe.ComponentTypeHelm, Namespace: "gpu-operator"}, + }, + }, + expected: []string{"gpu-operator"}, + }, + { + name: "skips helm without namespace", + rec: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Namespace: "gpu-operator"}, + {Name: "orphan", Type: recipe.ComponentTypeHelm, Namespace: ""}, + }, + }, + expected: []string{"gpu-operator"}, + }, + { + name: "mixed helm and kustomize", + rec: &recipe.RecipeResult{ + ComponentRefs: []recipe.ComponentRef{ + {Name: "gpu-operator", Type: recipe.ComponentTypeHelm, Namespace: "gpu-operator"}, + {Name: "kustomize-app", Type: recipe.ComponentTypeKustomize, Namespace: "default"}, + {Name: "network-operator", Type: recipe.ComponentTypeHelm, Namespace: "network-operator"}, + }, + }, + expected: []string{"gpu-operator", "network-operator"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := helmNamespacesFromRecipe(tt.rec) + if len(got) != len(tt.expected) { + t.Fatalf("got %d namespaces, want %d: %v", len(got), len(tt.expected), got) + } + for i, ns := range got { + if ns != tt.expected[i] { + t.Errorf("namespace[%d] = %q, want %q", i, ns, tt.expected[i]) + } + } + }) + } +} + // hasFlag checks if a cli.Flag has the given name func hasFlag(flag interface{ Names() []string }, name string) bool { return slices.Contains(flag.Names(), name) diff --git a/pkg/collector/factory.go b/pkg/collector/factory.go index 12a17dd3..2c3b9d76 100644 --- a/pkg/collector/factory.go +++ b/pkg/collector/factory.go @@ -41,10 +41,19 @@ func WithSystemDServices(services []string) Option { } } +// WithHelmNamespaces configures the namespaces for Helm release collection. +// nil/empty = skip, ["*"] = all namespaces, ["ns1","ns2"] = scoped. +func WithHelmNamespaces(namespaces []string) Option { + return func(f *DefaultFactory) { + f.HelmNamespaces = namespaces + } +} + // DefaultFactory is the standard implementation of Factory that creates collectors // with production dependencies. It configures default systemd services to monitor. type DefaultFactory struct { SystemDServices []string + HelmNamespaces []string } // NewDefaultFactory creates a new DefaultFactory with default configuration. @@ -86,5 +95,7 @@ func (f *DefaultFactory) CreateOSCollector() Collector { // CreateKubernetesCollector creates a Kubernetes API collector. func (f *DefaultFactory) CreateKubernetesCollector() Collector { - return &k8s.Collector{} + return &k8s.Collector{ + HelmNamespaces: f.HelmNamespaces, + } } diff --git a/pkg/collector/factory_test.go b/pkg/collector/factory_test.go index 21668be3..5d7bd13c 100644 --- a/pkg/collector/factory_test.go +++ b/pkg/collector/factory_test.go @@ -18,6 +18,7 @@ import ( "context" "testing" + "github.com/NVIDIA/aicr/pkg/collector/k8s" "github.com/NVIDIA/aicr/pkg/collector/systemd" ) @@ -87,6 +88,50 @@ func TestWithSystemDServices(t *testing.T) { } } +func TestWithHelmNamespaces(t *testing.T) { + tests := []struct { + name string + namespaces []string + wantLen int + }{ + { + name: "nil namespaces", + namespaces: nil, + wantLen: 0, + }, + { + name: "all namespaces", + namespaces: []string{"*"}, + wantLen: 1, + }, + { + name: "scoped namespaces", + namespaces: []string{"gpu-operator", "network-operator"}, + wantLen: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + factory := NewDefaultFactory(WithHelmNamespaces(tt.namespaces)) + + if len(factory.HelmNamespaces) != tt.wantLen { + t.Errorf("expected %d namespaces, got %d", tt.wantLen, len(factory.HelmNamespaces)) + } + + // Verify K8s collector gets the namespaces + col := factory.CreateKubernetesCollector() + k8sCol, ok := col.(*k8s.Collector) + if !ok { + t.Fatal("expected *k8s.Collector") + } + if len(k8sCol.HelmNamespaces) != tt.wantLen { + t.Errorf("K8s collector expected %d namespaces, got %d", tt.wantLen, len(k8sCol.HelmNamespaces)) + } + }) + } +} + func TestNewDefaultFactory_Defaults(t *testing.T) { factory := NewDefaultFactory() diff --git a/pkg/collector/k8s/helm.go b/pkg/collector/k8s/helm.go index ebe0b795..17412b1e 100644 --- a/pkg/collector/k8s/helm.go +++ b/pkg/collector/k8s/helm.go @@ -26,41 +26,67 @@ import ( "helm.sh/helm/v3/pkg/storage/driver" ) -// collectHelmReleases discovers all deployed Helm releases across all namespaces -// and returns their metadata and user-supplied values as measurement readings. +// collectHelmReleasesScoped collects Helm releases based on HelmNamespaces config. +// nil/empty = skip collection, ["*"] = all namespaces, ["ns1","ns2"] = scoped. +func (k *Collector) collectHelmReleasesScoped(ctx context.Context) map[string]measurement.Reading { + if len(k.HelmNamespaces) == 0 { + slog.Debug("helm collection skipped - no namespaces configured") + return make(map[string]measurement.Reading) + } + + if len(k.HelmNamespaces) == 1 && k.HelmNamespaces[0] == "*" { + return k.collectHelmReleasesInNamespace(ctx, "") + } + + data := make(map[string]measurement.Reading) + for _, ns := range k.HelmNamespaces { + if err := ctx.Err(); err != nil { + slog.Debug("helm collector context cancelled", slog.String("error", err.Error())) + return data + } + nsData := k.collectHelmReleasesInNamespace(ctx, ns) + for key, val := range nsData { + data[key] = val + } + } + return data +} + +// collectHelmReleasesInNamespace discovers deployed Helm releases in a single namespace +// (or all namespaces when namespace is ""). // On any error, it degrades gracefully by returning an empty map. -func (k *Collector) collectHelmReleases(ctx context.Context) map[string]measurement.Reading { +func (k *Collector) collectHelmReleasesInNamespace(ctx context.Context, namespace string) map[string]measurement.Reading { if err := ctx.Err(); err != nil { slog.Debug("helm collector context cancelled", slog.String("error", err.Error())) return make(map[string]measurement.Reading) } - // Create a Helm secrets storage driver using all-namespaces (""). - d := driver.NewSecrets(k.ClientSet.CoreV1().Secrets("")) + d := driver.NewSecrets(k.ClientSet.CoreV1().Secrets(namespace)) store := storage.Init(d) releases, err := store.ListDeployed() if err != nil { - slog.Warn("failed to list helm releases", slog.String("error", err.Error())) + slog.Warn("failed to list helm releases", + slog.String("namespace", namespace), + slog.String("error", err.Error())) return make(map[string]measurement.Reading) } - // Deduplicate: keep only the highest revision per release name+namespace. releases = latestReleases(releases) data := make(map[string]measurement.Reading) - for _, rel := range releases { if err := ctx.Err(); err != nil { slog.Debug("helm collector context cancelled during iteration", slog.String("error", err.Error())) return data } - mapRelease(rel, data) } - slog.Debug("collected helm releases", slog.Int("count", len(releases))) + slog.Debug("collected helm releases", + slog.String("namespace", namespace), + slog.Int("count", len(releases))) return data } diff --git a/pkg/collector/k8s/k8s.go b/pkg/collector/k8s/k8s.go index c38207e9..4b5b59b1 100644 --- a/pkg/collector/k8s/k8s.go +++ b/pkg/collector/k8s/k8s.go @@ -28,8 +28,9 @@ import ( // Collector collects information about the Kubernetes cluster. type Collector struct { - ClientSet kubernetes.Interface - RestConfig *rest.Config + ClientSet kubernetes.Interface + RestConfig *rest.Config + HelmNamespaces []string // nil/empty=skip, ["*"]=all, ["ns1","ns2"]=scoped } // Collect retrieves Kubernetes cluster information from the API server. @@ -68,7 +69,7 @@ func (k *Collector) Collect(ctx context.Context) (*measurement.Measurement, erro return k.collectNode(ctx) }) - helm := k.collectHelmReleases(ctx) + helm := k.collectHelmReleasesScoped(ctx) argocd := k.collectArgocdApplications(ctx) diff --git a/pkg/k8s/agent/deployer.go b/pkg/k8s/agent/deployer.go index 68104c08..3d67d584 100644 --- a/pkg/k8s/agent/deployer.go +++ b/pkg/k8s/agent/deployer.go @@ -54,6 +54,12 @@ func (d *Deployer) Deploy(ctx context.Context) error { return aicrerrors.Wrap(aicrerrors.ErrCodeInternal, "failed to create ClusterRoleBinding", err) } + if len(d.config.HelmNamespaces) > 0 { + if err := d.ensureHelmSecretRoles(ctx); err != nil { + return aicrerrors.Wrap(aicrerrors.ErrCodeInternal, "failed to create Helm secrets RBAC", err) + } + } + // Step 2: Ensure Job (delete existing + recreate) if err := d.ensureJob(ctx); err != nil { return aicrerrors.Wrap(aicrerrors.ErrCodeInternal, "failed to create Job", err) @@ -124,6 +130,14 @@ func (d *Deployer) Cleanup(ctx context.Context, opts CleanupOptions) error { deleted = append(deleted, fmt.Sprintf("ClusterRoleBinding %q", clusterRoleName)) } + if len(d.config.HelmNamespaces) > 0 { + if err := d.deleteHelmSecretRoles(ctx); err != nil { + errs = append(errs, fmt.Sprintf("Helm secrets RBAC: %v", err)) + } else { + deleted = append(deleted, fmt.Sprintf("Helm secrets RBAC (%d namespaces)", len(d.config.HelmNamespaces))) + } + } + // Log successful deletions if len(deleted) > 0 { slog.Debug("cleanup completed", slog.Int("deleted", len(deleted)), slog.Any("resources", deleted)) diff --git a/pkg/k8s/agent/deployer_test.go b/pkg/k8s/agent/deployer_test.go index d28d5a74..d9ef3bd2 100644 --- a/pkg/k8s/agent/deployer_test.go +++ b/pkg/k8s/agent/deployer_test.go @@ -124,9 +124,9 @@ func TestDeployer_EnsureRBAC(t *testing.T) { t.Fatalf("ClusterRole not found: %v", err) } - // Verify policy rules (nodes, pods, secrets, clusterpolicies, services) - if len(cr.Rules) != 5 { - t.Errorf("expected 5 rules, got %d", len(cr.Rules)) + // Default: 4 rules (nodes, pods, clusterpolicies, services) - no secrets + if len(cr.Rules) != 4 { + t.Errorf("expected 4 rules (no secrets by default), got %d", len(cr.Rules)) } }) @@ -844,6 +844,235 @@ func TestDeployer_StreamLogs_NoPod(t *testing.T) { } } +func TestDeployer_EnsureClusterRole_AllNamespaces(t *testing.T) { + clientset := fake.NewClientset() + config := Config{ + Namespace: "test-namespace", + ServiceAccountName: testName, + JobName: testName, + Image: "ghcr.io/nvidia/aicr-validator:latest", + Output: "cm://test-namespace/aicr-snapshot", + HelmAllNamespaces: true, + } + deployer := NewDeployer(clientset, config) + ctx := context.Background() + + if err := deployer.ensureClusterRole(ctx); err != nil { + t.Fatalf("failed to create ClusterRole: %v", err) + } + + cr, err := clientset.RbacV1().ClusterRoles(). + Get(ctx, "aicr-node-reader", metav1.GetOptions{}) + if err != nil { + t.Fatalf("ClusterRole not found: %v", err) + } + + // HelmAllNamespaces: 5 rules (nodes, pods, clusterpolicies, services, secrets) + if len(cr.Rules) != 5 { + t.Errorf("expected 5 rules with HelmAllNamespaces, got %d", len(cr.Rules)) + } + + // Verify secrets rule is present + hasSecrets := false + for _, rule := range cr.Rules { + if len(rule.Resources) == 1 && rule.Resources[0] == "secrets" { + hasSecrets = true + break + } + } + if !hasSecrets { + t.Error("expected secrets rule in ClusterRole when HelmAllNamespaces=true") + } +} + +func TestDeployer_EnsureHelmSecretRoles(t *testing.T) { + clientset := fake.NewClientset() + config := Config{ + Namespace: "test-namespace", + ServiceAccountName: testName, + JobName: testName, + Image: "ghcr.io/nvidia/aicr-validator:latest", + Output: "cm://test-namespace/aicr-snapshot", + HelmNamespaces: []string{"gpu-operator", "network-operator"}, + } + deployer := NewDeployer(clientset, config) + ctx := context.Background() + + if err := deployer.ensureHelmSecretRoles(ctx); err != nil { + t.Fatalf("failed to create Helm secret roles: %v", err) + } + + for _, ns := range config.HelmNamespaces { + // Verify Role + role, err := clientset.RbacV1().Roles(ns). + Get(ctx, helmSecretRoleName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Role not found in namespace %q: %v", ns, err) + } + if len(role.Rules) != 1 || role.Rules[0].Resources[0] != "secrets" { + t.Errorf("expected secrets rule in Role for namespace %q", ns) + } + if role.Labels["app.kubernetes.io/component"] != "helm-secrets" { + t.Errorf("expected helm-secrets component label on Role in namespace %q", ns) + } + + // Verify RoleBinding + rb, err := clientset.RbacV1().RoleBindings(ns). + Get(ctx, helmSecretRoleName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("RoleBinding not found in namespace %q: %v", ns, err) + } + if rb.Subjects[0].Name != testName { + t.Errorf("expected subject %q, got %q in namespace %q", testName, rb.Subjects[0].Name, ns) + } + if rb.Subjects[0].Namespace != "test-namespace" { + t.Errorf("expected subject namespace %q, got %q", "test-namespace", rb.Subjects[0].Namespace) + } + } +} + +func TestDeployer_EnsureHelmSecretRoles_Idempotent(t *testing.T) { + clientset := fake.NewClientset() + config := Config{ + Namespace: "test-namespace", + ServiceAccountName: testName, + JobName: testName, + Image: "ghcr.io/nvidia/aicr-validator:latest", + Output: "cm://test-namespace/aicr-snapshot", + HelmNamespaces: []string{"gpu-operator"}, + } + deployer := NewDeployer(clientset, config) + ctx := context.Background() + + if err := deployer.ensureHelmSecretRoles(ctx); err != nil { + t.Fatalf("first create failed: %v", err) + } + + if err := deployer.ensureHelmSecretRoles(ctx); err != nil { + t.Fatalf("second create failed (not idempotent): %v", err) + } +} + +func TestDeployer_Cleanup_WithHelmNamespaces(t *testing.T) { + clientset := fake.NewClientset() + + clientset.PrependReactor("create", "selfsubjectaccessreviews", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, &authv1.SelfSubjectAccessReview{ + Status: authv1.SubjectAccessReviewStatus{ + Allowed: true, + Reason: "test permissions allowed", + }, + }, nil + }) + + config := Config{ + Namespace: "test-namespace", + ServiceAccountName: testName, + JobName: testName, + Image: "ghcr.io/nvidia/aicr-validator:latest", + Output: "cm://test-namespace/aicr-snapshot", + HelmNamespaces: []string{"gpu-operator", "network-operator"}, + } + deployer := NewDeployer(clientset, config) + ctx := context.Background() + + if err := deployer.Deploy(ctx); err != nil { + t.Fatalf("Deploy() failed: %v", err) + } + + // Verify Helm secret roles were created + for _, ns := range config.HelmNamespaces { + _, err := clientset.RbacV1().Roles(ns).Get(ctx, helmSecretRoleName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Helm Role not found in namespace %q: %v", ns, err) + } + } + + // Cleanup + if err := deployer.Cleanup(ctx, CleanupOptions{Enabled: true}); err != nil { + t.Fatalf("Cleanup() failed: %v", err) + } + + // Verify Helm secret roles were deleted + for _, ns := range config.HelmNamespaces { + _, err := clientset.RbacV1().Roles(ns).Get(ctx, helmSecretRoleName, metav1.GetOptions{}) + if err == nil { + t.Errorf("Helm Role should be deleted in namespace %q", ns) + } + _, err = clientset.RbacV1().RoleBindings(ns).Get(ctx, helmSecretRoleName, metav1.GetOptions{}) + if err == nil { + t.Errorf("Helm RoleBinding should be deleted in namespace %q", ns) + } + } +} + +func TestBuildJob_HelmNamespacesEnvVar(t *testing.T) { + tests := []struct { + name string + helmNamespaces []string + helmAllNamespaces bool + wantEnvValue string + wantEnvPresent bool + }{ + { + name: "no helm config", + wantEnvPresent: false, + }, + { + name: "all namespaces", + helmAllNamespaces: true, + wantEnvValue: "*", + wantEnvPresent: true, + }, + { + name: "scoped namespaces", + helmNamespaces: []string{"gpu-operator", "network-operator"}, + wantEnvValue: "gpu-operator,network-operator", + wantEnvPresent: true, + }, + { + name: "single namespace", + helmNamespaces: []string{"gpu-operator"}, + wantEnvValue: "gpu-operator", + wantEnvPresent: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + config := Config{ + Namespace: "test-namespace", + ServiceAccountName: testName, + JobName: testName, + Image: "ghcr.io/nvidia/aicr-validator:latest", + Output: "cm://test-namespace/aicr-snapshot", + HelmNamespaces: tt.helmNamespaces, + HelmAllNamespaces: tt.helmAllNamespaces, + } + deployer := NewDeployer(fake.NewClientset(), config) + job := deployer.buildJob() + + container := job.Spec.Template.Spec.Containers[0] + var found bool + var gotValue string + for _, env := range container.Env { + if env.Name == "AICR_HELM_NAMESPACES" { + found = true + gotValue = env.Value + break + } + } + + if found != tt.wantEnvPresent { + t.Errorf("AICR_HELM_NAMESPACES present=%v, want=%v", found, tt.wantEnvPresent) + } + if found && gotValue != tt.wantEnvValue { + t.Errorf("AICR_HELM_NAMESPACES=%q, want=%q", gotValue, tt.wantEnvValue) + } + }) + } +} + // Helper function func containsVerb(verbs []string, verb string) bool { for _, v := range verbs { diff --git a/pkg/k8s/agent/job.go b/pkg/k8s/agent/job.go index be47b8b0..2c102b4f 100644 --- a/pkg/k8s/agent/job.go +++ b/pkg/k8s/agent/job.go @@ -16,6 +16,7 @@ package agent import ( "context" + "strings" "time" aicrerrors "github.com/NVIDIA/aicr/pkg/errors" @@ -117,24 +118,7 @@ func (d *Deployer) buildPodSpec(args []string) corev1.PodSpec { Image: d.config.Image, Command: []string{"aicr"}, Args: args, - Env: []corev1.EnvVar{ - { - Name: "AICR_AGENT_MODE", - Value: "true", - }, - { - Name: "AICR_LOG_PREFIX", - Value: "agent", - }, - { - Name: "NODE_NAME", - ValueFrom: &corev1.EnvVarSource{ - FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: "spec.nodeName", - }, - }, - }, - }, + Env: d.buildEnvVars(), VolumeMounts: []corev1.VolumeMount{ { Name: "tmp", @@ -276,6 +260,50 @@ func (d *Deployer) applyRestrictedSettings(spec *corev1.PodSpec) { } } +// buildEnvVars constructs the environment variables for the agent container. +func (d *Deployer) buildEnvVars() []corev1.EnvVar { + envVars := []corev1.EnvVar{ + { + Name: "AICR_AGENT_MODE", + Value: "true", + }, + { + Name: "AICR_LOG_PREFIX", + Value: "agent", + }, + { + Name: "NODE_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "spec.nodeName", + }, + }, + }, + } + + helmNS := d.helmNamespacesEnvValue() + if helmNS != "" { + envVars = append(envVars, corev1.EnvVar{ + Name: "AICR_HELM_NAMESPACES", + Value: helmNS, + }) + } + + return envVars +} + +// helmNamespacesEnvValue returns the value for AICR_HELM_NAMESPACES env var. +// Returns "*" for all-namespaces, comma-joined for scoped, or "" for none. +func (d *Deployer) helmNamespacesEnvValue() string { + if d.config.HelmAllNamespaces { + return "*" + } + if len(d.config.HelmNamespaces) > 0 { + return strings.Join(d.config.HelmNamespaces, ",") + } + return "" +} + // deleteJob deletes the Job. func (d *Deployer) deleteJob(ctx context.Context) error { propagationPolicy := metav1.DeletePropagationForeground diff --git a/pkg/k8s/agent/permissions.go b/pkg/k8s/agent/permissions.go index a31f83fc..72b0d499 100644 --- a/pkg/k8s/agent/permissions.go +++ b/pkg/k8s/agent/permissions.go @@ -39,26 +39,37 @@ type PermissionCheck struct { func (d *Deployer) CheckPermissions(ctx context.Context) ([]PermissionCheck, error) { checks := []PermissionCheck{} - // Required permissions for deployment - requiredChecks := []struct { + type permCheck struct { resource string verb string namespace string - }{ + } + + // Required permissions for deployment + requiredChecks := make([]permCheck, 0, 9+len(d.config.HelmNamespaces)*2) + requiredChecks = append(requiredChecks, // Namespace-scoped resources - {"serviceaccounts", "create", d.config.Namespace}, - {"roles", "create", d.config.Namespace}, - {"rolebindings", "create", d.config.Namespace}, - {"jobs", "create", d.config.Namespace}, - {"configmaps", "get", d.config.Namespace}, - {"configmaps", "list", d.config.Namespace}, + permCheck{"serviceaccounts", "create", d.config.Namespace}, + permCheck{"roles", "create", d.config.Namespace}, + permCheck{"rolebindings", "create", d.config.Namespace}, + permCheck{"jobs", "create", d.config.Namespace}, + permCheck{"configmaps", "get", d.config.Namespace}, + permCheck{"configmaps", "list", d.config.Namespace}, // Cluster-scoped resources - {"clusterroles", "create", ""}, - {"clusterrolebindings", "create", ""}, + permCheck{"clusterroles", "create", ""}, + permCheck{"clusterrolebindings", "create", ""}, // Cleanup permissions - {"jobs", "delete", d.config.Namespace}, + permCheck{"jobs", "delete", d.config.Namespace}, + ) + + // Add per-namespace permission checks for Helm secrets RBAC + for _, ns := range d.config.HelmNamespaces { + requiredChecks = append(requiredChecks, + permCheck{"roles", "create", ns}, + permCheck{"rolebindings", "create", ns}, + ) } var missingPermissions []string diff --git a/pkg/k8s/agent/rbac.go b/pkg/k8s/agent/rbac.go index 46b580a6..c700f8f6 100644 --- a/pkg/k8s/agent/rbac.go +++ b/pkg/k8s/agent/rbac.go @@ -89,46 +89,133 @@ func (d *Deployer) ensureRoleBinding(ctx context.Context) error { return k8s.IgnoreAlreadyExists(err) } +// helmSecretRoleName is the name used for per-namespace Helm secrets Roles and RoleBindings. +const helmSecretRoleName = "aicr-helm-secrets" + // ensureClusterRole creates the ClusterRole for node and cluster-wide resource access. +// Secrets access is only included when HelmAllNamespaces is true. // If the ClusterRole already exists, this is a no-op (idempotent). func (d *Deployer) ensureClusterRole(ctx context.Context) error { + rules := []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"nodes"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"pods"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{"nvidia.com"}, + Resources: []string{"clusterpolicies"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"services"}, + Verbs: []string{"get", "list"}, + }, + } + + if d.config.HelmAllNamespaces { + rules = append(rules, rbacv1.PolicyRule{ + APIGroups: []string{""}, + Resources: []string{"secrets"}, + Verbs: []string{"get", "list"}, + }) + } + cr := &rbacv1.ClusterRole{ ObjectMeta: metav1.ObjectMeta{ Name: clusterRoleName, }, - Rules: []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, - Resources: []string{"nodes"}, - Verbs: []string{"get", "list"}, - }, - { - APIGroups: []string{""}, - Resources: []string{"pods"}, - Verbs: []string{"get", "list"}, + Rules: rules, + } + + _, err := d.clientset.RbacV1().ClusterRoles().Create(ctx, cr, metav1.CreateOptions{}) + return k8s.IgnoreAlreadyExists(err) +} + +// ensureHelmSecretRoles creates per-namespace Roles and RoleBindings for Helm secrets access. +// Each namespace gets a Role with secrets get/list and a RoleBinding to the agent ServiceAccount. +func (d *Deployer) ensureHelmSecretRoles(ctx context.Context) error { + for _, ns := range d.config.HelmNamespaces { + if err := d.ensureHelmSecretRole(ctx, ns); err != nil { + return err + } + if err := d.ensureHelmSecretRoleBinding(ctx, ns); err != nil { + return err + } + } + return nil +} + +func (d *Deployer) ensureHelmSecretRole(ctx context.Context, namespace string) error { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: helmSecretRoleName, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "aicr", + "app.kubernetes.io/component": "helm-secrets", }, + }, + Rules: []rbacv1.PolicyRule{ { APIGroups: []string{""}, Resources: []string{"secrets"}, Verbs: []string{"get", "list"}, }, - { - APIGroups: []string{"nvidia.com"}, - Resources: []string{"clusterpolicies"}, - Verbs: []string{"get", "list"}, + }, + } + + _, err := d.clientset.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}) + return k8s.IgnoreAlreadyExists(err) +} + +func (d *Deployer) ensureHelmSecretRoleBinding(ctx context.Context, namespace string) error { + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: helmSecretRoleName, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "aicr", + "app.kubernetes.io/component": "helm-secrets", }, + }, + Subjects: []rbacv1.Subject{ { - APIGroups: []string{""}, - Resources: []string{"services"}, - Verbs: []string{"get", "list"}, + Kind: "ServiceAccount", + Name: d.config.ServiceAccountName, + Namespace: d.config.Namespace, }, }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: helmSecretRoleName, + }, } - _, err := d.clientset.RbacV1().ClusterRoles().Create(ctx, cr, metav1.CreateOptions{}) + _, err := d.clientset.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}) return k8s.IgnoreAlreadyExists(err) } +// deleteHelmSecretRoles removes per-namespace Roles and RoleBindings for Helm secrets. +func (d *Deployer) deleteHelmSecretRoles(ctx context.Context) error { + for _, ns := range d.config.HelmNamespaces { + if err := d.clientset.RbacV1().Roles(ns).Delete(ctx, helmSecretRoleName, metav1.DeleteOptions{}); k8s.IgnoreNotFound(err) != nil { + return err + } + if err := d.clientset.RbacV1().RoleBindings(ns).Delete(ctx, helmSecretRoleName, metav1.DeleteOptions{}); k8s.IgnoreNotFound(err) != nil { + return err + } + } + return nil +} + // ensureClusterRoleBinding creates the ClusterRoleBinding to bind the ClusterRole to the ServiceAccount. // If the ClusterRoleBinding already exists, this is a no-op (idempotent). func (d *Deployer) ensureClusterRoleBinding(ctx context.Context) error { diff --git a/pkg/k8s/agent/types.go b/pkg/k8s/agent/types.go index ceb19ca1..f46dec39 100644 --- a/pkg/k8s/agent/types.go +++ b/pkg/k8s/agent/types.go @@ -33,8 +33,10 @@ type Config struct { Tolerations []corev1.Toleration Output string Debug bool - Privileged bool // If true, run with privileged security context (required for GPU/SystemD collectors) - RequireGPU bool // If true, request nvidia.com/gpu resource (required for CDI environments) + Privileged bool // If true, run with privileged security context (required for GPU/SystemD collectors) + RequireGPU bool // If true, request nvidia.com/gpu resource (required for CDI environments) + HelmNamespaces []string // Per-NS secrets access for Helm release collection + HelmAllNamespaces bool // Cluster-wide secrets access for Helm collection } // Deployer manages the deployment and lifecycle of the agent Job. diff --git a/pkg/snapshotter/agent.go b/pkg/snapshotter/agent.go index f1309559..93fa3dca 100644 --- a/pkg/snapshotter/agent.go +++ b/pkg/snapshotter/agent.go @@ -89,6 +89,12 @@ type AgentConfig struct { // TemplatePath is the path to a Go template file for custom output formatting. // When set, the snapshot output will be processed through this template. TemplatePath string + + // HelmNamespaces lists namespaces for scoped Helm release collection. + HelmNamespaces []string + + // HelmAllNamespaces enables cluster-wide Helm secrets access. + HelmAllNamespaces bool } // DeployAndGetSnapshot deploys an agent to capture a snapshot and returns the Snapshot struct. @@ -130,6 +136,8 @@ func DeployAndGetSnapshot(ctx context.Context, config *AgentConfig) (*Snapshot, Debug: config.Debug, Privileged: config.Privileged, RequireGPU: config.RequireGPU, + HelmNamespaces: config.HelmNamespaces, + HelmAllNamespaces: config.HelmAllNamespaces, } // Create deployer @@ -392,6 +400,8 @@ func (n *NodeSnapshotter) measureWithAgent(ctx context.Context) error { Debug: n.AgentConfig.Debug, Privileged: n.AgentConfig.Privileged, RequireGPU: n.AgentConfig.RequireGPU, + HelmNamespaces: n.AgentConfig.HelmNamespaces, + HelmAllNamespaces: n.AgentConfig.HelmAllNamespaces, } // Create deployer diff --git a/pkg/snapshotter/snapshot.go b/pkg/snapshotter/snapshot.go index 3d5b83fc..38ae2f68 100644 --- a/pkg/snapshotter/snapshot.go +++ b/pkg/snapshotter/snapshot.go @@ -17,6 +17,8 @@ package snapshotter import ( "context" "log/slog" + "os" + "strings" "sync" "time" @@ -63,10 +65,27 @@ func (n *NodeSnapshotter) Measure(ctx context.Context) error { return n.measure(ctx) } +// parseHelmNamespacesEnv reads the AICR_HELM_NAMESPACES env var set by the agent Job. +// Returns nil for empty (skip), ["*"] for all, or split namespaces for scoped. +func parseHelmNamespacesEnv() []string { + val := os.Getenv("AICR_HELM_NAMESPACES") + if val == "" { + return nil + } + if val == "*" { + return []string{"*"} + } + return strings.Split(val, ",") +} + // measure collects configuration measurements from the current node. func (n *NodeSnapshotter) measure(ctx context.Context) error { if n.Factory == nil { - n.Factory = collector.NewDefaultFactory() + var opts []collector.Option + if helmNS := parseHelmNamespacesEnv(); len(helmNS) > 0 { + opts = append(opts, collector.WithHelmNamespaces(helmNS)) + } + n.Factory = collector.NewDefaultFactory(opts...) } slog.Debug("starting node snapshot") diff --git a/pkg/validator/checks/deployment/helm_values_check.go b/pkg/validator/checks/deployment/helm_values_check.go index c79544ff..3b432a62 100644 --- a/pkg/validator/checks/deployment/helm_values_check.go +++ b/pkg/validator/checks/deployment/helm_values_check.go @@ -19,6 +19,7 @@ import ( "fmt" "log/slog" "sort" + "strconv" "strings" "github.com/NVIDIA/aicr/pkg/errors" @@ -163,7 +164,32 @@ func flattenValuesRecursive(data map[string]any, prefix string, result map[strin } // valuesEqual compares two string representations of values, normalizing -// common type differences (e.g., "true"/"true", "1"/"1"). +// common type differences between YAML parsing and Helm's JSON storage. +// For example, YAML parses 1 as int while JSON parses it as float64; +// both flatten to "1" but edge cases like "1.0" vs "1" need handling. func valuesEqual(expected, actual string) bool { - return strings.TrimSpace(expected) == strings.TrimSpace(actual) + expected = strings.TrimSpace(expected) + actual = strings.TrimSpace(actual) + + if expected == actual { + return true + } + + // Numeric normalization: parse both as float64 to handle + // int vs float differences (YAML int 1 vs JSON float64 1.0). + if ef, ee := strconv.ParseFloat(expected, 64); ee == nil { + if af, ae := strconv.ParseFloat(actual, 64); ae == nil { + return ef == af + } + } + + // Boolean normalization: case-insensitive comparison + // handles "True"/"true", "FALSE"/"false". + if eb, ee := strconv.ParseBool(expected); ee == nil { + if ab, ae := strconv.ParseBool(actual); ae == nil { + return eb == ab + } + } + + return false } diff --git a/pkg/validator/checks/deployment/helm_values_check_test.go b/pkg/validator/checks/deployment/helm_values_check_test.go new file mode 100644 index 00000000..667ce76a --- /dev/null +++ b/pkg/validator/checks/deployment/helm_values_check_test.go @@ -0,0 +1,53 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package deployment + +import ( + "testing" + + "github.com/NVIDIA/aicr/pkg/validator/checks" +) + +// TestCheckHelmValues is the integration test for helm-values. +// This runs inside validator Jobs and invokes the validator. +func TestCheckHelmValues(t *testing.T) { + if testing.Short() { + t.Skip("Skipping integration test in short mode") + } + + // Load Job environment + runner, err := checks.NewTestRunner(t) + if err != nil { + t.Skipf("Not in Job environment: %v", err) + } + defer runner.Cancel() + + // Check if this check is enabled in recipe + if !runner.HasCheck("deployment", "helm-values") { + t.Skip("Check helm-values not enabled in recipe") + } + + t.Logf("Running check: helm-values") + + // Run the validator + ctx := runner.Context() + err = validateHelmValues(ctx) + + if err != nil { + t.Errorf("Check failed: %v", err) + } else { + t.Logf("✓ Check passed: helm-values") + } +} diff --git a/pkg/validator/checks/deployment/helm_values_check_unit_test.go b/pkg/validator/checks/deployment/helm_values_check_unit_test.go index 6d23268f..86f4a4c6 100644 --- a/pkg/validator/checks/deployment/helm_values_check_unit_test.go +++ b/pkg/validator/checks/deployment/helm_values_check_unit_test.go @@ -26,7 +26,7 @@ import ( "github.com/NVIDIA/aicr/pkg/validator/checks" ) -func TestCheckHelmValues(t *testing.T) { +func TestValidateHelmValues(t *testing.T) { tests := []struct { name string setup func() *checks.ValidationContext @@ -246,6 +246,38 @@ func TestCheckHelmValues(t *testing.T) { }, wantErr: false, }, + { + name: "float vs int normalization - 1.0 equals 1", + setup: func() *checks.ValidationContext { + return &checks.ValidationContext{ + Context: context.Background(), + Snapshot: snapshotWithHelm(map[string]string{ + "gpu-operator.chart": "gpu-operator", + "gpu-operator.values.replicas": "1.0", + }), + Recipe: recipeWithOverrides(map[string]any{ + "replicas": 1, + }), + } + }, + wantErr: false, + }, + { + name: "boolean case normalization - True equals true", + setup: func() *checks.ValidationContext { + return &checks.ValidationContext{ + Context: context.Background(), + Snapshot: snapshotWithHelm(map[string]string{ + "gpu-operator.chart": "gpu-operator", + "gpu-operator.values.driver.enabled": "True", + }), + Recipe: recipeWithOverrides(map[string]any{ + "driver": map[string]any{"enabled": true}, + }), + } + }, + wantErr: false, + }, { name: "snapshot key not present for recipe key - skip that key", setup: func() *checks.ValidationContext { @@ -402,6 +434,38 @@ func TestFlattenValues(t *testing.T) { } } +func TestValuesEqual(t *testing.T) { + tests := []struct { + name string + expected string + actual string + want bool + }{ + {"exact match", "foo", "foo", true}, + {"whitespace trim", " foo ", "foo", true}, + {"different strings", "foo", "bar", false}, + {"int vs float same value", "1", "1.0", true}, + {"float vs int same value", "3.0", "3", true}, + {"different numbers", "1", "2", false}, + {"float precision", "3.14", "3.14", true}, + {"float mismatch", "3.14", "3.15", false}, + {"bool true case insensitive", "true", "True", true}, + {"bool false case insensitive", "false", "FALSE", true}, + {"bool mismatch", "true", "false", false}, + {"bool vs string", "true", "yes", false}, + {"numeric string vs non-numeric", "1", "one", false}, + {"empty strings", "", "", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := valuesEqual(tt.expected, tt.actual); got != tt.want { + t.Errorf("valuesEqual(%q, %q) = %v, want %v", tt.expected, tt.actual, got, tt.want) + } + }) + } +} + // snapshotWithHelm creates a snapshot with K8s helm subtype data from flat key-value pairs. func snapshotWithHelm(data map[string]string) *snapshotter.Snapshot { helmData := make(map[string]measurement.Reading, len(data)) diff --git a/tests/chainsaw/cli/helm-values-discovery/chainsaw-test.yaml b/tests/chainsaw/cli/helm-values-discovery/chainsaw-test.yaml new file mode 100644 index 00000000..c91e9a3c --- /dev/null +++ b/tests/chainsaw/cli/helm-values-discovery/chainsaw-test.yaml @@ -0,0 +1,238 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: helm-values-discovery +spec: + description: | + Validates the helm-values deployment check discovers deployed Helm components. + Flow: + 1. Generate recipe with Helm components (gpu-operator) + 2. Create mock snapshot with matching Helm release data + 3. Run deployment-phase validate (--no-cluster) + 4. Verify helm-values check passes (values match) + 5. Create mismatched snapshot, verify check detects drift + No cluster needed. Run with: + AICR_BIN=$(pwd)/dist/e2e/aicr chainsaw test --no-cluster --test-dir tests/chainsaw/cli/helm-values-discovery + timeouts: + exec: 30s + steps: + + # ── Step 1: Generate recipe ────────────────────────────────────── + - name: generate-recipe + description: Generate an EKS H100 training recipe with gpu-operator component. + try: + - script: + content: | + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/chainsaw-helm-values-discovery" + rm -rf "${WORK}" && mkdir -p "${WORK}" + ${AICR_BIN} recipe \ + --service eks \ + --accelerator h100 \ + --intent training \ + --os ubuntu \ + --output "${WORK}/recipe.yaml" + + - name: verify-recipe-has-helm-components + description: Verify recipe includes gpu-operator as a Helm component. + try: + - script: + content: | + WORK="/tmp/chainsaw-helm-values-discovery" + grep -q "gpu-operator" "${WORK}/recipe.yaml" + grep -q "type: Helm" "${WORK}/recipe.yaml" + check: + ($error == null): true + + # ── Step 2: Create mock snapshot with matching Helm data ───────── + - name: create-matching-snapshot + description: Create a mock snapshot with Helm release data that matches the recipe values. + try: + - script: + content: | + WORK="/tmp/chainsaw-helm-values-discovery" + cat > "${WORK}/snapshot-matching.yaml" << 'SNAP' + measurements: + - type: K8s + subtypes: + - name: server + data: + version: v1.33.0 + - name: helm + data: + gpu-operator.chart: gpu-operator + gpu-operator.namespace: gpu-operator + gpu-operator.revision: "1" + gpu-operator.status: deployed + gpu-operator.version: "25.3.0" + gpu-operator.values.driver.version: "580.105.08" + gpu-operator.values.driver.enabled: "true" + gpu-operator.values.driver.useOpenKernelModules: "true" + gpu-operator.values.driver.rdma.enabled: "true" + gpu-operator.values.dcgm.enabled: "true" + gpu-operator.values.toolkit.enabled: "true" + gpu-operator.values.gfd.enabled: "true" + gpu-operator.values.gdrcopy.enabled: "true" + gpu-operator.values.gdrcopy.version: v2.5 + gpu-operator.values.fullnameOverride: gpu-operator + gpu-operator.values.operator.upgradeCRD: "true" + gpu-operator.values.migManager.enabled: "true" + - type: OS + subtypes: + - name: release + data: + ID: ubuntu + VERSION_ID: "24.04" + SNAP + + # ── Step 3: Validate with matching snapshot (should pass) ──────── + - name: validate-deployment-matching + description: Run deployment-phase validation. Helm-values check should pass with matching data. + try: + - script: + content: | + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/chainsaw-helm-values-discovery" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot-matching.yaml" \ + --phase deployment \ + --no-cluster \ + --fail-on-error=false \ + --output "${WORK}/result-matching.yaml" 2>&1 + check: + ($error == null): true + + - name: assert-helm-values-check-ran + description: Verify the helm-values check appears in the deployment phase results. + try: + - script: + content: | + WORK="/tmp/chainsaw-helm-values-discovery" + grep -q "helm-values" "${WORK}/result-matching.yaml" + check: + ($error == null): true + + # ── Step 4: Create mismatched snapshot (driver version drift) ──── + - name: create-mismatched-snapshot + description: Create a mock snapshot where driver.version differs from recipe. + try: + - script: + content: | + WORK="/tmp/chainsaw-helm-values-discovery" + cat > "${WORK}/snapshot-mismatched.yaml" << 'SNAP' + measurements: + - type: K8s + subtypes: + - name: server + data: + version: v1.33.0 + - name: helm + data: + gpu-operator.chart: gpu-operator + gpu-operator.namespace: gpu-operator + gpu-operator.revision: "1" + gpu-operator.status: deployed + gpu-operator.version: "25.3.0" + gpu-operator.values.driver.version: "550.99.99" + gpu-operator.values.driver.enabled: "true" + gpu-operator.values.dcgm.enabled: "true" + gpu-operator.values.toolkit.enabled: "true" + gpu-operator.values.fullnameOverride: gpu-operator + - type: OS + subtypes: + - name: release + data: + ID: ubuntu + VERSION_ID: "24.04" + SNAP + + # ── Step 5: Validate with mismatched snapshot (should detect drift) ── + - name: validate-deployment-mismatched + description: Run deployment-phase validation. Helm-values check should detect driver version drift. + try: + - script: + content: | + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/chainsaw-helm-values-discovery" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot-mismatched.yaml" \ + --phase deployment \ + --no-cluster \ + --fail-on-error=false \ + --output "${WORK}/result-mismatched.yaml" 2>&1 + check: + ($error == null): true + + - name: assert-drift-detected + description: Verify the validation result detects helm values mismatch. + try: + - script: + content: | + WORK="/tmp/chainsaw-helm-values-discovery" + grep -q "mismatch\|fail\|driver.version" "${WORK}/result-mismatched.yaml" + check: + ($error == null): true + + # ── Step 6: Validate with skip-helm-check flag ─────────────────── + - name: validate-skip-helm-check + description: Verify --skip-helm-check suppresses the helm-values check entirely. + try: + - script: + content: | + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/chainsaw-helm-values-discovery" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot-mismatched.yaml" \ + --phase deployment \ + --no-cluster \ + --skip-helm-check \ + --fail-on-error=false \ + --output "${WORK}/result-skipped.yaml" 2>&1 + check: + ($error == null): true + + # ── Step 7: Verify new helm RBAC flags exist ───────────────────── + - name: check-helm-flags + description: Verify both snapshot and validate commands expose helm namespace flags. + try: + - script: + content: | + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + missing="" + for cmd_name in snapshot validate; do + help_output=$(${AICR_BIN} ${cmd_name} --help 2>&1) + for flag in helm-namespaces helm-all-namespaces; do + echo "$help_output" | grep -q "$flag" || missing="$missing ${cmd_name}:${flag}" + done + done + # validate also has skip-helm-check + help_output=$(${AICR_BIN} validate --help 2>&1) + echo "$help_output" | grep -q "skip-helm-check" || missing="$missing validate:skip-helm-check" + if [ -n "$missing" ]; then + echo "Missing flags:$missing" >&2 + exit 1 + fi + check: + ($error == null): true + cleanup: + - script: + content: | + rm -rf /tmp/chainsaw-helm-values-discovery