diff --git a/docs/src/content/docs/cli-flags/cluster/cluster-diagnose.mdx b/docs/src/content/docs/cli-flags/cluster/cluster-diagnose.mdx index 34d3d6f1c6..5c91fb8b80 100644 --- a/docs/src/content/docs/cli-flags/cluster/cluster-diagnose.mdx +++ b/docs/src/content/docs/cli-flags/cluster/cluster-diagnose.mdx @@ -33,6 +33,7 @@ Usage: ksail cluster diagnose [flags] Flags: + --format string Output format: text or json. Use json for machine-readable structured output. (default "text") -n, --name string Name of the cluster to target -p, --provider Provider Provider to use ([Docker Hetzner Omni AWS]) diff --git a/pkg/cli/cmd/cluster/cluster.go b/pkg/cli/cmd/cluster/cluster.go index 25845abbe3..bbb38e1cb0 100644 --- a/pkg/cli/cmd/cluster/cluster.go +++ b/pkg/cli/cmd/cluster/cluster.go @@ -2843,6 +2843,7 @@ func NewDiagnoseCmd(_ *di.Runtime) *cobra.Command { var ( nameFlag string providerFlag v1alpha1.Provider + formatFlag string ) cmd := &cobra.Command{ @@ -2851,7 +2852,7 @@ func NewDiagnoseCmd(_ *di.Runtime) *cobra.Command { Long: diagnoseLongDesc, SilenceUsage: true, RunE: func(cmd *cobra.Command, _ []string) error { - return runDiagnoseCmd(cmd, nameFlag, providerFlag) + return runDiagnoseCmd(cmd, nameFlag, providerFlag, formatFlag) }, } @@ -2870,17 +2871,36 @@ func NewDiagnoseCmd(_ *di.Runtime) *cobra.Command { fmt.Sprintf("Provider to use (%s)", providerFlag.ValidValues()), ) + cmd.Flags().StringVar( + &formatFlag, + "format", + "text", + "Output format: text or json. Use json for machine-readable structured output.", + ) + return cmd } // runDiagnoseCmd inspects the live cluster via the Kubernetes API and writes -// a human-readable diagnostic report to the command's stdout. When every +// a human-readable or JSON diagnostic report to the command's stdout. When every // resource looks healthy the command writes a short "all healthy" banner. func runDiagnoseCmd( cmd *cobra.Command, nameFlag string, providerFlag v1alpha1.Provider, + formatFlag string, ) error { + format := strings.ToLower(formatFlag) + if format != outputFormatText && format != outputFormatJSON { + return fmt.Errorf( + "%w: %q (expected %q or %q)", + ErrUnsupportedOutputFormat, + format, + outputFormatText, + outputFormatJSON, + ) + } + resolved, err := lifecycle.ResolveClusterInfo(cmd, nameFlag, providerFlag, "") if err != nil { return fmt.Errorf("resolve cluster info: %w", err) @@ -2891,13 +2911,22 @@ func runDiagnoseCmd( return fmt.Errorf("build kubernetes client: %w", err) } + writer := cmd.OutOrStdout() + + if format == outputFormatJSON { + report, diagErr := k8s.DiagnoseClusterReport(cmd.Context(), clientset, resolved.ClusterName) + if diagErr != nil { + return fmt.Errorf("diagnose cluster %q: %w", resolved.ClusterName, diagErr) + } + + return runDiagnoseJSONReport(report, writer) + } + report, err := k8s.DiagnoseCluster(cmd.Context(), clientset) if err != nil { return fmt.Errorf("diagnose cluster %q: %w", resolved.ClusterName, err) } - writer := cmd.OutOrStdout() - if report == "" { _, _ = fmt.Fprintf( writer, @@ -2914,6 +2943,21 @@ func runDiagnoseCmd( return nil } +// runDiagnoseJSONReport serialises the structured DiagnoseReport for clusterName +// as indented JSON to w. It is extracted from runDiagnoseCmd to keep that +// function within the allowed line-count limit. +func runDiagnoseJSONReport(report k8s.DiagnoseReport, w io.Writer) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + + err := enc.Encode(report) + if err != nil { + return fmt.Errorf("encode diagnose report: %w", err) + } + + return nil +} + // runInfoCmd orchestrates the cluster info command flow: // 1. Resolve cluster identity (name, provider, kubeconfig) // 2. Query provider API for cluster status diff --git a/pkg/cli/cmd/cluster/cluster_test.go b/pkg/cli/cmd/cluster/cluster_test.go index b3174f1b30..ac41d53874 100644 --- a/pkg/cli/cmd/cluster/cluster_test.go +++ b/pkg/cli/cmd/cluster/cluster_test.go @@ -6729,6 +6729,47 @@ func TestNewDiagnoseCmd(t *testing.T) { providerFlag := diagnoseCmd.Flags().Lookup("provider") require.NotNil(t, providerFlag) assert.Equal(t, "p", providerFlag.Shorthand) + + formatFlag := diagnoseCmd.Flags().Lookup("format") + require.NotNil(t, formatFlag) + assert.Equal(t, "text", formatFlag.DefValue) +} + +// TestDiagnoseCmd_InvalidFormatRejectsEarly verifies that an unknown --format +// value is rejected before any cluster interaction takes place. +// This guards against typos like "--format jsn" silently falling back to the +// text path instead of returning an actionable error. +func TestDiagnoseCmd_InvalidFormatRejectsEarly(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + format string + }{ + {name: "typo jsn", format: "jsn"}, + {name: "empty format", format: ""}, + {name: "xml", format: "xml"}, + {name: "pretty", format: "pretty"}, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + + diagnoseCmd := cluster.NewDiagnoseCmd(nil) + diagnoseCmd.SetOut(io.Discard) + diagnoseCmd.SetErr(io.Discard) + diagnoseCmd.SetArgs([]string{"--format", testCase.format}) + + err := diagnoseCmd.Execute() + + require.Error(t, err) + assert.ErrorIs(t, err, cluster.ErrUnsupportedOutputFormat, + "expected ErrUnsupportedOutputFormat for format %q, got: %v", + testCase.format, err, + ) + }) + } } // TestClusterCmd_RegistersDiagnoseSubcommand verifies that NewClusterCmd wires diff --git a/pkg/k8s/diagnostics.go b/pkg/k8s/diagnostics.go index 53a7ac7224..333be5ec73 100644 --- a/pkg/k8s/diagnostics.go +++ b/pkg/k8s/diagnostics.go @@ -10,6 +10,149 @@ import ( "k8s.io/client-go/kubernetes" ) +// DiagnoseSeverity represents the severity level of a diagnostic finding. +type DiagnoseSeverity string + +const ( + // DiagnoseSeverityCritical indicates a resource is failing and requires immediate attention. + DiagnoseSeverityCritical DiagnoseSeverity = "critical" + // DiagnoseSeverityWarning indicates a resource is degraded but not yet failing. + DiagnoseSeverityWarning DiagnoseSeverity = "warning" +) + +const ( + // diagnoseMaxHealthScore is the starting score for a fully healthy cluster. + diagnoseMaxHealthScore = 100 + // diagnoseCriticalPenalty is the score deduction for each critical finding. + diagnoseCriticalPenalty = 25 + // diagnoseWarningPenalty is the score deduction for each warning finding. + diagnoseWarningPenalty = 10 +) + +// DiagnoseFinding describes a single unhealthy resource detected during diagnosis. +type DiagnoseFinding struct { + // Severity is the impact level: critical or warning. + Severity DiagnoseSeverity `json:"severity"` + // Resource is a short identifier, e.g. "node/node-1" or "pod/boom (default)". + Resource string `json:"resource"` + // Reason is a one-line description of the failure. + Reason string `json:"reason"` +} + +// DiagnoseReport is the structured result of DiagnoseClusterReport. It is +// the JSON-serialisable form of the cluster health snapshot produced by +// DiagnoseCluster. The HealthScore field (0–100) gives AI assistants and +// automation a single numeric signal; Findings carry the details. +type DiagnoseReport struct { + // ClusterName is the name of the inspected cluster. + ClusterName string `json:"clusterName"` + // HealthScore is an integer from 0 (completely broken) to 100 (fully healthy). + // Each critical finding deducts 25 points; each warning deducts 10 points. + HealthScore int `json:"healthScore"` + // Findings lists every unhealthy resource discovered. + Findings []DiagnoseFinding `json:"findings"` +} + +// DiagnoseClusterReport is the structured equivalent of DiagnoseCluster. It +// returns a DiagnoseReport suitable for JSON serialisation and AI consumption +// via the cluster_read MCP tool. The plain-text representation produced by +// DiagnoseCluster remains the default; this function is used when the caller +// requests --format json. +func DiagnoseClusterReport( + ctx context.Context, + clientset kubernetes.Interface, + clusterName string, +) (DiagnoseReport, error) { + report := DiagnoseReport{ + ClusterName: clusterName, + HealthScore: diagnoseMaxHealthScore, + Findings: []DiagnoseFinding{}, + } + + nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return report, fmt.Errorf("list nodes: %w", err) + } + + for i := range nodes.Items { + node := &nodes.Items[i] + if reason := describeNotReadyNode(node); reason != "" { + report.Findings = append(report.Findings, DiagnoseFinding{ + Severity: DiagnoseSeverityCritical, + Resource: "node/" + node.Name, + Reason: reason, + }) + } + } + + namespaces, err := listNamespaceNames(ctx, clientset) + if err != nil { + return report, err + } + + for _, namespace := range namespaces { + appendNamespacePodFindings(ctx, clientset, namespace, &report.Findings) + } + + report.HealthScore = diagnoseComputeScore(report.Findings) + + return report, nil +} + +// appendNamespacePodFindings lists all pods in namespace and appends a finding +// for each unhealthy pod (or a warning finding when the pod list itself fails). +func appendNamespacePodFindings( + ctx context.Context, + clientset kubernetes.Interface, + namespace string, + findings *[]DiagnoseFinding, +) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + *findings = append(*findings, DiagnoseFinding{ + Severity: DiagnoseSeverityWarning, + Resource: "namespace/" + namespace, + Reason: fmt.Sprintf("failed to list pods: %v", err), + }) + + return + } + + for j := range pods.Items { + pod := &pods.Items[j] + if isPodHealthy(pod) { + continue + } + + *findings = append(*findings, DiagnoseFinding{ + Severity: DiagnoseSeverityCritical, + Resource: fmt.Sprintf("pod/%s (%s)", pod.Name, namespace), + Reason: describePodFailure(pod), + }) + } +} + +// diagnoseComputeScore returns a health score in [0, diagnoseMaxHealthScore] +// by deducting penalty points for each finding. +func diagnoseComputeScore(findings []DiagnoseFinding) int { + score := diagnoseMaxHealthScore + + for _, f := range findings { + switch f.Severity { + case DiagnoseSeverityCritical: + score -= diagnoseCriticalPenalty + case DiagnoseSeverityWarning: + score -= diagnoseWarningPenalty + } + } + + if score < 0 { + score = 0 + } + + return score +} + // DiagnoseCluster produces a combined human-readable diagnostic report for // a running Kubernetes cluster. It enumerates every namespace, surfaces any // failing pods via DiagnosePodFailures, and reports any nodes that are not diff --git a/pkg/k8s/diagnostics_test.go b/pkg/k8s/diagnostics_test.go index aae1ba7d64..d382417672 100644 --- a/pkg/k8s/diagnostics_test.go +++ b/pkg/k8s/diagnostics_test.go @@ -3,6 +3,7 @@ package k8s_test import ( "context" "errors" + "fmt" "testing" "github.com/devantler-tech/ksail/v7/pkg/k8s" @@ -502,3 +503,144 @@ func TestDiagnoseCluster_NodeListErrorIsSurfaced(t *testing.T) { require.Error(t, err) assert.Contains(t, err.Error(), "list nodes") } + +func TestDiagnoseClusterReport_HealthyClusterReturns100(t *testing.T) { + t.Parallel() + + clientset := k8sfake.NewClientset( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}}, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, + Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }}, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "healthy", Namespace: "default"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Ready: true}}, + }, + }, + ) + + report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "my-cluster") + + require.NoError(t, err) + assert.Equal(t, "my-cluster", report.ClusterName) + assert.Equal(t, 100, report.HealthScore) + assert.Empty(t, report.Findings) +} + +func TestDiagnoseClusterReport_FailingPodReducesScore(t *testing.T) { + t.Parallel() + + clientset := k8sfake.NewClientset( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}}, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-ok"}, + Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }}, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "crasher", Namespace: "default"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }, + ) + + report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test") + + require.NoError(t, err) + assert.Equal(t, 75, report.HealthScore) + require.Len(t, report.Findings, 1) + assert.Equal(t, k8s.DiagnoseSeverityCritical, report.Findings[0].Severity) + assert.Contains(t, report.Findings[0].Resource, "crasher") +} + +func TestDiagnoseClusterReport_NotReadyNodeReducesScore(t *testing.T) { + t.Parallel() + + clientset := k8sfake.NewClientset( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}}, + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "broken-node"}, + Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionFalse, Message: "disk pressure"}, + }}, + }, + ) + + report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test") + + require.NoError(t, err) + assert.Equal(t, 75, report.HealthScore) + require.Len(t, report.Findings, 1) + assert.Equal(t, k8s.DiagnoseSeverityCritical, report.Findings[0].Severity) + assert.Equal(t, "node/broken-node", report.Findings[0].Resource) +} + +func TestDiagnoseClusterReport_ScoreFloorsAtZero(t *testing.T) { + t.Parallel() + + pods := make([]runtime.Object, 0, 5) + pods = append(pods, + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}}, + ) + + for i := range 5 { + pods = append(pods, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("fail-%d", i), Namespace: "default"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + }) + } + + clientset := k8sfake.NewClientset(pods...) + + report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "overloaded") + + require.NoError(t, err) + assert.Equal(t, 0, report.HealthScore) +} + +func TestDiagnoseClusterReport_NodeListErrorIsSurfaced(t *testing.T) { + t.Parallel() + + clientset := k8sfake.NewClientset() + clientset.PrependReactor( + "list", + "nodes", + func(_ k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, errConnectionRefused + }, + ) + + _, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test") + + require.Error(t, err) + assert.Contains(t, err.Error(), "list nodes") +} + +func TestDiagnoseClusterReport_PodListErrorCreatesWarningFinding(t *testing.T) { + t.Parallel() + + clientset := k8sfake.NewClientset( + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "broken-ns"}}, + ) + clientset.PrependReactor( + "list", + "pods", + func(_ k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, errConnectionRefused + }, + ) + + report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test") + + require.NoError(t, err) + require.Len(t, report.Findings, 1) + assert.Equal(t, k8s.DiagnoseSeverityWarning, report.Findings[0].Severity) + assert.Equal(t, "namespace/broken-ns", report.Findings[0].Resource) + assert.Contains(t, report.Findings[0].Reason, "failed to list pods") + assert.Equal(t, 90, report.HealthScore) +}