Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Usage:
ksail cluster diagnose [flags]

Flags:
--format string Output format: text or json. Use json for machine-readable structured output. (default "text")
-n, --name string Name of the cluster to target
-p, --provider Provider Provider to use ([Docker Hetzner Omni AWS])

Expand Down
29 changes: 25 additions & 4 deletions pkg/cli/cmd/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2843,6 +2843,7 @@
var (
nameFlag string
providerFlag v1alpha1.Provider
formatFlag string
)

cmd := &cobra.Command{
Expand All @@ -2851,7 +2852,7 @@
Long: diagnoseLongDesc,
SilenceUsage: true,
RunE: func(cmd *cobra.Command, _ []string) error {
return runDiagnoseCmd(cmd, nameFlag, providerFlag)
return runDiagnoseCmd(cmd, nameFlag, providerFlag, formatFlag)
},
}

Expand All @@ -2870,16 +2871,24 @@
fmt.Sprintf("Provider to use (%s)", providerFlag.ValidValues()),
)

cmd.Flags().StringVar(
&formatFlag,
"format",
"text",
"Output format: text or json. Use json for machine-readable structured output.",
)

return cmd
}

// runDiagnoseCmd inspects the live cluster via the Kubernetes API and writes
// a human-readable diagnostic report to the command's stdout. When every
// a human-readable or JSON diagnostic report to the command's stdout. When every
// resource looks healthy the command writes a short "all healthy" banner.
func runDiagnoseCmd(
cmd *cobra.Command,
nameFlag string,
providerFlag v1alpha1.Provider,
formatFlag string,
) error {
resolved, err := lifecycle.ResolveClusterInfo(cmd, nameFlag, providerFlag, "")
if err != nil {
Expand All @@ -2891,13 +2900,25 @@
return fmt.Errorf("build kubernetes client: %w", err)
}

writer := cmd.OutOrStdout()

if formatFlag == "json" {
Comment thread
devantler marked this conversation as resolved.
Outdated
report, diagErr := k8s.DiagnoseClusterReport(cmd.Context(), clientset, resolved.ClusterName)
if diagErr != nil {
return fmt.Errorf("diagnose cluster %q: %w", resolved.ClusterName, diagErr)
}

enc := json.NewEncoder(writer)
enc.SetIndent("", " ")

return enc.Encode(report)

Check failure on line 2914 in pkg/cli/cmd/cluster/cluster.go

View workflow job for this annotation

GitHub Actions / 🧹 Lint - golangci-lint

error returned from external package is unwrapped: sig: func (*encoding/json.Encoder).Encode(v any) error (wrapcheck)
}

report, err := k8s.DiagnoseCluster(cmd.Context(), clientset)
if err != nil {
return fmt.Errorf("diagnose cluster %q: %w", resolved.ClusterName, err)
}

writer := cmd.OutOrStdout()

if report == "" {
_, _ = fmt.Fprintf(
writer,
Expand Down
4 changes: 4 additions & 0 deletions pkg/cli/cmd/cluster/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6729,6 +6729,10 @@ func TestNewDiagnoseCmd(t *testing.T) {
providerFlag := diagnoseCmd.Flags().Lookup("provider")
require.NotNil(t, providerFlag)
assert.Equal(t, "p", providerFlag.Shorthand)

formatFlag := diagnoseCmd.Flags().Lookup("format")
require.NotNil(t, formatFlag)
assert.Equal(t, "text", formatFlag.DefValue)
}

// TestClusterCmd_RegistersDiagnoseSubcommand verifies that NewClusterCmd wires
Expand Down
112 changes: 112 additions & 0 deletions pkg/k8s/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,118 @@
"k8s.io/client-go/kubernetes"
)

// DiagnoseSeverity represents the severity level of a diagnostic finding.
type DiagnoseSeverity string

const (
// DiagnoseSeverityCritical indicates a resource is failing and requires immediate attention.
DiagnoseSeverityCritical DiagnoseSeverity = "critical"
// DiagnoseSeverityWarning indicates a resource is degraded but not yet failing.
DiagnoseSeverityWarning DiagnoseSeverity = "warning"
)

// DiagnoseFinding describes a single unhealthy resource detected during diagnosis.
type DiagnoseFinding struct {
// Severity is the impact level: critical or warning.
Severity DiagnoseSeverity `json:"severity"`
// Resource is a short identifier, e.g. "node/node-1" or "pod/boom (default)".
Resource string `json:"resource"`
// Reason is a one-line description of the failure.
Reason string `json:"reason"`
}

// DiagnoseReport is the structured result of DiagnoseClusterReport. It is
// the JSON-serialisable form of the cluster health snapshot produced by
// DiagnoseCluster. The HealthScore field (0–100) gives AI assistants and
// automation a single numeric signal; Findings carry the details.
type DiagnoseReport struct {
// ClusterName is the name of the inspected cluster.
ClusterName string `json:"clusterName"`
// HealthScore is an integer from 0 (completely broken) to 100 (fully healthy).
// Each critical finding deducts 25 points; each warning deducts 10 points.
HealthScore int `json:"healthScore"`
// Findings lists every unhealthy resource discovered.
Findings []DiagnoseFinding `json:"findings"`
}

// DiagnoseClusterReport is the structured equivalent of DiagnoseCluster. It
// returns a DiagnoseReport suitable for JSON serialisation and AI consumption
// via the cluster_read MCP tool. The plain-text representation produced by
// DiagnoseCluster remains the default; this function is used when the caller
// requests --format json.
func DiagnoseClusterReport(ctx context.Context, clientset kubernetes.Interface, clusterName string) (DiagnoseReport, error) {

Check failure on line 52 in pkg/k8s/diagnostics.go

View workflow job for this annotation

GitHub Actions / 🧹 Lint - golangci-lint

calculated cyclomatic complexity for function DiagnoseClusterReport is 13, max is 10 (cyclop)
report := DiagnoseReport{
ClusterName: clusterName,
HealthScore: 100,

Check failure on line 55 in pkg/k8s/diagnostics.go

View workflow job for this annotation

GitHub Actions / 🧹 Lint - golangci-lint

Magic number: 100, in <assign> detected (mnd)
Findings: []DiagnoseFinding{},
}

nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return report, fmt.Errorf("list nodes: %w", err)
}

for i := range nodes.Items {
node := &nodes.Items[i]
if reason := describeNotReadyNode(node); reason != "" {
report.Findings = append(report.Findings, DiagnoseFinding{
Severity: DiagnoseSeverityCritical,
Resource: "node/" + node.Name,
Reason: reason,
})
}
}

namespaces, err := listNamespaceNames(ctx, clientset)
if err != nil {
return report, err
}

for _, namespace := range namespaces {
pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
if err != nil {
report.Findings = append(report.Findings, DiagnoseFinding{
Severity: DiagnoseSeverityWarning,
Resource: "namespace/" + namespace,
Reason: fmt.Sprintf("failed to list pods: %v", err),
})

continue
}

for j := range pods.Items {
pod := &pods.Items[j]
if isPodHealthy(pod) {
continue
}

report.Findings = append(report.Findings, DiagnoseFinding{
Severity: DiagnoseSeverityCritical,
Resource: fmt.Sprintf("pod/%s (%s)", pod.Name, namespace),
Reason: describePodFailure(pod),
})
}
}

score := 100
for _, f := range report.Findings {
switch f.Severity {
case DiagnoseSeverityCritical:
score -= 25
case DiagnoseSeverityWarning:
score -= 10
}
}

if score < 0 {
score = 0
}

report.HealthScore = score

return report, nil
}

// DiagnoseCluster produces a combined human-readable diagnostic report for
// a running Kubernetes cluster. It enumerates every namespace, surfaces any
// failing pods via DiagnosePodFailures, and reports any nodes that are not
Expand Down
118 changes: 118 additions & 0 deletions pkg/k8s/diagnostics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package k8s_test
import (
"context"
"errors"
"fmt"
"testing"

"github.com/devantler-tech/ksail/v7/pkg/k8s"
Expand Down Expand Up @@ -502,3 +503,120 @@ func TestDiagnoseCluster_NodeListErrorIsSurfaced(t *testing.T) {
require.Error(t, err)
assert.Contains(t, err.Error(), "list nodes")
}

func TestDiagnoseClusterReport_HealthyClusterReturns100(t *testing.T) {
t.Parallel()

clientset := k8sfake.NewClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
&corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "node-1"},
Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
}},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "healthy", Namespace: "default"},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{{Ready: true}},
},
},
)

report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "my-cluster")

require.NoError(t, err)
assert.Equal(t, "my-cluster", report.ClusterName)
assert.Equal(t, 100, report.HealthScore)
assert.Empty(t, report.Findings)
}

func TestDiagnoseClusterReport_FailingPodReducesScore(t *testing.T) {
t.Parallel()

clientset := k8sfake.NewClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
&corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "node-ok"},
Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
}},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "crasher", Namespace: "default"},
Status: corev1.PodStatus{Phase: corev1.PodFailed},
},
)

report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test")

require.NoError(t, err)
assert.Equal(t, 75, report.HealthScore)
require.Len(t, report.Findings, 1)
assert.Equal(t, k8s.DiagnoseSeverityCritical, report.Findings[0].Severity)
assert.Contains(t, report.Findings[0].Resource, "crasher")
}

func TestDiagnoseClusterReport_NotReadyNodeReducesScore(t *testing.T) {
t.Parallel()

clientset := k8sfake.NewClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
&corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "broken-node"},
Status: corev1.NodeStatus{Conditions: []corev1.NodeCondition{
{Type: corev1.NodeReady, Status: corev1.ConditionFalse, Message: "disk pressure"},
}},
},
)

report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test")

require.NoError(t, err)
assert.Equal(t, 75, report.HealthScore)
require.Len(t, report.Findings, 1)
assert.Equal(t, k8s.DiagnoseSeverityCritical, report.Findings[0].Severity)
assert.Equal(t, "node/broken-node", report.Findings[0].Resource)
}

func TestDiagnoseClusterReport_ScoreFloorsAtZero(t *testing.T) {
t.Parallel()

pods := make([]runtime.Object, 0, 5)
pods = append(pods,
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
)

for i := range 5 {
pods = append(pods, &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("fail-%d", i), Namespace: "default"},
Status: corev1.PodStatus{Phase: corev1.PodFailed},
})
}

clientset := k8sfake.NewClientset(pods...)

report, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "overloaded")

require.NoError(t, err)
assert.Equal(t, 0, report.HealthScore)
}

func TestDiagnoseClusterReport_NodeListErrorIsSurfaced(t *testing.T) {
t.Parallel()

clientset := k8sfake.NewClientset()
clientset.PrependReactor(
"list",
"nodes",
func(_ k8stesting.Action) (bool, runtime.Object, error) {
return true, nil, errConnectionRefused
},
)

_, err := k8s.DiagnoseClusterReport(context.Background(), clientset, "test")

require.Error(t, err)
assert.Contains(t, err.Error(), "list nodes")
}
Loading