Skip to content

Commit a89e059

Browse files
committed
feat: add cli doc for backwards compatibly and warnings
1 parent 90b70a5 commit a89e059

File tree

10 files changed

+374
-79
lines changed

10 files changed

+374
-79
lines changed

docs/cli.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,50 @@ kubectl plugin for managing Skyhook deployments, packages, and nodes.
66

77
The Skyhook CLI (`kubectl skyhook`) provides SRE tooling for managing Skyhook operators and their packages across Kubernetes cluster nodes. It supports inspecting node/package state, forcing re-runs, managing node lifecycle, and retrieving logs.
88

9+
## Compatibility
10+
11+
### Minimum Operator Version
12+
13+
The CLI requires **operator version v0.8.0 or later** for full functionality of all commands.
14+
15+
### Command Compatibility Matrix
16+
17+
| Command | v0.7.x and earlier | v0.8.0+ |
18+
|---------|-------------------|---------|
19+
| `version` | ✅ Full | ✅ Full |
20+
| `node status` | ✅ Full | ✅ Full |
21+
| `node list` | ✅ Full | ✅ Full |
22+
| `node reset` | ✅ Full | ✅ Full |
23+
| `node ignore/unignore` | ✅ Full | ✅ Full |
24+
| `package status` | ✅ Full | ✅ Full |
25+
| `package rerun` | ✅ Full | ✅ Full |
26+
| `package logs` | ✅ Full | ✅ Full |
27+
| `reset` | ✅ Full | ✅ Full |
28+
| `pause` | ❌ Not supported | ✅ Full |
29+
| `resume` | ❌ Not supported | ✅ Full |
30+
| `disable` | ❌ Not supported | ✅ Full |
31+
| `enable` | ❌ Not supported | ✅ Full |
32+
33+
### Breaking Change: Pause/Disable Mechanism
34+
35+
In operator versions **v0.7.x and earlier**, pausing and disabling a Skyhook was done via spec fields:
36+
37+
```yaml
38+
spec:
39+
pause: true # Old method - no longer used by operator
40+
```
41+
42+
Starting with **v0.8.0**, the operator uses **annotations** instead:
43+
44+
```yaml
45+
metadata:
46+
annotations:
47+
skyhook.nvidia.com/pause: "true"
48+
skyhook.nvidia.com/disable: "true"
49+
```
50+
51+
The CLI's `pause`, `resume`, `disable`, and `enable` commands set these annotations. If you're running an older operator (v0.7.x or earlier), these commands will appear to succeed but the operator won't recognize the annotations - you'll need to edit the Skyhook spec directly using `kubectl edit`.
52+
953
## Installation
1054

1155
```bash
@@ -56,6 +100,8 @@ kubectl skyhook version --timeout 10s
56100

57101
Control Skyhook processing state.
58102

103+
> **Note:** Requires operator v0.8.0+. See [Compatibility](#compatibility) for details.
104+
59105
```bash
60106
# Pause a Skyhook (stops processing new nodes)
61107
kubectl skyhook pause my-skyhook
@@ -69,6 +115,8 @@ kubectl skyhook resume my-skyhook
69115

70116
Completely disable or re-enable a Skyhook.
71117

118+
> **Note:** Requires operator v0.8.0+. See [Compatibility](#compatibility) for details.
119+
72120
```bash
73121
# Disable a Skyhook completely
74122
kubectl skyhook disable my-skyhook
@@ -205,6 +253,9 @@ kubectl skyhook node status --skyhook my-skyhook -o json
205253
```
206254

207255
### Emergency Stop
256+
257+
> **Note:** Requires operator v0.8.0+. For older operators, use `kubectl edit skyhook my-skyhook` and set `spec.pause: true`.
258+
208259
```bash
209260
# Pause all processing
210261
kubectl skyhook pause my-skyhook --confirm

operator/Makefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,10 @@ ensure-test-symlinks: ## Ensure symlinks exist in test directories (idempotent)
334334
done
335335

336336
.PHONY: build-cli
337-
CLI_LDFLAGS := -ldflags "-X github.com/NVIDIA/skyhook/operator/internal/version.VERSION=$(VERSION) -X github.com/NVIDIA/skyhook/operator/internal/version.GIT_SHA=$(GIT_SHA)"
337+
CLI_VERSION ?= $(CLI_TAG_LAST)
338+
CLI_LDFLAGS := -ldflags "-X github.com/NVIDIA/skyhook/operator/internal/version.VERSION=$(CLI_VERSION) -X github.com/NVIDIA/skyhook/operator/internal/version.GIT_SHA=$(GIT_SHA)"
338339
build-cli: ensure-test-symlinks ## Build CLI binary.
339340
go build $(GOFLAGS) $(CLI_LDFLAGS) -o bin/skyhook cmd/cli/main.go
340-
341-
CLI_VERSION ?= $(CLI_TAG_LAST)
342341
CLI_DIST_DIR ?= dist
343342
cli-release-build: ## Build CLI binaries for all platforms (used by CI release workflow).
344343
@echo "Building CLI $(CLI_VERSION) for all platforms..."

operator/cmd/cli/app/cli_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"testing"
2525

2626
"github.com/NVIDIA/skyhook/operator/internal/cli/context"
27+
"github.com/NVIDIA/skyhook/operator/internal/cli/utils"
2728
. "github.com/onsi/ginkgo/v2"
2829
. "github.com/onsi/gomega"
2930
"github.com/spf13/cobra"
@@ -177,34 +178,34 @@ var _ = Describe("Skyhook CLI Tests", func() {
177178
})
178179
})
179180

180-
Describe("extractImageTag", func() {
181+
Describe("ExtractImageTag", func() {
181182
It("should extract tag from simple image reference", func() {
182-
tag := extractImageTag("ghcr.io/nvidia/skyhook/operator:v1.2.3")
183+
tag := utils.ExtractImageTag("ghcr.io/nvidia/skyhook/operator:v1.2.3")
183184
Expect(tag).To(Equal("v1.2.3"))
184185
})
185186

186187
It("should extract tag from image with digest", func() {
187-
tag := extractImageTag("ghcr.io/nvidia/skyhook/operator:v1.2.3@sha256:abc123")
188+
tag := utils.ExtractImageTag("ghcr.io/nvidia/skyhook/operator:v1.2.3@sha256:abc123")
188189
Expect(tag).To(Equal("v1.2.3"))
189190
})
190191

191192
It("should return empty for image without tag", func() {
192-
tag := extractImageTag("ghcr.io/nvidia/skyhook/operator")
193+
tag := utils.ExtractImageTag("ghcr.io/nvidia/skyhook/operator")
193194
Expect(tag).To(BeEmpty())
194195
})
195196

196197
It("should handle image with only digest", func() {
197-
tag := extractImageTag("ghcr.io/nvidia/skyhook/operator@sha256:abc123")
198+
tag := utils.ExtractImageTag("ghcr.io/nvidia/skyhook/operator@sha256:abc123")
198199
Expect(tag).To(BeEmpty())
199200
})
200201

201202
It("should extract latest tag", func() {
202-
tag := extractImageTag("nginx:latest")
203+
tag := utils.ExtractImageTag("nginx:latest")
203204
Expect(tag).To(Equal("latest"))
204205
})
205206

206207
It("should handle image with port in registry", func() {
207-
tag := extractImageTag("localhost:5000/myimage:v1.0")
208+
tag := utils.ExtractImageTag("localhost:5000/myimage:v1.0")
208209
Expect(tag).To(Equal("v1.0"))
209210
})
210211
})

operator/cmd/cli/app/lifecycle.go

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,42 @@ func newLifecycleCmd(ctx *cliContext.CLIContext, cfg lifecycleConfig) *cobra.Com
8989
return fmt.Errorf("initializing kubernetes client: %w", err)
9090
}
9191

92+
// Fetch the Skyhook to check operator version from its annotation
93+
skyhook, err := utils.GetSkyhook(cmd.Context(), kubeClient.Dynamic(), skyhookName)
94+
if err != nil {
95+
return err
96+
}
97+
98+
// Check operator version compatibility using Skyhook's version annotation
99+
// If annotation is missing or invalid (e.g., "dev"), fall back to deployment version
100+
opVersion := utils.GetSkyhookVersion(skyhook)
101+
if opVersion == "" || !utils.IsValidVersion(opVersion) {
102+
// Try to get version from deployment instead
103+
deployVersion, err := utils.DiscoverOperatorVersion(cmd.Context(), kubeClient.Kubernetes(), ctx.GlobalFlags.Namespace())
104+
if err == nil && utils.IsValidVersion(deployVersion) {
105+
opVersion = deployVersion
106+
} else {
107+
_, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: unable to determine operator version; "+
108+
"cannot verify compatibility (requires %s+)\n", utils.MinAnnotationSupportVersion)
109+
}
110+
}
111+
112+
if utils.IsValidVersion(opVersion) && utils.CompareVersions(opVersion, utils.MinAnnotationSupportVersion) < 0 {
113+
// Operator is older than v0.8.0 - annotations won't work
114+
if cfg.annotation == utils.PauseAnnotation {
115+
// pause/resume - feature exists but uses spec field instead of annotation
116+
specValue := "true"
117+
if cfg.action == "remove" {
118+
specValue = "false"
119+
}
120+
return fmt.Errorf("operator version %s uses spec.pause instead of annotations; "+
121+
"use 'kubectl edit skyhook %s' and set spec.pause: %s", opVersion, skyhookName, specValue)
122+
}
123+
// disable/enable - feature doesn't exist at all in older versions
124+
return fmt.Errorf("operator version %s does not support %s; this feature was added in %s",
125+
opVersion, cfg.confirmVerb, utils.MinAnnotationSupportVersion)
126+
}
127+
92128
if cfg.action == "set" {
93129
err = utils.SetSkyhookAnnotation(cmd.Context(), kubeClient.Dynamic(), skyhookName, cfg.annotation, "true")
94130
} else {
@@ -141,12 +177,15 @@ func NewResumeCmd(ctx *cliContext.CLIContext) *cobra.Command {
141177
142178
The operator will resume processing nodes after this command.`,
143179
example: ` # Resume a paused Skyhook
144-
kubectl skyhook resume gpu-init`,
180+
kubectl skyhook resume gpu-init
181+
182+
# Resume without confirmation
183+
kubectl skyhook resume gpu-init --confirm`,
145184
annotation: utils.PauseAnnotation,
146185
action: "remove",
147186
verb: "resumed",
148187
confirmVerb: "resume",
149-
needsConfirm: false,
188+
needsConfirm: true,
150189
})
151190
}
152191

@@ -181,11 +220,14 @@ func NewEnableCmd(ctx *cliContext.CLIContext) *cobra.Command {
181220
182221
The operator will resume normal processing after this command.`,
183222
example: ` # Enable a disabled Skyhook
184-
kubectl skyhook enable gpu-init`,
223+
kubectl skyhook enable gpu-init
224+
225+
# Enable without confirmation
226+
kubectl skyhook enable gpu-init --confirm`,
185227
annotation: utils.DisableAnnotation,
186228
action: "remove",
187229
verb: "enabled",
188230
confirmVerb: "enable",
189-
needsConfirm: false,
231+
needsConfirm: true,
190232
})
191233
}

operator/cmd/cli/app/lifecycle_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ var _ = Describe("Lifecycle Commands", func() {
4949
cmdFactory: NewResumeCmd,
5050
expectedUse: "resume <skyhook-name>",
5151
expectedVerb: "Resume",
52-
hasConfirmFlag: false,
52+
hasConfirmFlag: true,
5353
},
5454
{
5555
name: "Disable",
@@ -63,7 +63,7 @@ var _ = Describe("Lifecycle Commands", func() {
6363
cmdFactory: NewEnableCmd,
6464
expectedUse: "enable <skyhook-name>",
6565
expectedVerb: "Enable",
66-
hasConfirmFlag: false,
66+
hasConfirmFlag: true,
6767
},
6868
}
6969

operator/cmd/cli/app/version.go

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,13 @@ package app
2121
import (
2222
"context"
2323
"fmt"
24-
"strings"
2524
"time"
2625

2726
"github.com/spf13/cobra"
28-
"k8s.io/apimachinery/pkg/api/errors"
29-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30-
"k8s.io/client-go/kubernetes"
3127

3228
"github.com/NVIDIA/skyhook/operator/internal/cli/client"
3329
cliContext "github.com/NVIDIA/skyhook/operator/internal/cli/context"
30+
"github.com/NVIDIA/skyhook/operator/internal/cli/utils"
3431
"github.com/NVIDIA/skyhook/operator/internal/version"
3532
)
3633

@@ -72,7 +69,7 @@ func NewVersionCmd(ctx *cliContext.CLIContext) *cobra.Command {
7269
cmdCtx, cancel := context.WithTimeout(cmd.Context(), timeout)
7370
defer cancel()
7471

75-
opVersion, err := discoverOperatorVersion(cmdCtx, kubeClient.Kubernetes(), ctx.GlobalFlags.Namespace())
72+
opVersion, err := utils.DiscoverOperatorVersion(cmdCtx, kubeClient.Kubernetes(), ctx.GlobalFlags.Namespace())
7673
if err != nil {
7774
_, _ = fmt.Fprintf(cmd.OutOrStdout(), "Skyhook operator:\tunknown (%v)\n", err)
7875
return nil
@@ -88,56 +85,3 @@ func NewVersionCmd(ctx *cliContext.CLIContext) *cobra.Command {
8885

8986
return versionCmd
9087
}
91-
92-
func discoverOperatorVersion(ctx context.Context, kube kubernetes.Interface, namespace string) (string, error) {
93-
if kube == nil {
94-
return "", fmt.Errorf("nil kubernetes client")
95-
}
96-
if namespace == "" {
97-
namespace = cliContext.DefaultNamespace
98-
}
99-
100-
deploymentName := "skyhook-operator-controller-manager"
101-
deployment, err := kube.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
102-
if err != nil {
103-
if errors.IsNotFound(err) {
104-
return "", fmt.Errorf("skyhook operator deployment %q not found in namespace %q", deploymentName, namespace)
105-
}
106-
return "", fmt.Errorf("querying operator deployment: %w", err)
107-
}
108-
109-
// Try to get version from Helm label (preferred for Helm deployments)
110-
if v := deployment.Labels["app.kubernetes.io/version"]; strings.TrimSpace(v) != "" {
111-
return strings.TrimSpace(v), nil
112-
}
113-
114-
// Fallback: parse version from container image tag (works for kustomize deployments)
115-
if len(deployment.Spec.Template.Spec.Containers) > 0 {
116-
image := deployment.Spec.Template.Spec.Containers[0].Image
117-
if tag := extractImageTag(image); tag != "" && tag != "latest" {
118-
return tag, nil
119-
}
120-
}
121-
122-
return "", fmt.Errorf("unable to determine operator version from deployment labels or image tag")
123-
}
124-
125-
// extractImageTag extracts the tag from a container image reference.
126-
// Examples:
127-
// - "ghcr.io/nvidia/skyhook/operator:v1.2.3" -> "v1.2.3"
128-
// - "ghcr.io/nvidia/skyhook/operator:v1.2.3@sha256:..." -> "v1.2.3"
129-
// - "ghcr.io/nvidia/skyhook/operator" -> ""
130-
func extractImageTag(image string) string {
131-
// Remove digest if present (e.g., @sha256:...)
132-
if idx := strings.Index(image, "@"); idx > 0 {
133-
image = image[:idx]
134-
}
135-
136-
// Split on ":" to get tag
137-
parts := strings.Split(image, ":")
138-
if len(parts) < 2 {
139-
return ""
140-
}
141-
142-
return strings.TrimSpace(parts[len(parts)-1])
143-
}

operator/cmd/cli/app/version_test.go

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/client-go/kubernetes/fake"
3131

3232
cliContext "github.com/NVIDIA/skyhook/operator/internal/cli/context"
33+
"github.com/NVIDIA/skyhook/operator/internal/cli/utils"
3334
)
3435

3536
// TestNewVersionCmd_ClientOnly verifies the version command works with --client-only flag
@@ -88,6 +89,7 @@ func TestDiscoverOperatorVersion(t *testing.T) {
8889
Namespace: "skyhook",
8990
Labels: map[string]string{
9091
"app.kubernetes.io/version": "v0.9.0",
92+
"app.kubernetes.io/name": "skyhook-operator", // needed for detection
9193
},
9294
},
9395
},
@@ -146,10 +148,40 @@ func TestDiscoverOperatorVersion(t *testing.T) {
146148
ObjectMeta: metav1.ObjectMeta{
147149
Name: "skyhook-operator-controller-manager",
148150
Namespace: "skyhook",
151+
Labels: map[string]string{
152+
"app.kubernetes.io/name": "skyhook-operator", // needed for detection
153+
},
149154
},
150155
},
151156
wantErr: true,
152157
},
158+
{
159+
name: "detection by image name (no skyhook label)",
160+
deployment: &appsv1.Deployment{
161+
ObjectMeta: metav1.ObjectMeta{
162+
Name: "custom-named-deployment",
163+
Namespace: "skyhook",
164+
Labels: map[string]string{
165+
"app.kubernetes.io/version": "v1.5.0",
166+
"app": "my-operator", // no skyhook in labels
167+
},
168+
},
169+
Spec: appsv1.DeploymentSpec{
170+
Template: corev1.PodTemplateSpec{
171+
Spec: corev1.PodSpec{
172+
Containers: []corev1.Container{
173+
{
174+
Name: "manager",
175+
Image: "nvcr.io/nvidia/skyhook/operator:v1.5.0", // detected by image
176+
},
177+
},
178+
},
179+
},
180+
},
181+
},
182+
wantErr: false,
183+
wantVer: "v1.5.0",
184+
},
153185
{
154186
name: "latest tag should fail",
155187
deployment: &appsv1.Deployment{
@@ -189,7 +221,7 @@ func TestDiscoverOperatorVersion(t *testing.T) {
189221
}
190222
}
191223

192-
version, err := discoverOperatorVersion(context.Background(), clientset, "skyhook")
224+
version, err := utils.DiscoverOperatorVersion(context.Background(), clientset, "skyhook")
193225

194226
if (err != nil) != tt.wantErr {
195227
t.Errorf("discoverOperatorVersion() error = %v, wantErr %v", err, tt.wantErr)
@@ -244,7 +276,7 @@ func TestExtractImageTag(t *testing.T) {
244276

245277
for _, tt := range tests {
246278
t.Run(tt.name, func(t *testing.T) {
247-
got := extractImageTag(tt.image)
279+
got := utils.ExtractImageTag(tt.image)
248280
if got != tt.want {
249281
t.Errorf("extractImageTag(%q) = %q, want %q", tt.image, got, tt.want)
250282
}

0 commit comments

Comments
 (0)