Skip to content

Commit 9bb6101

Browse files
committed
feat: add sysreq based manual reboot
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent bf0b5e4 commit 9bb6101

7 files changed

Lines changed: 204 additions & 24 deletions

File tree

distros/kubernetes/nvsentinel/charts/janitor-provider/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ spec:
8282
# Generic provider environment variables
8383
- name: GENERIC_REBOOT_IMAGE
8484
value: {{ .Values.csp.generic.rebootImage | quote }}
85+
- name: GENERIC_REBOOT_USE_SYSRQ
86+
value: {{ .Values.csp.generic.useSysrqReboot | default false | quote }}
8587
- name: GENERIC_REBOOT_JOB_NAMESPACE
8688
value: {{ .Values.csp.generic.rebootJobNamespace | default .Release.Namespace | quote }}
8789
- name: GENERIC_REBOOT_JOB_TTL

distros/kubernetes/nvsentinel/charts/janitor-provider/values.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,16 @@ csp:
106106
# - azure: For Microsoft Azure AKS clusters
107107
# - oci: For Oracle Cloud Infrastructure OKE clusters
108108
# - nebius: For Nebius Managed Kubernetes (MK8s) clusters
109-
# - generic: For bare-metal / on-premises clusters (reboots via privileged Job running chroot /host reboot)
109+
# - generic: For bare-metal / on-premises clusters (reboots via privileged Job)
110110
provider: "kind"
111111

112112
# Generic provider configuration (only used when provider=generic)
113113
generic:
114114
# Container image used for the reboot Job
115115
rebootImage: "public.ecr.aws/docker/library/busybox:1.37.0"
116+
# Use the Linux Magic SysRq trigger for an immediate kernel reboot instead of
117+
# running chroot /host reboot.
118+
useSysrqReboot: false
116119
# Namespace for reboot Jobs (defaults to the janitor-provider's namespace)
117120
rebootJobNamespace: ""
118121
# TTL in seconds for completed reboot Jobs (auto-cleanup)

docs/configuration/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,6 @@ Each module has additional configuration options documented in its dedicated gui
104104
- [Fault Quarantine](./fault-quarantine.md)
105105
- [Node Drainer](./node-drainer.md)
106106
- [Fault Remediation](./fault-remediation.md)
107+
- [Janitor Provider](./janitor-provider.md)
107108
- [Preflight](./preflight.md)
108109
- [Event Exporter](./event-exporter.md)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Janitor Provider Configuration
2+
3+
## Overview
4+
5+
The janitor-provider module executes node lifecycle operations requested by Janitor, including reboot signals, node readiness checks, and provider-specific termination signals. This document covers the Helm configuration options for selecting and configuring the provider backend.
6+
7+
## Configuration Reference
8+
9+
### Module Enable/Disable
10+
11+
Controls whether the janitor-provider module is deployed in the cluster.
12+
13+
```yaml
14+
global:
15+
janitorProvider:
16+
enabled: true
17+
```
18+
19+
### CSP Provider
20+
21+
Selects the provider implementation used by janitor-provider.
22+
23+
```yaml
24+
janitor-provider:
25+
csp:
26+
provider: "kind"
27+
```
28+
29+
Supported providers are `kind`, `kwok`, `aws`, `gcp`, `azure`, `oci`, `nebius`, and `generic`.
30+
31+
## Generic Provider
32+
33+
The `generic` provider is intended for bare-metal and on-premises clusters where there is no cloud provider reboot API. It creates a privileged Kubernetes Job on the target node and uses the node `bootID` to verify that a reboot occurred.
34+
35+
```yaml
36+
janitor-provider:
37+
csp:
38+
provider: "generic"
39+
generic:
40+
rebootImage: "public.ecr.aws/docker/library/busybox:1.37.0"
41+
useSysrqReboot: false
42+
rebootJobNamespace: ""
43+
rebootJobTTLSeconds: 3600
44+
imagePullSecrets: ""
45+
```
46+
47+
### Generic Provider Options
48+
49+
`rebootImage` sets the container image used for the privileged reboot Job.
50+
51+
`useSysrqReboot` switches the reboot command from `chroot /host reboot` to Linux Magic SysRq by writing `b` to the host `/proc/sysrq-trigger`. Keep this disabled unless the standard reboot path leaves nodes stuck `NotReady`.
52+
53+
`rebootJobNamespace` sets the namespace where reboot Jobs are created. If empty, the janitor-provider release namespace is used.
54+
55+
`rebootJobTTLSeconds` controls how long completed reboot Jobs are retained before Kubernetes garbage-collects them.
56+
57+
`imagePullSecrets` is a comma-separated list of image pull secret names used by the reboot Job.
58+
59+
### SysRq Reboot
60+
61+
When `useSysrqReboot` is enabled, janitor-provider sets `GENERIC_REBOOT_USE_SYSRQ=true` and the reboot Job writes to the host SysRq trigger instead of invoking the normal reboot command.
62+
63+
This path bypasses the normal userspace shutdown flow. It is useful for environments where `chroot /host reboot` or `sudo reboot` is accepted but leaves the node stuck `NotReady`, but it should remain an explicit opt-in because it is more abrupt than the default reboot path.

docs/designs/028-generic-baremetal-reboot-provider.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ flowchart TD
3333

3434
## Decision
3535

36-
Add a **generic provider** (`CSP=generic`) that reboots nodes via a privileged Kubernetes Job running `chroot /host /sbin/reboot`, following the Job-based pattern from GPU Reset ([ADR-019](019-janitor-gpu-reset.md)). It is a named provider in the factory switch, just like `aws`, `gcp`, or `kind`.
36+
Add a **generic provider** (`CSP=generic`) that reboots nodes via a privileged Kubernetes Job. By default the Job runs `chroot /host /sbin/reboot`; deployments can opt into a Linux Magic SysRq reboot for environments where the normal reboot path wedges the node. This follows the Job-based pattern from GPU Reset ([ADR-019](019-janitor-gpu-reset.md)). It is a named provider in the factory switch, just like `aws`, `gcp`, or `kind`.
3737

3838
## Implementation
3939

@@ -69,7 +69,7 @@ sequenceDiagram
6969
GP-->>JC: requestID = pre-reboot bootID
7070
7171
K8s->>Job: Schedule pod on target node
72-
Job->>Node: chroot /host /sbin/reboot
72+
Job->>Node: chroot /host /sbin/reboot or echo b > /proc/sysrq-trigger
7373
Note over Node: Node reboots, pod is killed
7474
7575
Note over Node: Node boots back up, new bootID assigned
@@ -87,6 +87,13 @@ sequenceDiagram
8787

8888
Records the node's current `bootID` from `node.Status.NodeInfo.BootID`, creates a privileged Job on the target node, and returns the pre-reboot `bootID` as the `requestID`.
8989

90+
The reboot Job supports two reboot paths:
91+
92+
- Default: `chroot /host reboot`
93+
- SysRq opt-in: `echo b > /proc/sysrq-trigger` via the host `/proc` mount
94+
95+
The SysRq path is intended for bare-metal environments where the standard reboot command is accepted but leaves the node stuck `NotReady`. It bypasses the normal userspace shutdown path, so it is controlled by an explicit feature flag.
96+
9097
**Job specification:**
9198

9299
```yaml
@@ -166,6 +173,7 @@ csp:
166173
167174
generic: # config for the generic provider (when provider=generic)
168175
rebootImage: "busybox:1.37"
176+
useSysrqReboot: false # true to use echo b > /proc/sysrq-trigger
169177
rebootJobNamespace: "" # defaults to the janitor-provider's own namespace
170178
rebootJobTTLSeconds: 3600
171179
```
@@ -177,6 +185,8 @@ env:
177185
value: {{ .Values.csp.provider | default "kind" | quote }}
178186
- name: GENERIC_REBOOT_IMAGE
179187
value: {{ .Values.csp.generic.rebootImage | default "busybox:1.37" | quote }}
188+
- name: GENERIC_REBOOT_USE_SYSRQ
189+
value: {{ .Values.csp.generic.useSysrqReboot | default false | quote }}
180190
- name: GENERIC_REBOOT_JOB_NAMESPACE
181191
value: {{ .Values.csp.generic.rebootJobNamespace | quote }}
182192
- name: GENERIC_REBOOT_JOB_TTL

janitor-provider/pkg/csp/generic/generic.go

Lines changed: 61 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,22 @@ const (
3838
jobLabelKey = "nvsentinel.nvidia.com/reboot-job"
3939
jobNodeLabelKey = "nvsentinel.nvidia.com/reboot-node"
4040
hostMountPath = "/host"
41+
hostProcMountPath = "/host-proc"
4142
)
4243

4344
var _ model.CSPClient = (*Client)(nil)
4445

4546
// Config holds the configuration for the generic provider.
4647
type Config struct {
4748
RebootImage string
49+
UseSysrqReboot bool
4850
RebootJobNamespace string
4951
RebootJobTTL int32
5052
RebootJobPullSecrets []string
5153
}
5254

5355
// Client is the generic bare-metal implementation of the CSP Client interface.
54-
// It reboots nodes by creating a privileged Job that runs chroot /host reboot.
56+
// It reboots nodes by creating a privileged Job that runs the configured reboot method.
5557
type Client struct {
5658
k8sClient kubernetes.Interface
5759
config Config
@@ -89,7 +91,7 @@ func NewClientWithK8s(k8sClient kubernetes.Interface, config Config) *Client {
8991
}
9092

9193
// SendRebootSignal creates a privileged Job on the target node that executes
92-
// chroot /host reboot. Returns the node's pre-reboot bootID as the requestID.
94+
// the configured reboot command. Returns the node's pre-reboot bootID as the requestID.
9395
func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) {
9496
preRebootBootID := node.Status.NodeInfo.BootID
9597
if preRebootBootID == "" {
@@ -99,15 +101,17 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.
99101

100102
job := c.buildRebootJob(node.Name)
101103

102-
slog.InfoContext(ctx, "Creating reboot Job", "node", node.Name, "namespace", c.config.RebootJobNamespace)
104+
slog.InfoContext(ctx, "Creating reboot Job", "node", node.Name, "namespace", c.config.RebootJobNamespace,
105+
"useSysrqReboot", c.config.UseSysrqReboot)
103106

104107
created, err := c.k8sClient.BatchV1().Jobs(c.config.RebootJobNamespace).Create(ctx, job, metav1.CreateOptions{})
105108
if err != nil {
106109
return "", fmt.Errorf("failed to create reboot job for node %s: %w", node.Name, err)
107110
}
108111

109112
slog.InfoContext(ctx, "Reboot Job created", "node", node.Name, "job", created.Name,
110-
"jobNamespace", c.config.RebootJobNamespace, "bootID", preRebootBootID)
113+
"jobNamespace", c.config.RebootJobNamespace, "bootID", preRebootBootID,
114+
"useSysrqReboot", c.config.UseSysrqReboot)
111115

112116
return model.ResetSignalRequestRef(preRebootBootID), nil
113117
}
@@ -151,6 +155,37 @@ func (c *Client) SendTerminateSignal(ctx context.Context, node corev1.Node) (mod
151155
func (c *Client) buildRebootJob(nodeName string) *batchv1.Job {
152156
image := c.config.RebootImage
153157
ttl := c.config.RebootJobTTL
158+
command := c.config.rebootCommand()
159+
volumeMounts := []corev1.VolumeMount{
160+
{
161+
Name: "host-root",
162+
MountPath: hostMountPath,
163+
},
164+
}
165+
volumes := []corev1.Volume{
166+
{
167+
Name: "host-root",
168+
VolumeSource: corev1.VolumeSource{
169+
HostPath: &corev1.HostPathVolumeSource{
170+
Path: "/",
171+
},
172+
},
173+
},
174+
}
175+
if c.config.UseSysrqReboot {
176+
volumeMounts = append(volumeMounts, corev1.VolumeMount{
177+
Name: "host-proc",
178+
MountPath: hostProcMountPath,
179+
})
180+
volumes = append(volumes, corev1.Volume{
181+
Name: "host-proc",
182+
VolumeSource: corev1.VolumeSource{
183+
HostPath: &corev1.HostPathVolumeSource{
184+
Path: "/proc",
185+
},
186+
},
187+
})
188+
}
154189

155190
return &batchv1.Job{
156191
ObjectMeta: metav1.ObjectMeta{
@@ -182,34 +217,28 @@ func (c *Client) buildRebootJob(nodeName string) *batchv1.Job {
182217
{
183218
Name: "reboot",
184219
Image: image,
185-
Command: []string{"chroot", hostMountPath, "reboot"},
220+
Command: command,
186221
SecurityContext: &corev1.SecurityContext{
187222
Privileged: ptr.To(true),
188223
},
189-
VolumeMounts: []corev1.VolumeMount{
190-
{
191-
Name: "host-root",
192-
MountPath: hostMountPath,
193-
},
194-
},
195-
},
196-
},
197-
Volumes: []corev1.Volume{
198-
{
199-
Name: "host-root",
200-
VolumeSource: corev1.VolumeSource{
201-
HostPath: &corev1.HostPathVolumeSource{
202-
Path: "/",
203-
},
204-
},
224+
VolumeMounts: volumeMounts,
205225
},
206226
},
227+
Volumes: volumes,
207228
},
208229
},
209230
},
210231
}
211232
}
212233

234+
func (c Config) rebootCommand() []string {
235+
if c.UseSysrqReboot {
236+
return []string{"sh", "-c", fmt.Sprintf("echo b > %s/sysrq-trigger", hostProcMountPath)}
237+
}
238+
239+
return []string{"chroot", hostMountPath, "reboot"}
240+
}
241+
213242
// checkRebootJobPodStatus checks the reboot Job's pod for terminal failures
214243
// (e.g., ImagePullBackOff) that indicate the reboot was never attempted.
215244
func (c *Client) checkRebootJobPodStatus(ctx context.Context, nodeName string) error {
@@ -319,6 +348,16 @@ func loadConfigFromEnv() Config {
319348
image = defaultRebootImage
320349
}
321350

351+
useSysrqReboot := false
352+
if useSysrqStr := os.Getenv("GENERIC_REBOOT_USE_SYSRQ"); useSysrqStr != "" {
353+
parsed, err := strconv.ParseBool(useSysrqStr)
354+
if err != nil {
355+
slog.Warn("Invalid GENERIC_REBOOT_USE_SYSRQ, using default", "value", useSysrqStr, "default", false)
356+
} else {
357+
useSysrqReboot = parsed
358+
}
359+
}
360+
322361
namespace := os.Getenv("GENERIC_REBOOT_JOB_NAMESPACE")
323362
ttl := int32(defaultRebootJobTTLSeconds)
324363

@@ -343,6 +382,7 @@ func loadConfigFromEnv() Config {
343382

344383
return Config{
345384
RebootImage: image,
385+
UseSysrqReboot: useSysrqReboot,
346386
RebootJobNamespace: namespace,
347387
RebootJobTTL: ttl,
348388
RebootJobPullSecrets: pullSecrets,

0 commit comments

Comments
 (0)