Skip to content

Commit ebbc00c

Browse files
committed
chore: minor changes
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent 887f799 commit ebbc00c

5 files changed

Lines changed: 45 additions & 35 deletions

File tree

distros/kubernetes/nvsentinel/charts/janitor-provider/values.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ csp:
114114
# Container image used for the reboot Job
115115
rebootImage: "public.ecr.aws/docker/library/busybox:1.37.0"
116116
# Use the Linux Magic SysRq trigger for an immediate kernel reboot instead of
117-
# running chroot /host reboot.
117+
# running chroot /host reboot. Only enable this when target nodes support
118+
# Linux Magic SysRq and their kernel/config permissions allow an immediate
119+
# reboot through /proc/sysrq-trigger.
118120
useSysrqReboot: false
119121
# Namespace for reboot Jobs (defaults to the janitor-provider's namespace)
120122
rebootJobNamespace: ""

docs/configuration/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,5 @@ Each module has additional configuration options documented in its dedicated gui
104104
- [Fault Quarantine](./fault-quarantine.md)
105105
- [Node Drainer](./node-drainer.md)
106106
- [Fault Remediation](./fault-remediation.md)
107-
- [Janitor Provider](./janitor-provider.md)
108107
- [Preflight](./preflight.md)
109108
- [Event Exporter](./event-exporter.md)

docs/designs/028-generic-baremetal-reboot-provider.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ flowchart TD
3333

3434
## Decision
3535

36-
Add a **generic provider** (`CSP=generic`) that reboots nodes via a privileged Kubernetes Job. By default the Job runs `chroot /host /sbin/reboot`; deployments can opt into a Linux Magic SysRq reboot for environments where the normal reboot path wedges the node. This follows the Job-based pattern from GPU Reset ([ADR-019](019-janitor-gpu-reset.md)). It is a named provider in the factory switch, just like `aws`, `gcp`, or `kind`.
36+
Add a **generic provider** (`CSP=generic`) that reboots nodes via a privileged Kubernetes Job. By default the Job runs `chroot /host reboot`; deployments can opt into a Linux Magic SysRq reboot for environments where the normal reboot path wedges the node. This follows the Job-based pattern from GPU Reset ([ADR-019](019-janitor-gpu-reset.md)). It is a named provider in the factory switch, just like `aws`, `gcp`, or `kind`.
3737

3838
## Implementation
3939

@@ -69,7 +69,7 @@ sequenceDiagram
6969
GP-->>JC: requestID = pre-reboot bootID
7070
7171
K8s->>Job: Schedule pod on target node
72-
Job->>Node: chroot /host /sbin/reboot or echo b > /proc/sysrq-trigger
72+
Job->>Node: chroot /host reboot or echo b > /proc/sysrq-trigger
7373
Note over Node: Node reboots, pod is killed
7474
7575
Note over Node: Node boots back up, new bootID assigned
@@ -116,7 +116,7 @@ spec:
116116
containers:
117117
- name: reboot
118118
image: busybox:1.37
119-
command: ["chroot", "/host", "/sbin/reboot"]
119+
command: ["chroot", "/host", "reboot"]
120120
securityContext:
121121
privileged: true
122122
volumeMounts:
@@ -138,7 +138,7 @@ spec:
138138
| `ttlSecondsAfterFinished` | `3600` | Auto-cleanup after 1h |
139139
| `tolerations` | `[{operator: Exists}]` | Target node is likely cordoned/tainted |
140140
| `restartPolicy` | `Never` | Do not restart after reboot |
141-
| Image | `busybox:1.37` | Only needs `chroot` and host `/sbin/reboot` |
141+
| Image | `busybox:1.37` | Only needs `chroot` and the host `reboot` command |
142142

143143
#### IsNodeReady
144144

janitor-provider/pkg/csp/generic/generic.go

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -156,35 +156,42 @@ func (c *Client) buildRebootJob(nodeName string) *batchv1.Job {
156156
image := c.config.RebootImage
157157
ttl := c.config.RebootJobTTL
158158
command := c.config.rebootCommand()
159-
volumeMounts := []corev1.VolumeMount{
160-
{
161-
Name: "host-root",
162-
MountPath: hostMountPath,
163-
},
164-
}
165-
volumes := []corev1.Volume{
166-
{
167-
Name: "host-root",
168-
VolumeSource: corev1.VolumeSource{
169-
HostPath: &corev1.HostPathVolumeSource{
170-
Path: "/",
159+
var volumeMounts []corev1.VolumeMount
160+
var volumes []corev1.Volume
161+
if c.config.UseSysrqReboot {
162+
volumeMounts = []corev1.VolumeMount{
163+
{
164+
Name: "host-proc",
165+
MountPath: hostProcMountPath,
166+
},
167+
}
168+
volumes = []corev1.Volume{
169+
{
170+
Name: "host-proc",
171+
VolumeSource: corev1.VolumeSource{
172+
HostPath: &corev1.HostPathVolumeSource{
173+
Path: "/proc",
174+
},
171175
},
172176
},
173-
},
174-
}
175-
if c.config.UseSysrqReboot {
176-
volumeMounts = append(volumeMounts, corev1.VolumeMount{
177-
Name: "host-proc",
178-
MountPath: hostProcMountPath,
179-
})
180-
volumes = append(volumes, corev1.Volume{
181-
Name: "host-proc",
182-
VolumeSource: corev1.VolumeSource{
183-
HostPath: &corev1.HostPathVolumeSource{
184-
Path: "/proc",
177+
}
178+
} else {
179+
volumeMounts = []corev1.VolumeMount{
180+
{
181+
Name: "host-root",
182+
MountPath: hostMountPath,
183+
},
184+
}
185+
volumes = []corev1.Volume{
186+
{
187+
Name: "host-root",
188+
VolumeSource: corev1.VolumeSource{
189+
HostPath: &corev1.HostPathVolumeSource{
190+
Path: "/",
191+
},
185192
},
186193
},
187-
})
194+
}
188195
}
189196

190197
return &batchv1.Job{

janitor-provider/pkg/csp/generic/generic_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,18 +116,20 @@ func TestSendRebootSignal_CreatesSysrqJob(t *testing.T) {
116116
container := job.Spec.Template.Spec.Containers[0]
117117
assert.Equal(t, []string{"sh", "-c", "echo b > /host-proc/sysrq-trigger"}, container.Command)
118118

119-
assert.Contains(t, container.VolumeMounts, corev1.VolumeMount{
119+
require.Len(t, container.VolumeMounts, 1)
120+
assert.Equal(t, corev1.VolumeMount{
120121
Name: "host-proc",
121122
MountPath: hostProcMountPath,
122-
})
123-
assert.Contains(t, job.Spec.Template.Spec.Volumes, corev1.Volume{
123+
}, container.VolumeMounts[0])
124+
require.Len(t, job.Spec.Template.Spec.Volumes, 1)
125+
assert.Equal(t, corev1.Volume{
124126
Name: "host-proc",
125127
VolumeSource: corev1.VolumeSource{
126128
HostPath: &corev1.HostPathVolumeSource{
127129
Path: "/proc",
128130
},
129131
},
130-
})
132+
}, job.Spec.Template.Spec.Volumes[0])
131133
}
132134

133135
func TestBuildRebootJob_UsesCommandRebootByDefault(t *testing.T) {

0 commit comments

Comments
 (0)