Skip to content

Commit 03fbfe3

Browse files
committed
fix: deadlock if reboot pods are missing, adds them back
also increase the resource request and limits. we at times were seeing some pods hanging in different clusters
1 parent c8d67a5 commit 03fbfe3

File tree

7 files changed

+39
-28
lines changed

7 files changed

+39
-28
lines changed

chart/templates/skyhook-crd.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -390,14 +390,14 @@ spec:
390390
anyOf:
391391
- type: integer
392392
- type: string
393-
default: 250m
393+
default: 500m
394394
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
395395
x-kubernetes-int-or-string: true
396396
cpuRequest:
397397
anyOf:
398398
- type: integer
399399
- type: string
400-
default: 50m
400+
default: 500m
401401
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
402402
x-kubernetes-int-or-string: true
403403
memoryLimit:
@@ -411,7 +411,7 @@ spec:
411411
anyOf:
412412
- type: integer
413413
- type: string
414-
default: 64Mi
414+
default: 256Mi
415415
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
416416
x-kubernetes-int-or-string: true
417417
type: object

k8s-tests/chainsaw/skyhook/interrupt-grouping/skyhook.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ spec:
3434
type: service
3535
services: [containerd, foobar]
3636
resources:
37-
cpuLimit: 123m
37+
cpuLimit: 523m

k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,23 +30,23 @@ spec:
3030
(length(@)): 3
3131
resources: ## including these to tests defaults are working
3232
limits:
33-
cpu: 250m
33+
cpu: 500m
3434
memory: 256Mi
3535
requests:
36-
cpu: 50m
37-
memory: 64Mi
36+
cpu: 500m
37+
memory: 256Mi
3838
- name: cats-uninstallcheck
3939
args:
4040
([0]): uninstall-check
4141
([1]): /root
4242
(length(@)): 3
4343
resources: ## including these to tests defaults are working
4444
limits:
45-
cpu: 250m
45+
cpu: 500m
4646
memory: 256Mi
4747
requests:
48-
cpu: 50m
49-
memory: 64Mi
48+
cpu: 500m
49+
memory: 256Mi
5050
---
5151
kind: Pod
5252
apiVersion: v1

k8s-tests/chainsaw/skyhook/validate-packages/update.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ spec:
1919
version: "1.2.3"
2020
image: ghcr.io/nvidia/skyhook/agentless
2121
resources:
22-
cpuRequest: 50m
23-
cpuLimit: 250m
24-
memoryRequest: 64Mi
22+
cpuRequest: 500m
23+
cpuLimit: 500m
24+
memoryRequest: 256Mi
2525
memoryLimit: 256Mi
2626
invalid-env:
2727
version: "5.4.3"

operator/api/v1alpha1/skyhook_types.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,11 @@ func (p *PackageRef) GetUniqueName() string {
171171
}
172172

173173
type ResourceRequirements struct {
174-
// +kubebuilder:default="50m"
174+
// +kubebuilder:default="500m"
175175
CPURequest resource.Quantity `json:"cpuRequest,omitempty"`
176-
// +kubebuilder:default="250m"
176+
// +kubebuilder:default="500m"
177177
CPULimit resource.Quantity `json:"cpuLimit,omitempty"`
178-
// +kubebuilder:default="64Mi"
178+
// +kubebuilder:default="256Mi"
179179
MemoryRequest resource.Quantity `json:"memoryRequest,omitempty"`
180180
// +kubebuilder:default="256Mi"
181181
MemoryLimit resource.Quantity `json:"memoryLimit,omitempty"`

operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -377,14 +377,14 @@ spec:
377377
anyOf:
378378
- type: integer
379379
- type: string
380-
default: 250m
380+
default: 500m
381381
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
382382
x-kubernetes-int-or-string: true
383383
cpuRequest:
384384
anyOf:
385385
- type: integer
386386
- type: string
387-
default: 50m
387+
default: 500m
388388
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
389389
x-kubernetes-int-or-string: true
390390
memoryLimit:
@@ -398,7 +398,7 @@ spec:
398398
anyOf:
399399
- type: integer
400400
- type: string
401-
default: 64Mi
401+
default: 256Mi
402402
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
403403
x-kubernetes-int-or-string: true
404404
type: object

operator/internal/controller/skyhook_controller.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,12 +1467,12 @@ func (r *SkyhookReconciler) CreateInterruptPodForPackage(_interrupt *v1alpha1.In
14671467
VolumeMounts: volumeMounts,
14681468
Resources: corev1.ResourceRequirements{
14691469
Limits: corev1.ResourceList{
1470-
corev1.ResourceCPU: resource.MustParse("100m"),
1471-
corev1.ResourceMemory: resource.MustParse("20Mi"),
1470+
corev1.ResourceCPU: resource.MustParse("500m"),
1471+
corev1.ResourceMemory: resource.MustParse("256Mi"),
14721472
},
14731473
Requests: corev1.ResourceList{
1474-
corev1.ResourceCPU: resource.MustParse("100m"),
1475-
corev1.ResourceMemory: resource.MustParse("20Mi"),
1474+
corev1.ResourceCPU: resource.MustParse("500m"),
1475+
corev1.ResourceMemory: resource.MustParse("256Mi"),
14761476
},
14771477
},
14781478
},
@@ -1633,14 +1633,14 @@ func (r *SkyhookReconciler) CreatePodFromPackage(_package *v1alpha1.Package, sky
16331633
Privileged: ptr(true),
16341634
},
16351635
VolumeMounts: volumeMounts,
1636-
Resources: corev1.ResourceRequirements{ // setting this
1636+
Resources: corev1.ResourceRequirements{
16371637
Limits: corev1.ResourceList{
1638-
corev1.ResourceCPU: resource.MustParse("100m"),
1639-
corev1.ResourceMemory: resource.MustParse("20Mi"),
1638+
corev1.ResourceCPU: resource.MustParse("500m"),
1639+
corev1.ResourceMemory: resource.MustParse("256Mi"),
16401640
},
16411641
Requests: corev1.ResourceList{
1642-
corev1.ResourceCPU: resource.MustParse("100m"),
1643-
corev1.ResourceMemory: resource.MustParse("20Mi"),
1642+
corev1.ResourceCPU: resource.MustParse("500m"),
1643+
corev1.ResourceMemory: resource.MustParse("256Mi"),
16441644
},
16451645
},
16461646
},
@@ -1896,6 +1896,17 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr
18961896
return false, nil
18971897
}
18981898

1899+
// Theres is a race condition when a node reboots and api might clean up the interrupt pod
1900+
// so we need to check if the pod exists and if it does, we need to recreate it
1901+
if status != nil && (status.State == v1alpha1.StateInProgress || status.State == v1alpha1.StateErroring) && status.Stage == v1alpha1.StageInterrupt {
1902+
// call interrupt to recreate the pod if missing
1903+
// this is safe because the ageent is idempotent
1904+
err = r.Interrupt(ctx, skyhookNode, _package, interrupt)
1905+
if err != nil {
1906+
return false, err
1907+
}
1908+
}
1909+
18991910
if nextStage != nil && *nextStage == v1alpha1.StageInterrupt && runInterrupt { // time to do the interrupt
19001911

19011912
hasWork, err := r.HasNonInterruptWork(ctx, skyhookNode)

0 commit comments

Comments
 (0)