@@ -1467,12 +1467,12 @@ func (r *SkyhookReconciler) CreateInterruptPodForPackage(_interrupt *v1alpha1.In
14671467 VolumeMounts : volumeMounts ,
14681468 Resources : corev1.ResourceRequirements {
14691469 Limits : corev1.ResourceList {
1470- corev1 .ResourceCPU : resource .MustParse ("100m " ),
1471- corev1 .ResourceMemory : resource .MustParse ("20Mi " ),
1470+ corev1 .ResourceCPU : resource .MustParse ("500m " ),
1471+ corev1 .ResourceMemory : resource .MustParse ("256Mi " ),
14721472 },
14731473 Requests : corev1.ResourceList {
1474- corev1 .ResourceCPU : resource .MustParse ("100m " ),
1475- corev1 .ResourceMemory : resource .MustParse ("20Mi " ),
1474+ corev1 .ResourceCPU : resource .MustParse ("500m " ),
1475+ corev1 .ResourceMemory : resource .MustParse ("256Mi " ),
14761476 },
14771477 },
14781478 },
@@ -1633,14 +1633,14 @@ func (r *SkyhookReconciler) CreatePodFromPackage(_package *v1alpha1.Package, sky
16331633 Privileged : ptr (true ),
16341634 },
16351635 VolumeMounts : volumeMounts ,
1636- Resources : corev1.ResourceRequirements { // setting this
1636+ Resources : corev1.ResourceRequirements {
16371637 Limits : corev1.ResourceList {
1638- corev1 .ResourceCPU : resource .MustParse ("100m " ),
1639- corev1 .ResourceMemory : resource .MustParse ("20Mi " ),
1638+ corev1 .ResourceCPU : resource .MustParse ("500m " ),
1639+ corev1 .ResourceMemory : resource .MustParse ("256Mi " ),
16401640 },
16411641 Requests : corev1.ResourceList {
1642- corev1 .ResourceCPU : resource .MustParse ("100m " ),
1643- corev1 .ResourceMemory : resource .MustParse ("20Mi " ),
1642+ corev1 .ResourceCPU : resource .MustParse ("500m " ),
1643+ corev1 .ResourceMemory : resource .MustParse ("256Mi " ),
16441644 },
16451645 },
16461646 },
@@ -1896,6 +1896,17 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr
18961896 return false , nil
18971897 }
18981898
1899+ // Theres is a race condition when a node reboots and api might clean up the interrupt pod
1900+ // so we need to check if the pod exists and if it does, we need to recreate it
1901+ if status != nil && (status .State == v1alpha1 .StateInProgress || status .State == v1alpha1 .StateErroring ) && status .Stage == v1alpha1 .StageInterrupt {
1902+ // call interrupt to recreate the pod if missing
1903+ // this is safe because the ageent is idempotent
1904+ err = r .Interrupt (ctx , skyhookNode , _package , interrupt )
1905+ if err != nil {
1906+ return false , err
1907+ }
1908+ }
1909+
18991910 if nextStage != nil && * nextStage == v1alpha1 .StageInterrupt && runInterrupt { // time to do the interrupt
19001911
19011912 hasWork , err := r .HasNonInterruptWork (ctx , skyhookNode )
0 commit comments