Skip to content

Commit 078d270

Browse files
authored
Fixed ns termination status by reconciling (#91)
Fixes #88 Fixed ns termination status by reconciling Signed-off-by: nitishm <[email protected]> Co-authored-by: nitishm <[email protected]>
1 parent dfd038a commit 078d270

File tree

3 files changed

+47
-24
lines changed

3 files changed

+47
-24
lines changed

controllers/appgroup_controller.go

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ type ApplicationGroupReconciler struct {
6060

6161
// Recorder generates kubernetes events
6262
Recorder record.EventRecorder
63+
64+
lastSuccessfulApplicationGroup *orkestrav1alpha1.ApplicationGroup
6365
}
6466

6567
// +kubebuilder:rbac:groups=orkestra.azure.microsoft.com,resources=applicationgroups,verbs=get;list;watch;create;update;patch;delete
@@ -83,16 +85,17 @@ func (r *ApplicationGroupReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
8385
}
8486

8587
if appGroup.GetAnnotations() != nil {
86-
// TODO (nitishm) Use this in error remediation by reapplying last successful appgroup spec
87-
// lastSuccessfulApplicationGroup := appGroup.Annotations[lastSuccessfulApplicationGroupKey]
88-
_ = appGroup.Annotations[lastSuccessfulApplicationGroupKey]
88+
last := &orkestrav1alpha1.ApplicationGroup{}
89+
s := appGroup.Annotations[lastSuccessfulApplicationGroupKey]
90+
_ = json.Unmarshal([]byte(s), last)
91+
r.lastSuccessfulApplicationGroup = last
8992
}
9093

91-
// handle DELETE if deletion timestamp is non-zero
94+
// handle deletes if deletion timestamp is non-zero
9295
if !appGroup.DeletionTimestamp.IsZero() {
9396
// If finalizer is found, remove it and requeue
9497
if appGroup.Finalizers != nil {
95-
logr.Info("Cleaning up")
98+
logr.Info("cleaning up the applicationgroup resource")
9699
// TODO: Take remediation action
97100
// Reverse the entire workflow to remove all the Helm Releases
98101
appGroup.Finalizers = nil
@@ -113,34 +116,34 @@ func (r *ApplicationGroupReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
113116
return ctrl.Result{Requeue: true}, nil
114117
}
115118

116-
// handle UPDATE if checksum mismatched
119+
// handle first time install and subsequent updates
117120
checksums, err := pkg.Checksum(&appGroup)
118121
if err != nil {
119122
// TODO (nitishm) Handle different error types here to decide remediation action
120123
if errors.Is(err, pkg.ErrChecksumAppGroupSpecMismatch) {
121124
if appGroup.Status.Checksums != nil {
122125
appGroup.Status.Update = true
123126
}
124-
appGroup.Status.Checksums = checksums
125127
requeue, err = r.reconcile(ctx, logr, r.WorkflowNS, &appGroup)
126128
if err != nil {
127129
logr.Error(err, "failed to reconcile ApplicationGroup instance")
128-
r.updateStatusAndEvent(ctx, appGroup, requeue, err)
130+
r.handleResponseAndEvent(ctx, appGroup, requeue, err)
129131
return ctrl.Result{Requeue: requeue}, err
130132
}
131133

134+
appGroup.Status.Checksums = checksums
135+
132136
if appGroup.Status.Phase != v1alpha12.NodeSucceeded {
133-
r.updateStatusAndEvent(ctx, appGroup, requeue, err)
137+
r.handleResponseAndEvent(ctx, appGroup, requeue, err)
134138
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
135139
}
136140

137-
r.updateStatusAndEvent(ctx, appGroup, requeue, err)
138-
return ctrl.Result{Requeue: false}, nil
141+
r.handleResponseAndEvent(ctx, appGroup, requeue, err)
142+
return ctrl.Result{Requeue: requeue}, err
139143
}
140144

141-
appGroup.Status.Error = err.Error()
142-
_ = r.Status().Update(ctx, &appGroup)
143145
logr.Error(err, "failed to calculate checksum annotations for application group specs")
146+
r.handleResponseAndEvent(ctx, appGroup, false, err)
144147
return ctrl.Result{Requeue: false}, err
145148
}
146149

@@ -155,8 +158,7 @@ func (r *ApplicationGroupReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
155158
err = r.List(ctx, &wfs, listOption)
156159
if err != nil {
157160
logr.Error(err, "failed to find generate workflow instance")
158-
appGroup.Status.Error = err.Error()
159-
_ = r.Status().Update(ctx, &appGroup)
161+
r.handleResponseAndEvent(ctx, appGroup, false, err)
160162
return ctrl.Result{Requeue: false}, err
161163
}
162164

@@ -169,18 +171,16 @@ func (r *ApplicationGroupReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
169171
switch appGroup.Status.Phase {
170172
case v1alpha12.NodeRunning, v1alpha12.NodePending:
171173
logr.V(1).Info("workflow in pending/running state. requeue and reconcile after a short period")
172-
_ = r.Status().Update(ctx, &appGroup)
174+
r.handleResponseAndEvent(ctx, appGroup, true, nil)
173175
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
174176
case v1alpha12.NodeSucceeded:
175177
logr.V(1).Info("workflow ran to completion and succeeded")
176-
appGroup.Status.Error = ""
177-
r.updateStatusAndEvent(ctx, appGroup, false, nil)
178+
r.handleResponseAndEvent(ctx, appGroup, false, nil)
178179
return ctrl.Result{Requeue: false}, nil
179180
case v1alpha12.NodeError, v1alpha12.NodeFailed:
180181
err = fmt.Errorf("workflow in failure/error condition")
181182
logr.Error(err, "workflow in failure/error condition")
182-
appGroup.Status.Error = err.Error()
183-
_ = r.Status().Update(ctx, &appGroup)
183+
r.handleResponseAndEvent(ctx, appGroup, false, err)
184184
return ctrl.Result{Requeue: false}, err
185185
}
186186

@@ -195,9 +195,11 @@ func (r *ApplicationGroupReconciler) SetupWithManager(mgr ctrl.Manager) error {
195195
Complete(r)
196196
}
197197

198-
func (r *ApplicationGroupReconciler) updateStatusAndEvent(ctx context.Context, grp orkestrav1alpha1.ApplicationGroup, requeue bool, err error) {
198+
func (r *ApplicationGroupReconciler) handleResponseAndEvent(ctx context.Context, grp orkestrav1alpha1.ApplicationGroup, requeue bool, err error) {
199199
errStr := ""
200200
if err != nil {
201+
// Handle the error by remediating the workflow
202+
r.handleRemediation(ctx, err)
201203
errStr = err.Error()
202204
}
203205

@@ -206,9 +208,11 @@ func (r *ApplicationGroupReconciler) updateStatusAndEvent(ctx context.Context, g
206208
_ = r.Status().Update(ctx, &grp)
207209

208210
if grp.Status.Phase == v1alpha12.NodeSucceeded {
211+
// Annotate the resource with the last successful ApplicationGroup spec
209212
b, _ := json.Marshal(&grp)
210213
grp.SetAnnotations(map[string]string{lastSuccessfulApplicationGroupKey: string(b)})
211214
_ = r.Update(ctx, &grp)
215+
212216
r.Recorder.Event(&grp, "Normal", "ReconcileSuccess", fmt.Sprintf("Successfully reconciled ApplicationGroup %s", grp.Name))
213217
}
214218

@@ -264,3 +268,10 @@ func initApplications(appGroup *orkestrav1alpha1.ApplicationGroup) {
264268
}
265269
appGroup.Spec.Applications = v.DeepCopy().Spec.Applications
266270
}
271+
272+
func (r *ApplicationGroupReconciler) handleRemediation(ctx context.Context, err error) {
273+
if r.lastSuccessfulApplicationGroup != nil {
274+
r.lastSuccessfulApplicationGroup.Status.Checksums = nil
275+
_ = r.Update(ctx, r.lastSuccessfulApplicationGroup)
276+
}
277+
}

controllers/appgroup_reconciler.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ package controllers
22

33
import (
44
"context"
5+
"errors"
56
"os"
67

78
"fmt"
89

910
"github.com/Azure/Orkestra/api/v1alpha1"
1011
orkestrav1alpha1 "github.com/Azure/Orkestra/api/v1alpha1"
1112
"github.com/Azure/Orkestra/pkg/registry"
13+
"github.com/Azure/Orkestra/pkg/workflow"
1214
v1alpha12 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1"
1315
"github.com/go-logr/logr"
1416
)
@@ -184,6 +186,10 @@ func (r *ApplicationGroupReconciler) generateWorkflow(ctx context.Context, logr
184186

185187
err = r.Engine.Submit(ctx, logr, g)
186188
if err != nil {
189+
if errors.Is(err, workflow.ErrNamespaceTerminating) {
190+
logr.V(1).Info("namespace is in terminating state")
191+
return true, err
192+
}
187193
logr.Error(err, "engine failed to submit workflow")
188194
return false, err
189195
}

pkg/workflow/argo.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package workflow
22

33
import (
44
"context"
5+
kerrors "errors"
56
"fmt"
67
"os"
78
"strings"
@@ -31,6 +32,10 @@ const (
3132
valuesKeyGlobal = "global"
3233
)
3334

35+
var (
36+
ErrNamespaceTerminating = kerrors.New("namespace is in terminating phase")
37+
)
38+
3439
type argo struct {
3540
scheme *runtime.Scheme
3641
cli client.Client
@@ -122,7 +127,6 @@ func (a *argo) Submit(ctx context.Context, l logr.Logger, g *v1alpha1.Applicatio
122127
// FIXME (nitishm) Handle namespace in termination state by requeueing
123128
err := a.cli.Get(ctx, types.NamespacedName{Name: ns.Name}, &ns)
124129
if err != nil {
125-
fmt.Printf("\n\n\n\n\nGET %s\n\n\n\n", err)
126130
if errors.IsNotFound(err) {
127131
// Add OwnershipReference
128132
err = controllerutil.SetControllerReference(g, &ns, a.scheme)
@@ -132,12 +136,14 @@ func (a *argo) Submit(ctx context.Context, l logr.Logger, g *v1alpha1.Applicatio
132136

133137
err = a.cli.Create(ctx, &ns)
134138
if err != nil {
135-
fmt.Printf("\n\n\n\n\nCREATE %s\n\n\n\n", err)
136-
137139
return fmt.Errorf("failed to CREATE namespace %s object : %w", ns.Name, err)
138140
}
139141
}
140142
}
143+
144+
if ns.Status.Phase == corev1.NamespaceTerminating {
145+
return ErrNamespaceTerminating
146+
}
141147
}
142148

143149
err := a.cli.Get(ctx, types.NamespacedName{Namespace: a.wf.Namespace, Name: a.wf.Name}, obj)

0 commit comments

Comments
 (0)