Skip to content

Commit 515e407

Browse files
fix: improve error handling for retryable operations and add tests (#2342)
This change adds retry logic for handling transient errors in Kubernetes operations (update, create, and patch). It properly retries operations when encountering conflict errors, server timeouts, too many requests, and service unavailability errors. Non-retryable errors are properly propagated. Tests are added to verify the retry behavior. Signed-off-by: Karthik babu Manam <[email protected]> Co-authored-by: Charles-Edouard Brétéché <[email protected]>
1 parent ad0ee39 commit 515e407

File tree

6 files changed

+703
-136
lines changed

6 files changed

+703
-136
lines changed

pkg/engine/operations/create/operation.go

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,32 @@ func (o *operation) execute(ctx context.Context, bindings apis.Bindings, obj uns
8686
var outputs outputs.Outputs
8787
err := wait.PollUntilContextCancel(ctx, client.PollInterval, false, func(ctx context.Context) (bool, error) {
8888
outputs, lastErr = o.tryCreateResource(ctx, bindings, obj)
89-
// TODO: determine if the error can be retried
90-
return lastErr == nil, nil
89+
// Check if the error is retryable
90+
if lastErr != nil {
91+
// Conflict errors should be retried
92+
if kerrors.IsConflict(lastErr) {
93+
return false, nil
94+
}
95+
// Server timeout errors should be retried
96+
if kerrors.IsServerTimeout(lastErr) {
97+
return false, nil
98+
}
99+
// Too many requests errors should be retried
100+
if kerrors.IsTooManyRequests(lastErr) {
101+
return false, nil
102+
}
103+
// Service unavailable errors should be retried
104+
if kerrors.IsServiceUnavailable(lastErr) {
105+
return false, nil
106+
}
107+
// AlreadyExists error should not be retried as it's a permanent condition
108+
if kerrors.IsAlreadyExists(lastErr) {
109+
return false, lastErr
110+
}
111+
// Non-retryable error
112+
return false, lastErr
113+
}
114+
return true, nil
91115
})
92116
if err == nil {
93117
return outputs, nil
@@ -98,26 +122,45 @@ func (o *operation) execute(ctx context.Context, bindings apis.Bindings, obj uns
98122
return outputs, err
99123
}
100124

101-
// TODO: could be replaced by checking the already exists error
102125
func (o *operation) tryCreateResource(ctx context.Context, bindings apis.Bindings, obj unstructured.Unstructured) (outputs.Outputs, error) {
103-
var actual unstructured.Unstructured
104-
actual.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind())
105-
err := o.client.Get(ctx, client.Key(&obj), &actual)
106-
if err == nil {
126+
// First check if the resource exists
127+
key := client.Key(&obj)
128+
var existing unstructured.Unstructured
129+
existing.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind())
130+
131+
err := o.client.Get(ctx, key, &existing)
132+
if err != nil {
133+
// If there was an error other than NotFound, propagate it
134+
if !kerrors.IsNotFound(err) {
135+
return nil, err
136+
}
137+
138+
// Resource doesn't exist, try to create it
139+
createErr := o.client.Create(ctx, &obj)
140+
if createErr == nil {
141+
// Resource created successfully
142+
if o.cleaner != nil {
143+
o.cleaner.Add(o.client, &obj)
144+
}
145+
return o.handleCheck(ctx, bindings, obj, nil)
146+
}
147+
148+
// Check if the error matches any expectations
149+
if len(o.expect) > 0 {
150+
return o.handleCheck(ctx, bindings, obj, createErr)
151+
}
152+
153+
// If the error is not AlreadyExists, propagate it
154+
if !kerrors.IsAlreadyExists(createErr) {
155+
return nil, createErr
156+
}
157+
158+
// Resource already exists (race condition)
107159
return nil, errors.New("the resource already exists in the cluster")
108160
}
109-
if kerrors.IsNotFound(err) {
110-
return o.createResource(ctx, bindings, obj)
111-
}
112-
return nil, err
113-
}
114161

115-
func (o *operation) createResource(ctx context.Context, bindings apis.Bindings, obj unstructured.Unstructured) (outputs.Outputs, error) {
116-
err := o.client.Create(ctx, &obj)
117-
if err == nil && o.cleaner != nil {
118-
o.cleaner.Add(o.client, &obj)
119-
}
120-
return o.handleCheck(ctx, bindings, obj, err)
162+
// Resource already exists
163+
return nil, errors.New("the resource already exists in the cluster")
121164
}
122165

123166
func (o *operation) handleCheck(ctx context.Context, bindings apis.Bindings, obj unstructured.Unstructured, err error) (_outputs outputs.Outputs, _err error) {

pkg/engine/operations/create/operation_test.go

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/stretchr/testify/assert"
1717
kerrors "k8s.io/apimachinery/pkg/api/errors"
1818
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
19+
"k8s.io/apimachinery/pkg/runtime/schema"
1920
"k8s.io/utils/ptr"
2021
)
2122

@@ -52,6 +53,12 @@ func Test_create(t *testing.T) {
5253
*obj.(*unstructured.Unstructured) = pod
5354
return nil
5455
},
56+
CreateFn: func(_ context.Context, _ int, _ client.Object, _ ...client.CreateOption) error {
57+
return kerrors.NewAlreadyExists(
58+
schema.GroupResource{Group: "", Resource: "pods"},
59+
"test-pod",
60+
)
61+
},
5562
},
5663
expect: nil,
5764
expectedErr: errors.New("the resource already exists in the cluster"),
@@ -63,6 +70,12 @@ func Test_create(t *testing.T) {
6370
*obj.(*unstructured.Unstructured) = pod
6471
return nil
6572
},
73+
CreateFn: func(_ context.Context, _ int, _ client.Object, _ ...client.CreateOption) error {
74+
return kerrors.NewAlreadyExists(
75+
schema.GroupResource{Group: "", Resource: "pods"},
76+
"test-pod",
77+
)
78+
},
6679
},
6780
expect: nil,
6881
expectedErr: errors.New("the resource already exists in the cluster"),
@@ -99,6 +112,9 @@ func Test_create(t *testing.T) {
99112
GetFn: func(ctx context.Context, _ int, _ client.ObjectKey, _ client.Object, opts ...client.GetOption) error {
100113
return errors.New("some arbitrary error")
101114
},
115+
CreateFn: func(_ context.Context, _ int, _ client.Object, _ ...client.CreateOption) error {
116+
return errors.New("unexpected create call")
117+
},
102118
},
103119
expect: nil,
104120
expectedErr: errors.New("some arbitrary error"),
@@ -238,3 +254,170 @@ func Test_create(t *testing.T) {
238254
})
239255
}
240256
}
257+
258+
func Test_retry_logic(t *testing.T) {
259+
pod := unstructured.Unstructured{
260+
Object: map[string]any{
261+
"apiVersion": "v1",
262+
"kind": "Pod",
263+
"metadata": map[string]any{
264+
"name": "test-pod",
265+
},
266+
"spec": map[string]any{
267+
"containers": []any{
268+
map[string]any{
269+
"name": "test-container",
270+
"image": "test-image:v1",
271+
},
272+
},
273+
},
274+
},
275+
}
276+
277+
tests := []struct {
278+
name string
279+
object unstructured.Unstructured
280+
client *tclient.FakeClient
281+
cleaner cleaner.Cleaner
282+
expect []v1alpha1.Expectation
283+
expectedErr error
284+
}{
285+
{
286+
name: "conflict error should be retried and eventually succeed",
287+
object: pod,
288+
client: &tclient.FakeClient{
289+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
290+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
291+
},
292+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
293+
if call < 2 {
294+
return kerrors.NewConflict(
295+
schema.GroupResource{Group: "", Resource: "pods"},
296+
"test-pod",
297+
errors.New("conflict error"),
298+
)
299+
}
300+
return nil
301+
},
302+
},
303+
expect: nil,
304+
expectedErr: nil,
305+
},
306+
{
307+
name: "server timeout error should be retried and eventually succeed",
308+
object: pod,
309+
client: &tclient.FakeClient{
310+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
311+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
312+
},
313+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
314+
if call < 2 {
315+
return kerrors.NewServerTimeout(
316+
schema.GroupResource{Group: "", Resource: "pods"},
317+
"create",
318+
10,
319+
)
320+
}
321+
return nil
322+
},
323+
},
324+
expect: nil,
325+
expectedErr: nil,
326+
},
327+
{
328+
name: "too many requests error should be retried and eventually succeed",
329+
object: pod,
330+
client: &tclient.FakeClient{
331+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
332+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
333+
},
334+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
335+
if call < 2 {
336+
return kerrors.NewTooManyRequests(
337+
"too many requests",
338+
10,
339+
)
340+
}
341+
return nil
342+
},
343+
},
344+
expect: nil,
345+
expectedErr: nil,
346+
},
347+
{
348+
name: "service unavailable error should be retried and eventually succeed",
349+
object: pod,
350+
client: &tclient.FakeClient{
351+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
352+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
353+
},
354+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
355+
if call < 2 {
356+
return kerrors.NewServiceUnavailable("service unavailable")
357+
}
358+
return nil
359+
},
360+
},
361+
expect: nil,
362+
expectedErr: nil,
363+
},
364+
{
365+
name: "already exists error should not be retried",
366+
object: pod,
367+
client: &tclient.FakeClient{
368+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
369+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
370+
},
371+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
372+
return kerrors.NewAlreadyExists(
373+
schema.GroupResource{Group: "", Resource: "pods"},
374+
"test-pod",
375+
)
376+
},
377+
},
378+
expect: nil,
379+
expectedErr: errors.New("the resource already exists in the cluster"),
380+
},
381+
{
382+
name: "permanent error should not be retried",
383+
object: pod,
384+
client: &tclient.FakeClient{
385+
GetFn: func(ctx context.Context, call int, key client.ObjectKey, obj client.Object, _ ...client.GetOption) error {
386+
return kerrors.NewNotFound(obj.GetObjectKind().GroupVersionKind().GroupVersion().WithResource("pod").GroupResource(), key.Name)
387+
},
388+
CreateFn: func(_ context.Context, call int, _ client.Object, _ ...client.CreateOption) error {
389+
return kerrors.NewBadRequest("bad request error")
390+
},
391+
},
392+
expect: nil,
393+
expectedErr: errors.New("bad request error"),
394+
},
395+
}
396+
397+
for _, tt := range tests {
398+
t.Run(tt.name, func(t *testing.T) {
399+
logger := &mocks.Logger{}
400+
ctx := logging.WithLogger(context.TODO(), logger)
401+
toCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
402+
defer cancel()
403+
ctx = toCtx
404+
operation := New(
405+
apis.DefaultCompilers,
406+
tt.client,
407+
tt.object,
408+
nil,
409+
nil,
410+
false,
411+
tt.expect,
412+
nil,
413+
)
414+
outputs, err := operation.Exec(ctx, nil)
415+
assert.Nil(t, outputs)
416+
if tt.expectedErr != nil {
417+
assert.EqualError(t, err, tt.expectedErr.Error())
418+
} else {
419+
assert.NoError(t, err)
420+
}
421+
})
422+
}
423+
}

pkg/engine/operations/patch/operation.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/kyverno/chainsaw/pkg/engine/templating"
1717
"github.com/kyverno/chainsaw/pkg/logging"
1818
"github.com/kyverno/kyverno-json/pkg/core/compilers"
19+
kerrors "k8s.io/apimachinery/pkg/api/errors"
1920
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2021
"k8s.io/apimachinery/pkg/types"
2122
"k8s.io/apimachinery/pkg/util/json"
@@ -83,8 +84,32 @@ func (o *operation) execute(ctx context.Context, bindings apis.Bindings, obj uns
8384
var outputs outputs.Outputs
8485
err := wait.PollUntilContextCancel(ctx, client.PollInterval, false, func(ctx context.Context) (bool, error) {
8586
outputs, lastErr = o.tryPatchResource(ctx, bindings, obj)
86-
// TODO: determine if the error can be retried
87-
return lastErr == nil, nil
87+
// Check if the error is retryable
88+
if lastErr != nil {
89+
// Conflict errors should be retried
90+
if kerrors.IsConflict(lastErr) {
91+
return false, nil
92+
}
93+
// Server timeout errors should be retried
94+
if kerrors.IsServerTimeout(lastErr) {
95+
return false, nil
96+
}
97+
// Too many requests errors should be retried
98+
if kerrors.IsTooManyRequests(lastErr) {
99+
return false, nil
100+
}
101+
// Service unavailable errors should be retried
102+
if kerrors.IsServiceUnavailable(lastErr) {
103+
return false, nil
104+
}
105+
// Resource not found errors should not be retried
106+
if kerrors.IsNotFound(lastErr) {
107+
return false, lastErr
108+
}
109+
// Non-retryable error
110+
return false, lastErr
111+
}
112+
return true, nil
88113
})
89114
if err == nil {
90115
return outputs, nil

0 commit comments

Comments
 (0)