Skip to content

Commit 1bf1225

Browse files
chore: Use structured errors in logger (#2136)
1 parent fe016f3 commit 1bf1225

File tree

28 files changed

+101
-76
lines changed

28 files changed

+101
-76
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ go 1.24.2
55
require (
66
github.com/Pallinder/go-randomdata v1.2.0
77
github.com/avast/retry-go v3.0.0+incompatible
8-
github.com/awslabs/operatorpkg v0.0.0-20250320000002-b05af0f15c68
8+
github.com/awslabs/operatorpkg v0.0.0-20250415205441-a0c5d2f39e72
99
github.com/docker/docker v28.0.4+incompatible
1010
github.com/go-logr/logr v1.4.2
1111
github.com/imdario/mergo v0.3.16

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00
22
github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y=
33
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
44
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
5-
github.com/awslabs/operatorpkg v0.0.0-20250320000002-b05af0f15c68 h1:llLoYu7EeqtFrCGCJzzXIyDxvCwn/Zr+aX+sRyabXgw=
6-
github.com/awslabs/operatorpkg v0.0.0-20250320000002-b05af0f15c68/go.mod h1:Uu2TsiIC3jUXRxMiDXOsiz3ZuBLTsCj1j4B858r51bs=
5+
github.com/awslabs/operatorpkg v0.0.0-20250415205441-a0c5d2f39e72 h1:L6TgkVk04OH465tCZsdcPBE69yiIOcMRHwRJz3kltqY=
6+
github.com/awslabs/operatorpkg v0.0.0-20250415205441-a0c5d2f39e72/go.mod h1:83w70kHuAH/QyhMnKzql71/fs3kSOiTJq+DU64xKq6M=
77
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
88
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
99
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=

kwok/cloudprovider/cloudprovider.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"strings"
2525
"time"
2626

27+
"github.com/awslabs/operatorpkg/serrors"
2728
"github.com/awslabs/operatorpkg/status"
2829
"github.com/docker/docker/pkg/namesgenerator"
2930
"github.com/samber/lo"
@@ -160,7 +161,7 @@ func (c CloudProvider) getInstanceType(instanceTypeName string) (*cloudprovider.
160161
return it.Name == instanceTypeName
161162
})
162163
if !found {
163-
return nil, fmt.Errorf("unable to find instance type %q", instanceTypeName)
164+
return nil, serrors.Wrap(fmt.Errorf("unable to find instance type"), "instance-type", instanceTypeName)
164165
}
165166
return it, nil
166167
}
@@ -184,7 +185,7 @@ func (c CloudProvider) toNode(nodeClaim *v1.NodeClaim) (*corev1.Node, error) {
184185
for _, val := range req.Values {
185186
it, err := c.getInstanceType(val)
186187
if err != nil {
187-
return nil, fmt.Errorf("instance type %s not found", val)
188+
return nil, serrors.Wrap(fmt.Errorf("instance type not found"), "instance-type", val)
188189
}
189190

190191
availableOfferings := it.Offerings.Available().Compatible(requirements)

pkg/apis/v1/labels.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"strings"
2222

23+
"github.com/awslabs/operatorpkg/serrors"
2324
v1 "k8s.io/api/core/v1"
2425
"k8s.io/apimachinery/pkg/runtime/schema"
2526
"k8s.io/apimachinery/pkg/util/sets"
@@ -111,7 +112,7 @@ func IsRestrictedLabel(key string) error {
111112
return nil
112113
}
113114
if IsRestrictedNodeLabel(key) {
114-
return fmt.Errorf("label %s is restricted; specify a well known label: %v, or a custom label that does not use a restricted domain: %v", key, sets.List(WellKnownLabels), sets.List(RestrictedLabelDomains))
115+
return serrors.Wrap(fmt.Errorf("label is restricted; specify a well known label or a custom label that does not use a restricted domain"), "label", key, "well-known-labels", sets.List(WellKnownLabels), "restricted-labels", sets.List(RestrictedLabelDomains))
115116
}
116117
return nil
117118
}

pkg/apis/v1/nodepool.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"math"
2222
"strconv"
2323

24+
"github.com/awslabs/operatorpkg/serrors"
2425
"github.com/mitchellh/hashstructure/v2"
2526
"github.com/robfig/cron/v3"
2627
"github.com/samber/lo"
@@ -146,7 +147,7 @@ func (l Limits) ExceededBy(resources v1.ResourceList) error {
146147
for resourceName, usage := range resources {
147148
if limit, ok := l[resourceName]; ok {
148149
if usage.Cmp(limit) > 0 {
149-
return fmt.Errorf("%s resource usage of %v exceeds limit of %v", resourceName, usage.AsDec(), limit.AsDec())
150+
return serrors.Wrap(fmt.Errorf("resource usage exceeds limit"), "resource-name", resourceName, "usage", usage.AsDec(), "limit", limit.AsDec())
150151
}
151152
}
152153
}
@@ -358,7 +359,7 @@ func (in *Budget) IsActive(c clock.Clock) (bool, error) {
358359
if err != nil {
359360
// Should only occur if there's a discrepancy
360361
// with the validation regex and the cron package.
361-
return false, fmt.Errorf("invariant violated, invalid cron %s", schedule)
362+
return false, serrors.Wrap(fmt.Errorf("invariant violated, invalid cron"), "cron", schedule)
362363
}
363364
// Walk back in time for the duration associated with the schedule
364365
checkPoint := c.Now().UTC().Add(-lo.FromPtr(in.Duration).Duration)

pkg/cloudprovider/fake/cloudprovider.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"sync"
2525
"time"
2626

27+
"github.com/awslabs/operatorpkg/serrors"
2728
"github.com/awslabs/operatorpkg/status"
2829
"github.com/samber/lo"
2930
corev1 "k8s.io/api/core/v1"
@@ -204,7 +205,7 @@ func (c *CloudProvider) Get(_ context.Context, id string) (*v1.NodeClaim, error)
204205
if nodeClaim, ok := c.CreatedNodeClaims[id]; ok {
205206
return nodeClaim.DeepCopy(), nil
206207
}
207-
return nil, cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("no nodeclaim exists with id '%s'", id))
208+
return nil, cloudprovider.NewNodeClaimNotFoundError(serrors.Wrap(fmt.Errorf("no nodeclaim exists with id"), "id", id))
208209
}
209210

210211
func (c *CloudProvider) List(_ context.Context) ([]*v1.NodeClaim, error) {
@@ -284,7 +285,7 @@ func (c *CloudProvider) Delete(_ context.Context, nc *v1.NodeClaim) error {
284285
delete(c.CreatedNodeClaims, nc.Status.ProviderID)
285286
return nil
286287
}
287-
return cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("no nodeclaim exists with provider id '%s'", nc.Status.ProviderID))
288+
return cloudprovider.NewNodeClaimNotFoundError(serrors.Wrap(fmt.Errorf("no nodeclaim exists with provider id"), "provider-id", nc.Status.ProviderID))
288289
}
289290

290291
func (c *CloudProvider) IsDrifted(context.Context, *v1.NodeClaim) (cloudprovider.DriftReason, error) {

pkg/cloudprovider/types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"sync"
2626
"time"
2727

28+
"github.com/awslabs/operatorpkg/serrors"
2829
"github.com/awslabs/operatorpkg/status"
2930
"github.com/samber/lo"
3031
corev1 "k8s.io/api/core/v1"
@@ -214,7 +215,7 @@ func (its InstanceTypes) SatisfiesMinValues(requirements scheduling.Requirements
214215
}
215216
}
216217
if incompatibleKey != "" {
217-
return len(its), fmt.Errorf("minValues requirement is not met for %q", incompatibleKey)
218+
return len(its), serrors.Wrap(fmt.Errorf("minValues requirement is not met for label"), "label", incompatibleKey)
218219
}
219220
return len(its), nil
220221
}

pkg/controllers/disruption/consolidation.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"sort"
2424
"time"
2525

26+
"github.com/awslabs/operatorpkg/serrors"
2627
"github.com/samber/lo"
2728
corev1 "k8s.io/api/core/v1"
2829
"k8s.io/utils/clock"
@@ -318,7 +319,7 @@ func getCandidatePrices(candidates []*Candidate) (float64, error) {
318319
if reqs.Get(v1.CapacityTypeLabelKey).Has(v1.CapacityTypeReserved) {
319320
return 0.0, nil
320321
}
321-
return 0.0, fmt.Errorf("unable to determine offering for %s/%s/%s", c.instanceType.Name, c.capacityType, c.zone)
322+
return 0.0, serrors.Wrap(fmt.Errorf("unable to determine offering"), "instance-type", c.instanceType.Name, "capacity-type", c.capacityType, "zone", c.zone)
322323
}
323324
price += compatibleOfferings.Cheapest().Price
324325
}

pkg/controllers/disruption/controller.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"sync"
2626
"time"
2727

28+
"github.com/awslabs/operatorpkg/serrors"
2829
"github.com/awslabs/operatorpkg/singleton"
2930
"github.com/samber/lo"
3031
"go.uber.org/multierr"
@@ -131,13 +132,13 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
131132
if errors.IsConflict(err) {
132133
return reconcile.Result{Requeue: true}, nil
133134
}
134-
return reconcile.Result{}, fmt.Errorf("removing taint %s from nodes, %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err)
135+
return reconcile.Result{}, serrors.Wrap(fmt.Errorf("removing taint from nodes, %w", err), "taint", pretty.Taint(v1.DisruptedNoScheduleTaint))
135136
}
136137
if err := state.ClearNodeClaimsCondition(ctx, c.kubeClient, v1.ConditionTypeDisruptionReason, outdatedNodes...); err != nil {
137138
if errors.IsConflict(err) {
138139
return reconcile.Result{Requeue: true}, nil
139140
}
140-
return reconcile.Result{}, fmt.Errorf("removing %s condition from nodeclaims, %w", v1.ConditionTypeDisruptionReason, err)
141+
return reconcile.Result{}, serrors.Wrap(fmt.Errorf("removing condition from nodeclaims, %w", err), "condition", v1.ConditionTypeDisruptionReason)
141142
}
142143

143144
// Attempt different disruption methods. We'll only let one method perform an action
@@ -148,7 +149,7 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
148149
if errors.IsConflict(err) {
149150
return reconcile.Result{Requeue: true}, nil
150151
}
151-
return reconcile.Result{}, fmt.Errorf("disrupting via reason=%q, %w", strings.ToLower(string(m.Reason())), err)
152+
return reconcile.Result{}, serrors.Wrap(fmt.Errorf("disrupting, %w", err), strings.ToLower(string(m.Reason())), "reason")
152153
}
153154
if success {
154155
return reconcile.Result{RequeueAfter: singleton.RequeueImmediately}, nil
@@ -206,7 +207,7 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command,
206207

207208
// Cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes
208209
if err := c.MarkDisrupted(ctx, m, cmd.candidates...); err != nil {
209-
return fmt.Errorf("marking disrupted (command-id: %s), %w", commandID, err)
210+
return serrors.Wrap(fmt.Errorf("marking disrupted, %w", err), "command-id", commandID)
210211
}
211212

212213
var nodeClaimNames []string
@@ -215,7 +216,7 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command,
215216
if nodeClaimNames, err = c.createReplacementNodeClaims(ctx, m, cmd); err != nil {
216217
// If we failed to launch the replacement, don't disrupt. If this is some permanent failure,
217218
// we don't want to disrupt workloads with no way to provision new nodes for them.
218-
return fmt.Errorf("launching replacement nodeclaim (command-id: %s), %w", commandID, err)
219+
return serrors.Wrap(fmt.Errorf("launching replacement nodeclaim, %w", err), "command-id", commandID)
219220
}
220221
}
221222

@@ -234,7 +235,7 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command,
234235
if err := c.queue.Add(orchestration.NewCommand(nodeClaimNames, statenodes, commandID, m.Reason(), m.ConsolidationType())); err != nil {
235236
providerIDs := lo.Map(cmd.candidates, func(c *Candidate, _ int) string { return c.ProviderID() })
236237
c.cluster.UnmarkForDeletion(providerIDs...)
237-
return fmt.Errorf("adding command to queue (command-id: %s), %w", commandID, err)
238+
return serrors.Wrap(fmt.Errorf("adding command to queue, %w", err), "command-id", commandID)
238239
}
239240

240241
// An action is only performed and pods/nodes are only disrupted after a successful add to the queue
@@ -254,7 +255,7 @@ func (c *Controller) createReplacementNodeClaims(ctx context.Context, m Method,
254255
}
255256
if len(nodeClaimNames) != len(cmd.replacements) {
256257
// shouldn't ever occur since a partially failed CreateNodeClaims should return an error
257-
return nil, fmt.Errorf("expected %d replacements, got %d", len(cmd.replacements), len(nodeClaimNames))
258+
return nil, serrors.Wrap(fmt.Errorf("expected replacement count did not equal actual replacement count"), "expected-count", len(cmd.replacements), "actual-count", len(nodeClaimNames))
258259
}
259260
return nodeClaimNames, nil
260261
}
@@ -264,7 +265,7 @@ func (c *Controller) MarkDisrupted(ctx context.Context, m Method, candidates ...
264265
return c.StateNode
265266
})
266267
if err := state.RequireNoScheduleTaint(ctx, c.kubeClient, true, stateNodes...); err != nil {
267-
return fmt.Errorf("tainting nodes with %s: %w", pretty.Taint(v1.DisruptedNoScheduleTaint), err)
268+
return serrors.Wrap(fmt.Errorf("tainting nodes, %w", err), "taint", pretty.Taint(v1.DisruptedNoScheduleTaint))
268269
}
269270

270271
providerIDs := lo.Map(candidates, func(c *Candidate, _ int) string { return c.ProviderID() })

pkg/controllers/disruption/orchestration/queue.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"sync"
2525
"time"
2626

27+
"github.com/awslabs/operatorpkg/serrors"
2728
"github.com/awslabs/operatorpkg/singleton"
2829
"github.com/samber/lo"
2930
"go.uber.org/multierr"
@@ -243,7 +244,7 @@ func (q *Queue) Reconcile(ctx context.Context) (reconcile.Result, error) {
243244
// nolint:gocyclo
244245
func (q *Queue) waitOrTerminate(ctx context.Context, cmd *Command) error {
245246
if q.clock.Since(cmd.timeAdded) > maxRetryDuration {
246-
return NewUnrecoverableError(fmt.Errorf("command reached timeout after %s", q.clock.Since(cmd.timeAdded)))
247+
return NewUnrecoverableError(serrors.Wrap(fmt.Errorf("command reached timeout"), "duration", q.clock.Since(cmd.timeAdded)))
247248
}
248249
waitErrs := make([]error, len(cmd.Replacements))
249250
for i := range cmd.Replacements {
@@ -269,7 +270,7 @@ func (q *Queue) waitOrTerminate(ctx context.Context, cmd *Command) error {
269270
initializedStatus := nodeClaim.StatusConditions().Get(v1.ConditionTypeInitialized)
270271
if !initializedStatus.IsTrue() {
271272
q.recorder.Publish(disruptionevents.WaitingOnReadiness(nodeClaim))
272-
waitErrs[i] = fmt.Errorf("nodeclaim %s not initialized", nodeClaim.Name)
273+
waitErrs[i] = serrors.Wrap(fmt.Errorf("nodeclaim not initialized"), "NodeClaim", klog.KRef("", nodeClaim.Name))
273274
continue
274275
}
275276
cmd.Replacements[i].Initialized = true

0 commit comments

Comments
 (0)