Skip to content

Commit 0c7a45c

Browse files
authored
Merge pull request #1254 from k8s-infra-cherrypick-robot/cherry-pick-1253-to-release-0.6
[release-0.6] Fixed node being taken repeatedly causing queue block
2 parents 1cbeae7 + cb4cad6 commit 0c7a45c

File tree

1 file changed

+18
-11
lines changed

1 file changed

+18
-11
lines changed

pkg/kwok/controllers/node_lease_controller.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ type NodeLeaseController struct {
5050
mutateLeaseFunc func(*coordinationv1.Lease) error
5151

5252
delayQueue queue.WeightDelayingQueue[string]
53-
holdLeaseSet maps.SyncMap[string, struct{}]
53+
holdLeaseSet maps.SyncMap[string, bool]
5454

5555
holderIdentity string
5656
onNodeManagedFunc func(nodeName string)
@@ -112,14 +112,14 @@ func (c *NodeLeaseController) syncWorker(ctx context.Context) {
112112
if !ok {
113113
return
114114
}
115-
_, ok = c.holdLeaseSet.Load(nodeName)
115+
first, ok := c.holdLeaseSet.Load(nodeName)
116116
if !ok {
117117
continue
118118
}
119119

120120
dur := c.interval()
121121

122-
lease, err := c.sync(ctx, nodeName)
122+
lease, err := c.sync(ctx, nodeName, first)
123123
if err != nil {
124124
logger.Error("Failed to sync lease", err,
125125
"node", nodeName,
@@ -134,6 +134,10 @@ func (c *NodeLeaseController) syncWorker(ctx context.Context) {
134134
continue
135135
}
136136

137+
if first {
138+
c.holdLeaseSet.Store(nodeName, false)
139+
}
140+
137141
now := c.clock.Now()
138142
expireDuration := expireTime.Sub(now)
139143
hold := tryAcquireOrRenew(lease, c.holderIdentity, now)
@@ -148,7 +152,7 @@ func (c *NodeLeaseController) interval() time.Duration {
148152

149153
// TryHold tries to hold a lease for the NodeLeaseController
150154
func (c *NodeLeaseController) TryHold(name string) {
151-
_, loaded := c.holdLeaseSet.LoadOrStore(name, struct{}{})
155+
_, loaded := c.holdLeaseSet.LoadOrStore(name, true)
152156
if !loaded {
153157
c.delayQueue.Add(name)
154158
}
@@ -171,7 +175,7 @@ func (c *NodeLeaseController) Held(name string) bool {
171175
}
172176

173177
// sync syncs a lease for a node
174-
func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) (lease *coordinationv1.Lease, err error) {
178+
func (c *NodeLeaseController) sync(ctx context.Context, nodeName string, first bool) (lease *coordinationv1.Lease, err error) {
175179
logger := log.FromContext(ctx)
176180
logger = logger.With("node", nodeName)
177181

@@ -182,12 +186,15 @@ func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) (lease
182186
return nil, nil
183187
}
184188
logger.Info("Syncing lease")
185-
lease, err := c.renewLease(ctx, lease)
189+
lease, transitions, err := c.renewLease(ctx, lease)
186190
if err != nil {
187191
return nil, fmt.Errorf("failed to update lease using lease: %w", err)
188192
}
189193

190-
c.onNodeManaged(nodeName)
194+
// it is first or it has been transitioned, and then manage the node.
195+
if first || transitions {
196+
c.onNodeManaged(nodeName)
197+
}
191198
return lease, nil
192199
}
193200

@@ -249,7 +256,7 @@ func (c *NodeLeaseController) ensureLease(ctx context.Context, leaseName string)
249256
}
250257

251258
// renewLease attempts to update the lease for maxUpdateRetries, call this once you're sure the lease has been created
252-
func (c *NodeLeaseController) renewLease(ctx context.Context, base *coordinationv1.Lease) (*coordinationv1.Lease, error) {
259+
func (c *NodeLeaseController) renewLease(ctx context.Context, base *coordinationv1.Lease) (*coordinationv1.Lease, bool, error) {
253260
lease := base.DeepCopy()
254261

255262
transitions := format.ElemOrDefault(lease.Spec.HolderIdentity) != c.holderIdentity
@@ -263,15 +270,15 @@ func (c *NodeLeaseController) renewLease(ctx context.Context, base *coordination
263270
if c.mutateLeaseFunc != nil {
264271
err := c.mutateLeaseFunc(lease)
265272
if err != nil {
266-
return nil, err
273+
return nil, false, err
267274
}
268275
}
269276

270277
lease, err := c.typedClient.CoordinationV1().Leases(lease.Namespace).Update(ctx, lease, metav1.UpdateOptions{})
271278
if err != nil {
272-
return nil, err
279+
return nil, false, err
273280
}
274-
return lease, nil
281+
return lease, transitions, nil
275282
}
276283

277284
// setNodeOwnerFunc helps construct a mutateLeaseFunc which sets a node OwnerReference to the given lease object

0 commit comments

Comments
 (0)