Skip to content

Commit 50dfd74

Browse files
Add support for lease stealing (#78)
Fixes #4 Signed-off-by: Connor McKelvey <[email protected]> Signed-off-by: Ali Hobbs <[email protected]> Co-authored-by: Ali Hobbs <[email protected]> Co-authored-by: Ali Hobbs <[email protected]>
1 parent 9ceea63 commit 50dfd74

18 files changed

+1233
-50
lines changed

HyperMake

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ targets:
88
rebuild-toolchain:
99
description: build toolchain image
1010
watches:
11-
- support/docker/toolchain
12-
build: support/docker/toolchain
11+
- support/toolchain/docker
12+
build: support/toolchain/docker
1313

1414
toolchain:
1515
description: placeholder for additional toolchain dependencies

clientlibrary/checkpoint/checkpointer.go

+14
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,13 @@ const (
4040
LeaseTimeoutKey = "LeaseTimeout"
4141
SequenceNumberKey = "Checkpoint"
4242
ParentShardIdKey = "ParentShardId"
43+
ClaimRequestKey = "ClaimRequest"
4344

4445
// We've completely processed all records in this shard.
4546
ShardEnd = "SHARD_END"
47+
48+
// ErrShardClaimed is returned when shard is claimed
49+
ErrShardClaimed = "Shard is already claimed by another node"
4650
)
4751

4852
type ErrLeaseNotAcquired struct {
@@ -72,7 +76,17 @@ type Checkpointer interface {
7276

7377
// RemoveLeaseOwner to remove lease owner for the shard entry to make the shard available for reassignment
7478
RemoveLeaseOwner(string) error
79+
80+
// New Lease Stealing Methods
81+
// ListActiveWorkers returns active workers and their shards
82+
ListActiveWorkers(map[string]*par.ShardStatus) (map[string][]*par.ShardStatus, error)
83+
84+
// ClaimShard claims a shard for stealing
85+
ClaimShard(*par.ShardStatus, string) error
7586
}
7687

7788
// ErrSequenceIDNotFound is returned by FetchCheckpoint when no SequenceID is found
7889
var ErrSequenceIDNotFound = errors.New("SequenceIDNotFoundForShard")
90+
91+
// ErrShardNotAssigned is returned by ListActiveWorkers when no AssignedTo is found
92+
var ErrShardNotAssigned = errors.New("AssignedToNotFoundForShard")

clientlibrary/checkpoint/dynamodb-checkpointer.go

+186-6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
package checkpoint
2929

3030
import (
31+
"errors"
32+
"fmt"
3133
"time"
3234

3335
"github.com/aws/aws-sdk-go/aws"
@@ -61,6 +63,7 @@ type DynamoCheckpoint struct {
6163
svc dynamodbiface.DynamoDBAPI
6264
kclConfig *config.KinesisClientLibConfiguration
6365
Retries int
66+
lastLeaseSync time.Time
6467
}
6568

6669
func NewDynamoCheckpoint(kclConfig *config.KinesisClientLibConfiguration) *DynamoCheckpoint {
@@ -124,8 +127,22 @@ func (checkpointer *DynamoCheckpoint) GetLease(shard *par.ShardStatus, newAssign
124127
return err
125128
}
126129

130+
isClaimRequestExpired := shard.IsClaimRequestExpired(checkpointer.kclConfig)
131+
132+
var claimRequest string
133+
if checkpointer.kclConfig.EnableLeaseStealing {
134+
if currentCheckpointClaimRequest, ok := currentCheckpoint[ClaimRequestKey]; ok && currentCheckpointClaimRequest.S != nil {
135+
claimRequest = *currentCheckpointClaimRequest.S
136+
if newAssignTo != claimRequest && !isClaimRequestExpired {
137+
checkpointer.log.Debugf("another worker: %s has a claim on this shard. Not going to renew the lease", claimRequest)
138+
return errors.New(ErrShardClaimed)
139+
}
140+
}
141+
}
142+
127143
assignedVar, assignedToOk := currentCheckpoint[LeaseOwnerKey]
128144
leaseVar, leaseTimeoutOk := currentCheckpoint[LeaseTimeoutKey]
145+
129146
var conditionalExpression string
130147
var expressionAttributeValues map[string]*dynamodb.AttributeValue
131148

@@ -140,8 +157,14 @@ func (checkpointer *DynamoCheckpoint) GetLease(shard *par.ShardStatus, newAssign
140157
return err
141158
}
142159

143-
if time.Now().UTC().Before(currentLeaseTimeout) && assignedTo != newAssignTo {
144-
return ErrLeaseNotAcquired{"current lease timeout not yet expired"}
160+
if checkpointer.kclConfig.EnableLeaseStealing {
161+
if time.Now().UTC().Before(currentLeaseTimeout) && assignedTo != newAssignTo && !isClaimRequestExpired {
162+
return ErrLeaseNotAcquired{"current lease timeout not yet expired"}
163+
}
164+
} else {
165+
if time.Now().UTC().Before(currentLeaseTimeout) && assignedTo != newAssignTo {
166+
return ErrLeaseNotAcquired{"current lease timeout not yet expired"}
167+
}
145168
}
146169

147170
checkpointer.log.Debugf("Attempting to get a lock for shard: %s, leaseTimeout: %s, assignedTo: %s, newAssignedTo: %s", shard.ID, currentLeaseTimeout, assignedTo, newAssignTo)
@@ -175,9 +198,21 @@ func (checkpointer *DynamoCheckpoint) GetLease(shard *par.ShardStatus, newAssign
175198
marshalledCheckpoint[ParentShardIdKey] = &dynamodb.AttributeValue{S: aws.String(shard.ParentShardId)}
176199
}
177200

178-
if shard.GetCheckpoint() != "" {
201+
if checkpoint := shard.GetCheckpoint(); checkpoint != "" {
179202
marshalledCheckpoint[SequenceNumberKey] = &dynamodb.AttributeValue{
180-
S: aws.String(shard.GetCheckpoint()),
203+
S: aws.String(checkpoint),
204+
}
205+
}
206+
207+
if checkpointer.kclConfig.EnableLeaseStealing {
208+
if claimRequest != "" && claimRequest == newAssignTo && !isClaimRequestExpired {
209+
if expressionAttributeValues == nil {
210+
expressionAttributeValues = make(map[string]*dynamodb.AttributeValue)
211+
}
212+
conditionalExpression = conditionalExpression + " AND ClaimRequest = :claim_request"
213+
expressionAttributeValues[":claim_request"] = &dynamodb.AttributeValue{
214+
S: &claimRequest,
215+
}
181216
}
182217
}
183218

@@ -199,7 +234,7 @@ func (checkpointer *DynamoCheckpoint) GetLease(shard *par.ShardStatus, newAssign
199234

200235
// CheckpointSequence writes a checkpoint at the designated sequence ID
201236
func (checkpointer *DynamoCheckpoint) CheckpointSequence(shard *par.ShardStatus) error {
202-
leaseTimeout := shard.LeaseTimeout.UTC().Format(time.RFC3339)
237+
leaseTimeout := shard.GetLeaseTimeout().UTC().Format(time.RFC3339)
203238
marshalledCheckpoint := map[string]*dynamodb.AttributeValue{
204239
LeaseKeyKey: {
205240
S: aws.String(shard.ID),
@@ -208,7 +243,7 @@ func (checkpointer *DynamoCheckpoint) CheckpointSequence(shard *par.ShardStatus)
208243
S: aws.String(shard.GetCheckpoint()),
209244
},
210245
LeaseOwnerKey: {
211-
S: aws.String(shard.AssignedTo),
246+
S: aws.String(shard.GetLeaseOwner()),
212247
},
213248
LeaseTimeoutKey: {
214249
S: aws.String(leaseTimeout),
@@ -239,6 +274,16 @@ func (checkpointer *DynamoCheckpoint) FetchCheckpoint(shard *par.ShardStatus) er
239274
if assignedTo, ok := checkpoint[LeaseOwnerKey]; ok {
240275
shard.SetLeaseOwner(aws.StringValue(assignedTo.S))
241276
}
277+
278+
// Use up-to-date leaseTimeout to avoid ConditionalCheckFailedException when claiming
279+
if leaseTimeout, ok := checkpoint[LeaseTimeoutKey]; ok && leaseTimeout.S != nil {
280+
currentLeaseTimeout, err := time.Parse(time.RFC3339, aws.StringValue(leaseTimeout.S))
281+
if err != nil {
282+
return err
283+
}
284+
shard.LeaseTimeout = currentLeaseTimeout
285+
}
286+
242287
return nil
243288
}
244289

@@ -265,13 +310,148 @@ func (checkpointer *DynamoCheckpoint) RemoveLeaseOwner(shardID string) error {
265310
},
266311
},
267312
UpdateExpression: aws.String("remove " + LeaseOwnerKey),
313+
ExpressionAttributeValues: map[string]*dynamodb.AttributeValue{
314+
":assigned_to": {
315+
S: aws.String(checkpointer.kclConfig.WorkerID),
316+
},
317+
},
318+
ConditionExpression: aws.String("AssignedTo = :assigned_to"),
268319
}
269320

270321
_, err := checkpointer.svc.UpdateItem(input)
271322

272323
return err
273324
}
274325

326+
// ListActiveWorkers returns a map of workers and their shards
327+
func (checkpointer *DynamoCheckpoint) ListActiveWorkers(shardStatus map[string]*par.ShardStatus) (map[string][]*par.ShardStatus, error) {
328+
err := checkpointer.syncLeases(shardStatus)
329+
if err != nil {
330+
return nil, err
331+
}
332+
333+
workers := map[string][]*par.ShardStatus{}
334+
for _, shard := range shardStatus {
335+
if shard.GetCheckpoint() == ShardEnd {
336+
continue
337+
}
338+
339+
leaseOwner := shard.GetLeaseOwner()
340+
if leaseOwner == "" {
341+
checkpointer.log.Debugf("Shard Not Assigned Error. ShardID: %s, WorkerID: %s", shard.ID, checkpointer.kclConfig.WorkerID)
342+
return nil, ErrShardNotAssigned
343+
}
344+
if w, ok := workers[leaseOwner]; ok {
345+
workers[leaseOwner] = append(w, shard)
346+
} else {
347+
workers[leaseOwner] = []*par.ShardStatus{shard}
348+
}
349+
}
350+
return workers, nil
351+
}
352+
353+
// ClaimShard places a claim request on a shard to signal a steal attempt
354+
func (checkpointer *DynamoCheckpoint) ClaimShard(shard *par.ShardStatus, claimID string) error {
355+
err := checkpointer.FetchCheckpoint(shard)
356+
if err != nil && err != ErrSequenceIDNotFound {
357+
return err
358+
}
359+
leaseTimeoutString := shard.GetLeaseTimeout().Format(time.RFC3339)
360+
361+
conditionalExpression := `ShardID = :id AND LeaseTimeout = :lease_timeout AND attribute_not_exists(ClaimRequest)`
362+
expressionAttributeValues := map[string]*dynamodb.AttributeValue{
363+
":id": {
364+
S: aws.String(shard.ID),
365+
},
366+
":lease_timeout": {
367+
S: aws.String(leaseTimeoutString),
368+
},
369+
}
370+
371+
marshalledCheckpoint := map[string]*dynamodb.AttributeValue{
372+
LeaseKeyKey: {
373+
S: &shard.ID,
374+
},
375+
LeaseTimeoutKey: {
376+
S: &leaseTimeoutString,
377+
},
378+
SequenceNumberKey: {
379+
S: &shard.Checkpoint,
380+
},
381+
ClaimRequestKey: {
382+
S: &claimID,
383+
},
384+
}
385+
386+
if leaseOwner := shard.GetLeaseOwner(); leaseOwner == "" {
387+
conditionalExpression += " AND attribute_not_exists(AssignedTo)"
388+
} else {
389+
marshalledCheckpoint[LeaseOwnerKey] = &dynamodb.AttributeValue{S: &leaseOwner}
390+
conditionalExpression += "AND AssignedTo = :assigned_to"
391+
expressionAttributeValues[":assigned_to"] = &dynamodb.AttributeValue{S: &leaseOwner}
392+
}
393+
394+
if checkpoint := shard.GetCheckpoint(); checkpoint == "" {
395+
conditionalExpression += " AND attribute_not_exists(Checkpoint)"
396+
} else if checkpoint == ShardEnd {
397+
conditionalExpression += " AND Checkpoint <> :checkpoint"
398+
expressionAttributeValues[":checkpoint"] = &dynamodb.AttributeValue{S: aws.String(ShardEnd)}
399+
} else {
400+
conditionalExpression += " AND Checkpoint = :checkpoint"
401+
expressionAttributeValues[":checkpoint"] = &dynamodb.AttributeValue{S: &checkpoint}
402+
}
403+
404+
if shard.ParentShardId == "" {
405+
conditionalExpression += " AND attribute_not_exists(ParentShardId)"
406+
} else {
407+
marshalledCheckpoint[ParentShardIdKey] = &dynamodb.AttributeValue{S: aws.String(shard.ParentShardId)}
408+
conditionalExpression += " AND ParentShardId = :parent_shard"
409+
expressionAttributeValues[":parent_shard"] = &dynamodb.AttributeValue{S: &shard.ParentShardId}
410+
}
411+
412+
return checkpointer.conditionalUpdate(conditionalExpression, expressionAttributeValues, marshalledCheckpoint)
413+
}
414+
415+
func (checkpointer *DynamoCheckpoint) syncLeases(shardStatus map[string]*par.ShardStatus) error {
416+
log := checkpointer.kclConfig.Logger
417+
418+
if (checkpointer.lastLeaseSync.Add(time.Duration(checkpointer.kclConfig.LeaseSyncingTimeIntervalMillis) * time.Millisecond)).After(time.Now()) {
419+
return nil
420+
}
421+
422+
checkpointer.lastLeaseSync = time.Now()
423+
input := &dynamodb.ScanInput{
424+
ProjectionExpression: aws.String(fmt.Sprintf("%s,%s,%s", LeaseKeyKey, LeaseOwnerKey, SequenceNumberKey)),
425+
Select: aws.String("SPECIFIC_ATTRIBUTES"),
426+
TableName: aws.String(checkpointer.kclConfig.TableName),
427+
}
428+
429+
err := checkpointer.svc.ScanPages(input,
430+
func(pages *dynamodb.ScanOutput, lastPage bool) bool {
431+
results := pages.Items
432+
for _, result := range results {
433+
shardId, foundShardId := result[LeaseKeyKey]
434+
assignedTo, foundAssignedTo := result[LeaseOwnerKey]
435+
checkpoint, foundCheckpoint := result[SequenceNumberKey]
436+
if !foundShardId || !foundAssignedTo || !foundCheckpoint {
437+
continue
438+
}
439+
if shard, ok := shardStatus[aws.StringValue(shardId.S)]; ok {
440+
shard.SetLeaseOwner(aws.StringValue(assignedTo.S))
441+
shard.SetCheckpoint(aws.StringValue(checkpoint.S))
442+
}
443+
}
444+
return !lastPage
445+
})
446+
447+
if err != nil {
448+
log.Debugf("Error performing SyncLeases. Error: %+v ", err)
449+
return err
450+
}
451+
log.Debugf("Lease sync completed. Next lease sync will occur in %s", time.Duration(checkpointer.kclConfig.LeaseSyncingTimeIntervalMillis)*time.Millisecond)
452+
return nil
453+
}
454+
275455
func (checkpointer *DynamoCheckpoint) createTable() error {
276456
input := &dynamodb.CreateTableInput{
277457
AttributeDefinitions: []*dynamodb.AttributeDefinition{

0 commit comments

Comments
 (0)