@@ -299,7 +299,64 @@ func (h *nosqlHistoryStore) ForkHistoryBranch(
299
299
return resp , nil
300
300
}
301
301
302
- // DeleteHistoryBranch removes a branch
302
+ // DeleteHistoryBranch removes a branch. This is responsible for:
303
+ //
304
+ // - Deleting entries from the history_node table
305
+ // - Deleting the tree entry in the history_tree table
306
+ // - Handling the problem of garbage-collecting branches which may mutually rely on history
307
+ //
308
+ // For normal workflows, with only a single branch, this is straightfoward, it'll just do a
309
+ // delete on the history_node and history_tree tables. However, in cases where history is
310
+ // branched, things are slightly more complex. History branches in at least two ways:
311
+ // - During the conflict resolution handling of events during failover (where each
312
+ // region is represented as a history branch at the point where they were operating concurrently.
313
+ // - At the point of a workflow reset, where history is 'forked' and the previous
314
+ // workflow events are discarded, but the original workflow run's history is referenced.
315
+ //
316
+ // For workflows with forked history, they'll reference any nodes they rely on ancestors in the history_tree
317
+ // table. These references are exclusive, ie, the following is true:
318
+ //
319
+ // --------------+--------------------------------------
320
+ // tree_id | X
321
+ // branch_id | B
322
+ // ancestors | [{branch_id: A, end_node_id: 4}]
323
+ // --------------+--------------------------------------
324
+ // tree_id | X
325
+ // branch_id | A
326
+ // ancestors | null
327
+ //
328
+ // Will have nodes arranged like this
329
+ //
330
+ // ┌────────────┐
331
+ // │ Branch: A │
332
+ // │ Node: 1 │
333
+ // └─────┬──────┘
334
+ // │
335
+ // ┌─────▼──────┐
336
+ // │ Branch: A │
337
+ // │ Node: 2 │
338
+ // └─────┬──────┘
339
+ // │
340
+ // ┌─────▼──────┐
341
+ // │ Branch: A │
342
+ // │ Node: 3 ┼────────────┐
343
+ // └─────┬──────┘ │
344
+ // │ │
345
+ // ┌─────▼──────┐ ┌──────▼─────┐
346
+ // │ Branch: A │ │ Branch: B │
347
+ // │ Node: 4 │ │ Node: 4 │
348
+ // └────────────┘ └────────────┘
349
+ //
350
+ // The method of cleanup for each branch therefore, is to just remove the nodes that are not in use,
351
+ // nearly-always starting from the oldest branch (the original run) and, later in subsequent invocations,
352
+ // the branches relying on this. These cleanups are done by a call to the lower persistence layer
353
+ // with HistoryNodeFilter references specifying the minimum nodes to be cleaned up (inclusive).
354
+ //
355
+ // While workflowIDs being enforced to be held by a single run to execute at any one time
356
+ // generally ensures that any ancestor is cleaned up first, there's a couple of exceptions
357
+ // to keep in mind. Firstly, deletion jitter makes it near-certain that there'll be some overlap and therefore
358
+ // no guarantee that the oldest branch gets cleaned up first, as well as timer tasks are likely to retry and therefore
359
+ // end up being out of order anyway. So any cleanup needs to be able to handle these cases.
303
360
func (h * nosqlHistoryStore ) DeleteHistoryBranch (
304
361
ctx context.Context ,
305
362
request * persistence.InternalDeleteHistoryBranchRequest ,
@@ -318,6 +375,7 @@ func (h *nosqlHistoryStore) DeleteHistoryBranch(
318
375
TreeID : treeID ,
319
376
ShardID : & request .ShardID ,
320
377
})
378
+
321
379
if err != nil {
322
380
return err
323
381
}
@@ -327,13 +385,33 @@ func (h *nosqlHistoryStore) DeleteHistoryBranch(
327
385
TreeID : treeID ,
328
386
BranchID : & branch .BranchID ,
329
387
}
388
+
389
+ // This is the selection of history_nodes that will be removed
390
+ // at this point it's not safe to delete any nodes, because we don't know
391
+ // if this branch has other branches that've taken a dependency on it's nodes
392
+ // so we start with an empty filter
330
393
var nodeFilters []* nosqlplugin.HistoryNodeFilter
331
394
332
- // validBRsMaxEndNode is to know each branch range that is being used, we want to know what is the max nodeID referred by other valid branch
395
+ // ... and then look at all the active / good history branches and see
396
+ // what they're referencing. The idea being, to trim any nodes that
397
+ // might not in use by going and finding the topmost nodes that are currently
398
+ // referenced, and trimming those.
399
+ //
400
+ // validBRsMaxEndNode represents each branch range that is being used,
401
+ // we want to know what is the max nodeID referred by other valid branches
333
402
validBRsMaxEndNode := persistenceutils .GetBranchesMaxReferredNodeIDs (rsp .Branches )
334
403
404
+ // Todo (david.porter) handle the case of the history nodes being left over
405
+ // in the last branch (see unit test TestDeleteHistoryBranch_usedBranchWithGarbageFullyCleanedUp).
406
+ //
407
+ // Todo (david.porter) handle the case of out of order deletions where the
408
+ // ancestor branch is still 'valid' or being deleted after, and this
409
+ // deletion shouldn't necessarily render it invalid. (see test skipped)
410
+
335
411
// for each branch range to delete, we iterate from bottom to up, and delete up to the point according to validBRsEndNode
412
+ // brsToDelete here includes both the current branch being operated on and its ancestors
336
413
for i := len (brsToDelete ) - 1 ; i >= 0 ; i -- {
414
+
337
415
br := brsToDelete [i ]
338
416
maxReferredEndNodeID , ok := validBRsMaxEndNode [br .BranchID ]
339
417
if ok {
0 commit comments