Skip to content

Commit 9da6a4e

Browse files
authored
Merge pull request #214 from rq-dbrady/rq-dbrady/redisLockSweeperRaceconditionFixes
Fix Issue: Ensure proper locking in WorkflowSweeper to prevent race conditions
2 parents 7e877cb + 42a708f commit 9da6a4e

File tree

4 files changed

+21
-43
lines changed

4 files changed

+21
-43
lines changed

core/src/main/java/com/netflix/conductor/core/execution/WorkflowExecutor.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,6 @@ void restart(String workflowId, boolean useLatestDefinitions)
105105
*/
106106
WorkflowModel decide(String workflowId);
107107

108-
/**
109-
* @param workflow workflow to be evaluated
110-
* @return updated workflow
111-
*/
112-
WorkflowModel decideWithLock(WorkflowModel workflow);
113-
114108
/**
115109
* @param workflowId id of the workflow to be terminated
116110
* @param reason termination reason to be recorded

core/src/main/java/com/netflix/conductor/core/execution/WorkflowExecutorOps.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,33 +1039,6 @@ public WorkflowModel decide(String workflowId) {
10391039
}
10401040
}
10411041

1042-
/**
1043-
* This method overloads the {@link #decide(String)}. It will acquire a lock and evaluate the
1044-
* state of the workflow.
1045-
*
1046-
* @param workflow the workflow to evaluate the state for
1047-
* @return the workflow
1048-
*/
1049-
@Override
1050-
public WorkflowModel decideWithLock(WorkflowModel workflow) {
1051-
if (workflow == null) {
1052-
return null;
1053-
}
1054-
StopWatch watch = new StopWatch();
1055-
watch.start();
1056-
if (!executionLockService.acquireLock(workflow.getWorkflowId())) {
1057-
return null;
1058-
}
1059-
try {
1060-
return decide(workflow);
1061-
1062-
} finally {
1063-
executionLockService.releaseLock(workflow.getWorkflowId());
1064-
watch.stop();
1065-
Monitors.recordWorkflowDecisionTime(watch.getTime());
1066-
}
1067-
}
1068-
10691042
/**
10701043
* @param workflow the workflow to evaluate the state for
10711044
* @return true if the workflow has completed (success or failed), false otherwise. Note: This

core/src/main/java/com/netflix/conductor/core/reconciliation/WorkflowSweeper.java

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import com.netflix.conductor.model.TaskModel;
3636
import com.netflix.conductor.model.TaskModel.Status;
3737
import com.netflix.conductor.model.WorkflowModel;
38+
import com.netflix.conductor.service.ExecutionLockService;
3839

3940
import static com.netflix.conductor.core.config.SchedulerConfiguration.SWEEPER_EXECUTOR_NAME;
4041
import static com.netflix.conductor.core.utils.Utils.DECIDER_QUEUE;
@@ -49,6 +50,7 @@ public class WorkflowSweeper {
4950
private final WorkflowRepairService workflowRepairService;
5051
private final QueueDAO queueDAO;
5152
private final ExecutionDAOFacade executionDAOFacade;
53+
private final ExecutionLockService executionLockService;
5254

5355
private static final String CLASS_NAME = WorkflowSweeper.class.getSimpleName();
5456

@@ -57,12 +59,14 @@ public WorkflowSweeper(
5759
Optional<WorkflowRepairService> workflowRepairService,
5860
ConductorProperties properties,
5961
QueueDAO queueDAO,
60-
ExecutionDAOFacade executionDAOFacade) {
62+
ExecutionDAOFacade executionDAOFacade,
63+
ExecutionLockService executionLockService) {
6164
this.properties = properties;
6265
this.queueDAO = queueDAO;
6366
this.workflowExecutor = workflowExecutor;
6467
this.executionDAOFacade = executionDAOFacade;
6568
this.workflowRepairService = workflowRepairService.orElse(null);
69+
this.executionLockService = executionLockService;
6670
LOGGER.info("WorkflowSweeper initialized.");
6771
}
6872

@@ -73,25 +77,26 @@ public CompletableFuture<Void> sweepAsync(String workflowId) {
7377
}
7478

7579
public void sweep(String workflowId) {
80+
WorkflowContext workflowContext = new WorkflowContext(properties.getAppId());
81+
WorkflowContext.set(workflowContext);
7682
WorkflowModel workflow = null;
7783
try {
78-
WorkflowContext workflowContext = new WorkflowContext(properties.getAppId());
79-
WorkflowContext.set(workflowContext);
80-
LOGGER.debug("Running sweeper for workflow {}", workflowId);
81-
84+
if (!executionLockService.acquireLock(workflowId)) {
85+
return;
86+
}
8287
workflow = executionDAOFacade.getWorkflowModel(workflowId, true);
83-
88+
LOGGER.debug("Running sweeper for workflow {}", workflowId);
8489
if (workflowRepairService != null) {
8590
// Verify and repair tasks in the workflow.
8691
workflowRepairService.verifyAndRepairWorkflowTasks(workflow);
8792
}
88-
89-
workflow = workflowExecutor.decideWithLock(workflow);
93+
long decideStartTime = System.currentTimeMillis();
94+
workflow = workflowExecutor.decide(workflow.getWorkflowId());
95+
Monitors.recordWorkflowDecisionTime(System.currentTimeMillis() - decideStartTime);
9096
if (workflow != null && workflow.getStatus().isTerminal()) {
9197
queueDAO.remove(DECIDER_QUEUE, workflowId);
9298
return;
9399
}
94-
95100
} catch (NotFoundException nfe) {
96101
queueDAO.remove(DECIDER_QUEUE, workflowId);
97102
LOGGER.info(
@@ -100,6 +105,8 @@ public void sweep(String workflowId) {
100105
} catch (Exception e) {
101106
Monitors.error(CLASS_NAME, "sweep");
102107
LOGGER.error("Error running sweep for " + workflowId, e);
108+
} finally {
109+
executionLockService.releaseLock(workflowId);
103110
}
104111
long workflowOffsetTimeout =
105112
workflowOffsetWithJitter(properties.getWorkflowOffsetTimeout().getSeconds());

core/src/test/java/com/netflix/conductor/core/reconciliation/TestWorkflowSweeper.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import com.netflix.conductor.model.TaskModel;
3030
import com.netflix.conductor.model.TaskModel.Status;
3131
import com.netflix.conductor.model.WorkflowModel;
32+
import com.netflix.conductor.service.ExecutionLockService;
3233

3334
import static com.netflix.conductor.core.utils.Utils.DECIDER_QUEUE;
3435

@@ -45,6 +46,7 @@ public class TestWorkflowSweeper {
4546
private QueueDAO queueDAO;
4647
private ExecutionDAOFacade executionDAOFacade;
4748
private WorkflowSweeper workflowSweeper;
49+
private ExecutionLockService executionLockService;
4850

4951
private int defaultPostPoneOffSetSeconds = 1800;
5052

@@ -55,13 +57,15 @@ public void setUp() {
5557
queueDAO = mock(QueueDAO.class);
5658
workflowRepairService = mock(WorkflowRepairService.class);
5759
executionDAOFacade = mock(ExecutionDAOFacade.class);
60+
executionLockService = mock(ExecutionLockService.class);
5861
workflowSweeper =
5962
new WorkflowSweeper(
6063
workflowExecutor,
6164
Optional.of(workflowRepairService),
6265
properties,
6366
queueDAO,
64-
executionDAOFacade);
67+
executionDAOFacade,
68+
executionLockService);
6569
}
6670

6771
@Test

0 commit comments

Comments
 (0)