Skip to content

Commit 9e78e8a

Browse files
authored
Fix erroneous closing of workflow tasks instead of heartbeating in some edge cases (#1295)
1 parent 47a91c8 commit 9e78e8a

4 files changed

Lines changed: 206 additions & 31 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (C) 2022 Temporal Technologies, Inc. All Rights Reserved.
3+
*
4+
* Copyright (C) 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this material except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package io.temporal.internal;
22+
23+
public class Config {
24+
/** Force new workflow task after workflow task timeout multiplied by this coefficient. */
25+
public static final double WORKFLOW_TAK_HEARTBEAT_COEFFICIENT = 4d / 5d;
26+
}

temporal-sdk/src/main/java/io/temporal/internal/replay/ReplayWorkflowRunTaskHandler.java

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import static io.temporal.internal.common.ProtobufTimeUtils.toJavaDuration;
2424
import static io.temporal.serviceclient.CheckedExceptionWrapper.wrap;
2525

26+
import com.google.common.base.Preconditions;
2627
import com.google.common.base.Throwables;
2728
import com.google.protobuf.util.Durations;
2829
import com.google.protobuf.util.Timestamps;
@@ -36,6 +37,7 @@
3637
import io.temporal.api.query.v1.WorkflowQuery;
3738
import io.temporal.api.query.v1.WorkflowQueryResult;
3839
import io.temporal.api.workflowservice.v1.PollWorkflowTaskQueueResponseOrBuilder;
40+
import io.temporal.internal.Config;
3941
import io.temporal.internal.statemachines.StatesMachinesCallback;
4042
import io.temporal.internal.statemachines.WorkflowStateMachines;
4143
import io.temporal.internal.worker.*;
@@ -61,9 +63,6 @@
6163
*/
6264
class ReplayWorkflowRunTaskHandler implements WorkflowRunTaskHandler {
6365

64-
/** Force new decision task after workflow task timeout multiplied by this coefficient. */
65-
public static final double FORCED_DECISION_TIME_COEFFICIENT = 4d / 5d;
66-
6766
private final WorkflowServiceStubs service;
6867

6968
private final String namespace;
@@ -86,6 +85,7 @@ class ReplayWorkflowRunTaskHandler implements WorkflowRunTaskHandler {
8685
private final WorkflowStateMachines workflowStateMachines;
8786

8887
/** Number of non completed local activity tasks */
88+
// TODO move and maintain this counter inside workflowStateMachines
8989
private int localActivityTaskCount;
9090

9191
private final ReplayWorkflowExecutor replayWorkflowExecutor;
@@ -252,54 +252,58 @@ private void processLocalActivityRequests(long startTimeNs) throws InterruptedEx
252252
long durationUntilWFTHeartbeatNs =
253253
(long)
254254
(Durations.toNanos(startedEvent.getWorkflowTaskTimeout())
255-
* FORCED_DECISION_TIME_COEFFICIENT);
255+
* Config.WORKFLOW_TAK_HEARTBEAT_COEFFICIENT);
256256

257257
long nextWFTHeartbeatTimeNs = startTimeNs + durationUntilWFTHeartbeatNs;
258+
258259
while (true) {
259260
List<ExecuteLocalActivityParameters> laRequests =
260261
workflowStateMachines.takeLocalActivityRequests();
262+
localActivityTaskCount += laRequests.size();
263+
261264
for (ExecuteLocalActivityParameters laRequest : laRequests) {
262265
// TODO(maxim): In the presence of workflow task heartbeat this timeout doesn't make
263266
// much sense. I believe we should add ScheduleToStart timeout for the local activities
264267
// as well.
265-
Duration maxWaitTime = Duration.ofNanos(nextWFTHeartbeatTimeNs - System.nanoTime());
266-
if (maxWaitTime.isNegative()) {
267-
maxWaitTime = Duration.ZERO;
268-
}
268+
long maxWaitTimeNs = Math.max(nextWFTHeartbeatTimeNs - System.nanoTime(), 0);
269269
boolean accepted =
270270
localActivityTaskPoller.apply(
271-
new LocalActivityTask(laRequest, localActivityCompletionSink), maxWaitTime);
272-
localActivityTaskCount++;
273-
if (!accepted) {
274-
throw new IllegalStateException(
275-
"Unable to schedule local activity for execution, no more slots available and local activity task queue is full");
276-
}
271+
new LocalActivityTask(laRequest, localActivityCompletionSink),
272+
Duration.ofNanos(maxWaitTimeNs));
273+
Preconditions.checkState(
274+
accepted,
275+
"Unable to schedule local activity for execution, "
276+
+ "no more slots available and local activity task queue is full");
277277
}
278+
278279
if (localActivityTaskCount == 0) {
279280
// No outstanding local activity requests
280281
break;
281282
}
282-
waitAndProcessLocalActivityCompletion(nextWFTHeartbeatTimeNs);
283-
if (nextWFTHeartbeatTimeNs <= System.nanoTime()) {
283+
284+
long maxWaitTimeTillHeartbeatNs = Math.max(nextWFTHeartbeatTimeNs - System.nanoTime(), 0);
285+
ActivityTaskHandler.Result laCompletion =
286+
localActivityCompletionQueue.poll(maxWaitTimeTillHeartbeatNs, TimeUnit.NANOSECONDS);
287+
if (laCompletion == null) {
288+
// Need to force a new task as we are out of time
284289
break;
285290
}
286-
}
287-
}
288291

289-
private void waitAndProcessLocalActivityCompletion(long nextForcedDecisionTimeNs)
290-
throws InterruptedException {
291-
long maxWaitTimeNs = nextForcedDecisionTimeNs - System.nanoTime();
292-
if (maxWaitTimeNs <= 0) {
293-
return;
294-
}
295-
ActivityTaskHandler.Result laCompletion;
296-
laCompletion = localActivityCompletionQueue.poll(maxWaitTimeNs, TimeUnit.NANOSECONDS);
297-
if (laCompletion == null) {
298-
// Need to force a new task as nextForcedDecisionTime has passed.
299-
return;
292+
localActivityTaskCount--;
293+
workflowStateMachines.handleLocalActivityCompletion(laCompletion);
294+
// handleLocalActivityCompletion triggers eventLoop.
295+
// After this call, there may be new local activity requests available in
296+
// workflowStateMachines.takeLocalActivityRequests()
297+
// These requests need to be processed and accounted for, otherwise we may end up not
298+
// heartbeating and completing workflow task instead. So we have to make another iteration.
300299
}
301-
localActivityTaskCount--;
302-
workflowStateMachines.handleLocalActivityCompletion(laCompletion);
300+
301+
// it's safe to call and discard the result of takeLocalActivityRequests() here, because if it's
302+
// not empty - we are in trouble anyway
303+
Preconditions.checkState(
304+
workflowStateMachines.takeLocalActivityRequests().isEmpty(),
305+
"[BUG] Local activities requests from the last event loop were not drained "
306+
+ "and accounted in the outstanding local activities counter");
303307
}
304308

305309
private class StatesMachinesCallbackImpl implements StatesMachinesCallback {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* Copyright (C) 2022 Temporal Technologies, Inc. All Rights Reserved.
3+
*
4+
* Copyright (C) 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this material except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package io.temporal.workflow.activityTests;
22+
23+
import static org.junit.Assert.assertEquals;
24+
25+
import io.temporal.client.WorkflowOptions;
26+
import io.temporal.client.WorkflowStub;
27+
import io.temporal.internal.Config;
28+
import io.temporal.internal.Issue;
29+
import io.temporal.testing.internal.SDKTestOptions;
30+
import io.temporal.testing.internal.SDKTestWorkflowRule;
31+
import io.temporal.workflow.Workflow;
32+
import io.temporal.workflow.shared.TestActivities.TestActivitiesImpl;
33+
import io.temporal.workflow.shared.TestActivities.VariousTestActivities;
34+
import io.temporal.workflow.shared.TestWorkflows.TestWorkflow1;
35+
import java.time.Duration;
36+
import java.util.ArrayList;
37+
import java.util.List;
38+
import java.util.concurrent.TimeUnit;
39+
import org.junit.Rule;
40+
import org.junit.Test;
41+
42+
@Issue("https://github.com/temporalio/sdk-java/issues/1262")
43+
public class LocalActivityGettingScheduledRightBeforeWorkflowTaskHeartbeatTest {
44+
private static final Duration WORKFLOW_TASK_TIMEOUT = Duration.ofSeconds(5);
45+
private static final Duration SLEEP_DURATION =
46+
Duration.ofMillis(800); // << 1000 to avoid deadlock detection
47+
48+
private final TestActivitiesImpl activitiesImpl = new TestActivitiesImpl();
49+
50+
@Rule
51+
public SDKTestWorkflowRule testWorkflowRule =
52+
SDKTestWorkflowRule.newBuilder()
53+
.setWorkflowTypes(HeartbeatingWorkflowImpl.class)
54+
.setActivityImplementations(activitiesImpl)
55+
.build();
56+
57+
@Test(timeout = 15_000)
58+
public void testLocalActivitiesWorkflowTaskHeartbeat() {
59+
WorkflowOptions options =
60+
WorkflowOptions.newBuilder()
61+
.setWorkflowRunTimeout(WORKFLOW_TASK_TIMEOUT.multipliedBy(2))
62+
.setWorkflowTaskTimeout(WORKFLOW_TASK_TIMEOUT)
63+
.setTaskQueue(testWorkflowRule.getTaskQueue())
64+
.build();
65+
66+
List<WorkflowStub> stubs = new ArrayList<>();
67+
68+
// this test is actually pretty stable,
69+
// but run several instances to increase the chances of the right timing being hit
70+
for (int i = 0; i < 5; i++) {
71+
TestWorkflow1 workflow =
72+
testWorkflowRule.getWorkflowClient().newWorkflowStub(TestWorkflow1.class, options);
73+
WorkflowStub stub = WorkflowStub.fromTyped(workflow);
74+
stub.start(testWorkflowRule.getTaskQueue());
75+
stubs.add(stub);
76+
}
77+
78+
for (WorkflowStub stub : stubs) {
79+
assertEquals("done", stub.getResult(String.class));
80+
}
81+
}
82+
83+
public static class HeartbeatingWorkflowImpl implements TestWorkflow1 {
84+
@Override
85+
public String execute(String taskQueue) {
86+
VariousTestActivities localActivities =
87+
Workflow.newLocalActivityStub(
88+
VariousTestActivities.class, SDKTestOptions.newLocalActivityOptions());
89+
90+
long firstLocalActivityDurationMs =
91+
(long) (WORKFLOW_TASK_TIMEOUT.toMillis() * Config.WORKFLOW_TAK_HEARTBEAT_COEFFICIENT)
92+
- SLEEP_DURATION.toMillis() / 2;
93+
localActivities.sleepActivity(firstLocalActivityDurationMs, 0);
94+
95+
// It is very important for reproduction that the workflow heartbeat timeout is reached DURING
96+
// this sleep / workflow code execution.
97+
// So the first local activity is done, eventLoop is triggered and heartbeat timeout is
98+
// reached at the end of
99+
// this workflow code event loop call with the next activity scheduled.
100+
try {
101+
Thread.sleep(SLEEP_DURATION.toMillis());
102+
} catch (InterruptedException e) {
103+
Thread.currentThread().interrupt();
104+
throw new RuntimeException(e);
105+
}
106+
107+
localActivities.sleepActivity(TimeUnit.SECONDS.toMillis(1), 0);
108+
109+
return "done";
110+
}
111+
}
112+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (C) 2022 Temporal Technologies, Inc. All Rights Reserved.
3+
*
4+
* Copyright (C) 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this material except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package io.temporal.internal;
22+
23+
/**
24+
* Test annotated with {@link Issue} covers a specific problem or edge case and is usually crafted
25+
* carefully to hit the right conditions to reproduce the problem. Please specify a link to a ticket
26+
* that describes what this test intends to cover.
27+
*/
28+
public @interface Issue {
29+
/**
30+
* @return Link to a ticket that describes what this test is intended to cover.
31+
*/
32+
String value();
33+
}

0 commit comments

Comments
 (0)