Skip to content

Commit 8079433

Browse files
committed
syz-cluster: better handle SeriesProcessor restarts
If the Loop() was restarted in between the moment we marked the session as started in the DB and the moment we actually started the workflow, there was no way back to the normal operation. That was the reason of the sporadic TestProcessor failures we've seen in the presubmit tests. Handle this case in the code by just continuing the non-finished calls. Closes #5776.
1 parent e347022 commit 8079433

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

syz-cluster/controller/processor.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,16 @@ func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Sessio
154154
switch status {
155155
case workflow.StatusNotFound:
156156
log.Printf("scheduling a workflow for %q", session.ID)
157-
if err := sp.sessionRepo.Start(ctx, session.ID); err != nil {
157+
err := sp.sessionRepo.Start(ctx, session.ID)
158+
if err == db.ErrSessionAlreadyStarted {
159+
// It may happen if the service was restarted right between the moment we updated the DB
160+
// and actually started the workflow.
161+
log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID)
162+
} else if err != nil {
158163
app.Errorf("failed to mark session started: %v", err)
159164
break
160165
}
161-
err := sp.workflows.Start(session.ID)
166+
err = sp.workflows.Start(session.ID)
162167
if err != nil {
163168
app.Errorf("failed to start a workflow: %v", err)
164169
}

syz-cluster/controller/processor_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ func TestProcessor(t *testing.T) {
5252

5353
awaitFinishedSessions(t, processor.seriesRepo, 2)
5454

55-
// Restart the loop.
55+
// Emulate the service restart by aborting the loop.
56+
// This may break the execution in arbitrary places, which actually resembles the environment in which the code
57+
// will actually work. The bugs it triggers may be difficult to reproduce though.
5658
cancel()
5759
wg.Wait()
5860

0 commit comments

Comments
 (0)