From 75d9fb60fa5838434fbf9731ef082e5a565358b9 Mon Sep 17 00:00:00 2001 From: Sascha Grunert Date: Thu, 4 Dec 2025 09:51:19 +0100 Subject: [PATCH] Fix race condition when re-triggering GitHub Actions When a GitHub Action is re-triggered, GitHub temporarily removes the old check status before the new run starts. This causes a race where Tide may merge the PR during the brief window when the check is missing. This fix tracks previously seen contexts per PR (not per commit, so it works across force pushes). When a required context disappears, it's treated as PENDING to prevent premature merging. The context history is automatically pruned each sync to prevent memory leaks, and duplicates are avoided when a context is both missing and disappeared. Signed-off-by: Sascha Grunert --- pkg/tide/status.go | 66 +++++++++++++- pkg/tide/tide.go | 69 ++++++++++++--- pkg/tide/tide_test.go | 195 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 318 insertions(+), 12 deletions(-) diff --git a/pkg/tide/status.go b/pkg/tide/status.go index 17901811e4..df36bfdc73 100644 --- a/pkg/tide/status.go +++ b/pkg/tide/status.go @@ -62,6 +62,66 @@ type storedState struct { PreviousQuery string } +// contextHistory tracks previously seen contexts for a PR/commit combination +// to detect when required contexts disappear (e.g., when GitHub Actions are re-triggered) +type contextHistory struct { + prCommitContexts map[string]sets.Set[string] + sync.Mutex +} + +func newContextHistory() *contextHistory { + return &contextHistory{ + prCommitContexts: make(map[string]sets.Set[string]), + } +} + +// checkAndUpdate returns contexts that disappeared since last check, then updates history. +// This detects the race condition when GitHub Actions are re-triggered and the check +// status temporarily disappears before the new run starts. +// +// Note: Uses PR number only (not commit SHA) to track across force pushes. This means +// if a force push genuinely removes a context from the workflow, it will be detected +// as "disappeared" until the next sync. This is acceptable as it errs on the side of +// preventing premature merges. +func (ch *contextHistory) checkAndUpdate(pr *CodeReviewCommon, currentContexts []string) []string { + key := fmt.Sprintf("%s/%s#%d", pr.Org, pr.Repo, pr.Number) + + ch.Lock() + defer ch.Unlock() + + var disappeared []string + if previous, exists := ch.prCommitContexts[key]; exists { + currentSet := sets.New[string](currentContexts...) + for ctx := range previous.Difference(currentSet) { + disappeared = append(disappeared, ctx) + } + } + + ch.prCommitContexts[key] = sets.New[string](currentContexts...) + return disappeared +} + +// prune removes entries for PRs that are no longer in the pool. +// This prevents unbounded memory growth by cleaning up old/merged PRs. +func (ch *contextHistory) prune(activePRs map[string]CodeReviewCommon) { + ch.Lock() + defer ch.Unlock() + + // Build set of active PR keys + active := sets.New[string]() + for _, pr := range activePRs { + key := fmt.Sprintf("%s/%s#%d", pr.Org, pr.Repo, pr.Number) + active.Insert(key) + } + + // Remove entries not in active set + for key := range ch.prCommitContexts { + if !active.Has(key) { + delete(ch.prCommitContexts, key) + } + } +} + // statusController is a goroutine runs in the background type statusController struct { pjClient ctrlruntimeclient.Client @@ -106,6 +166,9 @@ type statusUpdate struct { // newPoolPending is a size 1 chan that signals that the main Tide loop has // updated the 'poolPRs' field with a freshly updated pool. newPoolPending chan bool + // contextHistory tracks previously seen contexts to detect disappearing contexts + // when GitHub Actions are re-triggered (addresses issue #337) + contextHistory *contextHistory } func (sc *statusController) shutdown() { @@ -238,7 +301,8 @@ func requirementDiff(pr *PullRequest, q *config.TideQuery, cc contextChecker) (s log := logrus.WithFields(pr.logFields()) for _, commit := range pr.Commits.Nodes { if commit.Commit.OID == pr.HeadRefOID { - for _, ctx := range unsuccessfulContexts(append(commit.Commit.Status.Contexts, checkRunNodesToContexts(log, commit.Commit.StatusCheckRollup.Contexts.Nodes)...), cc, log) { + crc := CodeReviewCommonFromPullRequest(pr) + for _, ctx := range unsuccessfulContexts(append(commit.Commit.Status.Contexts, checkRunNodesToContexts(log, commit.Commit.StatusCheckRollup.Contexts.Nodes)...), cc, crc, nil, log) { contexts = append(contexts, string(ctx.Context)) } } diff --git a/pkg/tide/tide.go b/pkg/tide/tide.go index c4e9744120..5c30d85224 100644 --- a/pkg/tide/tide.go +++ b/pkg/tide/tide.go @@ -406,6 +406,7 @@ func NewController( statusUpdate := &statusUpdate{ dontUpdateStatus: &threadSafePRSet{}, newPoolPending: make(chan bool), + contextHistory: newContextHistory(), } sc, err := newStatusController(ctx, logger, ghcStatus, mgr, gc, cfg, opener, statusURI, mergeChecker, usesGitHubAppsAuth, statusUpdate) @@ -582,6 +583,10 @@ func (c *syncController) Sync() error { c.statusUpdate.poolPRs = poolPRMap(filteredPools) c.statusUpdate.baseSHAs = baseSHAMap(filteredPools) c.statusUpdate.requiredContexts = requiredContextsMap(filteredPools) + // Prune old PR entries from context history to prevent memory leak + if c.statusUpdate.contextHistory != nil { + c.statusUpdate.contextHistory.prune(c.statusUpdate.poolPRs) + } select { case c.statusUpdate.newPoolPending <- true: c.statusUpdate.dontUpdateStatus.reset() @@ -675,7 +680,11 @@ func (c *syncController) filterSubpools(mergeAllowed func(*CodeReviewCommon) (st return } key := poolKey(sp.org, sp.repo, sp.branch) - if spFiltered := filterSubpool(c.provider, mergeAllowed, sp); spFiltered != nil { + var ch *contextHistory + if c.statusUpdate != nil { + ch = c.statusUpdate.contextHistory + } + if spFiltered := filterSubpool(c.provider, mergeAllowed, sp, ch); spFiltered != nil { sp.log.WithField("key", key).WithField("pool", spFiltered).Debug("filtered sub-pool") lock.Lock() @@ -727,10 +736,10 @@ func (c *syncController) initSubpoolData(sp *subpool) error { // should be deleted. // // This function works for any source code provider. -func filterSubpool(provider provider, mergeAllowed func(*CodeReviewCommon) (string, error), sp *subpool) *subpool { +func filterSubpool(provider provider, mergeAllowed func(*CodeReviewCommon) (string, error), sp *subpool, ch *contextHistory) *subpool { var toKeep []CodeReviewCommon for _, pr := range sp.prs { - if !filterPR(provider, mergeAllowed, sp, &pr) { + if !filterPR(provider, mergeAllowed, sp, &pr, ch) { toKeep = append(toKeep, pr) } } @@ -752,7 +761,7 @@ func filterSubpool(provider provider, mergeAllowed func(*CodeReviewCommon) (stri // retesting them.) // // This function works for any source code provider. -func filterPR(provider provider, mergeAllowed func(*CodeReviewCommon) (string, error), sp *subpool, pr *CodeReviewCommon) bool { +func filterPR(provider provider, mergeAllowed func(*CodeReviewCommon) (string, error), sp *subpool, pr *CodeReviewCommon, ch *contextHistory) bool { log := sp.log.WithFields(pr.logFields()) // Skip PRs that are known to be unmergeable. if reason, err := mergeAllowed(pr); err != nil { @@ -778,7 +787,7 @@ func filterPR(provider provider, mergeAllowed func(*CodeReviewCommon) (string, e } return false } - for _, ctx := range unsuccessfulContexts(contexts, sp.cc[pr.Number], log) { + for _, ctx := range unsuccessfulContexts(contexts, sp.cc[pr.Number], pr, ch, log) { if ctx.State != githubql.StatusStatePending { log.WithField("context", ctx.Context).Debug("filtering out PR as unsuccessful context is not pending") return true @@ -853,7 +862,11 @@ func (c *syncController) isPassingTests(log *logrus.Entry, pr *CodeReviewCommon, // If we can't get the status of the commit, assume that it is failing. return false } - unsuccessful := unsuccessfulContexts(contexts, cc, log) + var ch *contextHistory + if c.statusUpdate != nil { + ch = c.statusUpdate.contextHistory + } + unsuccessful := unsuccessfulContexts(contexts, cc, pr, ch, log) return len(unsuccessful) == 0 } @@ -862,8 +875,12 @@ func (c *syncController) isPassingTests(log *logrus.Entry, pr *CodeReviewCommon, // If the branchProtection is set to only check for required checks, we will skip // all non-required tests. If required tests are missing from the list, they will be // added to the list of failed contexts. -func unsuccessfulContexts(contexts []Context, cc contextChecker, log *logrus.Entry) []Context { +// It also detects when required contexts disappear (e.g., when GitHub Actions are re-triggered) +// and treats them as PENDING to prevent premature merging (addresses issue #337). +func unsuccessfulContexts(contexts []Context, cc contextChecker, pr *CodeReviewCommon, ch *contextHistory, log *logrus.Entry) []Context { var failed []Context + contextNames := contextsToStrings(contexts) + for _, ctx := range contexts { if string(ctx.Context) == statusContext { continue @@ -875,13 +892,45 @@ func unsuccessfulContexts(contexts []Context, cc contextChecker, log *logrus.Ent failed = append(failed, ctx) } } - for _, c := range cc.MissingRequiredContexts(contextsToStrings(contexts)) { - failed = append(failed, newExpectedContext(c)) + + // Track which contexts are added as failed to avoid duplicates + failedSet := sets.New[string]() + for _, ctx := range failed { + failedSet.Insert(string(ctx.Context)) + } + + // Add missing required contexts + for _, c := range cc.MissingRequiredContexts(contextNames) { + if !failedSet.Has(c) { + failed = append(failed, newExpectedContext(c)) + failedSet.Insert(c) + } + } + + // Check for disappeared contexts (race condition fix) + // When a GitHub Action is re-triggered, the check may temporarily disappear + // from the status before the new run starts. Detect this and treat disappeared + // required contexts as non-passing to prevent premature merging. + if ch != nil && pr != nil { + disappeared := ch.checkAndUpdate(pr, contextNames) + if len(disappeared) > 0 { + log.WithField("disappeared_contexts", disappeared). + Info("Detected disappeared contexts - likely due to re-triggered checks") + } + for _, c := range disappeared { + if !cc.IsOptional(c) && !failedSet.Has(c) { + failed = append(failed, Context{ + Context: githubql.String(c), + State: githubql.StatusStatePending, + Description: githubql.String("Context disappeared - likely re-triggered"), + }) + } + } } log.WithFields(logrus.Fields{ "total_context_count": len(contexts), - "context_names": contextsToStrings(contexts), + "context_names": contextNames, "failed_context_count": len(failed), "failed_context_names": contextsToStrings(failed), }).Debug("Filtered out failed contexts") diff --git a/pkg/tide/tide_test.go b/pkg/tide/tide_test.go index d114dca551..4bceb8e666 100644 --- a/pkg/tide/tide_test.go +++ b/pkg/tide/tide_test.go @@ -2908,7 +2908,7 @@ func TestFilterSubpool(t *testing.T) { mergeChecker: mmc, logger: logrus.WithContext(context.Background()), } - filtered := filterSubpool(provider, mmc.isAllowedToMerge, sp) + filtered := filterSubpool(provider, mmc.isAllowedToMerge, sp, newContextHistory()) if len(tc.expectedPRs) == 0 { if filtered != nil { t.Fatalf("Expected subpool to be pruned, but got: %v", filtered) @@ -5482,3 +5482,196 @@ func TestSerialRetestingConsidersPRThatIsCurrentlyBeingSRetested(t *testing.T) { } } + +// TestUnsuccessfulContextsWithDisappearedContexts tests the fix for issue #337 +// where re-triggering GitHub Actions causes contexts to temporarily disappear +func TestUnsuccessfulContextsWithDisappearedContexts(t *testing.T) { + log := logrus.NewEntry(logrus.StandardLogger()) + + testCases := []struct { + name string + contexts []Context + previousContexts []string + requiredContexts []string + expectedFailed int + }{ + { + name: "no disappeared contexts", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + {Context: githubql.String("test-2"), State: githubql.StatusStateSuccess}, + }, + previousContexts: []string{"test-1", "test-2"}, + requiredContexts: []string{"test-1", "test-2"}, + expectedFailed: 0, + }, + { + name: "context disappeared - race condition", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + }, + previousContexts: []string{"test-1", "test-2"}, + requiredContexts: []string{"test-1", "test-2"}, + expectedFailed: 1, // test-2 is both missing and disappeared, counted once + }, + { + name: "unknown context disappeared", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + }, + previousContexts: []string{"test-1", "test-2"}, + requiredContexts: []string{"test-1"}, + // test-2 is unknown (not in required/optional), treated as required + // by default unless SkipUnknownContexts is set + expectedFailed: 1, + }, + { + name: "first time seeing PR - no history", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + }, + previousContexts: nil, + requiredContexts: []string{"test-1", "test-2"}, + expectedFailed: 1, // only missing required, no disappeared + }, + { + name: "context reappeared after disappearing", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + {Context: githubql.String("test-2"), State: githubql.StatusStatePending}, + }, + previousContexts: []string{"test-1"}, + requiredContexts: []string{"test-1", "test-2"}, + expectedFailed: 1, // test-2 is pending (not disappeared, it's present) + }, + { + name: "context disappears multiple times", + contexts: []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + }, + previousContexts: []string{"test-1", "test-2"}, // test-2 was seen before + requiredContexts: []string{"test-1", "test-2"}, + expectedFailed: 1, // test-2 disappeared, blocks merge + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ch := newContextHistory() + pr := &CodeReviewCommon{ + Org: "test-org", + Repo: "test-repo", + Number: 123, + HeadRefOID: "abc123", + } + + // Set up previous context history if provided + if tc.previousContexts != nil { + ch.checkAndUpdate(pr, tc.previousContexts) + } + + cc := &config.TideContextPolicy{ + RequiredContexts: tc.requiredContexts, + } + + failed := unsuccessfulContexts(tc.contexts, cc, pr, ch, log) + + if len(failed) != tc.expectedFailed { + t.Errorf("expected %d failed contexts, got %d: %+v", tc.expectedFailed, len(failed), failed) + } + }) + } +} + +// TestContextDisappearsMultipleTimes tests the scenario where a context +// appears, disappears, reappears, then disappears again +func TestContextDisappearsMultipleTimes(t *testing.T) { + log := logrus.NewEntry(logrus.StandardLogger()) + ch := newContextHistory() + pr := &CodeReviewCommon{ + Org: "test-org", + Repo: "test-repo", + Number: 123, + HeadRefOID: "abc123", + } + cc := &config.TideContextPolicy{ + RequiredContexts: []string{"test-1", "test-2"}, + } + + // T0: Both contexts present and successful + contexts1 := []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + {Context: githubql.String("test-2"), State: githubql.StatusStateSuccess}, + } + failed1 := unsuccessfulContexts(contexts1, cc, pr, ch, log) + if len(failed1) != 0 { + t.Errorf("T0: expected 0 failed, got %d", len(failed1)) + } + + // T1: test-2 disappears (re-triggered) + contexts2 := []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + } + failed2 := unsuccessfulContexts(contexts2, cc, pr, ch, log) + if len(failed2) != 1 { + t.Errorf("T1: expected 1 failed (disappeared test-2), got %d", len(failed2)) + } + + // T2: test-2 reappears as SUCCESS + contexts3 := []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + {Context: githubql.String("test-2"), State: githubql.StatusStateSuccess}, + } + failed3 := unsuccessfulContexts(contexts3, cc, pr, ch, log) + if len(failed3) != 0 { + t.Errorf("T2: expected 0 failed, got %d", len(failed3)) + } + + // T3: test-2 disappears AGAIN (re-triggered again) + contexts4 := []Context{ + {Context: githubql.String("test-1"), State: githubql.StatusStateSuccess}, + } + failed4 := unsuccessfulContexts(contexts4, cc, pr, ch, log) + if len(failed4) != 1 { + t.Errorf("T3: expected 1 failed (disappeared test-2 again), got %d", len(failed4)) + } +} + +// TestContextHistoryPrune tests that old PR entries are cleaned up +func TestContextHistoryPrune(t *testing.T) { + ch := newContextHistory() + + pr1 := &CodeReviewCommon{Org: "org", Repo: "repo", Number: 1, HeadRefOID: "sha1"} + pr2 := &CodeReviewCommon{Org: "org", Repo: "repo", Number: 2, HeadRefOID: "sha2"} + pr3 := &CodeReviewCommon{Org: "org", Repo: "repo", Number: 3, HeadRefOID: "sha3"} + + // Record contexts for multiple PRs + ch.checkAndUpdate(pr1, []string{"test-1", "test-2"}) + ch.checkAndUpdate(pr2, []string{"test-1", "test-2"}) + ch.checkAndUpdate(pr3, []string{"test-1", "test-2"}) + + if len(ch.prCommitContexts) != 3 { + t.Errorf("expected 3 entries, got %d", len(ch.prCommitContexts)) + } + + // Only PR 1 and 2 are still active + activePRs := map[string]CodeReviewCommon{ + "key1": *pr1, + "key2": *pr2, + } + ch.prune(activePRs) + + if len(ch.prCommitContexts) != 2 { + t.Errorf("expected 2 entries after prune, got %d", len(ch.prCommitContexts)) + } + + if !ch.prCommitContexts["org/repo#1"].Has("test-1") { + t.Error("PR 1 should still be tracked") + } + if !ch.prCommitContexts["org/repo#2"].Has("test-1") { + t.Error("PR 2 should still be tracked") + } + if _, exists := ch.prCommitContexts["org/repo#3"]; exists { + t.Error("PR 3 should have been pruned") + } +}