Skip to content

Commit fb072b3

Browse files
committed
Fix credential refresh race during worker activation
1 parent 5407fbe commit fb072b3

2 files changed

Lines changed: 53 additions & 6 deletions

File tree

controlplane/configstore/store.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -854,16 +854,17 @@ func (cs *ConfigStore) RetireOrphanWorker(workerID int, reason string) (bool, er
854854
// and pre-migration rows that existed before this column was introduced
855855
// (these get refreshed eagerly so we converge to the new state).
856856
//
857-
// Only active states are considered: retired/lost/draining(-out) rows
858-
// don't need creds. We include `idle` deliberately — an idle worker that
859-
// belongs to an org (post-activation) still has live creds in DuckDB and
860-
// will need them on the next session.
857+
// Only already-activated states are considered: retired/lost/draining rows
858+
// don't need creds, and reserved/activating rows must not be refreshed because
859+
// their first ActivateTenant RPC may still be in flight. Refreshing them would
860+
// bump owner_epoch underneath the activation path and make the original
861+
// owner_epoch look stale to the worker. We include `idle` deliberately — an
862+
// idle worker that belongs to an org (post-activation) still has live creds in
863+
// DuckDB and will need them on the next session.
861864
func (cs *ConfigStore) ListWorkersDueForCredentialRefresh(ownerCPInstanceID string, cutoff time.Time) ([]WorkerRecord, error) {
862865
var workers []WorkerRecord
863866
credEligibleStates := []WorkerState{
864867
WorkerStateIdle,
865-
WorkerStateReserved,
866-
WorkerStateActivating,
867868
WorkerStateHot,
868869
WorkerStateHotIdle,
869870
}

tests/configstore/runtime_store_postgres_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,52 @@ func TestListWorkersDueForCredentialRefreshTreatsNullAsDue(t *testing.T) {
10051005
}
10061006
}
10071007

1008+
// TestListWorkersDueForCredentialRefreshSkipsReservedAndActivatingRows
1009+
// protects the initial activation path. Reserved/activating rows are org-bound
1010+
// before the first ActivateTenant RPC has finished and before the activation
1011+
// path stamps s3_credentials_expires_at. If the refresh scheduler treats those
1012+
// NULL-expiry rows as due, it can bump owner_epoch under the in-flight
1013+
// activation and cause the worker to reject the original owner_epoch as stale.
1014+
func TestListWorkersDueForCredentialRefreshSkipsReservedAndActivatingRows(t *testing.T) {
1015+
store := newIsolatedConfigStore(t)
1016+
now := time.Date(2026, time.April, 30, 12, 0, 0, 0, time.UTC)
1017+
pastDue := now.Add(-1 * time.Minute)
1018+
1019+
rows := []*configstore.WorkerRecord{
1020+
{
1021+
WorkerID: 8, PodName: "duckgres-worker-8",
1022+
State: configstore.WorkerStateReserved, OrgID: "acme",
1023+
OwnerCPInstanceID: "cp-me:boot-a", OwnerEpoch: 1,
1024+
LastHeartbeatAt: now,
1025+
},
1026+
{
1027+
WorkerID: 9, PodName: "duckgres-worker-9",
1028+
State: configstore.WorkerStateActivating, OrgID: "acme",
1029+
OwnerCPInstanceID: "cp-me:boot-a", OwnerEpoch: 1,
1030+
LastHeartbeatAt: now, S3CredentialsExpiresAt: &pastDue,
1031+
},
1032+
{
1033+
WorkerID: 10, PodName: "duckgres-worker-10",
1034+
State: configstore.WorkerStateHot, OrgID: "acme",
1035+
OwnerCPInstanceID: "cp-me:boot-a", OwnerEpoch: 1,
1036+
LastHeartbeatAt: now, S3CredentialsExpiresAt: &pastDue,
1037+
},
1038+
}
1039+
for _, w := range rows {
1040+
if err := store.UpsertWorkerRecord(w); err != nil {
1041+
t.Fatalf("UpsertWorkerRecord(%d): %v", w.WorkerID, err)
1042+
}
1043+
}
1044+
1045+
due, err := store.ListWorkersDueForCredentialRefresh("cp-me:boot-a", now)
1046+
if err != nil {
1047+
t.Fatalf("ListWorkersDueForCredentialRefresh: %v", err)
1048+
}
1049+
if len(due) != 1 || due[0].WorkerID != 10 {
1050+
t.Fatalf("expected only activated hot worker 10 to be due, got %#v", due)
1051+
}
1052+
}
1053+
10081054
// TestListWorkersDueForCredentialRefreshSkipsHealthyAndNeutral:
10091055
// - Healthy row (expiry comfortably in the future): not returned.
10101056
// - Neutral warm row (org_id=”): not returned regardless of expiry.

0 commit comments

Comments
 (0)