Skip to content

Commit 2776f1d

Browse files
committed
fix(oplog-collections-transformer): warn+skip on empty ALL_SITE_IDS instead of error-retry
Per PR #363 review: an empty/blank ALL_SITE_IDS (misconfig, or a partial deployment that doesn't fan user status) made publishUserStatus return a transient error, so the status event Nak-stormed to MAX_DELIVER (1000) before a loud Term — wasteful, and broke partial deployments. Detect the no-destination case at the sent==0 point and skip cleanly: one WARN log + onSkipped("status_no_sites") metric + migration.ErrSkipped (Ack-skip, no retries). Detection stays at sent==0 so malformed multi-entry values (e.g. ALL_SITE_IDS=",") still warn rather than silently ack. A startup hard-fail on empty ALL_SITE_IDS is the planned permanent form (TODO). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012X9qhQT4NwmCHjdndwNtFD
1 parent fbde695 commit 2776f1d

2 files changed

Lines changed: 12 additions & 10 deletions

File tree

data-migration/oplog-collections-transformer/users.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,14 @@ func (h *handler) publishUserStatus(ctx context.Context, account, statusText str
132132
sent++
133133
}
134134
if sent == 0 {
135-
// No destination sites: a statusText change would be lost. Surface a (transient) error
136-
// so the event Naks and is visible rather than silently dropped — ALL_SITE_IDS is empty
137-
// or misconfigured. (Future: hard-fail at startup instead of per-event.)
138-
return fmt.Errorf("no destination sites for user_status_updated (account %q): ALL_SITE_IDS empty", account)
135+
// No destination sites — ALL_SITE_IDS empty/misconfigured, or a partial deployment that
136+
// doesn't fan status. Skip cleanly with a logged + metered signal rather than Nak-storming
137+
// every status event. TODO: make empty ALL_SITE_IDS a startup hard-fail once the failure
138+
// modes are understood.
139+
slog.WarnContext(ctx, "ALL_SITE_IDS has no destinations — skipping user status fan-out",
140+
"account", account, "request_id", natsutil.RequestIDFromContext(ctx))
141+
h.metrics.onSkipped(ctx, "status_no_sites")
142+
return migration.ErrSkipped
139143
}
140144
return nil
141145
}

data-migration/oplog-collections-transformer/users_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,11 @@ func TestHandleUser_StatusTextUpdate_FansToAllSites(t *testing.T) {
110110
assert.ElementsMatch(t, []string{testSiteID, "s2"}, dests)
111111
}
112112

113-
func TestHandleUser_StatusTextUpdate_NoSites_ReturnsError(t *testing.T) {
113+
func TestHandleUser_StatusTextUpdate_NoSites_WarnsAndSkips(t *testing.T) {
114114
pub := &fakePublisher{}
115115
doc := `{"_id":"u1","username":"alice","statusText":"in a meeting"}`
116116
h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: []byte(doc)})
117-
h.allSiteIDs = nil // misconfigured: ALL_SITE_IDS empty
117+
h.allSiteIDs = nil // ALL_SITE_IDS empty (misconfig, or a partial deployment without status fan-out)
118118

119119
ev := oplogEvent{
120120
Op: "update",
@@ -124,10 +124,8 @@ func TestHandleUser_StatusTextUpdate_NoSites_ReturnsError(t *testing.T) {
124124
UpdateDescription: json.RawMessage(`{"updatedFields":{"statusText":"in a meeting"}}`),
125125
}
126126
err := h.handleUser(context.Background(), ev)
127-
// Empty destinations must surface an error (Nak/visible), never a silent skip or drop.
128-
require.Error(t, err)
129-
assert.NotErrorIs(t, err, migration.ErrSkipped)
130-
assert.NotErrorIs(t, err, migration.ErrPoison)
127+
// No destinations: warn + Ack-skip — not a retry-storm error, not a silent drop.
128+
assert.ErrorIs(t, err, migration.ErrSkipped)
131129
assert.Empty(t, pub.events)
132130
}
133131

0 commit comments

Comments
 (0)