diff --git a/.gitignore b/.gitignore index 68f6e6068..42ff10942 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ tmp/ # stray local builds of the data-migration services /oplog-connector /oplog-transformer +/oplog-collections-transformer # Captured pprof profiles from `make profile`. profiles/ diff --git a/data-migration/CDC_COVERAGE.md b/data-migration/CDC_COVERAGE.md new file mode 100644 index 000000000..e9171a3ae --- /dev/null +++ b/data-migration/CDC_COVERAGE.md @@ -0,0 +1,75 @@ +# Collections CDC — coverage matrix + +> Companion to `README.md` (component overview) and `SOURCE_DATA.md` (source schema). +> This doc pins **exactly which source change events the collections migration covers, and which it does not** — the reference for the team building the `oplog-collections-transformer`. +> Design: `docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md`. +> +> Scope: the **live CDC tail** of the operational collections (rooms, subscriptions, thread_subscriptions, users). The bulk/initial state sync ≤ checkpoint is a separate owner's job; we tail from the handed-off checkpoint. + +## CDC payload facts (all collections) + +The connector forwards raw change-stream events with **no `updateLookup`** and **no `fullDocumentBeforeChange`**: + +| Op | Payload carried | Source lookup by `_id` | +|---|---|---| +| `insert` | full `fullDocument` | in payload | +| `replace` | full `fullDocument` | in payload (lookup not needed) | +| `update` | only `updateDescription` (changed fields, no post-image) | **full current doc** (doc still exists) | +| `delete` | only `documentKey._id` | **nothing** — doc already gone | + +→ A source lookup resolves the full doc for any op **except `delete`**. + +## Event coverage matrix + +**Legend:** ✅ migrated · ❌ intentionally not migrated · ⚠️ deferred / later work. + +| # | Source event | Op + payload | Source lookup (by `_id`) | Current-system facts | Handling / impact | +|---|---|---|---|---|---| +| **Rooms** | +| 1 | Room create | `insert` — full doc | in payload | `t` ∈ `c,p,d,l,v`; `prid`⇒discussion; `teamId`/`teamMain`; `d` can have >2 users | ✅ map → `room_sync` (skip `l`,`v`,group-DM) | +| 2 | Room replace | `replace` — full doc | not needed | whole-doc rewrite; can cross type/exclusion boundary; **no delta** to tell which fields changed | ✅ re-classify → `room_renamed` + `room_restricted` + `room_sync` (conservative — field events are idempotent + guarded; subs' denormalized name/visibility must not go stale) | +| 3 | Room change | `update` — changed fields only | full current doc | — | ✅ re-read doc → `room_renamed` / `room_restricted` / `room_sync` | +| 4 | Room delete | `delete` — `_id` only | nothing — doc gone | app has no room-delete operation | ❌ skip (no app deletion; un-actionable) | +| **Subscriptions** | +| 5 | Sub create | `insert` — full doc | in payload | `u`, `rid`, `roles[]`, `open`, `f`, `disableNotifications`, `ls`/`lr`, `alert` | ✅ `member_added` + state events | +| 6 | Sub replace | `replace` — full doc | not needed | whole-doc rewrite | ✅ re-classify → `member_added` + state | +| 7 | Sub change (incl. leave/rejoin) | `update` — changed fields only | full current doc | leaving sets `open:false` (not a row delete) | ✅ re-read doc → `open`-toggle → `member_added`/`member_removed`; mute/fav/role/read → matching event | +| 8 | Sub delete (true row removal) | `delete` — `_id` only | nothing — doc gone | destination subs key by generated `UUIDv7`, not source `_id`; removal needs `(roomID, account)` | ❌ skip (un-actionable; rare — leave is `open:false`) | +| **Thread subscriptions** | +| 9 | Follow / first reply | `insert` — full doc | in payload | keyed `(u._id, parentMessage._id)`; carries `rid`, `lastSeenAt`, `unreadMention` | ✅ resolve thread-room+user → `thread_subscription_upserted` | +| 10 | Thread-sub replace | `replace` — full doc | not needed | whole-doc rewrite | ✅ re-resolve → upsert | +| 11 | Thread read / mention change | `update` — changed fields only | full current doc | — | ✅ re-read doc → re-upsert | +| 12 | Thread unfollow | `delete` — `_id` only | nothing — doc gone | destination thread-subs key by `(threadRoomId, userId)`; inbox-worker has no thread-sub removal handler; live stack emits no thread-unfollow federation event | ❌ skip (un-actionable **and** no handler) → stale follow lingers | +| **Users** | +| 13 | User create | `insert` — full doc | in payload | `_id`, `username` (mutable), `type`, `customFields.*`, `roles[]`, `federation.origin` | ✅ insert-if-absent by account | +| 14 | User replace | `replace` — full doc | not needed | whole-doc rewrite | ✅ insert-if-absent (re-classify) | +| 15 | User **HR-field** change (engName, tsmcName, dept/sect, roles, …) after first seed | `update` — changed fields only | full current doc | company-wide user sync owns these; insert-if-absent leaves existing untouched | ❌ not propagated (other sync keeps it current) | +| 15a | User **`statusText`** change | `update` — changed fields only | full current doc | chat-originated (set by the user inside legacy chat), **not** in the HR dataset — no other sync carries it | ✅ fan `user_status_updated` to all sites (global-visibility) | +| 16 | User deactivate / delete | `update` (`active:false`) or `delete` | `update`: full doc · `delete`: nothing | source sets `active:false` (no row deletion); no destination apply-path wired | ❌ deferred (out of scope) | +| **All collections** | +| 17 | Collection drop / rename | collection-level (`drop`/`rename`/`invalidate`) | n/a | terminates/invalidates the per-collection change stream | ⚠️ out of scope, deferred — connector re-point, not migration logic | + +## inbox-worker handler coverage + +Every apply-handler the inbox-worker exposes is either produced by the migration or intentionally not: + +| Inbox handler | Emitted? | From | +|---|---|---| +| `member_added` | ✅ | sub `insert`/`replace`; `open` false→true | +| `member_removed` | ✅ | sub `open` true→false | +| `room_sync` | ✅ | room `insert`/`replace`/other-field `update` | +| `role_updated` | ✅ | sub `roles[]` | +| `subscription_read` | ✅ | sub `max(ls,lr)` + `alert` | +| `subscription_mute_toggled` | ✅ | sub `disableNotifications` | +| `subscription_favorite_toggled` | ✅ | sub `f` | +| `thread_subscription_upserted` | ✅ | thread-sub `insert`/`replace`/`update` | +| `room_renamed` | ✅ | room `name`/`fname` change | +| `room_restricted` | ✅ | room `restricted`/`externalAccess` change | +| `user_status_updated` | ✅ | user `statusText` change (chat-owned; fanned to all sites) | +| `thread_read` | ⚠️ not emitted | redundant — thread-sub `lastSeenAt` rides `thread_subscription_upserted`; `Subscription.ThreadUnread` is message-pipeline-owned | + +## Open confirmations (source engineers) + +- Which room field(s) back **`Restricted`** (read-only) and **`ExternalAccess`** — see `SOURCE_DATA.md`. +- Does the source emit whole-doc **`replace`** for these collections, or only field-level `update`? (If never, rows 2/6/10/14 are moot.) +- Where does a user **employee id** live (if at all). diff --git a/data-migration/SOURCE_DATA.md b/data-migration/SOURCE_DATA.md index a25382da7..41f1e74ac 100644 --- a/data-migration/SOURCE_DATA.md +++ b/data-migration/SOURCE_DATA.md @@ -82,3 +82,118 @@ Example document (values rotated/sanitized): - `updateDescription` — absent (no field-level diff). - `fullDocument` — the new version (with the `fullDocument` option). - `previousDocument` — the old version (with `showExpandedEvents:true`). + +--- + +# Collections migration — source schema (assumptions, for source-engineer cross-check) + +> This section is the migration team's **current understanding** of the operational +> source collections read by the collections path (`oplog-collections-transformer`, +> design: `docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md`). +> **Every "Assumed" row drives a write into the new system — please correct anything wrong.** +> Legend: ✅ confirmed by source team · ❓ assumption awaiting confirmation · ⛔ deliberately ignored. + +## Conventions assumed across these collections +- ✅ **`federation.origin`** is authoritative. **Absent** ⇒ record is **local**. **Present** ⇒ a federated peer domain whose **first dotted label is the site code** (`0030204.tchat-test.test.company.com` ⇒ `0030204`). +- ❓ `federation.origin` is never the literal `"local"` (we treat absent ⇒ local). +- ✅ Each site's source DB already holds its **federated copies**; we migrate the full local source with **no** drop-filter. + +## 3. `rocketchat_rooms` + +| Source field | Type | Interpretation | Status | +|---|---|---|---| +| `_id` | string | Room id | ✅ | +| `t` | string | Room type — **only** `c`,`p`,`d`,`l`,`v` exist | ✅ | +| `prid` | string (opt) | Parent room id — **present ⇒ discussion** (`t` is `p`) | ✅ | +| `teamId` | string (opt) | Room belongs to a Team | ✅ | +| `teamMain` | bool (opt) | True only on a team's **primary** room | ✅ | +| `name` | string | Machine/handle name | ❓ | +| `fname` | string | Friendly display name | ❓ | +| `uids` / `usernames` | array | Members; for `t:d` length **can exceed 2** (group DM) | ✅ | +| `u` | object | Creator (`u._id`, `u.username`) | ❓ | +| `ts` / `_updatedAt` | date | Created / last-updated | ❓ | +| `restricted` | bool (opt) | **Authoritative restriction flag** (TSMC custom; absent ⇒ false). Confirmed on TKMS. RC's `ro` (read-only/announcement) is a **different concept** — deliberately ignored | ✅ | +| **external/federation access** | ? | **Which field is authoritative for "external access allowed"?** | ❓ | +| `federation.origin` | string (opt) | Origin site | ✅ | +| `federation.domains[]` | array | Member domains, service-synced, may be stale | ✅ ⛔ | + +Type mapping logic to sanity-check: `c`/`p` (no `prid`) → one channel type (no public/private split); `p`+`prid` → discussion; `d` (2 participants) → dm (botDM if a participant is a bot); `d` (>2) → **skip** (no group DM); `l`/`v` → **skip**; team rooms → plain channel (`teamId`/`teamMain` dropped). + +## 4. `rocketchat_subscriptions` + +One row per (user, room). ✅ Unique index `{ rid:1, 'u._id':1 }`. + +| Source field | Type | Interpretation | Status | +|---|---|---|---| +| `u._id`, `u.username` | string | Member identity | ✅ | +| `rid` | string | Room id | ✅ | +| `open` | bool | **Membership active.** Leave ⇒ `open:false` (no delete); re-join ⇒ true | ✅ | +| `ts` | date | Join time (set once, stable across re-joins) | ✅ | +| `roles[]` | string[] | `owner`/`moderator`/`leader`/`user` (role-based ownership) | ✅ | +| `ls` | date | Last **seen** (scrolled cursor) | ✅ | +| `lr` | date | Last **read** (explicit mark) | ✅ | +| `alert` | bool | True if **any** unread content (not just mentions) | ✅ | +| `userMentions` / `groupMentions` | int | Unread `@user` / `@all`,`@here` counts | ✅ | +| `tunread[]` | string[] | Parent-message ids (`tmid`) of threads with any unread | ✅ | +| `tunreadGroup[]` / `tunreadUser[]` | string[] | …group-mention / direct-mention variants | ✅ | +| `disableNotifications` | bool | **TSMC custom — authoritative mute (all-off)** | ✅ | +| `muteGroupMentions` | bool | `@all`/`@here` only (**not** our mute flag) | ✅ | +| `f` | bool (opt) | Favorited (absent ⇒ false) | ✅ | +| `favoritedAt` | date (opt) | Last favorite time. Exists at source (TKMS) but **unused by CDC** — per the agreed guard mapping below, all guards derive from `_updatedAt` | ✅ ⛔ | +| `name` / `fname` | string | Machine name / friendly display name | ✅ | +| `federation.origin` | string (opt) | Origin site (assumed consistent with room) | ✅ ❓ | + +Derived: "has mention" = `userMentions>0 || groupMentions>0`; "muted" = `disableNotifications`; **read timestamp (`lastSeenAt`) = `max(ls, lr)`** (resolved per design D1 — the furthest point consumed by either the scrolled cursor or the explicit mark-read). + +**Guard timestamps (agreed with source team, supersedes the earlier conditional/null mapping):** every +destination high-water guard derives uniformly from the source **`_updatedAt`** — `rolesUpdatedAt`, +`muteUpdatedAt`, `favoriteUpdatedAt` (subscriptions) and `nameUpdatedAt`, restricted-guard (rooms). +No null-when-false conditional, no `favoritedAt` source. Note: the canonical restricted guard field +(`restrictedUpdatedAt`) is not in the destination codebase yet; inbox-worker currently applies +`room_restricted` via `visibilityUpdatedAt` — accepted until the rename lands in main. + +## 5. `tsmc_thread_subscriptions` + +One row per (user, thread). ✅ Unique index `{ 'u._id':1, 'parentMessage._id':1 }`. + +| Source field | Type | Interpretation | Status | +|---|---|---|---| +| `_id` | string | Row id | ✅ | +| `u._id`, `u.username` | string | Follower identity | ✅ | +| `rid` | string | Room id (matches parent room) | ✅ | +| `parentMessage._id` | string | Thread root message id (`tmid`) — the thread key | ✅ | +| `lastMessage._id` / `._updatedAt` | string/date | Last message in thread | ✅ | +| `createdAt` | date | Row creation (lazy — on follow/first reply) | ✅ | +| `lastSeenAt` | date | Last-read timestamp for the thread | ✅ | +| `unreadMention` | int | Thread mention/unread count | ✅ | + +Lifecycle: created lazily; **unfollow deletes the row** (no soft-delete); no `federation.origin` (site inherited from room/user). **Open:** please share a redacted sample doc to confirm nothing is missed. + +## 6. `users` + +| Source field | Type | Interpretation | Status | +|---|---|---|---| +| `_id` | string (17-char base62) | Stable user id | ✅ | +| `username` | string | **Account id — unique but mutable** | ✅ | +| `type` | string | `user` or `bot` (bot has `appId`); no other non-human types | ✅ | +| `appId` | string (opt) | Present on bot/app accounts | ✅ | +| `name` | string | Display name | ✅ | +| `customFields.engName` / `tsmcName` | string | English / Chinese name | ✅ | +| `customFields.deptId` / `deptName` | string | Department id / name | ✅ | +| `customFields.sectId` / `sectName` | string | Section id / name | ✅ | +| `customFields.appId` / `appName` | string | App id / name | ✅ | +| `hrInfo` | `ITsmcUser[]` | HR directory records | ❓ (not consumed yet) | +| `statusText` / `status` | string | Status message / presence | ✅ | +| `roles[]` | string[] | Global roles (`admin` marker) | ✅ | +| `active` | bool | Deactivation ⇒ `active:false` (no deletion) | ✅ | +| `isRemote` | bool | True on local docs of **federated** users | ✅ | +| `federation.origin` | string (opt) | Origin site (absent ⇒ local) | ✅ | +| **employee id** | ? | **Where does an employee id live — is it `username`?** | ❓ | +| **Traditional-Chinese dept/sect names** | ? | Is there a TC variant of `deptName`/`sectName`? | ❓ | + +Seeded (insert-if-absent, keyed by account): `username`, `engName`, `tsmcName`, dept/sect ids+names, `roles`, `statusText`, site, bot flag. Everything else is owned by the company-wide user sync. + +Post-seed **updates**: HR fields are **not** re-propagated (the company-wide sync keeps them current). The **one exception is `statusText`** — it is chat-originated (not in the HR dataset), so a live `statusText` change fans a `user_status_updated` event to all sites (design §4.1a); without it, legacy status changes during the migration window would be lost. + +## Explicitly **not** migrated +`federation.domains[]`; livechat (`l`) / voip (`v`) rooms; group DMs (`d`>2); team grouping (`teamId`/`teamMain`); user deactivation/deletion; thread-sub unfollows during cutover. Flag any of these you'd expect to matter. diff --git a/data-migration/oplog-collections-transformer/bootstrap.go b/data-migration/oplog-collections-transformer/bootstrap.go new file mode 100644 index 000000000..8495b9f8c --- /dev/null +++ b/data-migration/oplog-collections-transformer/bootstrap.go @@ -0,0 +1,33 @@ +package main + +import ( + "context" + "fmt" + + "github.com/nats-io/nats.go/jetstream" + + "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + + "github.com/hmchangw/chat/pkg/stream" +) + +// streamManager is the minimal JetStream surface bootstrapStreams needs, service-local so tests can fake it without mockgen. +type streamManager interface { + CreateOrUpdateStream(ctx context.Context, cfg jetstream.StreamConfig) (oteljetstream.Stream, error) +} + +// bootstrapStreams is a no-op in production (this service owns no streams). When Enabled +// (dev/integration) it creates only the MIGRATION_OPLOG_{siteID} schema; inbox-worker owns INBOX. +func bootstrapStreams(ctx context.Context, js streamManager, siteID string, enabled bool) error { + if !enabled { + return nil + } + cfg := stream.MigrationOplog(siteID) + if _, err := js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: cfg.Name, + Subjects: cfg.Subjects, + }); err != nil { + return fmt.Errorf("create MIGRATION_OPLOG stream: %w", err) + } + return nil +} diff --git a/data-migration/oplog-collections-transformer/classify.go b/data-migration/oplog-collections-transformer/classify.go new file mode 100644 index 000000000..a84ae1e94 --- /dev/null +++ b/data-migration/oplog-collections-transformer/classify.go @@ -0,0 +1,39 @@ +package main + +import "github.com/hmchangw/chat/pkg/model" + +// roomClass is the result of classifying a source room. +type roomClass struct { + Type model.RoomType // valid only when !Excluded + Excluded bool + Reason string // exclusion reason (for metrics), set only when Excluded +} + +// classifyRoom maps a source room type t (+ prid/teamId/participant/bot signals) to a destination +// RoomType or an exclusion (group_dm/livechat/voip/unknown_type). p+prid→discussion, team→channel. §4.2. +func classifyRoom(t string, hasPrid, hasTeamID bool, hasBot bool, participantCount int) roomClass { + // hasTeamID is accepted for caller clarity and future use; c/p branch already returns channel + // regardless of teamId, so no separate branch is needed here. + _ = hasTeamID + switch t { + case "c", "p": + if t == "p" && hasPrid { + return roomClass{Type: model.RoomTypeDiscussion} + } + return roomClass{Type: model.RoomTypeChannel} + case "d": + if participantCount > 2 { + return roomClass{Excluded: true, Reason: "group_dm"} + } + if hasBot { + return roomClass{Type: model.RoomTypeBotDM} + } + return roomClass{Type: model.RoomTypeDM} + case "l": + return roomClass{Excluded: true, Reason: "livechat"} + case "v": + return roomClass{Excluded: true, Reason: "voip"} + default: + return roomClass{Excluded: true, Reason: "unknown_type"} + } +} diff --git a/data-migration/oplog-collections-transformer/classify_test.go b/data-migration/oplog-collections-transformer/classify_test.go new file mode 100644 index 000000000..0390b4524 --- /dev/null +++ b/data-migration/oplog-collections-transformer/classify_test.go @@ -0,0 +1,137 @@ +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/hmchangw/chat/pkg/model" +) + +func TestClassifyRoom(t *testing.T) { + tests := []struct { + name string + t string + hasPrid bool + hasTeamID bool + hasBot bool + participantCount int + wantType model.RoomType + wantExcluded bool + wantReason string + }{ + { + name: "c → channel", + t: "c", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantType: model.RoomTypeChannel, + wantExcluded: false, + }, + { + name: "p no prid → channel", + t: "p", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantType: model.RoomTypeChannel, + wantExcluded: false, + }, + { + name: "p with prid → discussion", + t: "p", + hasPrid: true, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantType: model.RoomTypeDiscussion, + wantExcluded: false, + }, + { + name: "d 2 participants no bot → dm", + t: "d", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 2, + wantType: model.RoomTypeDM, + wantExcluded: false, + }, + { + name: "d 2 participants with bot → botDM", + t: "d", + hasPrid: false, + hasTeamID: false, + hasBot: true, + participantCount: 2, + wantType: model.RoomTypeBotDM, + wantExcluded: false, + }, + { + name: "d 3 participants → excluded group_dm", + t: "d", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 3, + wantExcluded: true, + wantReason: "group_dm", + }, + { + name: "l → excluded livechat", + t: "l", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantExcluded: true, + wantReason: "livechat", + }, + { + name: "v → excluded voip", + t: "v", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantExcluded: true, + wantReason: "voip", + }, + { + name: "unknown type x → excluded unknown_type", + t: "x", + hasPrid: false, + hasTeamID: false, + hasBot: false, + participantCount: 0, + wantExcluded: true, + wantReason: "unknown_type", + }, + { + name: "c with teamId → channel (team rooms as plain channel)", + t: "c", + hasPrid: false, + hasTeamID: true, + hasBot: false, + participantCount: 0, + wantType: model.RoomTypeChannel, + wantExcluded: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := classifyRoom(tc.t, tc.hasPrid, tc.hasTeamID, tc.hasBot, tc.participantCount) + assert.Equal(t, tc.wantExcluded, got.Excluded) + if tc.wantExcluded { + assert.Equal(t, tc.wantReason, got.Reason) + } else { + assert.Equal(t, tc.wantType, got.Type) + assert.Empty(t, got.Reason) + } + }) + } +} diff --git a/data-migration/oplog-collections-transformer/config.go b/data-migration/oplog-collections-transformer/config.go new file mode 100644 index 000000000..1dd922fe3 --- /dev/null +++ b/data-migration/oplog-collections-transformer/config.go @@ -0,0 +1,94 @@ +package main + +import ( + "fmt" + "strings" + + "github.com/caarlos0/env/v11" +) + +// config holds every tunable, parsed from the environment via caarlos0/env. +// Required fields have no default and fail-fast at startup when absent. +type config struct { + SiteID string `env:"SITE_ID,required"` + + // AllSiteIDs is every federated site. A user statusText change fans to all of them (incl. ours) — + // status is global-visibility and chat-originated (no other sync carries it), unlike HR fields. + AllSiteIDs []string `env:"ALL_SITE_IDS" envDefault:"" envSeparator:","` + + NatsURL string `env:"NATS_URL,required"` + NatsCredsFile string `env:"NATS_CREDS_FILE" envDefault:""` + + // Source legacy Mongo (replica set): the connector tails it; this service re-reads + // the full current doc by _id on update events (the connector forwards only the delta). + SourceMongoURI string `env:"SOURCE_MONGO_URI,required"` + SourceUsername string `env:"SOURCE_MONGO_USERNAME" envDefault:""` + SourcePassword string `env:"SOURCE_MONGO_PASSWORD" envDefault:""` + SourceDB string `env:"SOURCE_DB" envDefault:"rocketchat"` + + // Target new-stack per-site Mongo: users insert-if-absent + thread_room/user FK resolution. + TargetMongoURI string `env:"TARGET_MONGO_URI,required"` + TargetUsername string `env:"TARGET_MONGO_USERNAME" envDefault:""` + TargetPassword string `env:"TARGET_MONGO_PASSWORD" envDefault:""` + TargetDB string `env:"TARGET_DB" envDefault:"chat"` + + // Source collection names (the connector's raw collection names). + RoomsCollection string `env:"ROOMS_COLLECTION" envDefault:"rocketchat_rooms"` + SubscriptionsCollection string `env:"SUBSCRIPTIONS_COLLECTION" envDefault:"rocketchat_subscriptions"` + ThreadSubsCollection string `env:"THREAD_SUBS_COLLECTION" envDefault:"tsmc_thread_subscriptions"` + UsersCollection string `env:"USERS_COLLECTION" envDefault:"users"` + + SourceReadPreference string `env:"SOURCE_READ_PREFERENCE" envDefault:"primaryPreferred"` + + ConsumerDurable string `env:"CONSUMER_DURABLE" envDefault:"oplog-collections-transformer"` + MaxDeliver int `env:"MAX_DELIVER" envDefault:"1000"` + DeleteMaxDeliver int `env:"DELETE_MAX_DELIVER" envDefault:"60"` + + Bootstrap bootstrapConfig `envPrefix:"BOOTSTRAP_"` + + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9090"` + LogLevel string `env:"LOG_LEVEL" envDefault:"info"` +} + +type bootstrapConfig struct { + Enabled bool `env:"STREAMS" envDefault:"false"` +} + +// parseConfig parses and validates the environment configuration. +func parseConfig() (config, error) { + cfg, err := env.ParseAs[config]() + if err != nil { + return config{}, fmt.Errorf("parse config: %w", err) + } + // caarlos0/env `required` only rejects an unset var, not a whitespace-only one. Trim and + // re-validate the required scalars too, so a value like " " fails here rather than breaking + // subject building / connections later at runtime. + cfg.SiteID = strings.TrimSpace(cfg.SiteID) + cfg.NatsURL = strings.TrimSpace(cfg.NatsURL) + cfg.SourceMongoURI = strings.TrimSpace(cfg.SourceMongoURI) + cfg.TargetMongoURI = strings.TrimSpace(cfg.TargetMongoURI) + cfg.RoomsCollection = strings.TrimSpace(cfg.RoomsCollection) + cfg.SubscriptionsCollection = strings.TrimSpace(cfg.SubscriptionsCollection) + cfg.ThreadSubsCollection = strings.TrimSpace(cfg.ThreadSubsCollection) + cfg.UsersCollection = strings.TrimSpace(cfg.UsersCollection) + for name, v := range map[string]string{ + "SITE_ID": cfg.SiteID, + "NATS_URL": cfg.NatsURL, + "SOURCE_MONGO_URI": cfg.SourceMongoURI, + "TARGET_MONGO_URI": cfg.TargetMongoURI, + "ROOMS_COLLECTION": cfg.RoomsCollection, + "SUBSCRIPTIONS_COLLECTION": cfg.SubscriptionsCollection, + "THREAD_SUBS_COLLECTION": cfg.ThreadSubsCollection, + "USERS_COLLECTION": cfg.UsersCollection, + } { + if v == "" { + return config{}, fmt.Errorf("%s must be non-empty", name) + } + } + // DeleteMaxDeliver above MaxDeliver is a no-op footgun: the shorter cap would never trip first. + // Clamp it down when MaxDeliver is finite (0 = unlimited). + if cfg.MaxDeliver > 0 && cfg.DeleteMaxDeliver > cfg.MaxDeliver { + cfg.DeleteMaxDeliver = cfg.MaxDeliver + } + return cfg, nil +} diff --git a/data-migration/oplog-collections-transformer/config_test.go b/data-migration/oplog-collections-transformer/config_test.go new file mode 100644 index 000000000..33c4541a8 --- /dev/null +++ b/data-migration/oplog-collections-transformer/config_test.go @@ -0,0 +1,155 @@ +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// setRequiredEnv sets the four required env vars that every test needs to succeed. +func setRequiredEnv(t *testing.T) { + t.Helper() + t.Setenv("SITE_ID", "site1") + t.Setenv("NATS_URL", "nats://localhost:4222") + t.Setenv("SOURCE_MONGO_URI", "mongodb://localhost:27017") + t.Setenv("TARGET_MONGO_URI", "mongodb://localhost:27018") +} + +func TestParseConfig_Defaults(t *testing.T) { + setRequiredEnv(t) + cfg, err := parseConfig() + require.NoError(t, err) + + assert.Equal(t, "site1", cfg.SiteID) + assert.Equal(t, "nats://localhost:4222", cfg.NatsURL) + assert.Equal(t, "", cfg.NatsCredsFile) + + assert.Equal(t, "mongodb://localhost:27017", cfg.SourceMongoURI) + assert.Equal(t, "", cfg.SourceUsername) + assert.Equal(t, "", cfg.SourcePassword) + assert.Equal(t, "rocketchat", cfg.SourceDB) + + assert.Equal(t, "mongodb://localhost:27018", cfg.TargetMongoURI) + assert.Equal(t, "", cfg.TargetUsername) + assert.Equal(t, "", cfg.TargetPassword) + assert.Equal(t, "chat", cfg.TargetDB) + + assert.Equal(t, "rocketchat_rooms", cfg.RoomsCollection) + assert.Equal(t, "rocketchat_subscriptions", cfg.SubscriptionsCollection) + assert.Equal(t, "tsmc_thread_subscriptions", cfg.ThreadSubsCollection) + assert.Equal(t, "users", cfg.UsersCollection) + + assert.Equal(t, "primaryPreferred", cfg.SourceReadPreference) + assert.Equal(t, "oplog-collections-transformer", cfg.ConsumerDurable) + assert.Equal(t, 1000, cfg.MaxDeliver) + assert.Equal(t, 60, cfg.DeleteMaxDeliver) + assert.Equal(t, false, cfg.Bootstrap.Enabled) + assert.Equal(t, ":9090", cfg.MetricsAddr) + assert.Equal(t, "info", cfg.LogLevel) +} + +func TestParseConfig_MissingRequired(t *testing.T) { + // Only SITE_ID set — NATS_URL, SOURCE_MONGO_URI, TARGET_MONGO_URI missing. + t.Setenv("SITE_ID", "site1") + _, err := parseConfig() + require.Error(t, err) +} + +func TestParseConfig_EmptyCollectionNameFails(t *testing.T) { + setRequiredEnv(t) + t.Setenv("ROOMS_COLLECTION", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "ROOMS_COLLECTION") +} + +func TestParseConfig_EmptySubscriptionsCollectionFails(t *testing.T) { + setRequiredEnv(t) + t.Setenv("SUBSCRIPTIONS_COLLECTION", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "SUBSCRIPTIONS_COLLECTION") +} + +func TestParseConfig_EmptyThreadSubsCollectionFails(t *testing.T) { + setRequiredEnv(t) + t.Setenv("THREAD_SUBS_COLLECTION", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "THREAD_SUBS_COLLECTION") +} + +func TestParseConfig_EmptyUsersCollectionFails(t *testing.T) { + setRequiredEnv(t) + t.Setenv("USERS_COLLECTION", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "USERS_COLLECTION") +} + +func TestParseConfig_ClampsDeleteMaxDeliver(t *testing.T) { + setRequiredEnv(t) + t.Setenv("MAX_DELIVER", "100") + t.Setenv("DELETE_MAX_DELIVER", "500") // exceeds MaxDeliver → clamped + cfg, err := parseConfig() + require.NoError(t, err) + assert.Equal(t, 100, cfg.MaxDeliver) + assert.Equal(t, 100, cfg.DeleteMaxDeliver, "DELETE_MAX_DELIVER is clamped to MAX_DELIVER") +} + +func TestParseConfig_DeleteMaxDeliverStaysWhenMaxDeliverZero(t *testing.T) { + setRequiredEnv(t) + t.Setenv("MAX_DELIVER", "0") // 0 = unlimited; delete cap must NOT be clamped to unlimited + t.Setenv("DELETE_MAX_DELIVER", "60") + cfg, err := parseConfig() + require.NoError(t, err) + assert.Equal(t, 60, cfg.DeleteMaxDeliver, "delete cap stays finite when MAX_DELIVER is unlimited") +} + +func TestParseConfig_DeleteMaxDeliverNotClampedWhenBelow(t *testing.T) { + setRequiredEnv(t) + t.Setenv("MAX_DELIVER", "1000") + t.Setenv("DELETE_MAX_DELIVER", "60") + cfg, err := parseConfig() + require.NoError(t, err) + assert.Equal(t, 60, cfg.DeleteMaxDeliver, "a DELETE_MAX_DELIVER below MAX_DELIVER is left untouched") +} + +func TestParseConfig_BootstrapEnabled(t *testing.T) { + setRequiredEnv(t) + t.Setenv("BOOTSTRAP_STREAMS", "true") + cfg, err := parseConfig() + require.NoError(t, err) + assert.True(t, cfg.Bootstrap.Enabled) +} + +func TestParseConfig_CustomCollectionNames(t *testing.T) { + setRequiredEnv(t) + t.Setenv("ROOMS_COLLECTION", "my_rooms") + t.Setenv("SUBSCRIPTIONS_COLLECTION", "my_subs") + t.Setenv("THREAD_SUBS_COLLECTION", "my_thread_subs") + t.Setenv("USERS_COLLECTION", "my_users") + cfg, err := parseConfig() + require.NoError(t, err) + assert.Equal(t, "my_rooms", cfg.RoomsCollection) + assert.Equal(t, "my_subs", cfg.SubscriptionsCollection) + assert.Equal(t, "my_thread_subs", cfg.ThreadSubsCollection) + assert.Equal(t, "my_users", cfg.UsersCollection) +} + +func TestParseConfig_WhitespaceSiteID_Errors(t *testing.T) { + setRequiredEnv(t) + t.Setenv("SITE_ID", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "SITE_ID must be non-empty") +} + +func TestParseConfig_WhitespaceSourceMongoURI_Errors(t *testing.T) { + setRequiredEnv(t) + t.Setenv("SOURCE_MONGO_URI", " ") + _, err := parseConfig() + require.Error(t, err) + assert.Contains(t, err.Error(), "SOURCE_MONGO_URI must be non-empty") +} diff --git a/data-migration/oplog-collections-transformer/deploy/Dockerfile b/data-migration/oplog-collections-transformer/deploy/Dockerfile new file mode 100644 index 000000000..18b9af60c --- /dev/null +++ b/data-migration/oplog-collections-transformer/deploy/Dockerfile @@ -0,0 +1,20 @@ +FROM golang:1.25.11-alpine AS builder + +WORKDIR /app + +COPY go.mod go.sum ./ +RUN go mod download + +COPY pkg/ pkg/ +COPY data-migration/oplog-collections-transformer/ data-migration/oplog-collections-transformer/ + +RUN CGO_ENABLED=0 go build -o /oplog-collections-transformer ./data-migration/oplog-collections-transformer/ + +FROM alpine:3.21 + +RUN apk add --no-cache ca-certificates && adduser -D -u 10001 app + +COPY --from=builder /oplog-collections-transformer /oplog-collections-transformer + +USER app +ENTRYPOINT ["/oplog-collections-transformer"] diff --git a/data-migration/oplog-collections-transformer/deploy/azure-pipelines.yml b/data-migration/oplog-collections-transformer/deploy/azure-pipelines.yml new file mode 100644 index 000000000..e210eb31e --- /dev/null +++ b/data-migration/oplog-collections-transformer/deploy/azure-pipelines.yml @@ -0,0 +1,80 @@ +trigger: + branches: + include: + - main + - develop + paths: + include: + - data-migration/oplog-collections-transformer/ + - pkg/ + +pr: + branches: + include: + - main + paths: + include: + - data-migration/oplog-collections-transformer/ + - pkg/ + +variables: + GO_VERSION: '1.25.11' + SERVICE_PATH: data-migration/oplog-collections-transformer + IMAGE_NAME: oplog-collections-transformer + REGISTRY: '$(containerRegistry)' + +stages: + - stage: Validate + displayName: 'Lint & Test' + jobs: + - job: LintAndTest + pool: + vmImage: 'ubuntu-latest' + steps: + - task: GoTool@0 + inputs: + version: '$(GO_VERSION)' + displayName: 'Install Go $(GO_VERSION)' + + - script: go vet ./$(SERVICE_PATH)/... ./pkg/... + displayName: 'Go Vet' + + - script: go test ./pkg/... -v -race -coverprofile=coverage-pkg.out + displayName: 'Test shared packages' + + # Run unit + integration (-tags=integration) so the handler wiring — which + # is only reachable with real Mongo/NATS (testcontainers) — counts toward + # the coverage floor. Requires Docker on the agent. + - script: go test ./$(SERVICE_PATH)/... -tags=integration -v -race -coverprofile=coverage-$(IMAGE_NAME).out + displayName: 'Test $(IMAGE_NAME) (unit + integration)' + + - script: | + set -euo pipefail + pct=$(go tool cover -func=coverage-$(IMAGE_NAME).out | awk '/^total:/ {print substr($3, 1, length($3)-1)}') + echo "$(IMAGE_NAME) total coverage: ${pct}%" + awk -v p="$pct" 'BEGIN { exit (p+0 < 80) }' || { echo "##vso[task.logissue type=error]coverage ${pct}% is below the 80% floor (CLAUDE.md §4)"; exit 1; } + displayName: 'Enforce 80% coverage floor' + + - script: go build -o /dev/null ./$(SERVICE_PATH)/ + displayName: 'Build $(IMAGE_NAME)' + + - stage: Build + displayName: 'Build & Push Image' + dependsOn: Validate + condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) + jobs: + - job: BuildImage + pool: + vmImage: 'ubuntu-latest' + steps: + - task: Docker@2 + inputs: + containerRegistry: '$(containerRegistry)' + repository: 'chat/$(IMAGE_NAME)' + command: 'buildAndPush' + Dockerfile: '$(SERVICE_PATH)/deploy/Dockerfile' + buildContext: '.' + tags: | + $(Build.BuildId) + latest + displayName: 'Build & push $(IMAGE_NAME)' diff --git a/data-migration/oplog-collections-transformer/deploy/docker-compose.yml b/data-migration/oplog-collections-transformer/deploy/docker-compose.yml new file mode 100644 index 000000000..85b225be7 --- /dev/null +++ b/data-migration/oplog-collections-transformer/deploy/docker-compose.yml @@ -0,0 +1,54 @@ +name: oplog-collections-transformer + +# Local dev only. Stands up a source Mongo (standalone — update lookups use FindOne, no replica +# set required), a target Mongo, a JetStream-enabled NATS, and the transformer. +# Production streams/manifests are ops/IaC-owned — the transformer owns no streams. + +services: + source-mongo: + image: mongo:7 + networks: + - oplog-col-local + + target-mongo: + image: mongo:7 + networks: + - oplog-col-local + + nats: + image: nats:2.10-alpine + command: ["--jetstream", "--http_port", "8222"] + networks: + - oplog-col-local + + oplog-collections-transformer: + build: + context: ../.. + dockerfile: data-migration/oplog-collections-transformer/deploy/Dockerfile + depends_on: + source-mongo: + condition: service_started + target-mongo: + condition: service_started + nats: + condition: service_started + environment: + - SITE_ID=site-local + - NATS_URL=nats://nats:4222 + - SOURCE_MONGO_URI=mongodb://source-mongo:27017/ + - SOURCE_DB=rocketchat + - TARGET_MONGO_URI=mongodb://target-mongo:27017/ + - TARGET_DB=chat + - ROOMS_COLLECTION=rocketchat_rooms + - SUBSCRIPTIONS_COLLECTION=rocketchat_subscriptions + - THREAD_SUBS_COLLECTION=tsmc_thread_subscriptions + - USERS_COLLECTION=users + - SOURCE_READ_PREFERENCE=primaryPreferred + - BOOTSTRAP_STREAMS=true + - LOG_LEVEL=info + networks: + - oplog-col-local + +networks: + oplog-col-local: + driver: bridge diff --git a/data-migration/oplog-collections-transformer/handler.go b/data-migration/oplog-collections-transformer/handler.go new file mode 100644 index 000000000..27ea22778 --- /dev/null +++ b/data-migration/oplog-collections-transformer/handler.go @@ -0,0 +1,174 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "strings" + "time" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" +) + +// oplogEvent mirrors model.OplogEvent's wire shape (decoded from the consumed message), +// matching the message transformer's struct so both decode the connector output identically. +type oplogEvent struct { + EventID string `json:"eventId"` + Op string `json:"op"` + Collection string `json:"coll"` + DocumentKey json.RawMessage `json:"documentKey"` + ClusterTime int64 `json:"clusterTime"` // source op time, unix ms. + FullDocument json.RawMessage `json:"fullDocument"` + UpdateDescription json.RawMessage `json:"updateDescription"` + // Degraded is true when the connector couldn't encode an opaque field (left nil) but still + // published the event. The transformer recovers the missing field via a source lookup, not poison. + Degraded bool `json:"degraded"` + DegradedReason string `json:"degradedReason"` +} + +// inboxPublisher publishes a model.InboxEvent into the local INBOX stream. +type inboxPublisher interface { + Publish(ctx context.Context, evt model.InboxEvent) error +} + +// targetStore is the new-stack per-site Mongo access the transformer needs. +type targetStore interface { + // UpsertUserIfAbsent inserts the user keyed by account only when absent, leaving an existing + // doc (owned by the company-wide sync) untouched. inserted is false when already present. + UpsertUserIfAbsent(ctx context.Context, u model.User) (inserted bool, err error) + // FindThreadRoom resolves the thread room for a parent message id, returning roomID, + // threadRoomID, and the thread room's home siteID (thread-subs inherit the room's site, §6). + FindThreadRoom(ctx context.Context, parentMessageID string) (roomID, threadRoomID, siteID string, found bool, err error) + FindUserID(ctx context.Context, account string) (userID string, found bool, err error) +} + +type handler struct { + siteID string + allSiteIDs []string + roomsColl string + subsColl string + threadSubsColl string + usersColl string + pub inboxPublisher + target targetStore + // lookups re-read the current source doc on update events (the connector forwards only the + // delta), keyed by source collection name — one SourceLookup per watched collection. + lookups map[string]migration.SourceLookup + metrics *metrics // nil-safe + now func() int64 // injectable clock, defaults to time.Now().UTC().UnixMilli +} + +// nowMillis returns the handler's current time in unix ms, defaulting to wall-clock when unset. +func (h *handler) nowMillis() int64 { + if h.now != nil { + return h.now() + } + return time.Now().UTC().UnixMilli() +} + +// handle dispatches one decoded oplog event by collection. nil = ack+count; ErrSkipped = +// ack-without-count (already metered); ErrPoison => Term; any other error => Nak (transient). +// +//nolint:gocritic // ev passed by value: it's the decoded event the consume loop hands off, one per message off the hot path. +func (h *handler) handle(ctx context.Context, ev oplogEvent) error { + switch ev.Collection { + case h.roomsColl: + return h.handleRoom(ctx, ev) + case h.usersColl: + return h.handleUser(ctx, ev) + case h.subsColl: + return h.handleSubscription(ctx, ev) + case h.threadSubsColl: + return h.handleThreadSub(ctx, ev) + default: + slog.Debug("skip non-migrated collection", + "collection", ev.Collection, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "other_collection") + return migration.ErrSkipped + } +} + +// resolveDoc returns the full current source doc for the event, or (nil, true, nil) to skip. +// insert/replace carry the doc inline; update re-reads by documentKey._id; delete is always skip. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) resolveDoc(ctx context.Context, ev oplogEvent) (doc []byte, skip bool, err error) { + switch ev.Op { + case "insert", "replace": + if len(ev.FullDocument) == 0 { + if !ev.Degraded { + // The connector always carries the doc for insert/replace — a non-degraded missing + // one is a contract violation that can never succeed on redelivery. Poison. + return nil, false, fmt.Errorf("%w: %s without fullDocument", migration.ErrPoison, ev.Op) + } + // Degraded: the connector couldn't encode fullDocument (left nil) but still published. + // Recover the live source doc by _id rather than drop it — mirrors oplog-transformer. + slog.Warn("recovering degraded insert/replace via source lookup", + "eventId", ev.EventID, "reason", ev.DegradedReason, "request_id", natsutil.RequestIDFromContext(ctx)) + return h.resolveBySourceLookup(ctx, ev) + } + return ev.FullDocument, false, nil + case "update": + return h.resolveBySourceLookup(ctx, ev) + case "delete": + // Un-actionable: only documentKey._id, and the destination doesn't key by source _id. + return nil, true, nil + default: + // Unknown op — caller meters "unknown_op". + return nil, true, nil + } +} + +// resolveBySourceLookup re-reads the full current source doc by documentKey._id — used for updates +// (the connector forwards only the delta) and degraded insert/replace (fullDocument couldn't encode). +// skip=true when the doc vanished from source between the event and our re-read. +// +//nolint:gocritic // ev passed by value to mirror resolveDoc's signature; off the hot path. +func (h *handler) resolveBySourceLookup(ctx context.Context, ev oplogEvent) (doc []byte, skip bool, err error) { + id, idErr := documentKeyID(ev.DocumentKey) + if idErr != nil { + return nil, false, idErr + } + lk := h.lookups[ev.Collection] + if lk == nil { + // No source lookup for this collection is a misconfiguration (filter subjects and the + // lookups map disagree) — it can never succeed. Poison. + return nil, false, fmt.Errorf("%w: no source lookup for collection %q", migration.ErrPoison, ev.Collection) + } + got, lookupErr := lk.FindByID(ctx, id) + if lookupErr != nil { + return nil, false, fmt.Errorf("lookup %q: %w", id, lookupErr) + } + if got == nil { + // Doc vanished from source between the change event and our re-read — nothing to apply. + return nil, true, nil + } + return got, false, nil +} + +// documentKeyID decodes documentKey → _id (the common string case). Returns migration.ErrPoison +// when missing/malformed — mirrors the message transformer's documentKeyID. +func documentKeyID(documentKey json.RawMessage) (string, error) { + var key struct { + ID string `json:"_id"` + } + if err := json.Unmarshal(documentKey, &key); err != nil || key.ID == "" { + return "", fmt.Errorf("%w: bad documentKey", migration.ErrPoison) + } + return key.ID, nil +} + +// siteIDFromOrigin returns the record's home siteId: the deployment's siteID when origin is +// absent or "local", else the first dotted label of the origin domain ("0030204.tchat..." → "0030204"). +func siteIDFromOrigin(origin, deploymentSiteID string) string { + if origin == "" || origin == "local" { + return deploymentSiteID + } + if i := strings.IndexByte(origin, '.'); i >= 0 { + return origin[:i] + } + return origin +} diff --git a/data-migration/oplog-collections-transformer/handler_test.go b/data-migration/oplog-collections-transformer/handler_test.go new file mode 100644 index 000000000..1f28032f3 --- /dev/null +++ b/data-migration/oplog-collections-transformer/handler_test.go @@ -0,0 +1,243 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" +) + +// fakePublisher captures the InboxEvents the handler publishes. +type fakePublisher struct { + events []model.InboxEvent + err error +} + +//nolint:gocritic // signature pinned by the inboxPublisher interface. +func (f *fakePublisher) Publish(_ context.Context, evt model.InboxEvent) error { + if f.err != nil { + return f.err + } + f.events = append(f.events, evt) + return nil +} + +// fakeLookup returns a canned doc for FindByID (used by the update path). +type fakeLookup struct { + doc []byte + err error +} + +func (f *fakeLookup) FindByID(_ context.Context, _ string) ([]byte, error) { + return f.doc, f.err +} + +// fakeTarget records UpsertUserIfAbsent calls and answers the FK lookups. +type fakeTarget struct { + upserted []model.User + inserted bool + err error +} + +//nolint:gocritic // signature pinned by the targetStore interface. +func (f *fakeTarget) UpsertUserIfAbsent(_ context.Context, u model.User) (bool, error) { + if f.err != nil { + return false, f.err + } + f.upserted = append(f.upserted, u) + return f.inserted, nil +} + +func (f *fakeTarget) FindThreadRoom(_ context.Context, _ string) (string, string, string, bool, error) { + return "", "", "", false, nil +} + +func (f *fakeTarget) FindUserID(_ context.Context, _ string) (string, bool, error) { + return "", false, nil +} + +const ( + testSiteID = "s1" + roomsColl = "rocketchat_rooms" + subsColl = "rocketchat_subscriptions" + threadSubColl = "tsmc_thread_subscriptions" + usersColl = "users" +) + +func newTestHandler(pub inboxPublisher, target targetStore, lookup migration.SourceLookup) *handler { + return &handler{ + siteID: testSiteID, + allSiteIDs: []string{testSiteID, "s2"}, + roomsColl: roomsColl, + subsColl: subsColl, + threadSubsColl: threadSubColl, + usersColl: usersColl, + pub: pub, + target: target, + lookups: map[string]migration.SourceLookup{ + roomsColl: lookup, + subsColl: lookup, + threadSubColl: lookup, + usersColl: lookup, + "": lookup, // bare resolveDoc tests that don't set Collection + }, + now: func() int64 { return 1700000000000 }, + } +} + +func TestSiteIDFromOrigin(t *testing.T) { + tests := []struct { + name string + origin string + want string + }{ + {"absent → deployment site", "", testSiteID}, + {"literal local → deployment site", "local", testSiteID}, + {"federated domain → first label", "0030204.tchat-test.test.company.com", "0030204"}, + {"no dot → whole origin", "0030204", "0030204"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, siteIDFromOrigin(tc.origin, testSiteID)) + }) + } +} + +func TestHandle_Dispatch(t *testing.T) { + t.Run("subscriptions collection routes to handleSubscription", func(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","fname":"General","open":true}` + err := h.handle(context.Background(), subEv("insert", doc, "")) + require.NoError(t, err) + assert.NotEmpty(t, pub.events) + }) + + t.Run("thread-subs routes to handleThreadSub (poison is branch-specific)", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + // Empty parentMessage._id poisons ONLY on the thread-sub branch; the default-skip would + // return ErrSkipped, so ErrPoison proves the event actually reached handleThreadSub. + err := h.handle(context.Background(), oplogEvent{Op: "insert", Collection: threadSubColl, + FullDocument: json.RawMessage(`{"_id":"ts1","u":{"username":"alice"},"parentMessage":{"_id":""}}`)}) + assert.ErrorIs(t, err, migration.ErrPoison) + }) + + t.Run("unknown collection skipped", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + err := h.handle(context.Background(), oplogEvent{Op: "insert", Collection: "other"}) + assert.ErrorIs(t, err, migration.ErrSkipped) + }) + + t.Run("rooms collection routes to handleRoom", func(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + doc := `{"_id":"r1","t":"c","fname":"General","uids":["u1"]}` + err := h.handle(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + assert.Len(t, pub.events, 1) + }) + + t.Run("users collection routes to handleUser", func(t *testing.T) { + target := &fakeTarget{inserted: true} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + err := h.handle(context.Background(), userEv("insert", `{"_id":"u1","username":"alice"}`)) + require.NoError(t, err) + assert.Len(t, target.upserted, 1) + }) +} + +func TestHandleRoom_PublishError(t *testing.T) { + pub := &fakePublisher{err: errors.New("inbox down")} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + doc := `{"_id":"r1","t":"c","fname":"General","uids":["u1"]}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrSkipped) + assert.NotErrorIs(t, err, migration.ErrPoison, "a transient publish failure must Nak, not poison") +} + +func TestHandleUser_UpsertError(t *testing.T) { + target := &fakeTarget{err: errors.New("mongo down")} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + err := h.handleUser(context.Background(), userEv("insert", `{"_id":"u1","username":"alice"}`)) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrSkipped) + assert.NotErrorIs(t, err, migration.ErrPoison, "a transient upsert failure must Nak, not poison") +} + +func TestResolveDoc(t *testing.T) { + full := json.RawMessage(`{"_id":"r1"}`) + + t.Run("insert carries full doc inline", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + doc, skip, err := h.resolveDoc(context.Background(), oplogEvent{Op: "insert", FullDocument: full}) + require.NoError(t, err) + assert.False(t, skip) + assert.JSONEq(t, string(full), string(doc)) + }) + + t.Run("insert without fullDocument is poison", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + _, _, err := h.resolveDoc(context.Background(), oplogEvent{Op: "insert"}) + assert.ErrorIs(t, err, migration.ErrPoison) + }) + + t.Run("update re-reads via lookup", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{doc: full}) + doc, skip, err := h.resolveDoc(context.Background(), oplogEvent{ + Op: "update", DocumentKey: json.RawMessage(`{"_id":"r1"}`), + }) + require.NoError(t, err) + assert.False(t, skip) + assert.JSONEq(t, string(full), string(doc)) + }) + + t.Run("update lookup miss → skip", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{doc: nil}) + _, skip, err := h.resolveDoc(context.Background(), oplogEvent{ + Op: "update", DocumentKey: json.RawMessage(`{"_id":"r1"}`), + }) + require.NoError(t, err) + assert.True(t, skip) + }) + + t.Run("update lookup error → transient (Nak)", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{err: errors.New("source down")}) + _, _, err := h.resolveDoc(context.Background(), oplogEvent{ + Op: "update", DocumentKey: json.RawMessage(`{"_id":"r1"}`), + }) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrPoison) + assert.NotErrorIs(t, err, migration.ErrSkipped) + }) + + t.Run("update for collection with no source lookup → poison", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{doc: full}) + // A collection the lookups map doesn't know about (filter subjects/map disagree). + _, _, err := h.resolveDoc(context.Background(), oplogEvent{ + Op: "update", Collection: "unwatched", DocumentKey: json.RawMessage(`{"_id":"r1"}`), + }) + assert.ErrorIs(t, err, migration.ErrPoison) + }) + + t.Run("update with bad documentKey → poison", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + _, _, err := h.resolveDoc(context.Background(), oplogEvent{ + Op: "update", DocumentKey: json.RawMessage(`{}`), + }) + assert.ErrorIs(t, err, migration.ErrPoison) + }) + + t.Run("delete → skip", func(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + _, skip, err := h.resolveDoc(context.Background(), oplogEvent{Op: "delete"}) + require.NoError(t, err) + assert.True(t, skip) + }) +} diff --git a/data-migration/oplog-collections-transformer/inboxpublisher.go b/data-migration/oplog-collections-transformer/inboxpublisher.go new file mode 100644 index 000000000..4fba9c007 --- /dev/null +++ b/data-migration/oplog-collections-transformer/inboxpublisher.go @@ -0,0 +1,35 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/subject" +) + +// jetstreamPublisher publishes InboxEvents into the local INBOX stream. +type jetstreamPublisher struct { + publish func(ctx context.Context, msg *nats.Msg, opts ...jetstream.PublishOpt) (*jetstream.PubAck, error) +} + +// Publish emits one InboxEvent onto the INBOX external lane, blocking on the pub-ack. The request +// id flows from ctx into the message headers (natsutil.NewMsg) so transformer→inbox-worker shares it. +// +//nolint:gocritic // model.InboxEvent passed by value: one per migrated record, off the hot path. +func (p *jetstreamPublisher) Publish(ctx context.Context, evt model.InboxEvent) error { + data, err := json.Marshal(evt) + if err != nil { + return fmt.Errorf("marshal inbox event: %w", err) + } + m := natsutil.NewMsg(ctx, subject.InboxExternal(evt.DestSiteID, evt.Type), data) + if _, err := p.publish(ctx, m); err != nil { + return fmt.Errorf("publish inbox external: %w", err) + } + return nil +} diff --git a/data-migration/oplog-collections-transformer/inboxpublisher_test.go b/data-migration/oplog-collections-transformer/inboxpublisher_test.go new file mode 100644 index 000000000..f99ab394a --- /dev/null +++ b/data-migration/oplog-collections-transformer/inboxpublisher_test.go @@ -0,0 +1,75 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "testing" + + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/subject" +) + +func TestJetstreamPublisher_Publish(t *testing.T) { + const site = "s1" + + t.Run("publishes event to correct subject with correct data", func(t *testing.T) { + var captured *nats.Msg + pub := &jetstreamPublisher{ + publish: func(_ context.Context, msg *nats.Msg, _ ...jetstream.PublishOpt) (*jetstream.PubAck, error) { + captured = msg + return &jetstream.PubAck{}, nil + }, + } + + // Distinct SiteID (home) vs DestSiteID (routing) so a source/dest mix-up in subject + // routing would be caught — the subject must use DestSiteID. + evt := model.InboxEvent{ + Type: "room_sync", + SiteID: "home-site", + DestSiteID: site, + Payload: []byte(`{"id":"r1"}`), + Timestamp: 1700000000000, + } + + err := pub.Publish(context.Background(), evt) + require.NoError(t, err) + require.NotNil(t, captured) + + wantSubject := subject.InboxExternal(evt.DestSiteID, "room_sync") + assert.Equal(t, wantSubject, captured.Subject, "subject must route on DestSiteID, not SiteID") + + var got model.InboxEvent + require.NoError(t, json.Unmarshal(captured.Data, &got)) + assert.Equal(t, evt.Type, got.Type) + assert.Equal(t, evt.SiteID, got.SiteID) + assert.Equal(t, evt.DestSiteID, got.DestSiteID) + assert.Equal(t, evt.Timestamp, got.Timestamp) + assert.JSONEq(t, string(evt.Payload), string(got.Payload)) + }) + + t.Run("publish error propagates as wrapped error", func(t *testing.T) { + publishErr := errors.New("nats down") + pub := &jetstreamPublisher{ + publish: func(_ context.Context, _ *nats.Msg, _ ...jetstream.PublishOpt) (*jetstream.PubAck, error) { + return nil, publishErr + }, + } + + evt := model.InboxEvent{ + Type: "room_sync", + SiteID: site, + Timestamp: 1700000000000, + } + + err := pub.Publish(context.Background(), evt) + require.Error(t, err) + assert.ErrorContains(t, err, "publish inbox external") + assert.ErrorIs(t, err, publishErr) + }) +} diff --git a/data-migration/oplog-collections-transformer/integration_test.go b/data-migration/oplog-collections-transformer/integration_test.go new file mode 100644 index 000000000..d8b9482c8 --- /dev/null +++ b/data-migration/oplog-collections-transformer/integration_test.go @@ -0,0 +1,488 @@ +//go:build integration + +package main + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/nats-io/nats.go/jetstream" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" + "github.com/hmchangw/chat/pkg/testutil" +) + +func TestMain(m *testing.M) { testutil.RunTests(m) } + +// -------------------------------------------------------------------------- +// Target store tests (real Mongo via testutil.MongoDB) +// -------------------------------------------------------------------------- + +func TestTargetStore_UpsertUserIfAbsent(t *testing.T) { + ctx := context.Background() + db := testutil.MongoDB(t, "tgt") + s := NewMongoTargetStore(db) + require.NoError(t, s.EnsureIndexes(ctx)) + + u := model.User{ + ID: "userabc123", + Account: "alice", + EngName: "Alice A", + ChineseName: "愛麗絲", + SiteID: "site1", + } + + // First call — must insert. + inserted, err := s.UpsertUserIfAbsent(ctx, u) + require.NoError(t, err) + assert.True(t, inserted, "first upsert must create the doc") + + // Second call with a different user but same account — must NOT overwrite. + u2 := model.User{ + ID: "differentid", + Account: "alice", // same account → filter matches existing + EngName: "Someone Else", + ChineseName: "別人", + SiteID: "site2", + } + inserted2, err := s.UpsertUserIfAbsent(ctx, u2) + require.NoError(t, err) + assert.False(t, inserted2, "second upsert for same account must not insert") + + // Verify the stored doc is unchanged (first one wins). + storedID, found, err := s.FindUserID(ctx, "alice") + require.NoError(t, err) + require.True(t, found) + assert.Equal(t, u.ID, storedID, "stored user id must be the first-inserted one") + + // Confirm the full doc still has the original engName. + var got model.User + require.NoError(t, db.Collection("users").FindOne(ctx, bson.M{"account": "alice"}).Decode(&got)) + assert.Equal(t, "Alice A", got.EngName, "$setOnInsert must not overwrite existing fields") +} + +func TestTargetStore_FindUserID(t *testing.T) { + ctx := context.Background() + db := testutil.MongoDB(t, "tgt") + s := NewMongoTargetStore(db) + require.NoError(t, s.EnsureIndexes(ctx)) + + // Missing → found==false, no error. + id, found, err := s.FindUserID(ctx, "nobody") + require.NoError(t, err) + assert.False(t, found) + assert.Empty(t, id) + + // Insert then find. + u := model.User{ID: "uid123abc", Account: "bob", SiteID: "site1"} + inserted, err := s.UpsertUserIfAbsent(ctx, u) + require.NoError(t, err) + require.True(t, inserted) + + id2, found2, err := s.FindUserID(ctx, "bob") + require.NoError(t, err) + assert.True(t, found2) + assert.Equal(t, "uid123abc", id2) +} + +func TestTargetStore_FindThreadRoom(t *testing.T) { + ctx := context.Background() + db := testutil.MongoDB(t, "tgt") + s := NewMongoTargetStore(db) + + // Missing → found==false, no error. + roomID, trID, siteID, found, err := s.FindThreadRoom(ctx, "nonexistent_parent") + require.NoError(t, err) + assert.False(t, found) + assert.Empty(t, roomID) + assert.Empty(t, trID) + assert.Empty(t, siteID) + + // Insert a thread room doc directly, then find it by parentMessageID. + tr := model.ThreadRoom{ + ID: "tr1", + ParentMessageID: "msg1", + RoomID: "room1", + SiteID: "site1", + CreatedAt: time.Now().UTC(), + UpdatedAt: time.Now().UTC(), + } + _, err = db.Collection("thread_rooms").InsertOne(ctx, tr) + require.NoError(t, err) + + gotRoomID, gotTRID, gotSiteID, gotFound, err := s.FindThreadRoom(ctx, "msg1") + require.NoError(t, err) + assert.True(t, gotFound) + assert.Equal(t, "room1", gotRoomID) + assert.Equal(t, "tr1", gotTRID) + assert.Equal(t, "site1", gotSiteID) +} + +// -------------------------------------------------------------------------- +// Inbox publisher round-trip (real NATS via testutil.NATS) +// -------------------------------------------------------------------------- + +func TestInboxPublisher_RoundTrip(t *testing.T) { + const site = "site1rt" + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + nc, err := natsutil.Connect(testutil.NATS(t), "") + require.NoError(t, err) + defer func() { assert.NoError(t, nc.Drain()) }() + + js, err := oteljetstream.New(nc) + require.NoError(t, err) + + // Create the INBOX stream so publishes are captured. + inboxCfg := stream.Inbox(site) + _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: inboxCfg.Name, + Subjects: inboxCfg.Subjects, + }) + require.NoError(t, err) + + pub := &jetstreamPublisher{publish: js.PublishMsg} + + evt := model.InboxEvent{ + Type: "room_sync", + SiteID: site, + DestSiteID: site, + Payload: []byte(`{"id":"r1","type":"channel"}`), + Timestamp: 1700000000000, + } + require.NoError(t, pub.Publish(ctx, evt)) + + // Create a consumer filtered to the aggregate lane for room_sync. + cons, err := js.CreateOrUpdateConsumer(ctx, inboxCfg.Name, jetstream.ConsumerConfig{ + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: []string{subject.InboxExternal(site, "room_sync")}, + }) + require.NoError(t, err) + + var got jetstream.Msg + require.Eventually(t, func() bool { + batch, berr := cons.Fetch(1, jetstream.FetchMaxWait(500*time.Millisecond)) + if berr != nil { + return false + } + for msg := range batch.Messages() { + assert.NoError(t, msg.Ack()) + got = msg + return true + } + return false + }, 30*time.Second, 250*time.Millisecond, "room_sync event must land on INBOX") + + require.NotNil(t, got) + assert.Equal(t, subject.InboxExternal(site, "room_sync"), got.Subject()) + + var gotEvt model.InboxEvent + require.NoError(t, json.Unmarshal(got.Data(), &gotEvt)) + assert.Equal(t, evt.Type, gotEvt.Type) + assert.Equal(t, evt.SiteID, gotEvt.SiteID) + assert.Equal(t, evt.DestSiteID, gotEvt.DestSiteID) + assert.Equal(t, evt.Timestamp, gotEvt.Timestamp) + assert.JSONEq(t, string(evt.Payload), string(gotEvt.Payload)) +} + +// -------------------------------------------------------------------------- +// End-to-end handler tests (real Mongo source+target + real NATS) +// -------------------------------------------------------------------------- + +func TestEndToEnd_UserInsert(t *testing.T) { + const site = "site1ue" + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + srcDB := testutil.MongoDB(t, "src") + tgtDB := testutil.MongoDB(t, "tgt") + + // Source lookup via the real migration package. + srcColl := srcDB.Collection("users") + lookup := migration.NewMongoSourceLookup(srcColl) + + // Seed a source users doc. + const srcID = "user001" + _, err := srcColl.InsertOne(ctx, bson.M{ + "_id": srcID, + "username": "charlie", + "type": "user", + "customFields": bson.M{ + "engName": "Charlie C", + "tsmcName": "查理", + "deptId": "dept1", + "deptName": "Engineering", + }, + }) + require.NoError(t, err) + + // Read back as relaxed extJSON — the shape the connector emits. + fullDoc, err := lookup.FindByID(ctx, srcID) + require.NoError(t, err) + require.NotEmpty(t, fullDoc) + + nc, err := natsutil.Connect(testutil.NATS(t), "") + require.NoError(t, err) + defer func() { assert.NoError(t, nc.Drain()) }() + js, err := oteljetstream.New(nc) + require.NoError(t, err) + + tgtStore := NewMongoTargetStore(tgtDB) + require.NoError(t, tgtStore.EnsureIndexes(ctx)) + + h := &handler{ + siteID: site, + roomsColl: "rocketchat_rooms", + subsColl: "rocketchat_subscriptions", + threadSubsColl: "tsmc_thread_subscriptions", + usersColl: "users", + pub: &jetstreamPublisher{publish: js.PublishMsg}, + target: tgtStore, + lookups: map[string]migration.SourceLookup{"users": lookup}, + now: func() int64 { return 1700000000000 }, + } + + require.NoError(t, h.handle(ctx, oplogEvent{ + Collection: "users", + Op: "insert", + FullDocument: fullDoc, + })) + + // Assert the user now exists in the target users collection. + var got model.User + err = tgtDB.Collection("users").FindOne(ctx, bson.M{"account": "charlie"}).Decode(&got) + require.NoError(t, err, "user must be present in target users collection") + assert.Equal(t, "charlie", got.Account) + assert.Equal(t, "Charlie C", got.EngName) + assert.Equal(t, "查理", got.ChineseName) + assert.Equal(t, site, got.SiteID) +} + +func TestEndToEnd_RoomInsert_PublishesRoomSync(t *testing.T) { + const site = "site1ri" + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + srcDB := testutil.MongoDB(t, "src") + tgtDB := testutil.MongoDB(t, "tgt") + srcColl := srcDB.Collection("rocketchat_rooms") + lookup := migration.NewMongoSourceLookup(srcColl) + + const roomSrcID = "room001" + _, err := srcColl.InsertOne(ctx, bson.M{ + "_id": roomSrcID, + "t": "c", + "fname": "General", + "name": "general", + "uids": bson.A{"u1", "u2"}, + }) + require.NoError(t, err) + + fullDoc, err := lookup.FindByID(ctx, roomSrcID) + require.NoError(t, err) + require.NotEmpty(t, fullDoc) + + nc, err := natsutil.Connect(testutil.NATS(t), "") + require.NoError(t, err) + defer func() { assert.NoError(t, nc.Drain()) }() + js, err := oteljetstream.New(nc) + require.NoError(t, err) + + // Create the INBOX stream so the room_sync publish is captured. + inboxCfg := stream.Inbox(site) + _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: inboxCfg.Name, + Subjects: inboxCfg.Subjects, + }) + require.NoError(t, err) + + tgtStore := NewMongoTargetStore(tgtDB) + require.NoError(t, tgtStore.EnsureIndexes(ctx)) + + h := &handler{ + siteID: site, + roomsColl: "rocketchat_rooms", + subsColl: "rocketchat_subscriptions", + threadSubsColl: "tsmc_thread_subscriptions", + usersColl: "users", + pub: &jetstreamPublisher{publish: js.PublishMsg}, + target: tgtStore, + lookups: map[string]migration.SourceLookup{"rocketchat_rooms": lookup}, + now: func() int64 { return 1700000000000 }, + } + + require.NoError(t, h.handle(ctx, oplogEvent{ + Collection: "rocketchat_rooms", + Op: "insert", + FullDocument: fullDoc, + })) + + // Fetch the room_sync event off INBOX. + cons, err := js.CreateOrUpdateConsumer(ctx, inboxCfg.Name, jetstream.ConsumerConfig{ + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: []string{subject.InboxExternal(site, "room_sync")}, + }) + require.NoError(t, err) + + var got jetstream.Msg + require.Eventually(t, func() bool { + batch, berr := cons.Fetch(1, jetstream.FetchMaxWait(500*time.Millisecond)) + if berr != nil { + return false + } + for msg := range batch.Messages() { + assert.NoError(t, msg.Ack()) + got = msg + return true + } + return false + }, 30*time.Second, 250*time.Millisecond, "room_sync event must land on INBOX") + + require.NotNil(t, got) + + var evt model.InboxEvent + require.NoError(t, json.Unmarshal(got.Data(), &evt)) + assert.Equal(t, model.InboxEventType("room_sync"), evt.Type) + assert.Equal(t, site, evt.SiteID) + + var room model.Room + require.NoError(t, json.Unmarshal(evt.Payload, &room)) + assert.Equal(t, roomSrcID, room.ID) + assert.Equal(t, model.RoomTypeChannel, room.Type) +} + +func TestEndToEnd_ThreadSub_NakThenResolve(t *testing.T) { + const site = "site1ts" + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + srcDB := testutil.MongoDB(t, "src") + tgtDB := testutil.MongoDB(t, "tgt") + srcColl := srcDB.Collection("tsmc_thread_subscriptions") + lookup := migration.NewMongoSourceLookup(srcColl) + + // Seed a source thread sub doc. + const tsubSrcID = "tsub001" + now := time.Now().UTC().Truncate(time.Millisecond) + _, err := srcColl.InsertOne(ctx, bson.M{ + "_id": tsubSrcID, + "u": bson.M{ + "_id": "user001", + "username": "dave", + }, + "rid": "room1", + "parentMessage": bson.M{ + "_id": "msg_parent_1", + }, + "lastSeenAt": now, + "unreadMention": 0, + "createdAt": now, + }) + require.NoError(t, err) + + fullDoc, err := lookup.FindByID(ctx, tsubSrcID) + require.NoError(t, err) + require.NotEmpty(t, fullDoc) + + nc, err := natsutil.Connect(testutil.NATS(t), "") + require.NoError(t, err) + defer func() { assert.NoError(t, nc.Drain()) }() + js, err := oteljetstream.New(nc) + require.NoError(t, err) + + // Create the INBOX stream. + inboxCfg := stream.Inbox(site) + _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: inboxCfg.Name, + Subjects: inboxCfg.Subjects, + }) + require.NoError(t, err) + + tgtStore := NewMongoTargetStore(tgtDB) + require.NoError(t, tgtStore.EnsureIndexes(ctx)) + + h := &handler{ + siteID: site, + roomsColl: "rocketchat_rooms", + subsColl: "rocketchat_subscriptions", + threadSubsColl: "tsmc_thread_subscriptions", + usersColl: "users", + pub: &jetstreamPublisher{publish: js.PublishMsg}, + target: tgtStore, + lookups: map[string]migration.SourceLookup{"tsmc_thread_subscriptions": lookup}, + now: func() int64 { return 1700000000000 }, + } + + // Phase 1: thread_room and user both absent → transient error (Nak). + err = h.handle(ctx, oplogEvent{ + Collection: "tsmc_thread_subscriptions", + Op: "insert", + FullDocument: fullDoc, + }) + require.Error(t, err, "missing FK must return a transient error") + assert.NotErrorIs(t, err, migration.ErrSkipped, "must not be skipped") + assert.NotErrorIs(t, err, migration.ErrPoison, "must not be poison") + + // Phase 2: seed the thread_room and user into the target, re-run → success + INBOX event. + tr := model.ThreadRoom{ + ID: "tr_parent_1", + ParentMessageID: "msg_parent_1", + RoomID: "room1", + SiteID: site, + CreatedAt: now, + UpdatedAt: now, + } + _, err = tgtDB.Collection("thread_rooms").InsertOne(ctx, tr) + require.NoError(t, err) + + u := model.User{ID: "dave_uid", Account: "dave", SiteID: site} + inserted, err := tgtStore.UpsertUserIfAbsent(ctx, u) + require.NoError(t, err) + require.True(t, inserted) + + err = h.handle(ctx, oplogEvent{ + Collection: "tsmc_thread_subscriptions", + Op: "insert", + FullDocument: fullDoc, + }) + require.NoError(t, err, "after FKs are seeded, handle must succeed") + + // Confirm the thread_subscription_upserted event landed on INBOX. + cons, err := js.CreateOrUpdateConsumer(ctx, inboxCfg.Name, jetstream.ConsumerConfig{ + AckPolicy: jetstream.AckExplicitPolicy, + FilterSubjects: []string{subject.InboxExternal(site, string(model.InboxThreadSubscriptionUpserted))}, + }) + require.NoError(t, err) + + var got jetstream.Msg + require.Eventually(t, func() bool { + batch, berr := cons.Fetch(1, jetstream.FetchMaxWait(500*time.Millisecond)) + if berr != nil { + return false + } + for msg := range batch.Messages() { + assert.NoError(t, msg.Ack()) + got = msg + return true + } + return false + }, 30*time.Second, 250*time.Millisecond, "thread_subscription_upserted event must land on INBOX") + + require.NotNil(t, got) + var outboxEvt model.InboxEvent + require.NoError(t, json.Unmarshal(got.Data(), &outboxEvt)) + assert.Equal(t, model.InboxThreadSubscriptionUpserted, outboxEvt.Type) + assert.Equal(t, site, outboxEvt.SiteID) +} diff --git a/data-migration/oplog-collections-transformer/main.go b/data-migration/oplog-collections-transformer/main.go new file mode 100644 index 000000000..0bff680e4 --- /dev/null +++ b/data-migration/oplog-collections-transformer/main.go @@ -0,0 +1,343 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "os" + "strings" + "time" + + "github.com/nats-io/nats.go/jetstream" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readpref" + + "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/mongoutil" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/otelutil" + "github.com/hmchangw/chat/pkg/shutdown" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" +) + +func main() { + cfg, err := parseConfig() + if err != nil { + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil))) + slog.Error("parse config", "error", err) + os.Exit(1) + } + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: parseLevel(cfg.LogLevel)}))) + + // Surface an empty ALL_SITE_IDS once at startup: user statusText changes won't propagate + // (publishUserStatus skips with a per-event metric). Legitimate for a rooms/subs-only partial + // deployment; a misconfig otherwise. (Future: promote to a hard-fail once the modes are known.) + if !hasDestinationSite(cfg.AllSiteIDs) { + slog.Warn("ALL_SITE_IDS is empty — user status fan-out is disabled (intentional only for a partial deployment)") + } + + ctx := context.Background() + + tracerShutdown, err := otelutil.InitTracer(ctx, "oplog-collections-transformer") + if err != nil { + slog.Error("init tracer failed", "error", err) + os.Exit(1) + } + meterShutdown, err := otelutil.InitMeter("oplog-collections-transformer") + if err != nil { + slog.Error("init meter failed", "error", err) + os.Exit(1) + } + m, err := newMetrics() + if err != nil { + slog.Error("init metrics failed", "error", err) + os.Exit(1) + } + + // Bind synchronously so a port conflict fails startup loudly rather than + // running blind — observability is the stall signal for this single-replica pump. + metricsServer := newMetricsServer() + ln, err := net.Listen("tcp", cfg.MetricsAddr) + if err != nil { + slog.Error("metrics listen failed", "addr", cfg.MetricsAddr, "error", err) + os.Exit(1) + } + go func() { + slog.Info("metrics+health server listening", "addr", cfg.MetricsAddr) + if err := metricsServer.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Error("metrics server failed", "error", err) + } + }() + + // Source legacy Mongo: re-read full current docs by _id on update events. + source, err := mongoutil.Connect(ctx, cfg.SourceMongoURI, cfg.SourceUsername, cfg.SourcePassword) + if err != nil { + slog.Error("source mongo connect failed", "error", err) + os.Exit(1) + } + rp, err := readPreference(cfg.SourceReadPreference) + if err != nil { + slog.Error("read preference invalid", "error", err) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + sourceDB := source.Database(cfg.SourceDB) + lookups := map[string]migration.SourceLookup{ + cfg.RoomsCollection: migration.NewMongoSourceLookup(sourceDB.Collection(cfg.RoomsCollection, options.Collection().SetReadPreference(rp))), + cfg.SubscriptionsCollection: migration.NewMongoSourceLookup(sourceDB.Collection(cfg.SubscriptionsCollection, options.Collection().SetReadPreference(rp))), + cfg.ThreadSubsCollection: migration.NewMongoSourceLookup(sourceDB.Collection(cfg.ThreadSubsCollection, options.Collection().SetReadPreference(rp))), + cfg.UsersCollection: migration.NewMongoSourceLookup(sourceDB.Collection(cfg.UsersCollection, options.Collection().SetReadPreference(rp))), + } + + // Target new-stack per-site Mongo: user insert-if-absent + thread_room/user FK resolution. + targetClient, err := mongoutil.Connect(ctx, cfg.TargetMongoURI, cfg.TargetUsername, cfg.TargetPassword) + if err != nil { + slog.Error("target mongo connect failed", "error", err) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + target := NewMongoTargetStore(targetClient.Database(cfg.TargetDB)) + if err := target.EnsureIndexes(ctx); err != nil { + slog.Error("ensure target indexes failed", "error", err) + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + + nc, err := natsutil.Connect(cfg.NatsURL, cfg.NatsCredsFile) + if err != nil { + slog.Error("nats connect failed", "error", err) + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + js, err := oteljetstream.New(nc) + if err != nil { + slog.Error("jetstream init failed", "error", err) + _ = nc.Drain() + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + + if err := bootstrapStreams(ctx, js, cfg.SiteID, cfg.Bootstrap.Enabled); err != nil { + slog.Error("bootstrap streams failed", "error", err) + _ = nc.Drain() + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + + h := &handler{ + siteID: cfg.SiteID, + allSiteIDs: cfg.AllSiteIDs, + roomsColl: cfg.RoomsCollection, + subsColl: cfg.SubscriptionsCollection, + threadSubsColl: cfg.ThreadSubsCollection, + usersColl: cfg.UsersCollection, + pub: &jetstreamPublisher{publish: js.PublishMsg}, + target: target, + lookups: lookups, + metrics: m, + now: nowMs, + } + + streamName := stream.MigrationOplog(cfg.SiteID).Name + // The connector owns MIGRATION_OPLOG and may bootstrap it slightly after we start. + // Wait for the stream to appear rather than crash-loop on "stream not found". + cons, err := createConsumerWithRetry(ctx, js, streamName, jetstream.ConsumerConfig{ + Durable: cfg.ConsumerDurable, + AckPolicy: jetstream.AckExplicitPolicy, + DeliverPolicy: jetstream.DeliverAllPolicy, + MaxDeliver: cfg.MaxDeliver, + FilterSubjects: []string{ + subject.MigrationOplog(cfg.SiteID, cfg.RoomsCollection, "*"), + subject.MigrationOplog(cfg.SiteID, cfg.SubscriptionsCollection, "*"), + subject.MigrationOplog(cfg.SiteID, cfg.ThreadSubsCollection, "*"), + subject.MigrationOplog(cfg.SiteID, cfg.UsersCollection, "*"), + }, + }) + if err != nil { + slog.Error("create consumer failed", "stream", streamName, "error", err) + _ = nc.Drain() + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + + cc, err := cons.Consume(func(msg oteljetstream.Msg) { + processOne(msg.Context(), h, msg, m, cfg.MaxDeliver, cfg.DeleteMaxDeliver) + }) + if err != nil { + slog.Error("consume failed", "stream", streamName, "error", err) + _ = nc.Drain() + mongoutil.Disconnect(ctx, targetClient) + mongoutil.Disconnect(ctx, source) + os.Exit(1) + } + + slog.Info("oplog-collections-transformer started", "site", cfg.SiteID, "stream", streamName) + + // Ordered, timeout-bounded cleanup: + // stop consume → metrics/health → observability → NATS drain → Mongo (target then source). + shutdown.Wait(ctx, 25*time.Second, + func(context.Context) error { cc.Stop(); return nil }, + func(ctx context.Context) error { return metricsServer.Shutdown(ctx) }, + func(ctx context.Context) error { return tracerShutdown(ctx) }, + func(ctx context.Context) error { return meterShutdown(ctx) }, + func(context.Context) error { return nc.Drain() }, + func(ctx context.Context) error { mongoutil.Disconnect(ctx, targetClient); return nil }, + func(ctx context.Context) error { mongoutil.Disconnect(ctx, source); return nil }, + ) +} + +// processOne decodes one event and maps its outcome to a JetStream disposition: Ack on success, +// Term on poison, Nak-with-delay on transient up to maxDeliver, then Term-with-metric (not silent drop). +func processOne(ctx context.Context, h *handler, m jetstream.Msg, mtr *metrics, maxDeliver, deleteMaxDeliver int) { + // Stamp a correlation id once at entry; it flows via ctx into the inbox publish + // (read from ctx through natsutil.NewMsg), so transformer→inbox-worker shares one request_id. + ctx, reqID := natsutil.StampRequestID(ctx, m.Headers(), m.Subject()) + // dispose runs a JetStream ack/term/nak and logs (rather than silently drops) any failure — + // the message will redeliver, but a failing disposition signals a NATS-health problem worth seeing. + dispose := func(action string, fn func() error) { + if derr := fn(); derr != nil { + slog.Error("jetstream disposition failed", "action", action, "error", derr, "request_id", reqID) + } + } + var ev oplogEvent + if err := json.Unmarshal(m.Data(), &ev); err != nil { + slog.Error("decode oplog event — term", "error", err, "request_id", reqID) + mtr.onTerm(ctx, "unknown", "unknown") + dispose("term", m.Term) + return + } + // Hard deletes get a shorter cap: a foreign-origin one can't be recognised (no doc) and would + // otherwise churn to the global MaxDeliver; the local race needs only seconds to converge. + deliverCap := maxDeliver + if ev.Op == "delete" { + deliverCap = deleteMaxDeliver + } + // Resolve delivery count; a Metadata error prefers Nak over a premature Term. + var numDelivered uint64 + if meta, err := m.Metadata(); err == nil { + numDelivered = meta.NumDelivered + } + isFinal := migration.IsFinalDelivery(numDelivered, deliverCap) + err := h.handle(ctx, ev) + switch migration.Classify(err, isFinal) { + case migration.ActionAck: + mtr.onProcessed(ctx, ev.Op, ev.Collection) + dispose("ack", m.Ack) + case migration.ActionTerm: + slog.Error("poison event — term (skipping)", "eventId", ev.EventID, "error", err, "request_id", reqID) + mtr.onTerm(ctx, ev.Op, ev.Collection) + dispose("term", m.Term) + case migration.ActionAckSkip: + // A deliberate skip — already metered via onSkipped by the handler. Ack but DON'T count + // it as processed (that would double-count the same event). + dispose("ack", m.Ack) + case migration.ActionTermExhausted: + // A further Nak would hit the cap and be silently dropped by JetStream. + // Term it explicitly so the give-up is logged + metered instead of vanishing. + slog.Error("delivery limit reached — terming (dropping)", "eventId", ev.EventID, "op", ev.Op, "cap", deliverCap, "error", err, "request_id", reqID) + mtr.onExhausted(ctx, ev.Op, ev.Collection) + dispose("term", m.Term) + default: + slog.Error("transient failure — nak", "eventId", ev.EventID, "error", err, "request_id", reqID) + mtr.onNak(ctx, ev.Op, ev.Collection) + dispose("nak", func() error { return m.NakWithDelay(2 * time.Second) }) + } +} + +func nowMs() int64 { return time.Now().UTC().UnixMilli() } + +// hasDestinationSite reports whether sites has at least one non-empty entry (a real fan-out target). +func hasDestinationSite(sites []string) bool { + for _, s := range sites { + if s != "" { + return true + } + } + return false +} + +// streamWaitTimeout bounds how long startup waits for the connector to bootstrap MIGRATION_OPLOG. +const streamWaitTimeout = 60 * time.Second + +// createConsumerWithRetry creates the durable consumer, retrying while the stream does not yet exist +// (the connector creates it independently); other errors and streamWaitTimeout are returned. +// +//nolint:gocritic // hugeParam: cfg is passed by value to match jetstream.CreateOrUpdateConsumer's signature. +func createConsumerWithRetry(ctx context.Context, js oteljetstream.JetStream, streamName string, cfg jetstream.ConsumerConfig) (oteljetstream.Consumer, error) { + deadline := time.Now().Add(streamWaitTimeout) + for { + cons, err := js.CreateOrUpdateConsumer(ctx, streamName, cfg) + if err == nil { + return cons, nil + } + if !errors.Is(err, jetstream.ErrStreamNotFound) || time.Now().After(deadline) { + return nil, err + } + slog.Warn("waiting for stream to be created by the connector", "stream", streamName) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(2 * time.Second): + } + } +} + +// newMetricsServer builds the /metrics + /healthz HTTP server with timeouts that guard against hung scrapers tying up goroutines. +func newMetricsServer() *http.Server { + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + }) + return &http.Server{ + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 60 * time.Second, + } +} + +func readPreference(s string) (*readpref.ReadPref, error) { + switch strings.ToLower(strings.TrimSpace(s)) { + case "primary": + return readpref.Primary(), nil + case "primarypreferred", "": + return readpref.PrimaryPreferred(), nil + case "secondary": + return readpref.Secondary(), nil + case "secondarypreferred": + return readpref.SecondaryPreferred(), nil + case "nearest": + return readpref.Nearest(), nil + default: + return nil, fmt.Errorf("invalid SOURCE_READ_PREFERENCE: %s", s) + } +} + +func parseLevel(s string) slog.Level { + switch strings.ToLower(strings.TrimSpace(s)) { + case "debug": + return slog.LevelDebug + case "warn": + return slog.LevelWarn + case "error": + return slog.LevelError + default: + return slog.LevelInfo + } +} diff --git a/data-migration/oplog-collections-transformer/metrics.go b/data-migration/oplog-collections-transformer/metrics.go new file mode 100644 index 000000000..cd793fe38 --- /dev/null +++ b/data-migration/oplog-collections-transformer/metrics.go @@ -0,0 +1,126 @@ +package main + +import ( + "context" + "fmt" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// metrics holds the transformer's instruments: processed throughput, nak/term/exhausted +// dispositions, user-seed outcome, and FK resolution misses. Nil-safe (tests run without a meter). +type metrics struct { + processed metric.Int64Counter + naks metric.Int64Counter + terms metric.Int64Counter + skipped metric.Int64Counter + exhausted metric.Int64Counter + userSeed metric.Int64Counter + resolveMiss metric.Int64Counter +} + +func newMetrics() (*metrics, error) { + m := otel.Meter("oplog-collections-transformer") + processed, err := m.Int64Counter("oplog_collections_transformer_events_processed_total", + metric.WithDescription("oplog events handled and acked, by op")) + if err != nil { + return nil, fmt.Errorf("processed counter: %w", err) + } + naks, err := m.Int64Counter("oplog_collections_transformer_naks_total", + metric.WithDescription("transient failures naked for redelivery, by op")) + if err != nil { + return nil, fmt.Errorf("naks counter: %w", err) + } + terms, err := m.Int64Counter("oplog_collections_transformer_terms_total", + metric.WithDescription("poison/undecodable events termed (never redelivered), by op")) + if err != nil { + return nil, fmt.Errorf("terms counter: %w", err) + } + skipped, err := m.Int64Counter("oplog_collections_transformer_events_skipped_total", + metric.WithDescription("events deliberately skipped (excluded room type, out-of-scope collection etc.), by reason")) + if err != nil { + return nil, fmt.Errorf("skipped counter: %w", err) + } + exhausted, err := m.Int64Counter("oplog_collections_transformer_exhausted_total", + metric.WithDescription("events termed after reaching MaxDeliver (would-be silent JetStream drops), by op")) + if err != nil { + return nil, fmt.Errorf("exhausted counter: %w", err) + } + userSeed, err := m.Int64Counter("oplog_collections_transformer_user_seed_total", + metric.WithDescription("user insert-if-absent seeds, by outcome (insert/present)")) + if err != nil { + return nil, fmt.Errorf("user seed counter: %w", err) + } + resolveMiss, err := m.Int64Counter("oplog_collections_transformer_resolve_miss_total", + metric.WithDescription("foreign-key resolution misses (thread-sub user/thread_room), by kind")) + if err != nil { + return nil, fmt.Errorf("resolve miss counter: %w", err) + } + return &metrics{ + processed: processed, naks: naks, terms: terms, skipped: skipped, + exhausted: exhausted, userSeed: userSeed, resolveMiss: resolveMiss, + }, nil +} + +// opCollAttr labels a disposition by op (insert/update/delete) and source collection, so ops can +// see which collection is stuck or poisoning — not just the aggregate op breakdown. +func opCollAttr(op, collection string) metric.MeasurementOption { + return metric.WithAttributes(attribute.String("op", op), attribute.String("collection", collection)) +} + +func (m *metrics) onProcessed(ctx context.Context, op, collection string) { + if m == nil { + return + } + m.processed.Add(ctx, 1, opCollAttr(op, collection)) +} + +func (m *metrics) onNak(ctx context.Context, op, collection string) { + if m == nil { + return + } + m.naks.Add(ctx, 1, opCollAttr(op, collection)) +} + +func (m *metrics) onTerm(ctx context.Context, op, collection string) { + if m == nil { + return + } + m.terms.Add(ctx, 1, opCollAttr(op, collection)) +} + +func (m *metrics) onSkipped(ctx context.Context, reason string) { + if m == nil { + return + } + m.skipped.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", reason))) +} + +// onExhausted records an event termed because it reached MaxDeliver — what would otherwise be a +// silent JetStream-side drop after the redelivery cap. Distinct from poison terms; alert on it. +func (m *metrics) onExhausted(ctx context.Context, op, collection string) { + if m == nil { + return + } + m.exhausted.Add(ctx, 1, opCollAttr(op, collection)) +} + +// onUserSeed records a user insert-if-absent seed, labelled "insert" (a new doc was created) or +// "present" (another sync already owns the account, left untouched). +func (m *metrics) onUserSeed(ctx context.Context, outcome string) { + if m == nil { + return + } + m.userSeed.Add(ctx, 1, metric.WithAttributes(attribute.String("outcome", outcome))) +} + +// onResolveMiss records a foreign-key resolution miss for the thread-sub double dependency +// (kind "user" or "thread_room"); used by the later thread-sub mapper to flag Nak retries. +func (m *metrics) onResolveMiss(ctx context.Context, kind string) { + if m == nil { + return + } + m.resolveMiss.Add(ctx, 1, metric.WithAttributes(attribute.String("kind", kind))) +} diff --git a/data-migration/oplog-collections-transformer/metrics_test.go b/data-migration/oplog-collections-transformer/metrics_test.go new file mode 100644 index 000000000..098d06305 --- /dev/null +++ b/data-migration/oplog-collections-transformer/metrics_test.go @@ -0,0 +1,91 @@ +package main + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +func TestNewMetrics(t *testing.T) { + m, err := newMetrics() + require.NoError(t, err) + require.NotNil(t, m) + // Recording must not panic with a real (no-op exporter) meter. + m.onProcessed(context.Background(), "insert", "rocketchat_room") + m.onNak(context.Background(), "update", "rocketchat_room") + m.onTerm(context.Background(), "insert", "rocketchat_room") + m.onSkipped(context.Background(), "other_collection") + m.onExhausted(context.Background(), "update", "rocketchat_room") + m.onUserSeed(context.Background(), "insert") + m.onResolveMiss(context.Background(), "user") +} + +func TestMetrics_NilSafe(t *testing.T) { + var m *metrics // the unit-test case: handler's metrics is nil + require.NotPanics(t, func() { + m.onProcessed(context.Background(), "insert", "rocketchat_room") + m.onNak(context.Background(), "update", "rocketchat_room") + m.onTerm(context.Background(), "insert", "rocketchat_room") + m.onSkipped(context.Background(), "other_collection") + m.onExhausted(context.Background(), "update", "rocketchat_room") + m.onUserSeed(context.Background(), "present") + m.onResolveMiss(context.Background(), "thread_room") + }) +} + +// TestMetrics_DispositionCountersCarryCollection verifies the disposition counters are labelled +// by both op and collection, so ops can see which source collection is stuck/poisoning. +func TestMetrics_DispositionCountersCarryCollection(t *testing.T) { + // Restore the global meter provider so this test doesn't leak its manual reader into siblings. + prev := otel.GetMeterProvider() + t.Cleanup(func() { otel.SetMeterProvider(prev) }) + reader := sdkmetric.NewManualReader() + otel.SetMeterProvider(sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))) + + m, err := newMetrics() + require.NoError(t, err) + + ctx := context.Background() + m.onProcessed(ctx, "insert", "rocketchat_subscription") + m.onNak(ctx, "update", "rocketchat_subscription") + m.onTerm(ctx, "delete", "rocketchat_message") + m.onExhausted(ctx, "update", "tsmc_thread_subscriptions") + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + + want := map[string]map[string]string{ + "oplog_collections_transformer_events_processed_total": {"op": "insert", "collection": "rocketchat_subscription"}, + "oplog_collections_transformer_naks_total": {"op": "update", "collection": "rocketchat_subscription"}, + "oplog_collections_transformer_terms_total": {"op": "delete", "collection": "rocketchat_message"}, + "oplog_collections_transformer_exhausted_total": {"op": "update", "collection": "tsmc_thread_subscriptions"}, + } + + found := map[string]bool{} + for _, sm := range rm.ScopeMetrics { + for _, md := range sm.Metrics { + wantAttrs, ok := want[md.Name] + if !ok { + continue + } + sum, ok := md.Data.(metricdata.Sum[int64]) + require.True(t, ok, "%s should be an int64 sum", md.Name) + require.Len(t, sum.DataPoints, 1) + attrs := sum.DataPoints[0].Attributes + for k, v := range wantAttrs { + got, present := attrs.Value(attribute.Key(k)) + require.True(t, present, "%s missing attribute %q", md.Name, k) + assert.Equal(t, v, got.AsString(), "%s attribute %q", md.Name, k) + } + found[md.Name] = true + } + } + assert.Len(t, found, len(want), "all disposition counters recorded") +} diff --git a/data-migration/oplog-collections-transformer/rooms.go b/data-migration/oplog-collections-transformer/rooms.go new file mode 100644 index 000000000..dcd0a93ef --- /dev/null +++ b/data-migration/oplog-collections-transformer/rooms.go @@ -0,0 +1,228 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "time" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" +) + +// sourceRoom is the subset of a rocketchat_rooms doc the mapper decodes (relaxed extended JSON). +type sourceRoom struct { + ID string `bson:"_id"` + T string `bson:"t"` + Prid string `bson:"prid"` + TeamID string `bson:"teamId"` + Name string `bson:"name"` + FName string `bson:"fname"` + // Restricted is the TSMC-custom restriction flag (confirmed authoritative on TKMS; absent ⇒ + // false). RocketChat's separate `ro` (read-only/announcement mode) is a different concept + // with no destination equivalent and is deliberately NOT decoded. + Restricted bool `bson:"restricted"` + UIDs []string `bson:"uids"` + Usernames []string `bson:"usernames"` + UpdatedAt time.Time `bson:"_updatedAt"` + TS time.Time `bson:"ts"` + // Federation.Origin is the room's home site (absent ⇒ local); drives siteId stamping. + Federation struct { + Origin string `bson:"origin"` + } `bson:"federation"` +} + +// updateDescription is the connector's update delta; only changed field keys matter, values are opaque. +type updateDescription struct { + UpdatedFields map[string]any `bson:"updatedFields" json:"updatedFields"` + RemovedFields []string `bson:"removedFields" json:"removedFields"` +} + +// participantCount returns the member count, preferring uids and falling back to usernames. +func (r *sourceRoom) participantCount() int { + if len(r.UIDs) > 0 { + return len(r.UIDs) + } + return len(r.Usernames) +} + +// displayName returns the friendly display name (fname), falling back to the machine name. +func (r *sourceRoom) displayName() string { + if r.FName != "" { + return r.FName + } + return r.Name +} + +// handleRoom maps a rocketchat_rooms change event to an inbox InboxEvent (§4.2 / §4.0). +// Returns migration.ErrSkipped for deletes, excluded room types, and update lookup misses. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) handleRoom(ctx context.Context, ev oplogEvent) error { + if ev.Op == "delete" { + // The app has no room deletion and the delete event is un-actionable (only the source _id). + slog.Debug("skip room delete (un-actionable, no app deletion)", + "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "room_delete") + return migration.ErrSkipped + } + + doc, skip, err := h.resolveDoc(ctx, ev) + if err != nil { + return fmt.Errorf("resolve room doc: %w", err) + } + if skip { + h.metrics.onSkipped(ctx, ev.Op+"_skip") + return migration.ErrSkipped + } + + var sr sourceRoom + if uerr := bson.UnmarshalExtJSON(doc, false, &sr); uerr != nil { + return fmt.Errorf("%w: decode source room: %v", migration.ErrPoison, uerr) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + + // hasBot is unresolvable here without a user lookup (botDM detection deferred — see §4.2 / + // the design's botDM note); pass false so a 2-party bot DM classifies as a plain dm for now. + class := classifyRoom(sr.T, sr.Prid != "", sr.TeamID != "", false, sr.participantCount()) + if class.Excluded { + slog.Debug("skip excluded room type", + "t", sr.T, "reason", class.Reason, "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, class.Reason) + return migration.ErrSkipped + } + + // Zero-guard an absent source timestamp with now() so the room doc never carries a year-0001 + // UpdatedAt, keeping the UpsertRoom high-water-mark guard functional. + nowMillis := h.nowMillis() + updatedAt := sr.UpdatedAt.UTC() + if updatedAt.IsZero() { + updatedAt = time.UnixMilli(nowMillis).UTC() + } + createdAt := sr.TS.UTC() + if createdAt.IsZero() { + createdAt = updatedAt + } + + room := model.Room{ + ID: sr.ID, + Type: class.Type, + Name: sr.displayName(), + SiteID: siteIDFromOrigin(sr.Federation.Origin, h.siteID), + // ExternalAccess source field is unconfirmed (SOURCE_DATA.md §3) — default false per design. + ExternalAccess: false, + Restricted: sr.Restricted, + UIDs: sr.UIDs, + Accounts: sr.Usernames, + UserCount: sr.participantCount(), + UpdatedAt: updatedAt, + CreatedAt: createdAt, + } + + evts, err := h.roomEvents(ev, &room) + if err != nil { + return fmt.Errorf("build room events: %w", err) + } + for _, evt := range evts { + if err := h.pub.Publish(ctx, evt); err != nil { + return fmt.Errorf("publish room event %q: %w", evt.Type, err) + } + } + return nil +} + +// roomEvents builds the InboxEvents for a room change: always room_sync, preceded by room_renamed +// (name/fname changed) and/or room_restricted (restricted changed) — both when one update changes both. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) roomEvents(ev oplogEvent, room *model.Room) ([]model.InboxEvent, error) { + if ev.Op == "insert" { + // A brand-new room has no subscriptions yet — nothing to rename/re-restrict, sync suffices. + return []model.InboxEvent{h.roomSyncEvent(room)}, nil + } + if ev.Op != "update" { + // replace: a whole-doc rewrite carries NO updateDescription delta, so there is no way to + // know which fields changed. Emit every field-level event conservatively — they are + // idempotent and guarded downstream — otherwise a rename/visibility change inside a + // replace would converge the rooms doc (room_sync) while every subscription kept the + // stale denormalized name/visibility forever. + return []model.InboxEvent{ + h.roomRenamedEvent(room), + h.roomRestrictedEvent(room), + h.roomSyncEvent(room), + }, nil + } + + var desc updateDescription + if len(ev.UpdateDescription) > 0 { + if err := bson.UnmarshalExtJSON(ev.UpdateDescription, false, &desc); err != nil { + return nil, fmt.Errorf("%w: decode room updateDescription: %v", migration.ErrPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + } + + // A single update delta can change name/fname AND restricted together — emit every matching + // event, not just the first, so a combined rename+restrict doesn't drop the visibility change. + var evts []model.InboxEvent + if changed(desc, "name") || changed(desc, "fname") { + evts = append(evts, h.roomRenamedEvent(room)) + } + if changed(desc, "restricted") { + evts = append(evts, h.roomRestrictedEvent(room)) + } + // room_sync always trails so the room doc itself converges alongside the subscription-side events. + evts = append(evts, h.roomSyncEvent(room)) + return evts, nil +} + +// changed reports whether the named field appears in the update delta (set or removed). +func changed(desc updateDescription, field string) bool { + if _, ok := desc.UpdatedFields[field]; ok { + return true + } + for _, rf := range desc.RemovedFields { + if rf == field { + return true + } + } + return false +} + +func (h *handler) roomSyncEvent(room *model.Room) model.InboxEvent { + return h.inboxEvent(model.InboxEventType("room_sync"), room.SiteID, mustMarshal(room)) +} + +func (h *handler) roomRenamedEvent(room *model.Room) model.InboxEvent { + // Use the source _updatedAt millis (zero-guarded in handleRoom) as the nameUpdatedAt high-water + // mark so UpdateSubscriptionNamesForRoom matches the companion room_sync guard. + return h.inboxEvent(model.InboxRoomRenamed, room.SiteID, mustMarshal(model.RoomRenamedInboxPayload{ + RoomID: room.ID, + NewName: room.Name, + Timestamp: room.UpdatedAt.UnixMilli(), + })) +} + +func (h *handler) roomRestrictedEvent(room *model.Room) model.InboxEvent { + // Use the source _updatedAt millis (zero-guarded in handleRoom) as the visibilityUpdatedAt + // high-water mark so ApplySubscriptionVisibility matches the companion room_sync guard. + return h.inboxEvent(model.InboxRoomRestricted, room.SiteID, mustMarshal(model.RoomRestrictedInboxPayload{ + RoomID: room.ID, + Restricted: room.Restricted, + ExternalAccess: room.ExternalAccess, + OwnerAccount: "", + Timestamp: room.UpdatedAt.UnixMilli(), + })) +} + +// inboxEvent wraps an inner payload in the local-INBOX InboxEvent envelope. SiteID is the +// record's home site; DestSiteID is this deployment (the local inbox-worker applies it). +func (h *handler) inboxEvent(t model.InboxEventType, siteID string, payload []byte) model.InboxEvent { + return model.InboxEvent{ + Type: t, + SiteID: siteID, + DestSiteID: h.siteID, + Payload: payload, + Timestamp: h.nowMillis(), + } +} diff --git a/data-migration/oplog-collections-transformer/rooms_test.go b/data-migration/oplog-collections-transformer/rooms_test.go new file mode 100644 index 000000000..46ebb37ef --- /dev/null +++ b/data-migration/oplog-collections-transformer/rooms_test.go @@ -0,0 +1,360 @@ +package main + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" +) + +func roomEv(op string, doc, updateDesc string) oplogEvent { + ev := oplogEvent{Op: op, Collection: roomsColl, EventID: "e1"} + if doc != "" { + ev.FullDocument = json.RawMessage(doc) + } + if updateDesc != "" { + ev.UpdateDescription = json.RawMessage(updateDesc) + } + ev.DocumentKey = json.RawMessage(`{"_id":"r1"}`) + return ev +} + +func TestHandleRoom_InsertChannel(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"r1","t":"c","fname":"General","name":"general","restricted":false,"uids":["u1","u2"],"usernames":["alice","bob"]}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + evt := pub.events[0] + assert.Equal(t, model.InboxEventType("room_sync"), evt.Type) + assert.Equal(t, testSiteID, evt.SiteID) + assert.Equal(t, testSiteID, evt.DestSiteID) + + var room model.Room + require.NoError(t, json.Unmarshal(evt.Payload, &room)) + assert.Equal(t, "r1", room.ID) + assert.Equal(t, model.RoomTypeChannel, room.Type) + assert.Equal(t, "General", room.Name) + assert.Equal(t, testSiteID, room.SiteID) + assert.False(t, room.Restricted) + assert.False(t, room.ExternalAccess) + assert.Equal(t, []string{"alice", "bob"}, room.Accounts) +} + +func TestHandleRoom_InsertDiscussion(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"r1","t":"p","prid":"parent1","fname":"Topic","uids":["u1"]}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &room)) + assert.Equal(t, model.RoomTypeDiscussion, room.Type) +} + +func TestHandleRoom_InsertLivechatSkipped(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"r1","t":"l","fname":"Support"}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleRoom_InsertGroupDMSkipped(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"r1","t":"d","uids":["u1","u2","u3"]}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleRoom_UpdateNameChange(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"New Name","uids":["u1"]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"fname":"New Name"}}`)) + require.NoError(t, err) + + // name change → [room_renamed, room_sync] + require.Len(t, pub.events, 2) + evt := pub.events[0] + assert.Equal(t, model.InboxRoomRenamed, evt.Type) + + var p model.RoomRenamedInboxPayload + require.NoError(t, json.Unmarshal(evt.Payload, &p)) + assert.Equal(t, "r1", p.RoomID) + assert.Equal(t, "New Name", p.NewName) + // No source _updatedAt in this doc → falls back to h.now() = 1700000000000. + assert.Equal(t, int64(1700000000000), p.Timestamp) + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[1].Type) +} + +func TestHandleRoom_UpdateRestrictedChange(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"Room","restricted":true,"uids":["u1"]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"restricted":true}}`)) + require.NoError(t, err) + + // restricted change → [room_restricted, room_sync] + require.Len(t, pub.events, 2) + evt := pub.events[0] + assert.Equal(t, model.InboxRoomRestricted, evt.Type) + + var p model.RoomRestrictedInboxPayload + require.NoError(t, json.Unmarshal(evt.Payload, &p)) + assert.Equal(t, "r1", p.RoomID) + assert.True(t, p.Restricted) + assert.False(t, p.ExternalAccess) + assert.Empty(t, p.OwnerAccount) + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[1].Type) +} + +func TestHandleRoom_UpdateOtherFieldReSync(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"Room","uids":["u1","u2"]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"description":"hi"}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[0].Type) +} + +func TestHandleRoom_Delete(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + err := h.handleRoom(context.Background(), roomEv("delete", "", "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleRoom_FederatedOriginSiteID(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"r1","t":"c","fname":"Remote","federation":{"origin":"0030204.tchat-test.test.company.com"},"uids":["u1"]}` + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, "0030204", pub.events[0].SiteID) + + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &room)) + assert.Equal(t, "0030204", room.SiteID) +} + +// Bug 1 tests — room_sync UpdatedAt/CreatedAt stamping. + +// TestHandleRoom_InsertUpdatedAtFromSource asserts that a room_sync payload carries +// UpdatedAt equal to the source _updatedAt field and is never the zero time. +func TestHandleRoom_InsertUpdatedAtFromSource(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // Source _updatedAt in relaxed extended JSON ($date). + sourceUpdatedAt := time.Date(2024, 3, 15, 10, 0, 0, 0, time.UTC) + sourceCreatedAt := time.Date(2024, 1, 5, 8, 30, 0, 0, time.UTC) + doc := `{"_id":"r1","t":"c","fname":"General","uids":["u1"],` + + `"_updatedAt":{"$date":"2024-03-15T10:00:00.000Z"},` + + `"ts":{"$date":"2024-01-05T08:30:00.000Z"}}` + + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &room)) + + assert.False(t, room.UpdatedAt.IsZero(), "UpdatedAt must not be zero") + assert.Equal(t, sourceUpdatedAt, room.UpdatedAt.UTC()) + assert.Equal(t, sourceCreatedAt, room.CreatedAt.UTC()) +} + +// TestHandleRoom_InsertMissingUpdatedAtFallsBackToNow asserts that when a source doc +// has no _updatedAt, the room_sync carries the now-fallback (non-zero) timestamp. +func TestHandleRoom_InsertMissingUpdatedAtFallsBackToNow(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // Doc deliberately missing _updatedAt and ts. + doc := `{"_id":"r1","t":"c","fname":"General","uids":["u1"]}` + + err := h.handleRoom(context.Background(), roomEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &room)) + + assert.False(t, room.UpdatedAt.IsZero(), "UpdatedAt must not be zero when source field is absent") + // The handler's now() returns 1700000000000 ms — verify the fallback matches. + assert.Equal(t, time.UnixMilli(1700000000000).UTC(), room.UpdatedAt.UTC()) + assert.Equal(t, time.UnixMilli(1700000000000).UTC(), room.CreatedAt.UTC()) +} + +// Bug 2 tests — dual-event emission on rename/restrict updates. + +// TestHandleRoom_UpdateNameChangeEmitsBothEvents asserts that a name/fname change +// publishes room_renamed (for subs) AND room_sync (for the room doc), in that order. +func TestHandleRoom_UpdateNameChangeEmitsBothEvents(t *testing.T) { + pub := &fakePublisher{} + // Source doc has _updatedAt so the room_sync carries the source timestamp. + full := `{"_id":"r1","t":"c","fname":"New Name","uids":["u1"],"_updatedAt":{"$date":"2024-03-15T10:00:00.000Z"}}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"fname":"New Name"}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 2, "update with name change must publish room_renamed AND room_sync") + + // First event: room_renamed for subscriptions. + assert.Equal(t, model.InboxRoomRenamed, pub.events[0].Type) + var rp model.RoomRenamedInboxPayload + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &rp)) + assert.Equal(t, "r1", rp.RoomID) + assert.Equal(t, "New Name", rp.NewName) + // Timestamp uses source _updatedAt millis (zero-guarded). + sourceMillis := time.Date(2024, 3, 15, 10, 0, 0, 0, time.UTC).UnixMilli() + assert.Equal(t, sourceMillis, rp.Timestamp) + + // Second event: room_sync for the room doc, carrying the new name. + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[1].Type) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[1].Payload, &room)) + assert.Equal(t, "r1", room.ID) + assert.Equal(t, "New Name", room.Name) + assert.False(t, room.UpdatedAt.IsZero(), "room_sync UpdatedAt must not be zero") + assert.Equal(t, time.Date(2024, 3, 15, 10, 0, 0, 0, time.UTC), room.UpdatedAt.UTC()) +} + +// TestHandleRoom_UpdateRestrictedChangeEmitsBothEvents asserts that a ro change +// publishes room_restricted (for subs) AND room_sync (for the room doc). +func TestHandleRoom_UpdateRestrictedChangeEmitsBothEvents(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"Room","restricted":true,"uids":["u1"],"_updatedAt":{"$date":"2024-05-01T12:00:00.000Z"}}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"restricted":true}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 2, "update with ro change must publish room_restricted AND room_sync") + + // First event: room_restricted for subscriptions. + assert.Equal(t, model.InboxRoomRestricted, pub.events[0].Type) + var rp model.RoomRestrictedInboxPayload + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &rp)) + assert.Equal(t, "r1", rp.RoomID) + assert.True(t, rp.Restricted) + sourceMillis := time.Date(2024, 5, 1, 12, 0, 0, 0, time.UTC).UnixMilli() + assert.Equal(t, sourceMillis, rp.Timestamp) + + // Second event: room_sync. + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[1].Type) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[1].Payload, &room)) + assert.Equal(t, "r1", room.ID) + assert.True(t, room.Restricted) +} + +// TestHandleRoom_UpdateOtherFieldEmitsOnlyRoomSync asserts that an unrelated field +// change still emits exactly one room_sync event. +func TestHandleRoom_UpdateOtherFieldEmitsOnlyRoomSync(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"Room","uids":["u1","u2"]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"description":"hi"}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 1, "unrelated field update must emit exactly one room_sync") + assert.Equal(t, model.InboxEventType("room_sync"), pub.events[0].Type) +} + +func TestHandleRoom_UpdateNameAndRo_EmitsRenamedRestrictedAndSync(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"New","restricted":true,"uids":["u1"],"_updatedAt":{"$date":"2024-05-01T12:00:00.000Z"}}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + err := h.handleRoom(context.Background(), roomEv("update", "", `{"updatedFields":{"fname":"New","restricted":true}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 3, "a combined name+ro update must emit room_renamed, room_restricted, and room_sync") + types := []model.InboxEventType{pub.events[0].Type, pub.events[1].Type, pub.events[2].Type} + assert.Contains(t, types, model.InboxRoomRenamed) + assert.Contains(t, types, model.InboxRoomRestricted) + assert.Contains(t, types, model.InboxEventType("room_sync")) +} + +func TestHandleRoom_Replace_EmitsAllFieldEventsConservatively(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // replace carries the full doc but NO updateDescription delta — there is no way to know which + // fields changed, so all field-level events must be emitted conservatively (they're idempotent + // and guarded on the destination), or a rename/visibility change inside a whole-doc replace + // would update the rooms doc (room_sync) while every subscription kept the stale + // denormalized name/visibility forever. + doc := `{"_id":"r1","t":"c","fname":"Replaced","name":"replaced","restricted":true,"uids":["u1"],"_updatedAt":{"$date":"2024-05-01T12:00:00.000Z"}}` + require.NoError(t, h.handleRoom(context.Background(), roomEv("replace", doc, ""))) + + require.Len(t, pub.events, 3, "replace must emit room_renamed, room_restricted, and room_sync") + types := []model.InboxEventType{pub.events[0].Type, pub.events[1].Type, pub.events[2].Type} + assert.Contains(t, types, model.InboxRoomRenamed) + assert.Contains(t, types, model.InboxRoomRestricted) + assert.Contains(t, types, model.InboxEventType("room_sync")) + + byType := eventsByType(pub.events) + var renamed model.RoomRenamedInboxPayload + require.NoError(t, json.Unmarshal(byType[model.InboxRoomRenamed].Payload, &renamed)) + assert.Equal(t, "Replaced", renamed.NewName) + assert.Equal(t, int64(1714564800000), renamed.Timestamp, "guard timestamp is the source _updatedAt millis") + + var restricted model.RoomRestrictedInboxPayload + require.NoError(t, json.Unmarshal(byType[model.InboxRoomRestricted].Payload, &restricted)) + assert.True(t, restricted.Restricted) + assert.Equal(t, int64(1714564800000), restricted.Timestamp) +} + +func TestHandleRoom_DegradedInsertRecoversViaSourceLookup(t *testing.T) { + pub := &fakePublisher{} + full := `{"_id":"r1","t":"c","fname":"Recovered","uids":["u1"]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(full)}) + + // Degraded insert: the connector couldn't encode fullDocument (empty) but flagged Degraded. + ev := oplogEvent{Op: "insert", Collection: roomsColl, EventID: "e1", + DocumentKey: json.RawMessage(`{"_id":"r1"}`), Degraded: true, DegradedReason: "fullDocument encode failed"} + require.NoError(t, h.handleRoom(context.Background(), ev)) + + require.Len(t, pub.events, 1) + var room model.Room + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &room)) + assert.Equal(t, "Recovered", room.Name) +} + +func TestHandleRoom_NonDegradedInsertWithoutFullDocument_Poisons(t *testing.T) { + h := newTestHandler(&fakePublisher{}, &fakeTarget{}, &fakeLookup{}) + ev := oplogEvent{Op: "insert", Collection: roomsColl, DocumentKey: json.RawMessage(`{"_id":"r1"}`)} + assert.ErrorIs(t, h.handleRoom(context.Background(), ev), migration.ErrPoison) +} diff --git a/data-migration/oplog-collections-transformer/subscriptions.go b/data-migration/oplog-collections-transformer/subscriptions.go new file mode 100644 index 000000000..19b713ebd --- /dev/null +++ b/data-migration/oplog-collections-transformer/subscriptions.go @@ -0,0 +1,321 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "time" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" +) + +// sourceSubscription is the subset of a rocketchat_subscriptions doc the mapper decodes (handles $date). +type sourceSubscription struct { + ID string `bson:"_id"` + U struct { + ID string `bson:"_id"` + Username string `bson:"username"` + } `bson:"u"` + RID string `bson:"rid"` + T string `bson:"t"` + Name string `bson:"name"` + FName string `bson:"fname"` + Roles []string `bson:"roles"` + Open bool `bson:"open"` + F bool `bson:"f"` + DisableNotifications bool `bson:"disableNotifications"` + LS time.Time `bson:"ls"` + LR time.Time `bson:"lr"` + Alert bool `bson:"alert"` + TS time.Time `bson:"ts"` + UpdatedAt time.Time `bson:"_updatedAt"` + // Federation.Origin is the subscription's home site (absent ⇒ local); drives siteId stamping. + Federation struct { + Origin string `bson:"origin"` + } `bson:"federation"` +} + +// lastSeenMillis returns max(ls, lr) in unix ms — the furthest point consumed by either the +// scrolled cursor (ls) or the explicit mark-read (lr), per spec §4.3 / D1. +func (s *sourceSubscription) lastSeenMillis() int64 { + // Zero-guard each: a zero time.Time.UnixMilli() is a large negative (year-0001) that would + // leak a bogus lastSeenAt into the inbox event. Absent ls/lr → 0 (never-read subscription). + var ls, lr int64 + if !s.LS.IsZero() { + ls = s.LS.UTC().UnixMilli() + } + if !s.LR.IsZero() { + lr = s.LR.UTC().UnixMilli() + } + if lr > ls { + return lr + } + return ls +} + +// subUpdateDescription is the connector's update delta; only changed field keys matter, values are opaque. +type subUpdateDescription struct { + UpdatedFields map[string]any `bson:"updatedFields" json:"updatedFields"` + RemovedFields []string `bson:"removedFields" json:"removedFields"` +} + +// subChanged reports whether the named field appears in the update delta (set or removed). +func subChanged(desc subUpdateDescription, field string) bool { + if _, ok := desc.UpdatedFields[field]; ok { + return true + } + for _, rf := range desc.RemovedFields { + if rf == field { + return true + } + } + return false +} + +// handleSubscription maps a rocketchat_subscriptions change event to inbox InboxEvents (§4.3 / §4.0): +// insert/replace reproduce the source row, update emits the matching event(s), delete maps by _id. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) handleSubscription(ctx context.Context, ev oplogEvent) error { + if ev.Op == "delete" { + // True row delete is un-actionable (spec §4.0/§4.3): the event carries only the source _id, + // which doesn't map to the destination sub (keyed by a generated UUIDv7). A genuine leave + // arrives as an open:false update (→ member_removed) instead, so true deletes are rare. + slog.Debug("skip subscription delete (un-actionable; leave is open:false)", + "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "subscription_delete") + return migration.ErrSkipped + } + + doc, skip, err := h.resolveDoc(ctx, ev) + if err != nil { + return fmt.Errorf("resolve subscription doc: %w", err) + } + if skip { + h.metrics.onSkipped(ctx, ev.Op+"_skip") + return migration.ErrSkipped + } + + var ss sourceSubscription + if uerr := bson.UnmarshalExtJSON(doc, false, &ss); uerr != nil { + return fmt.Errorf("%w: decode source subscription: %v", migration.ErrPoison, uerr) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + + if ev.Op == "update" { + return h.handleSubscriptionUpdate(ctx, ev, &ss) + } + // insert / replace → rebuild the full source row. + return h.publishSubscriptionState(ctx, &ss, true) +} + +// handleSubscriptionUpdate emits the event(s) matching the changed fields. An open toggle is the +// dominant action (membership lifecycle); other recognized field deltas map to a single state event. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) handleSubscriptionUpdate(ctx context.Context, ev oplogEvent, ss *sourceSubscription) error { + var desc subUpdateDescription + if len(ev.UpdateDescription) > 0 { + if err := bson.UnmarshalExtJSON(ev.UpdateDescription, false, &desc); err != nil { + return fmt.Errorf("%w: decode subscription updateDescription: %v", migration.ErrPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + } + + // Membership leave/rejoin is an open toggle and dominates: re-read decides the action. + if subChanged(desc, "open") { + if ss.Open { + // Re-subscribe: rebuild the full state (a rejoin starts fresh). + return h.publishSubscriptionState(ctx, ss, true) + } + return h.pub.Publish(ctx, h.memberRemovedEvent(ss)) + } + + siteID := siteIDFromOrigin(ss.Federation.Origin, h.siteID) + emitted := false + + // Emit even when roles became empty: a roles-cleared update (e.g. owner demoted) must propagate, + // otherwise the destination keeps stale roles. roleUpdatedEvent maps nil → nil cleanly. + if subChanged(desc, "roles") { + if err := h.pub.Publish(ctx, h.roleUpdatedEvent(ss, siteID)); err != nil { + return fmt.Errorf("publish role_updated: %w", err) + } + emitted = true + } + if subChanged(desc, "disableNotifications") { + if err := h.pub.Publish(ctx, h.muteEvent(ss, siteID)); err != nil { + return err + } + emitted = true + } + if subChanged(desc, "f") { + if err := h.pub.Publish(ctx, h.favoriteEvent(ss, siteID)); err != nil { + return err + } + emitted = true + } + // ls/lr/alert all map to a single subscription_read using the current max(ls,lr)+alert. + if subChanged(desc, "ls") || subChanged(desc, "lr") || subChanged(desc, "alert") { + if err := h.pub.Publish(ctx, h.readEvent(ss, siteID)); err != nil { + return err + } + emitted = true + } + + if !emitted { + // name/fname changes are driven by the room rename path, not the sub; any other field is noise. + slog.Debug("skip subscription update (no recognized field changed)", + "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "subscription_update_noop") + return migration.ErrSkipped + } + return nil +} + +// publishSubscriptionState emits member_added followed by the state events that reproduce the +// source row (role/mute/favorite/read). Used by insert/replace and by an open false→true rejoin. +func (h *handler) publishSubscriptionState(ctx context.Context, ss *sourceSubscription, withMemberAdded bool) error { + siteID := siteIDFromOrigin(ss.Federation.Origin, h.siteID) + + if withMemberAdded { + if err := h.pub.Publish(ctx, h.memberAddedEvent(ss, siteID)); err != nil { + return err + } + } + if len(ss.Roles) > 0 { + if err := h.pub.Publish(ctx, h.roleUpdatedEvent(ss, siteID)); err != nil { + return err + } + } + if err := h.pub.Publish(ctx, h.muteEvent(ss, siteID)); err != nil { + return err + } + if err := h.pub.Publish(ctx, h.favoriteEvent(ss, siteID)); err != nil { + return err + } + return h.pub.Publish(ctx, h.readEvent(ss, siteID)) +} + +// memberAddedEvent builds the member_added InboxEvent. RoomType is classified from t alone +// (a sub can't see prid/teamId/bot, so discussion/botDM degrade to channel/dm); roles via role_updated. +func (h *handler) memberAddedEvent(ss *sourceSubscription, siteID string) model.InboxEvent { + class := classifyRoom(ss.T, false, false, false, 2) + // Zero-guard ts → now so an absent source ts never becomes a year-0001 JoinedAt. + joinedAt := ss.TS.UTC().UnixMilli() + if ss.TS.IsZero() { + joinedAt = h.nowMillis() + } + payload := mustMarshal(model.MemberAddEvent{ + Type: "member_added", + RoomID: ss.RID, + Accounts: []string{ss.U.Username}, + RoomType: class.Type, + RoomName: ss.FName, + SiteID: siteID, + JoinedAt: joinedAt, + Timestamp: h.nowMillis(), + }) + return h.inboxEvent(model.InboxMemberAdded, siteID, payload) +} + +// memberRemovedEvent builds the member_removed InboxEvent (open true→false leave). +func (h *handler) memberRemovedEvent(ss *sourceSubscription) model.InboxEvent { + siteID := siteIDFromOrigin(ss.Federation.Origin, h.siteID) + payload := mustMarshal(model.MemberRemoveEvent{ + Type: "member_removed", + RoomID: ss.RID, + Accounts: []string{ss.U.Username}, + SiteID: siteID, + Timestamp: h.nowMillis(), + }) + return h.inboxEvent(model.InboxMemberRemoved, siteID, payload) +} + +// roleUpdatedEvent builds the role_updated InboxEvent. inbox-worker.handleRoleUpdated decodes a +// SubscriptionUpdateEvent and applies Subscription.{User.Account, RoomID, Roles}. +// subUpdatedAtMillis is the source subscription's _updatedAt — the high-water mark the field-update +// guards (mute/favorite/roles) stamp on their events, stable across redelivery so a re-delivered +// inline insert snapshot can't out-rank a newer update. Falls back to now() when the source omits it. +func (h *handler) subUpdatedAtMillis(ss *sourceSubscription) int64 { + if ss.UpdatedAt.IsZero() { + return h.nowMillis() + } + return ss.UpdatedAt.UTC().UnixMilli() +} + +func (h *handler) roleUpdatedEvent(ss *sourceSubscription, siteID string) model.InboxEvent { + payload := mustMarshal(model.SubscriptionUpdateEvent{ + Subscription: model.Subscription{ + User: model.SubscriptionUser{ID: ss.U.ID, Account: ss.U.Username}, + RoomID: ss.RID, + Roles: mapSubscriptionRoles(ss.Roles), + }, + Action: "role_updated", + Timestamp: h.subUpdatedAtMillis(ss), + }) + return h.inboxEvent(model.InboxEventType("role_updated"), siteID, payload) +} + +func (h *handler) muteEvent(ss *sourceSubscription, siteID string) model.InboxEvent { + payload := mustMarshal(model.SubscriptionMuteToggledEvent{ + Account: ss.U.Username, + RoomID: ss.RID, + Muted: ss.DisableNotifications, + Timestamp: h.subUpdatedAtMillis(ss), + }) + return h.inboxEvent(model.InboxSubscriptionMuteToggled, siteID, payload) +} + +func (h *handler) favoriteEvent(ss *sourceSubscription, siteID string) model.InboxEvent { + payload := mustMarshal(model.SubscriptionFavoriteToggledEvent{ + Account: ss.U.Username, + RoomID: ss.RID, + Favorite: ss.F, + Timestamp: h.subUpdatedAtMillis(ss), + }) + return h.inboxEvent(model.InboxSubscriptionFavoriteToggled, siteID, payload) +} + +func (h *handler) readEvent(ss *sourceSubscription, siteID string) model.InboxEvent { + payload := mustMarshal(model.SubscriptionReadEvent{ + Account: ss.U.Username, + RoomID: ss.RID, + LastSeenAt: ss.lastSeenMillis(), + Alert: ss.Alert, + Timestamp: h.nowMillis(), + }) + return h.inboxEvent(model.InboxSubscriptionRead, siteID, payload) +} + +// mapSubscriptionRoles maps RocketChat role strings to model.Role: "owner" → RoleOwner; everything +// else (RC "moderator"/"leader"/"user", which the new model lacks) → RoleMember. Empty source roles +// (a RocketChat demotion clears the array) map to the [member] floor — the new stack's invariant is +// roles are never empty (room-service writes ["member"] after a live demotion), and inbox-worker +// permanently drops a role_updated with no roles, so an empty mapping would silently lose demotions. +func mapSubscriptionRoles(roles []string) []model.Role { + if len(roles) == 0 { + return []model.Role{model.RoleMember} + } + out := make([]model.Role, 0, len(roles)) + for _, r := range roles { + if r == string(model.RoleOwner) { + out = append(out, model.RoleOwner) + } else { + out = append(out, model.RoleMember) + } + } + return out +} + +// mustMarshal JSON-encodes a fixed-shape model payload; json.Marshal cannot fail on these, +// so an error is a programmer error and panics. +func mustMarshal(v any) []byte { + b, err := json.Marshal(v) + if err != nil { + panic(fmt.Sprintf("marshal inbox payload: %v", err)) + } + return b +} diff --git a/data-migration/oplog-collections-transformer/subscriptions_test.go b/data-migration/oplog-collections-transformer/subscriptions_test.go new file mode 100644 index 000000000..4173d2f36 --- /dev/null +++ b/data-migration/oplog-collections-transformer/subscriptions_test.go @@ -0,0 +1,401 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" +) + +func subEv(op, doc, updateDesc string) oplogEvent { + ev := oplogEvent{Op: op, Collection: subsColl, EventID: "se1"} + if doc != "" { + ev.FullDocument = json.RawMessage(doc) + } + if updateDesc != "" { + ev.UpdateDescription = json.RawMessage(updateDesc) + } + ev.DocumentKey = json.RawMessage(`{"_id":"sub1"}`) + return ev +} + +// eventsByType groups the published events by their InboxEvent.Type. +func eventsByType(evts []model.InboxEvent) map[model.InboxEventType]model.InboxEvent { + m := make(map[model.InboxEventType]model.InboxEvent, len(evts)) + for _, e := range evts { + m[e.Type] = e + } + return m +} + +// A full source subscription doc: owner role, muted, favorited, alert, ls < lr. +const fullSubDoc = `{ + "_id":"sub1", + "u":{"_id":"u1","username":"alice"}, + "rid":"r1", + "t":"c", + "name":"general", + "fname":"General", + "roles":["owner"], + "open":true, + "f":true, + "disableNotifications":true, + "alert":true, + "ls":{"$date":"2024-01-15T09:00:00.000Z"}, + "lr":{"$date":"2024-01-15T10:00:00.000Z"}, + "ts":{"$date":"2024-01-01T00:00:00.000Z"}, + "_updatedAt":{"$date":"2024-01-20T00:00:00.000Z"} +}` + +// updatedAtMillis is the source _updatedAt for fullSubDoc (2024-01-20T00:00:00Z) — the high-water +// mark the mute/favorite/role guards must stamp, stable across redelivery. +const updatedAtMillis = int64(1705708800000) + +// lrMillis is max(ls,lr) for fullSubDoc — lr (10:00Z) is later than ls (09:00Z). +const lrMillis = int64(1705312800000) // 2024-01-15T10:00:00Z +// tsMillis is the JoinedAt for fullSubDoc. +const tsMillis = int64(1704067200000) // 2024-01-01T00:00:00Z + +func TestHandleSubscription_Insert_AllEvents(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + err := h.handleSubscription(context.Background(), subEv("insert", fullSubDoc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 5) + + // 1. member_added (first in order). + assert.Equal(t, model.InboxMemberAdded, pub.events[0].Type) + var ma model.MemberAddEvent + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &ma)) + assert.Equal(t, "member_added", ma.Type) + assert.Equal(t, "r1", ma.RoomID) + assert.Equal(t, []string{"alice"}, ma.Accounts) + assert.Equal(t, model.RoomTypeChannel, ma.RoomType) + assert.Equal(t, "General", ma.RoomName) + assert.Equal(t, testSiteID, ma.SiteID) + assert.Equal(t, tsMillis, ma.JoinedAt) + + byType := eventsByType(pub.events) + + // 2. role_updated (owner → owner). + roleEvt, ok := byType[model.InboxEventType("role_updated")] + require.True(t, ok) + var su model.SubscriptionUpdateEvent + require.NoError(t, json.Unmarshal(roleEvt.Payload, &su)) + assert.Equal(t, "alice", su.Subscription.User.Account) + assert.Equal(t, "r1", su.Subscription.RoomID) + assert.Equal(t, []model.Role{model.RoleOwner}, su.Subscription.Roles) + + // 3. mute (from disableNotifications=true). + muteEvt, ok := byType[model.InboxSubscriptionMuteToggled] + require.True(t, ok) + var mute model.SubscriptionMuteToggledEvent + require.NoError(t, json.Unmarshal(muteEvt.Payload, &mute)) + assert.Equal(t, "alice", mute.Account) + assert.Equal(t, "r1", mute.RoomID) + assert.True(t, mute.Muted) + + // 4. favorite (f=true). + favEvt, ok := byType[model.InboxSubscriptionFavoriteToggled] + require.True(t, ok) + var fav model.SubscriptionFavoriteToggledEvent + require.NoError(t, json.Unmarshal(favEvt.Payload, &fav)) + assert.True(t, fav.Favorite) + + // 5. subscription_read (LastSeenAt = max(ls,lr), Alert). + readEvt, ok := byType[model.InboxSubscriptionRead] + require.True(t, ok) + var read model.SubscriptionReadEvent + require.NoError(t, json.Unmarshal(readEvt.Payload, &read)) + assert.Equal(t, "alice", read.Account) + assert.Equal(t, "r1", read.RoomID) + assert.Equal(t, lrMillis, read.LastSeenAt) + assert.True(t, read.Alert) + + // All envelopes carry the home site + local dest. + for _, e := range pub.events { + assert.Equal(t, testSiteID, e.SiteID) + assert.Equal(t, testSiteID, e.DestSiteID) + } +} + +func TestHandleSubscription_FieldGuards_UseSourceUpdatedAt(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // The field-update guards (mute/favorite/roles) must stamp the event Timestamp from the + // source _updatedAt — stable across redelivery — not from publish-time now(). Otherwise a + // redelivered insert (inline snapshot) could out-rank a newer update at the destination. + require.NoError(t, h.handleSubscription(context.Background(), subEv("insert", fullSubDoc, ""))) + byType := eventsByType(pub.events) + + var mute model.SubscriptionMuteToggledEvent + require.NoError(t, json.Unmarshal(byType[model.InboxSubscriptionMuteToggled].Payload, &mute)) + assert.Equal(t, updatedAtMillis, mute.Timestamp) + + var fav model.SubscriptionFavoriteToggledEvent + require.NoError(t, json.Unmarshal(byType[model.InboxSubscriptionFavoriteToggled].Payload, &fav)) + assert.Equal(t, updatedAtMillis, fav.Timestamp) + + var su model.SubscriptionUpdateEvent + require.NoError(t, json.Unmarshal(byType[model.InboxEventType("role_updated")].Payload, &su)) + assert.Equal(t, updatedAtMillis, su.Timestamp) +} + +func TestHandleSubscription_FieldGuards_NoSourceUpdatedAt_FallsBackToNow(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // No _updatedAt in source → guards fall back to now() (the test handler's fixed clock). + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c",` + + `"roles":["owner"],"open":true,"disableNotifications":true,"f":true}` + require.NoError(t, h.handleSubscription(context.Background(), subEv("insert", doc, ""))) + byType := eventsByType(pub.events) + + var mute model.SubscriptionMuteToggledEvent + require.NoError(t, json.Unmarshal(byType[model.InboxSubscriptionMuteToggled].Payload, &mute)) + assert.Equal(t, int64(1700000000000), mute.Timestamp) +} + +func TestHandleSubscription_Insert_EmptyRoles_NoRoleEvent(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","fname":"General", + "open":true,"f":false,"disableNotifications":false,"alert":false, + "ls":{"$date":"2024-01-15T09:00:00.000Z"},"lr":{"$date":"2024-01-15T09:00:00.000Z"}, + "ts":{"$date":"2024-01-01T00:00:00.000Z"}}` + err := h.handleSubscription(context.Background(), subEv("insert", doc, "")) + require.NoError(t, err) + + require.Len(t, pub.events, 4) + byType := eventsByType(pub.events) + _, hasRole := byType[model.InboxEventType("role_updated")] + assert.False(t, hasRole) + _, hasAdd := byType[model.InboxMemberAdded] + assert.True(t, hasAdd) +} + +func TestHandleSubscription_Insert_FederatedSiteID(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","fname":"General", + "roles":["owner"],"open":true, + "federation":{"origin":"0030204.tchat-test.test.company.com"}, + "ts":{"$date":"2024-01-01T00:00:00.000Z"}}` + err := h.handleSubscription(context.Background(), subEv("insert", doc, "")) + require.NoError(t, err) + + require.NotEmpty(t, pub.events) + for _, e := range pub.events { + assert.Equal(t, "0030204", e.SiteID) + } + var ma model.MemberAddEvent + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &ma)) + assert.Equal(t, "0030204", ma.SiteID) +} + +func TestHandleSubscription_Update_FavoriteOnly(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(fullSubDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", `{"updatedFields":{"f":true}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxSubscriptionFavoriteToggled, pub.events[0].Type) + var fav model.SubscriptionFavoriteToggledEvent + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &fav)) + assert.True(t, fav.Favorite) +} + +func TestHandleSubscription_Update_OpenFalse_MemberRemoved(t *testing.T) { + pub := &fakePublisher{} + closedDoc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","fname":"General","open":false}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(closedDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", `{"updatedFields":{"open":false}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxMemberRemoved, pub.events[0].Type) + var mr model.MemberRemoveEvent + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &mr)) + assert.Equal(t, "member_removed", mr.Type) + assert.Equal(t, "r1", mr.RoomID) + assert.Equal(t, []string{"alice"}, mr.Accounts) + assert.Equal(t, testSiteID, mr.SiteID) +} + +func TestHandleSubscription_Update_OpenTrue_Resubscribe(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(fullSubDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", `{"updatedFields":{"open":true}}`)) + require.NoError(t, err) + + // Re-subscribe rebuilds full state: same 5 events as an insert. + require.Len(t, pub.events, 5) + byType := eventsByType(pub.events) + _, hasAdd := byType[model.InboxMemberAdded] + assert.True(t, hasAdd) + _, hasRole := byType[model.InboxEventType("role_updated")] + assert.True(t, hasRole) + _, hasRead := byType[model.InboxSubscriptionRead] + assert.True(t, hasRead) +} + +func TestHandleSubscription_Update_ReadField(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(fullSubDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", `{"updatedFields":{"ls":{"$date":"2024-01-15T09:00:00.000Z"}}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxSubscriptionRead, pub.events[0].Type) + var read model.SubscriptionReadEvent + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &read)) + assert.Equal(t, lrMillis, read.LastSeenAt) // max(ls,lr) from current doc + assert.True(t, read.Alert) +} + +func TestHandleSubscription_Update_UnrecognizedField_Skip(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(fullSubDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", `{"updatedFields":{"_updatedAt":{"$date":"2024-01-15T09:00:00.000Z"}}}`)) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleSubscription_Delete_Skipped(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + // True row delete is un-actionable (spec §4.0/§4.3): only the source _id, which doesn't map to + // the destination sub. A genuine leave arrives as an open:false update. Skip, publish nothing. + err := h.handleSubscription(context.Background(), subEv("delete", "", "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleSubscription_MaxLsLr(t *testing.T) { + tests := []struct { + name string + ls string + lr string + want int64 + }{ + { + name: "lr later than ls", + ls: "2024-01-15T09:00:00.000Z", + lr: "2024-01-15T10:00:00.000Z", + want: 1705312800000, // 10:00Z + }, + { + name: "ls later than lr", + ls: "2024-01-15T11:00:00.000Z", + lr: "2024-01-15T10:00:00.000Z", + want: 1705316400000, // 11:00Z + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","fname":"General","open":true,` + + `"alert":true,"ls":{"$date":"` + tc.ls + `"},"lr":{"$date":"` + tc.lr + `"},` + + `"ts":{"$date":"2024-01-01T00:00:00.000Z"}}` + err := h.handleSubscription(context.Background(), subEv("insert", doc, "")) + require.NoError(t, err) + + byType := eventsByType(pub.events) + readEvt, ok := byType[model.InboxSubscriptionRead] + require.True(t, ok) + var read model.SubscriptionReadEvent + require.NoError(t, json.Unmarshal(readEvt.Payload, &read)) + assert.Equal(t, tc.want, read.LastSeenAt) + }) + } +} + +func TestHandleSubscription_Update_RolesAndMute(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: json.RawMessage(fullSubDoc)}) + + err := h.handleSubscription(context.Background(), subEv("update", "", + `{"updatedFields":{"roles":["owner"],"disableNotifications":true}}`)) + require.NoError(t, err) + + require.Len(t, pub.events, 2) + byType := eventsByType(pub.events) + _, hasRole := byType[model.InboxEventType("role_updated")] + assert.True(t, hasRole) + _, hasMute := byType[model.InboxSubscriptionMuteToggled] + assert.True(t, hasMute) +} + +func TestHandleSubscription_Replace_AllEvents(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + err := h.handleSubscription(context.Background(), subEv("replace", fullSubDoc, "")) + require.NoError(t, err) + assert.Len(t, pub.events, 5) +} + +func TestHandleSubscription_PublishError(t *testing.T) { + pub := &fakePublisher{err: errors.New("inbox down")} + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + + err := h.handleSubscription(context.Background(), subEv("insert", fullSubDoc, "")) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrSkipped) +} + +func TestHandleSubscription_ZeroTimestamps_GuardedNotYear0001(t *testing.T) { + pub := &fakePublisher{} + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","disableNotifications":false}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{}) + require.NoError(t, h.handleSubscription(context.Background(), subEv("insert", doc, ""))) + byType := eventsByType(pub.events) + + var read model.SubscriptionReadEvent + require.NoError(t, json.Unmarshal(byType[model.InboxSubscriptionRead].Payload, &read)) + assert.Equal(t, int64(0), read.LastSeenAt, "absent ls/lr must yield 0, not a negative year-0001 millis") + + var ma model.MemberAddEvent + require.NoError(t, json.Unmarshal(byType[model.InboxMemberAdded].Payload, &ma)) + assert.Equal(t, int64(1700000000000), ma.JoinedAt, "absent ts must fall back to now(), not year-0001") +} + +func TestHandleSubscription_RolesCleared_EmitsRoleUpdated(t *testing.T) { + pub := &fakePublisher{} + doc := `{"_id":"sub1","u":{"_id":"u1","username":"alice"},"rid":"r1","t":"c","roles":[]}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: []byte(doc)}) + + ev := oplogEvent{Op: "update", Collection: subsColl, EventID: "e1", + DocumentKey: json.RawMessage(`{"_id":"sub1"}`), + UpdateDescription: json.RawMessage(`{"updatedFields":{"roles":[]}}`)} + require.NoError(t, h.handleSubscription(context.Background(), ev)) + + role, ok := eventsByType(pub.events)[model.InboxEventType("role_updated")] + require.True(t, ok, "a roles-cleared update must still emit role_updated") + var su model.SubscriptionUpdateEvent + require.NoError(t, json.Unmarshal(role.Payload, &su)) + // Cleared source roles must land as [member], never empty: inbox-worker permanently drops a + // role_updated with no roles (malformed-event guard), and the new-stack floor is [member] + // (room-service writes ["member"] after a live demotion — the migration must match). + assert.Equal(t, []model.Role{model.RoleMember}, su.Subscription.Roles, + "a demotion (cleared roles) must map to the [member] floor so it survives inbox-worker") +} diff --git a/data-migration/oplog-collections-transformer/targetstore.go b/data-migration/oplog-collections-transformer/targetstore.go new file mode 100644 index 000000000..bd27fc683 --- /dev/null +++ b/data-migration/oplog-collections-transformer/targetstore.go @@ -0,0 +1,91 @@ +package main + +import ( + "context" + "errors" + "fmt" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + + "github.com/hmchangw/chat/pkg/model" +) + +// mongoTargetStore is the new-stack per-site Mongo access the transformer needs: +// user insert-if-absent plus thread_room / user FK resolution for thread-sub mapping. +type mongoTargetStore struct { + users *mongo.Collection // TargetDB.users + threadRooms *mongo.Collection // TargetDB.thread_rooms +} + +// Compile-time assertion that *mongoTargetStore satisfies targetStore. +var _ targetStore = (*mongoTargetStore)(nil) + +// NewMongoTargetStore binds the users and thread_rooms collections on the target DB. +func NewMongoTargetStore(db *mongo.Database) *mongoTargetStore { + return &mongoTargetStore{ + users: db.Collection("users"), + threadRooms: db.Collection("thread_rooms"), + } +} + +// EnsureIndexes creates the unique index on users.account — the insert-if-absent dedup key. +// thread_rooms indexes are owned by message-worker and intentionally not touched here. +func (s *mongoTargetStore) EnsureIndexes(ctx context.Context) error { + if _, err := s.users.Indexes().CreateOne(ctx, mongo.IndexModel{ + Keys: bson.D{{Key: "account", Value: 1}}, + Options: options.Index().SetUnique(true), + }); err != nil { + return fmt.Errorf("ensure users account index: %w", err) + } + return nil +} + +// UpsertUserIfAbsent inserts u keyed by account only when absent, leaving an existing doc +// (owned by the company-wide sync) untouched. inserted reports whether a new doc was created. +// +//nolint:gocritic // model.User passed by value: one per migrated user record, off the hot path. +func (s *mongoTargetStore) UpsertUserIfAbsent(ctx context.Context, u model.User) (bool, error) { + res, err := s.users.UpdateOne(ctx, + bson.M{"account": u.Account}, + bson.M{"$setOnInsert": u}, + options.UpdateOne().SetUpsert(true), + ) + if err != nil { + return false, fmt.Errorf("upsert user if absent: %w", err) + } + return res.UpsertedCount > 0, nil +} + +// FindThreadRoom resolves the thread room for parentMessageID, returning room id, thread room id, +// and its home site. found=false (no error) when no thread room exists yet for that parent message. +func (s *mongoTargetStore) FindThreadRoom(ctx context.Context, parentMessageID string) (string, string, string, bool, error) { + var tr model.ThreadRoom + err := s.threadRooms.FindOne(ctx, bson.M{"parentMessageId": parentMessageID}, + options.FindOne().SetProjection(bson.M{"roomId": 1, "siteId": 1}), + ).Decode(&tr) + if errors.Is(err, mongo.ErrNoDocuments) { + return "", "", "", false, nil + } + if err != nil { + return "", "", "", false, fmt.Errorf("find thread room by parent message: %w", err) + } + return tr.RoomID, tr.ID, tr.SiteID, true, nil +} + +// FindUserID resolves the new-stack user _id for the given account. found=false (no error) +// when the account has not been seeded yet. +func (s *mongoTargetStore) FindUserID(ctx context.Context, account string) (string, bool, error) { + var u model.User + err := s.users.FindOne(ctx, bson.M{"account": account}, + options.FindOne().SetProjection(bson.M{"_id": 1}), + ).Decode(&u) + if errors.Is(err, mongo.ErrNoDocuments) { + return "", false, nil + } + if err != nil { + return "", false, fmt.Errorf("find user id by account: %w", err) + } + return u.ID, true, nil +} diff --git a/data-migration/oplog-collections-transformer/threadsubs.go b/data-migration/oplog-collections-transformer/threadsubs.go new file mode 100644 index 000000000..86d4a3f56 --- /dev/null +++ b/data-migration/oplog-collections-transformer/threadsubs.go @@ -0,0 +1,124 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "time" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/idgen" + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" +) + +// sourceThreadSub is the subset of a tsmc_thread_subscriptions doc the mapper decodes (handles $date). +type sourceThreadSub struct { + ID string `bson:"_id"` + U struct { + ID string `bson:"_id"` + Username string `bson:"username"` + } `bson:"u"` + RID string `bson:"rid"` + ParentMessage struct { + ID string `bson:"_id"` + } `bson:"parentMessage"` + LastSeenAt *time.Time `bson:"lastSeenAt"` + UnreadMention int `bson:"unreadMention"` + CreatedAt time.Time `bson:"createdAt"` +} + +// handleThreadSub maps a tsmc_thread_subscriptions change event (§4.4 / §4.0): delete → skip; +// insert/replace/update → resolve the thread_room + user FKs, then publish thread_subscription_upserted. +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) handleThreadSub(ctx context.Context, ev oplogEvent) error { + reqID := natsutil.RequestIDFromContext(ctx) + + if ev.Op == "delete" { + // Un-actionable: the delete event carries only the source _id, and there is no inbox + // removal handler for thread-sub unfollows (spec §4.4 / D2). + slog.Debug("skip thread_sub delete (un-actionable, no inbox removal handler)", + "eventId", ev.EventID, "request_id", reqID) + h.metrics.onSkipped(ctx, "thread_sub_delete") + return migration.ErrSkipped + } + + doc, skip, err := h.resolveDoc(ctx, ev) + if err != nil { + return err + } + if skip { + h.metrics.onSkipped(ctx, ev.Op+"_skip") + return migration.ErrSkipped + } + + var sts sourceThreadSub + if uerr := bson.UnmarshalExtJSON(doc, false, &sts); uerr != nil { + return fmt.Errorf("%w: decode source thread_sub: %v", migration.ErrPoison, uerr) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + + parentMessageID := sts.ParentMessage.ID + account := sts.U.Username + + // A blank parentMessage._id or account is structurally invalid source data: the FK lookups + // below would always miss and Nak-storm to MAX_DELIVER. Poison instead — redelivery can't fix it. + if parentMessageID == "" { + return fmt.Errorf("%w: thread_sub has empty parentMessage._id", migration.ErrPoison) + } + if account == "" { + return fmt.Errorf("%w: thread_sub has empty u.username", migration.ErrPoison) + } + + // Resolve thread_room by parent message id — yields roomID, threadRoomID, and the room's + // siteID (thread-subs inherit the room's site per spec §6). + roomID, threadRoomID, roomSiteID, found, err := h.target.FindThreadRoom(ctx, parentMessageID) + if err != nil { + return fmt.Errorf("find thread room for parentMessage %s: %w", parentMessageID, err) + } + if !found { + // Thread room hasn't been created yet by the message migration — Nak and retry. + h.metrics.onResolveMiss(ctx, "thread_room") + return fmt.Errorf("thread_room not found for parentMessage %s", parentMessageID) + } + + // Cross-check: if the source rid is non-empty and differs from the resolved roomID, + // log a warning — the resolved room is authoritative. + if sts.RID != "" && sts.RID != roomID { + slog.Warn("thread_sub source rid differs from resolved roomID — using resolved", + "source_rid", sts.RID, "resolved_room_id", roomID, + "parentMessageId", parentMessageID, "request_id", reqID) + } + + // Resolve user by account — needed for the UserID FK. + userID, uFound, err := h.target.FindUserID(ctx, account) + if err != nil { + return fmt.Errorf("find user for account %s: %w", account, err) + } + if !uFound { + // User hasn't been seeded yet — Nak and retry. + h.metrics.onResolveMiss(ctx, "user") + return fmt.Errorf("user not found for account %s", account) + } + + now := time.UnixMilli(h.nowMillis()).UTC() + + sub := model.ThreadSubscription{ + ID: idgen.GenerateUUIDv7(), + ParentMessageID: parentMessageID, + RoomID: roomID, + ThreadRoomID: threadRoomID, + UserID: userID, + UserAccount: account, + SiteID: roomSiteID, + LastSeenAt: sts.LastSeenAt, + HasMention: sts.UnreadMention > 0, + CreatedAt: sts.CreatedAt.UTC(), + UpdatedAt: now, + } + + payload := mustMarshal(sub) + return h.pub.Publish(ctx, h.inboxEvent(model.InboxThreadSubscriptionUpserted, roomSiteID, payload)) +} diff --git a/data-migration/oplog-collections-transformer/threadsubs_test.go b/data-migration/oplog-collections-transformer/threadsubs_test.go new file mode 100644 index 000000000..c46da41db --- /dev/null +++ b/data-migration/oplog-collections-transformer/threadsubs_test.go @@ -0,0 +1,302 @@ +package main + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" +) + +// threadSubTarget is a targetStore fake wired for thread-sub FK resolution tests. +// It returns configurable results for FindThreadRoom and FindUserID while delegating +// UpsertUserIfAbsent to the embedded fakeTarget so existing tests are unaffected. +type threadSubTarget struct { + fakeTarget + // FindThreadRoom return values. + threadRoomFound bool + threadRoomErr error + roomID string + threadRoomID string + roomSiteID string + + // FindUserID return values. + userFound bool + userErr error + userID string +} + +func (f *threadSubTarget) FindThreadRoom(_ context.Context, _ string) (string, string, string, bool, error) { + return f.roomID, f.threadRoomID, f.roomSiteID, f.threadRoomFound, f.threadRoomErr +} + +func (f *threadSubTarget) FindUserID(_ context.Context, _ string) (string, bool, error) { + return f.userID, f.userFound, f.userErr +} + +// newResolvedTarget builds a threadSubTarget where both FK lookups succeed. +func newResolvedTarget() *threadSubTarget { + return &threadSubTarget{ + threadRoomFound: true, + roomID: "room1", + threadRoomID: "thread1", + roomSiteID: testSiteID, + userFound: true, + userID: "user1", + } +} + +// threadSubEv builds an oplogEvent for the tsmc_thread_subscriptions collection. +func threadSubEv(op, doc string) oplogEvent { + ev := oplogEvent{Op: op, Collection: threadSubColl, EventID: "ts1"} + if doc != "" { + ev.FullDocument = json.RawMessage(doc) + } + ev.DocumentKey = json.RawMessage(`{"_id":"ts1"}`) + return ev +} + +// A full source thread_sub doc matching SOURCE_DATA.md §5. +const fullThreadSubDoc = `{ + "_id":"ts1", + "u":{"_id":"u1","username":"alice"}, + "rid":"room1", + "parentMessage":{"_id":"tmid1"}, + "lastSeenAt":{"$date":"2024-01-15T10:00:00.000Z"}, + "unreadMention":2, + "createdAt":{"$date":"2024-01-01T00:00:00.000Z"} +}` + +// A thread_sub doc with no lastSeenAt (optional field). +const threadSubNoLastSeen = `{ + "_id":"ts2", + "u":{"_id":"u1","username":"alice"}, + "rid":"room1", + "parentMessage":{"_id":"tmid1"}, + "unreadMention":0, + "createdAt":{"$date":"2024-01-01T00:00:00.000Z"} +}` + +func TestHandleThreadSub_Insert_BothFKsResolve(t *testing.T) { + pub := &fakePublisher{} + target := newResolvedTarget() + h := newTestHandler(pub, target, &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", fullThreadSubDoc)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + evt := pub.events[0] + assert.Equal(t, model.InboxThreadSubscriptionUpserted, evt.Type) + assert.Equal(t, testSiteID, evt.SiteID) + assert.Equal(t, testSiteID, evt.DestSiteID) + + var sub model.ThreadSubscription + require.NoError(t, json.Unmarshal(evt.Payload, &sub)) + assert.Equal(t, "tmid1", sub.ParentMessageID) + assert.Equal(t, "room1", sub.RoomID) + assert.Equal(t, "thread1", sub.ThreadRoomID) + assert.Equal(t, "user1", sub.UserID) + assert.Equal(t, "alice", sub.UserAccount) + assert.Equal(t, testSiteID, sub.SiteID) + assert.True(t, sub.HasMention, "unreadMention=2 should set HasMention=true") + require.NotNil(t, sub.LastSeenAt) + wantLastSeen := time.Date(2024, 1, 15, 10, 0, 0, 0, time.UTC) + assert.Equal(t, wantLastSeen, sub.LastSeenAt.UTC()) + wantCreatedAt := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + assert.Equal(t, wantCreatedAt, sub.CreatedAt.UTC()) + // UpdatedAt is the handler's now time (1700000000000 ms in tests). + assert.Equal(t, time.UnixMilli(1700000000000).UTC(), sub.UpdatedAt.UTC()) + // ID must be non-empty (generated UUIDv7). + assert.NotEmpty(t, sub.ID) +} + +func TestHandleThreadSub_Insert_ThreadRoomMissing_TransientError(t *testing.T) { + pub := &fakePublisher{} + target := &threadSubTarget{ + threadRoomFound: false, + userFound: true, + userID: "user1", + } + h := newTestHandler(pub, target, &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", fullThreadSubDoc)) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrSkipped, "missing thread_room must Nak, not skip") + assert.NotErrorIs(t, err, migration.ErrPoison, "missing thread_room must Nak, not poison") + assert.Empty(t, pub.events, "no event published when thread_room missing") +} + +func TestHandleThreadSub_Insert_UserMissing_TransientError(t *testing.T) { + pub := &fakePublisher{} + target := &threadSubTarget{ + threadRoomFound: true, + roomID: "room1", + threadRoomID: "thread1", + roomSiteID: testSiteID, + userFound: false, + } + h := newTestHandler(pub, target, &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", fullThreadSubDoc)) + require.Error(t, err) + assert.NotErrorIs(t, err, migration.ErrSkipped, "missing user must Nak, not skip") + assert.NotErrorIs(t, err, migration.ErrPoison, "missing user must Nak, not poison") + assert.Empty(t, pub.events, "no event published when user missing") +} + +func TestHandleThreadSub_Update_BothFKsResolve(t *testing.T) { + pub := &fakePublisher{} + target := newResolvedTarget() + // fakeLookup returns the full doc on update re-read. + h := newTestHandler(pub, target, &fakeLookup{doc: json.RawMessage(fullThreadSubDoc)}) + + ev := oplogEvent{ + Op: "update", + Collection: threadSubColl, + EventID: "ts1", + DocumentKey: json.RawMessage(`{"_id":"ts1"}`), + UpdateDescription: json.RawMessage(`{"updatedFields":{"lastSeenAt":{"$date":"2024-01-15T10:00:00.000Z"}}}`), + } + err := h.handleThreadSub(context.Background(), ev) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxThreadSubscriptionUpserted, pub.events[0].Type) + var sub model.ThreadSubscription + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &sub)) + assert.Equal(t, "thread1", sub.ThreadRoomID) + assert.Equal(t, "room1", sub.RoomID) + assert.Equal(t, "user1", sub.UserID) +} + +func TestHandleThreadSub_Delete_Skip(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, newResolvedTarget(), &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("delete", "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleThreadSub_UnreadMention_HasMention(t *testing.T) { + tests := []struct { + name string + unreadMention int + wantMention bool + }{ + {"unreadMention=0 → HasMention=false", 0, false}, + {"unreadMention=3 → HasMention=true", 3, true}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + pub := &fakePublisher{} + target := newResolvedTarget() + h := newTestHandler(pub, target, &fakeLookup{}) + + docJSON, err := json.Marshal(map[string]any{ + "_id": "ts1", + "u": map[string]any{"_id": "u1", "username": "alice"}, + "rid": "room1", + "parentMessage": map[string]any{"_id": "tmid1"}, + "unreadMention": tc.unreadMention, + "createdAt": map[string]any{"$date": "2024-01-01T00:00:00.000Z"}, + }) + require.NoError(t, err) + + herr := h.handleThreadSub(context.Background(), threadSubEv("insert", string(docJSON))) + require.NoError(t, herr) + require.Len(t, pub.events, 1) + + var sub model.ThreadSubscription + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &sub)) + assert.Equal(t, tc.wantMention, sub.HasMention) + }) + } +} + +func TestHandleThreadSub_NoLastSeenAt_NilInPayload(t *testing.T) { + pub := &fakePublisher{} + target := newResolvedTarget() + h := newTestHandler(pub, target, &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", threadSubNoLastSeen)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + var sub model.ThreadSubscription + require.NoError(t, json.Unmarshal(pub.events[0].Payload, &sub)) + assert.Nil(t, sub.LastSeenAt, "absent lastSeenAt must decode as nil") + assert.False(t, sub.HasMention, "unreadMention=0 → HasMention=false") +} + +func TestHandleThreadSub_Insert_NoFullDocument_Poison(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, newResolvedTarget(), &fakeLookup{}) + + ev := oplogEvent{Op: "insert", Collection: threadSubColl, EventID: "ts1", + DocumentKey: json.RawMessage(`{"_id":"ts1"}`)} + err := h.handleThreadSub(context.Background(), ev) + assert.ErrorIs(t, err, migration.ErrPoison) + assert.Empty(t, pub.events) +} + +// emptyParentThreadSubDoc has a blank parentMessage._id — structurally invalid, can never +// resolve a thread_room, so it must poison rather than Nak-storm to MAX_DELIVER. +const emptyParentThreadSubDoc = `{ + "_id":"ts1", + "u":{"_id":"u1","username":"alice"}, + "rid":"room1", + "parentMessage":{"_id":""}, + "unreadMention":0, + "createdAt":{"$date":"2024-01-01T00:00:00.000Z"} +}` + +// emptyAccountThreadSubDoc has a blank u.username — can never resolve a user, must poison. +const emptyAccountThreadSubDoc = `{ + "_id":"ts1", + "u":{"_id":"u1","username":""}, + "rid":"room1", + "parentMessage":{"_id":"tmid1"}, + "unreadMention":0, + "createdAt":{"$date":"2024-01-01T00:00:00.000Z"} +}` + +func TestHandleThreadSub_Insert_EmptyParentMessageID_Poison(t *testing.T) { + pub := &fakePublisher{} + // A resolving target would otherwise succeed — the guard must trip before any FK lookup. + h := newTestHandler(pub, newResolvedTarget(), &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", emptyParentThreadSubDoc)) + assert.ErrorIs(t, err, migration.ErrPoison) + assert.NotErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleThreadSub_Insert_EmptyAccount_Poison(t *testing.T) { + pub := &fakePublisher{} + h := newTestHandler(pub, newResolvedTarget(), &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("insert", emptyAccountThreadSubDoc)) + assert.ErrorIs(t, err, migration.ErrPoison) + assert.NotErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleThreadSub_Replace_BothFKsResolve(t *testing.T) { + pub := &fakePublisher{} + target := newResolvedTarget() + h := newTestHandler(pub, target, &fakeLookup{}) + + err := h.handleThreadSub(context.Background(), threadSubEv("replace", fullThreadSubDoc)) + require.NoError(t, err) + + require.Len(t, pub.events, 1) + assert.Equal(t, model.InboxThreadSubscriptionUpserted, pub.events[0].Type) +} diff --git a/data-migration/oplog-collections-transformer/users.go b/data-migration/oplog-collections-transformer/users.go new file mode 100644 index 000000000..93b44154a --- /dev/null +++ b/data-migration/oplog-collections-transformer/users.go @@ -0,0 +1,165 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/idgen" + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/natsutil" +) + +// sourceUser is the subset of a users doc the mapper consumes, decoded from relaxed extended JSON. +type sourceUser struct { + ID string `bson:"_id"` + Username string `bson:"username"` + Type string `bson:"type"` + StatusText string `bson:"statusText"` + Roles []string `bson:"roles"` + CustomFields struct { + EngName string `bson:"engName"` + TsmcName string `bson:"tsmcName"` + DeptID string `bson:"deptId"` + DeptName string `bson:"deptName"` + SectID string `bson:"sectId"` + SectName string `bson:"sectName"` + } `bson:"customFields"` + // Federation.Origin is the user's home site (absent ⇒ local); drives siteId stamping. + Federation struct { + Origin string `bson:"origin"` + } `bson:"federation"` +} + +// handleUser maps a users change event to an insert-if-absent direct write to the per-site +// users collection per spec §4.1. delete is skipped (deactivation is active:false, deferred). +// +//nolint:gocritic // ev passed by value to mirror handle's signature; off the hot path. +func (h *handler) handleUser(ctx context.Context, ev oplogEvent) error { + if ev.Op == "delete" { + // Deactivation is active:false (an update), not a row delete; the delete event is + // un-actionable anyway (only the source _id). Deferred — skip + metric. + slog.Debug("skip user delete (deactivation is active:false, deferred)", + "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "user_delete") + return migration.ErrSkipped + } + + doc, skip, err := h.resolveDoc(ctx, ev) + if err != nil { + return err + } + if skip { + h.metrics.onSkipped(ctx, ev.Op+"_skip") + return migration.ErrSkipped + } + + var su sourceUser + if uerr := bson.UnmarshalExtJSON(doc, false, &su); uerr != nil { + return fmt.Errorf("%w: decode source user: %v", migration.ErrPoison, uerr) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + + u := model.User{ + ID: idgen.GenerateUUIDv7(), + Account: su.Username, + EngName: su.CustomFields.EngName, + ChineseName: su.CustomFields.TsmcName, + SectID: su.CustomFields.SectID, + SectName: su.CustomFields.SectName, + DeptID: su.CustomFields.DeptID, + DeptName: su.CustomFields.DeptName, + StatusText: su.StatusText, + Roles: mapUserRoles(su.Roles), + SiteID: siteIDFromOrigin(su.Federation.Origin, h.siteID), + } + + inserted, err := h.target.UpsertUserIfAbsent(ctx, u) + if err != nil { + return fmt.Errorf("upsert user if absent (account %q): %w", u.Account, err) + } + if inserted { + h.metrics.onUserSeed(ctx, "insert") + } else { + h.metrics.onUserSeed(ctx, "present") + } + + // Post-seed HR fields (engName, deptId, sectId, …) are NOT propagated — the company-wide user + // sync owns those (spec §4.1/§9). statusText is the exception: it is chat-originated (set by the + // user inside the legacy chat, not part of the HR dataset), so no other sync carries it. A live + // statusText change must reach every site (global-visibility, not home-routed); fan it to all + // sites incl. ours — the migration has no synchronous local writer, so we learn via our own inbox. + if ev.Op == "update" { + var desc updateDescription + if len(ev.UpdateDescription) > 0 { + if derr := bson.UnmarshalExtJSON(ev.UpdateDescription, false, &desc); derr != nil { + return fmt.Errorf("%w: decode user updateDescription: %v", migration.ErrPoison, derr) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + } + } + if changed(desc, "statusText") { + if err := h.publishUserStatus(ctx, su.Username, su.StatusText); err != nil { + return err + } + } + } + + return nil +} + +// publishUserStatus fans a user_status_updated InboxEvent to every site incl. our own (statusIsShow +// stays nil — owned by the company-wide sync). A publish failure Naks the whole event; re-fan is idempotent. +func (h *handler) publishUserStatus(ctx context.Context, account, statusText string) error { + now := h.nowMillis() + payload := mustMarshal(model.UserStatusUpdated{ + Account: account, + StatusText: statusText, + Timestamp: now, + }) + sent := 0 + for _, dest := range h.allSiteIDs { + if dest == "" { + continue + } + evt := model.InboxEvent{ + Type: model.InboxUserStatusUpdated, + SiteID: h.siteID, + DestSiteID: dest, + Payload: payload, + Timestamp: now, + } + if err := h.pub.Publish(ctx, evt); err != nil { + return fmt.Errorf("publish user_status_updated to %q: %w", dest, err) + } + sent++ + } + if sent == 0 { + // No destination sites — ALL_SITE_IDS empty/misconfigured, or a partial deployment that + // doesn't fan status. Skip cleanly with a logged + metered signal rather than Nak-storming + // every status event. TODO: make empty ALL_SITE_IDS a startup hard-fail once the failure + // modes are understood. + slog.WarnContext(ctx, "ALL_SITE_IDS has no destinations — skipping user status fan-out", + "account", account, "request_id", natsutil.RequestIDFromContext(ctx)) + h.metrics.onSkipped(ctx, "status_no_sites") + return migration.ErrSkipped + } + return nil +} + +// mapUserRoles maps source role strings to model.UserRole: "admin" → UserRoleAdmin, all else +// → UserRoleUser. Returns nil for no roles (an empty Roles reads as ["user"]). +func mapUserRoles(roles []string) []model.UserRole { + if len(roles) == 0 { + return nil + } + out := make([]model.UserRole, 0, len(roles)) + for _, r := range roles { + if r == string(model.UserRoleAdmin) { + out = append(out, model.UserRoleAdmin) + } else { + out = append(out, model.UserRoleUser) + } + } + return out +} diff --git a/data-migration/oplog-collections-transformer/users_test.go b/data-migration/oplog-collections-transformer/users_test.go new file mode 100644 index 000000000..83f47df9f --- /dev/null +++ b/data-migration/oplog-collections-transformer/users_test.go @@ -0,0 +1,145 @@ +package main + +import ( + "context" + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/migration" + "github.com/hmchangw/chat/pkg/model" +) + +func userEv(op, doc string) oplogEvent { + ev := oplogEvent{Op: op, Collection: usersColl, EventID: "e1"} + if doc != "" { + ev.FullDocument = json.RawMessage(doc) + } + ev.DocumentKey = json.RawMessage(`{"_id":"u1"}`) + return ev +} + +func TestHandleUser_InsertMapsFields(t *testing.T) { + target := &fakeTarget{inserted: true} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + + doc := `{"_id":"u1","username":"alice","type":"user","statusText":"hi",` + + `"roles":["admin","user"],` + + `"customFields":{"engName":"Alice","tsmcName":"愛麗絲","deptId":"D1","deptName":"Dept","sectId":"S1","sectName":"Sect"}}` + err := h.handleUser(context.Background(), userEv("insert", doc)) + require.NoError(t, err) + + require.Len(t, target.upserted, 1) + u := target.upserted[0] + assert.Equal(t, "alice", u.Account) + assert.Equal(t, "Alice", u.EngName) + assert.Equal(t, "愛麗絲", u.ChineseName) + assert.Equal(t, "D1", u.DeptID) + assert.Equal(t, "Dept", u.DeptName) + assert.Equal(t, "S1", u.SectID) + assert.Equal(t, "Sect", u.SectName) + assert.Equal(t, "hi", u.StatusText) + assert.Equal(t, testSiteID, u.SiteID) + assert.NotEmpty(t, u.ID) + require.Len(t, u.Roles, 2) + assert.Equal(t, model.UserRoleAdmin, u.Roles[0]) + assert.Equal(t, model.UserRoleUser, u.Roles[1]) +} + +func TestHandleUser_Delete(t *testing.T) { + target := &fakeTarget{} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + + err := h.handleUser(context.Background(), userEv("delete", "")) + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, target.upserted) +} + +func TestHandleUser_FederatedOriginSiteID(t *testing.T) { + target := &fakeTarget{inserted: true} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + + doc := `{"_id":"u1","username":"bob","federation":{"origin":"0030204.tchat-test.test.company.com"}}` + err := h.handleUser(context.Background(), userEv("insert", doc)) + require.NoError(t, err) + + require.Len(t, target.upserted, 1) + assert.Equal(t, "0030204", target.upserted[0].SiteID) +} + +func TestHandleUser_AlreadyPresent(t *testing.T) { + target := &fakeTarget{inserted: false} + h := newTestHandler(&fakePublisher{}, target, &fakeLookup{}) + + doc := `{"_id":"u1","username":"carol"}` + err := h.handleUser(context.Background(), userEv("insert", doc)) + require.NoError(t, err) + require.Len(t, target.upserted, 1) + assert.Equal(t, "carol", target.upserted[0].Account) +} + +func TestHandleUser_StatusTextUpdate_FansToAllSites(t *testing.T) { + pub := &fakePublisher{} + doc := `{"_id":"u1","username":"alice","statusText":"in a meeting"}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: []byte(doc)}) + + ev := oplogEvent{ + Op: "update", + Collection: usersColl, + EventID: "e1", + DocumentKey: json.RawMessage(`{"_id":"u1"}`), + UpdateDescription: json.RawMessage(`{"updatedFields":{"statusText":"in a meeting"}}`), + } + require.NoError(t, h.handleUser(context.Background(), ev)) + + // statusText is chat-originated and global-visibility: fan to every site in allSiteIDs (s1 incl. self, s2). + require.Len(t, pub.events, 2) + var dests []string + for _, e := range pub.events { + assert.Equal(t, model.InboxUserStatusUpdated, e.Type) + assert.Equal(t, testSiteID, e.SiteID) + dests = append(dests, e.DestSiteID) + var p model.UserStatusUpdated + require.NoError(t, json.Unmarshal(e.Payload, &p)) + assert.Equal(t, "alice", p.Account) + assert.Equal(t, "in a meeting", p.StatusText) + assert.Nil(t, p.StatusIsShow, "statusIsShow is not sourced — left nil for the user sync to own") + } + assert.ElementsMatch(t, []string{testSiteID, "s2"}, dests) +} + +func TestHandleUser_StatusTextUpdate_NoSites_WarnsAndSkips(t *testing.T) { + pub := &fakePublisher{} + doc := `{"_id":"u1","username":"alice","statusText":"in a meeting"}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: []byte(doc)}) + h.allSiteIDs = nil // ALL_SITE_IDS empty (misconfig, or a partial deployment without status fan-out) + + ev := oplogEvent{ + Op: "update", + Collection: usersColl, + EventID: "e1", + DocumentKey: json.RawMessage(`{"_id":"u1"}`), + UpdateDescription: json.RawMessage(`{"updatedFields":{"statusText":"in a meeting"}}`), + } + err := h.handleUser(context.Background(), ev) + // No destinations: warn + Ack-skip — not a retry-storm error, not a silent drop. + assert.ErrorIs(t, err, migration.ErrSkipped) + assert.Empty(t, pub.events) +} + +func TestHandleUser_NonStatusUpdate_NoFanout(t *testing.T) { + pub := &fakePublisher{} + doc := `{"_id":"u1","username":"alice","customFields":{"deptName":"NewDept"}}` + h := newTestHandler(pub, &fakeTarget{}, &fakeLookup{doc: []byte(doc)}) + + ev := oplogEvent{ + Op: "update", + Collection: usersColl, + DocumentKey: json.RawMessage(`{"_id":"u1"}`), + UpdateDescription: json.RawMessage(`{"updatedFields":{"customFields.deptName":"NewDept"}}`), + } + require.NoError(t, h.handleUser(context.Background(), ev)) + assert.Empty(t, pub.events, "an HR-field update must not fan a status event") +} diff --git a/data-migration/oplog-transformer/errors.go b/data-migration/oplog-transformer/errors.go deleted file mode 100644 index e1a73f71a..000000000 --- a/data-migration/oplog-transformer/errors.go +++ /dev/null @@ -1,11 +0,0 @@ -package main - -import "errors" - -// errPoison marks an event that can never succeed (unmappable doc). The consume loop Terms -// these instead of redelivering, so one bad event never wedges the stream. -var errPoison = errors.New("poison event") - -// errSkipped marks an event the handler deliberately dropped. The consume loop Acks these but does -// NOT count them as processed — the skip is already metered via onSkipped, so counting it double-counts. -var errSkipped = errors.New("event skipped") diff --git a/data-migration/oplog-transformer/handler.go b/data-migration/oplog-transformer/handler.go index f008245de..14937c4b9 100644 --- a/data-migration/oplog-transformer/handler.go +++ b/data-migration/oplog-transformer/handler.go @@ -7,10 +7,16 @@ import ( "log/slog" "time" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" ) +// sourceLookup fetches the current full message doc from the source by _id. +type sourceLookup interface { + FindByID(ctx context.Context, id string) ([]byte, error) +} + // oplogEvent mirrors model.OplogEvent's wire shape (decoded from the consumed message). type oplogEvent struct { EventID string `json:"eventId"` @@ -41,30 +47,30 @@ type handler struct { } // skipSystem skips a system/event message (not user content, deferred from migration), records a -// skip metric, and returns errSkipped so the caller Acks without counting it as processed. +// skip metric, and returns migration.ErrSkipped so the caller Acks without counting it as processed. func (h *handler) skipSystem(ctx context.Context, t, eventID string) error { slog.Info("skipping system message", "t", t, "eventId", eventID, "request_id", natsutil.RequestIDFromContext(ctx)) h.metrics.onSkipped(ctx, "system_message") - return errSkipped + return migration.ErrSkipped } // skipForeign skips a message authored at a remote site (federation.origin set) — foreign copies -// arrive via the new app's own federation. origin is a site id, safe to log. Returns errSkipped. +// arrive via the new app's own federation. origin is a site id, safe to log. Returns migration.ErrSkipped. func (h *handler) skipForeign(ctx context.Context, origin, eventID string) error { slog.Info("skipping foreign-origin message", "origin", origin, "eventId", eventID, "request_id", natsutil.RequestIDFromContext(ctx)) h.metrics.onSkipped(ctx, "foreign_origin") - return errSkipped + return migration.ErrSkipped } -// handle processes one decoded oplog event. nil = ack+count; errSkipped = ack-without-counting -// (deliberate drop, already metered); errPoison => Term; any other error => Nak (transient). +// handle processes one decoded oplog event. nil = ack+count; migration.ErrSkipped = ack-without-counting +// (deliberate drop, already metered); migration.ErrPoison => Term; any other error => Nak (transient). // //nolint:gocritic // ev passed by value: it's the decoded event the consume loop hands off, one per message off the hot path. func (h *handler) handle(ctx context.Context, ev oplogEvent) error { if ev.Collection != h.collection { slog.Debug("skip non-message collection", "collection", ev.Collection, "request_id", natsutil.RequestIDFromContext(ctx)) h.metrics.onSkipped(ctx, "other_collection") - return errSkipped + return migration.ErrSkipped } switch ev.Op { case "insert": @@ -78,7 +84,7 @@ func (h *handler) handle(ctx context.Context, ev oplogEvent) error { default: slog.Warn("unknown op skipped", "op", ev.Op, "eventId", ev.EventID, "request_id", natsutil.RequestIDFromContext(ctx)) h.metrics.onSkipped(ctx, "unknown_op") - return errSkipped + return migration.ErrSkipped } } @@ -89,7 +95,7 @@ func (h *handler) handleInsert(ctx context.Context, ev oplogEvent) error { if !ev.Degraded { // A non-degraded insert with no fullDocument is a contract violation — the connector // always carries the doc for inserts unless it degraded it. Poison. - return fmt.Errorf("%w: insert without fullDocument", errPoison) + return fmt.Errorf("%w: insert without fullDocument", migration.ErrPoison) } recovered, err := h.recoverDegradedDoc(ctx, ev) if err != nil { @@ -99,9 +105,9 @@ func (h *handler) handleInsert(ctx context.Context, ev oplogEvent) error { } rc, err := decodeRocketchatMessage(doc) if err != nil { - // Single %w keeps errPoison matchable; the decode error is folded in with %v (nothing checks + // Single %w keeps migration.ErrPoison matchable; the decode error is folded in with %v (nothing checks // errors.Is on it, and one sentinel per chain satisfies the semgrep multi-wrap guard). - return fmt.Errorf("%w: %v", errPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + return fmt.Errorf("%w: %v", migration.ErrPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only } // Foreign-origin messages are migrated by their home site, not here — the connector's $match // drops them at the source; this is defense-in-depth in case one slips through. @@ -151,13 +157,13 @@ func (h *handler) resolveParentCreatedAt(ctx context.Context, parentID, eventID return &ts, nil } -// documentKeyID decodes documentKey → _id. Returns errPoison when missing/malformed. +// documentKeyID decodes documentKey → _id. Returns migration.ErrPoison when missing/malformed. func documentKeyID(documentKey json.RawMessage) (string, error) { var key struct { ID string `json:"_id"` } if err := json.Unmarshal(documentKey, &key); err != nil || key.ID == "" { - return "", fmt.Errorf("%w: bad documentKey", errPoison) + return "", fmt.Errorf("%w: bad documentKey", migration.ErrPoison) } return key.ID, nil } @@ -212,7 +218,7 @@ func (h *handler) handleUpdate(ctx context.Context, ev oplogEvent) error { // from the recover Nak (own live doc must exist) and thread-parent miss (best-effort publish). slog.Warn("update lookup miss — skipping", "id", id, "request_id", natsutil.RequestIDFromContext(ctx)) h.metrics.onSkipped(ctx, "update_lookup_miss") - return errSkipped + return migration.ErrSkipped } return h.applyUpdate(ctx, ev, id, doc) } @@ -226,7 +232,7 @@ func (h *handler) handleReplace(ctx context.Context, ev oplogEvent) error { doc := ev.FullDocument if len(doc) == 0 { if !ev.Degraded { - return fmt.Errorf("%w: replace without fullDocument", errPoison) + return fmt.Errorf("%w: replace without fullDocument", migration.ErrPoison) } recovered, rerr := h.recoverDegradedDoc(ctx, ev) if rerr != nil { @@ -269,7 +275,7 @@ func deleteTime(clusterTime int64) time.Time { func (h *handler) applyUpdate(ctx context.Context, ev oplogEvent, id string, doc []byte) error { rc, err := decodeRocketchatMessage(doc) if err != nil { - return fmt.Errorf("%w: %v", errPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only + return fmt.Errorf("%w: %v", migration.ErrPoison, err) //nolint:errorlint // intentional single-%w sentinel wrap; decode err is informational only } // Foreign-origin filter for update/replace: the connector's $match can't drop these (no // fullDocument on update events), so the resolved doc is where we catch them, before classifying. diff --git a/data-migration/oplog-transformer/handler_test.go b/data-migration/oplog-transformer/handler_test.go index befcdb631..1566791f0 100644 --- a/data-migration/oplog-transformer/handler_test.go +++ b/data-migration/oplog-transformer/handler_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" ) @@ -92,7 +93,7 @@ func TestHandle_NonDegradedReplaceNilDocumentKeyPoison(t *testing.T) { // no DocumentKey on a non-degraded event = contract violation = poison. }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a non-degraded replace without a documentKey is poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a non-degraded replace without a documentKey is poison") } func TestHandle_DeleteOpRoutesByID(t *testing.T) { @@ -150,14 +151,14 @@ func TestHandle_InsertThreadReplyParentLookupErrorNaks(t *testing.T) { h := newTestHandler(&recordPublisher{}, &recordHistory{}, errLookup{err: errors.New("source down")}) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "insert", FullDocument: loadDoc(t, "threadreply.json")}) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a transient parent-lookup error must Nak (retry), not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a transient parent-lookup error must Nak (retry), not Term") } func TestHandle_InsertForeignOriginSkipped(t *testing.T) { pub := &recordPublisher{} h := newTestHandler(pub, &recordHistory{}, fakeLookup{}) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "insert", FullDocument: loadDoc(t, "foreign.json")}) - require.ErrorIs(t, err, errSkipped, "a deliberate skip returns errSkipped (Acked, not counted as processed)") + require.ErrorIs(t, err, migration.ErrSkipped, "a deliberate skip returns migration.ErrSkipped (Acked, not counted as processed)") assert.Empty(t, pub.inserts, "a foreign-origin insert must not be published (defense-in-depth behind the connector $match)") } @@ -167,7 +168,7 @@ func TestHandle_UpdateForeignOriginSkipped(t *testing.T) { look := fakeLookup{"fgn456abc789def01": loadDoc(t, "foreign.json")} h := newTestHandler(&recordPublisher{}, hist, look) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "update", DocumentKey: []byte(`{"_id":"fgn456abc789def01"}`)}) - require.ErrorIs(t, err, errSkipped) + require.ErrorIs(t, err, migration.ErrSkipped) assert.Empty(t, hist.edits, "foreign-origin update must not edit") assert.Empty(t, hist.deletes, "foreign-origin update must not delete") } @@ -175,14 +176,14 @@ func TestHandle_UpdateForeignOriginSkipped(t *testing.T) { func TestHandle_UnknownCollectionSkipped(t *testing.T) { h := newTestHandler(&recordPublisher{}, &recordHistory{}, fakeLookup{}) err := h.handle(context.Background(), oplogEvent{Collection: "users", Op: "insert", FullDocument: []byte(`{}`)}) - require.ErrorIs(t, err, errSkipped, "a non-message collection is skipped (Acked, not counted)") + require.ErrorIs(t, err, migration.ErrSkipped, "a non-message collection is skipped (Acked, not counted)") } func TestHandle_InsertSystemMessageSkipped(t *testing.T) { pub := &recordPublisher{} h := newTestHandler(pub, &recordHistory{}, fakeLookup{}) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "insert", FullDocument: loadDoc(t, "system.json")}) - require.ErrorIs(t, err, errSkipped) + require.ErrorIs(t, err, migration.ErrSkipped) assert.Empty(t, pub.inserts, "system messages (t set) must not be published as inserts") } @@ -191,7 +192,7 @@ func TestHandle_UpdateSystemMessageSkipped(t *testing.T) { look := fakeLookup{"sysMsg00000000001": loadDoc(t, "system.json")} h := newTestHandler(&recordPublisher{}, hist, look) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "update", DocumentKey: []byte(`{"_id":"sysMsg00000000001"}`), ClusterTime: 1700000000000}) - require.ErrorIs(t, err, errSkipped) + require.ErrorIs(t, err, migration.ErrSkipped) assert.Empty(t, hist.edits, "system message update must not edit") assert.Empty(t, hist.deletes, "system message update must not delete") } @@ -199,7 +200,7 @@ func TestHandle_UpdateSystemMessageSkipped(t *testing.T) { func TestHandle_LookupMissSkipped(t *testing.T) { h := newTestHandler(&recordPublisher{}, &recordHistory{}, fakeLookup{}) err := h.handle(context.Background(), oplogEvent{Collection: "rocketchat_message", Op: "update", DocumentKey: []byte(`{"_id":"gone"}`)}) - require.ErrorIs(t, err, errSkipped, "update lookup miss is ack-skipped (the doc is gone, nothing to apply)") + require.ErrorIs(t, err, migration.ErrSkipped, "update lookup miss is ack-skipped (the doc is gone, nothing to apply)") } // errLookup is a sourceLookup that always returns an error (a transient source failure). @@ -235,7 +236,7 @@ func TestHandle_DegradedInsertLookupMissNaks(t *testing.T) { DocumentKey: []byte(`{"_id":"abc123def456ghi78"}`), }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded-insert lookup miss must Nak (retry), not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded-insert lookup miss must Nak (retry), not Term") assert.Empty(t, pub.inserts) } @@ -249,7 +250,7 @@ func TestHandle_DegradedInsertLookupErrorNaks(t *testing.T) { DocumentKey: []byte(`{"_id":"abc123def456ghi78"}`), }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded-insert lookup error must Nak (retry), not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded-insert lookup error must Nak (retry), not Term") } func TestHandle_DegradedInsertNilDocumentKeyNaks(t *testing.T) { @@ -262,7 +263,7 @@ func TestHandle_DegradedInsertNilDocumentKeyNaks(t *testing.T) { DocumentKey: nil, }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded event with nil documentKey must Nak, not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded event with nil documentKey must Nak, not Term") } func TestHandle_NonDegradedInsertEmptyDocPoison(t *testing.T) { @@ -274,7 +275,7 @@ func TestHandle_NonDegradedInsertEmptyDocPoison(t *testing.T) { Degraded: false, }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a non-degraded insert without fullDocument is a contract violation = poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a non-degraded insert without fullDocument is a contract violation = poison") } func TestHandle_DegradedUpdateNilDocumentKeyNaks(t *testing.T) { @@ -286,7 +287,7 @@ func TestHandle_DegradedUpdateNilDocumentKeyNaks(t *testing.T) { DocumentKey: nil, }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded update with nil documentKey must Nak, not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded update with nil documentKey must Nak, not Term") } func TestHandle_NonDegradedUpdateBadDocumentKeyPoison(t *testing.T) { @@ -297,7 +298,7 @@ func TestHandle_NonDegradedUpdateBadDocumentKeyPoison(t *testing.T) { DocumentKey: []byte(`{bad`), }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a malformed documentKey on a non-degraded event is poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a malformed documentKey on a non-degraded event is poison") } func TestHandle_DegradedReplaceRecovered(t *testing.T) { @@ -326,7 +327,7 @@ func TestHandle_DegradedReplaceLookupMissNaks(t *testing.T) { DocumentKey: []byte(`{"_id":"abc123def456ghi78"}`), }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded-replace lookup miss must Nak (retry), not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded-replace lookup miss must Nak (retry), not Term") } func TestHandle_NonDegradedReplaceEmptyDocPoison(t *testing.T) { @@ -338,7 +339,7 @@ func TestHandle_NonDegradedReplaceEmptyDocPoison(t *testing.T) { Degraded: false, }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a non-degraded replace without fullDocument is poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a non-degraded replace without fullDocument is poison") } func TestHandle_UnknownOpSkipped(t *testing.T) { @@ -347,7 +348,7 @@ func TestHandle_UnknownOpSkipped(t *testing.T) { Collection: "rocketchat_message", Op: "rename", // not one of insert/update/replace/delete }) - require.ErrorIs(t, err, errSkipped, "an unknown op is skipped (Acked, not counted)") + require.ErrorIs(t, err, migration.ErrSkipped, "an unknown op is skipped (Acked, not counted)") } func TestHandle_UpdateLookupErrorNaks(t *testing.T) { @@ -358,7 +359,7 @@ func TestHandle_UpdateLookupErrorNaks(t *testing.T) { DocumentKey: []byte(`{"_id":"abc123def456ghi78"}`), }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a transient source lookup error must Nak (retry), not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a transient source lookup error must Nak (retry), not Term") } func TestHandle_UpdateMalformedDocPoison(t *testing.T) { @@ -371,7 +372,7 @@ func TestHandle_UpdateMalformedDocPoison(t *testing.T) { DocumentKey: []byte(`{"_id":"abc123def456ghi78"}`), }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a present-but-corrupt looked-up doc is poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a present-but-corrupt looked-up doc is poison") } func TestHandle_DeleteNilDocumentKeyDegradedNaks(t *testing.T) { @@ -383,7 +384,7 @@ func TestHandle_DeleteNilDocumentKeyDegradedNaks(t *testing.T) { DocumentKey: nil, }) require.Error(t, err) - assert.False(t, errors.Is(err, errPoison), "a degraded delete with nil documentKey must Nak, not Term") + assert.False(t, errors.Is(err, migration.ErrPoison), "a degraded delete with nil documentKey must Nak, not Term") } func TestHandle_DeleteBadDocumentKeyPoison(t *testing.T) { @@ -394,5 +395,5 @@ func TestHandle_DeleteBadDocumentKeyPoison(t *testing.T) { DocumentKey: []byte(`{bad`), }) require.Error(t, err) - assert.True(t, errors.Is(err, errPoison), "a malformed documentKey on a non-degraded delete is poison") + assert.True(t, errors.Is(err, migration.ErrPoison), "a malformed documentKey on a non-degraded delete is poison") } diff --git a/data-migration/oplog-transformer/historyclient.go b/data-migration/oplog-transformer/historyclient.go index 034072e63..8dea3cf0d 100644 --- a/data-migration/oplog-transformer/historyclient.go +++ b/data-migration/oplog-transformer/historyclient.go @@ -11,6 +11,7 @@ import ( "go.opentelemetry.io/otel/codes" "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/subject" @@ -60,7 +61,7 @@ func (c *natsHistoryClient) request(ctx context.Context, subj string, payload an } termCode, derr := classifyHistoryReply(subj, reply.Data) if termCode != "" { - // A permanent history rejection — the transformer will Term (errPoison). Record the + // A permanent history rejection — the transformer will Term (migration.ErrPoison). Record the // rejecting category so a genuine permanent failure is visible, not buried in generic terms. c.metrics.onHistoryRejected(ctx, string(termCode)) } @@ -78,13 +79,13 @@ func permanentHistoryRejection(code errcode.Code) bool { } } -// classifyHistoryReply maps a history reply to a disposition error: nil → Ack; errPoison-wrapped +// classifyHistoryReply maps a history reply to a disposition error: nil → Ack; migration.ErrPoison-wrapped // (permanent rejection) → Term; plain error (retryable/unknown/not-ok ack/undecodable) → Nak. // termCode is the rejecting category only for a permanent rejection (Term metric), else "". func classifyHistoryReply(subj string, data []byte) (termCode errcode.Code, err error) { if ec, ok := errcode.Parse(data); ok { if permanentHistoryRejection(ec.Code) { - return ec.Code, fmt.Errorf("%w: history permanently rejected %q (%s): %s", errPoison, subj, ec.Code, ec.Message) + return ec.Code, fmt.Errorf("%w: history permanently rejected %q (%s): %s", migration.ErrPoison, subj, ec.Code, ec.Message) } return "", fmt.Errorf("history rejected %q (retryable %s): %s", subj, ec.Code, ec.Message) } diff --git a/data-migration/oplog-transformer/historyclient_test.go b/data-migration/oplog-transformer/historyclient_test.go index 3922ce208..231fed557 100644 --- a/data-migration/oplog-transformer/historyclient_test.go +++ b/data-migration/oplog-transformer/historyclient_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/require" "github.com/hmchangw/chat/pkg/errcode" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" ) @@ -113,8 +114,8 @@ func TestClassifyHistoryReply(t *testing.T) { return } require.Error(t, err) - assert.Equal(t, tc.wantPoison, errors.Is(err, errPoison), - "errPoison membership drives Term vs Nak") + assert.Equal(t, tc.wantPoison, errors.Is(err, migration.ErrPoison), + "migration.ErrPoison membership drives Term vs Nak") assert.Equal(t, tc.wantTermCode, termCode) }) } diff --git a/data-migration/oplog-transformer/integration_test.go b/data-migration/oplog-transformer/integration_test.go index 8155ad585..da70fd8e5 100644 --- a/data-migration/oplog-transformer/integration_test.go +++ b/data-migration/oplog-transformer/integration_test.go @@ -20,6 +20,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/natsutil" @@ -82,7 +83,7 @@ func TestTransformer_InsertToCanonical(t *testing.T) { }) require.NoError(t, err) - lookup := newMongoSourceLookup(source) + lookup := migration.NewMongoSourceLookup(source) fullDoc, err := lookup.FindByID(ctx, msgID) require.NoError(t, err) require.NotEmpty(t, fullDoc) @@ -211,7 +212,7 @@ func TestTransformer_SoftDeleteToHistory(t *testing.T) { softDeleteType: "rm", publisher: &canonicalPublisher{siteID: site, publish: nil, now: nowMs}, // unused on delete path history: &natsHistoryClient{nc: nc.NatsConn(), siteID: site, timeout: 5 * time.Second}, - lookup: newMongoSourceLookup(source), + lookup: migration.NewMongoSourceLookup(source), } require.NoError(t, h.handle(ctx, oplogEvent{ diff --git a/data-migration/oplog-transformer/main.go b/data-migration/oplog-transformer/main.go index 8fda83946..49db3905e 100644 --- a/data-migration/oplog-transformer/main.go +++ b/data-migration/oplog-transformer/main.go @@ -19,6 +19,7 @@ import ( "github.com/Marz32onE/instrumentation-go/otel-nats/oteljetstream" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/mongoutil" "github.com/hmchangw/chat/pkg/natsutil" "github.com/hmchangw/chat/pkg/otelutil" @@ -103,7 +104,7 @@ func main() { softDeleteType: cfg.SoftDeleteType, publisher: &canonicalPublisher{siteID: cfg.SiteID, publish: js.PublishMsg, now: nowMs}, history: &natsHistoryClient{nc: nc.NatsConn(), siteID: cfg.SiteID, timeout: cfg.HistoryRequestTimeout, metrics: m}, - lookup: newMongoSourceLookup(sourceColl), + lookup: migration.NewMongoSourceLookup(sourceColl), metrics: m, } @@ -151,7 +152,7 @@ func main() { // processOne decodes one event and dispatches it, mapping the outcome to a JetStream disposition: // Ack on success, Term on poison (never redelivered), Nak-with-delay on transient up to maxDeliver -// — then Termed with a distinct metric instead of JetStream's silent drop (see isFinalDelivery). +// — then Termed with a distinct metric instead of JetStream's silent drop (see migration.IsFinalDelivery). func processOne(ctx context.Context, h *handler, m jetstream.Msg, mtr *metrics, maxDeliver, deleteMaxDeliver int) { // Stamp a correlation id once at entry; it flows via ctx into the history RPC and canonical publish // (both read it from ctx through natsutil.NewMsg), so transformer→history→worker shares one request_id. @@ -176,19 +177,26 @@ func processOne(ctx context.Context, h *handler, m jetstream.Msg, mtr *metrics, if ev.Op == "delete" { deliverCap = deleteMaxDeliver } - switch err := h.handle(ctx, ev); { - case err == nil: + // Resolve delivery count; a Metadata error prefers Nak over a premature Term. + var numDelivered uint64 + if meta, err := m.Metadata(); err == nil { + numDelivered = meta.NumDelivered + } + isFinal := migration.IsFinalDelivery(numDelivered, deliverCap) + err := h.handle(ctx, ev) + switch migration.Classify(err, isFinal) { + case migration.ActionAck: mtr.onProcessed(ctx, ev.Op) dispose("ack", m.Ack) - case errors.Is(err, errPoison): + case migration.ActionTerm: slog.Error("poison event — term (skipping)", "eventId", ev.EventID, "error", err, "request_id", reqID) mtr.onTerm(ctx, ev.Op) dispose("term", m.Term) - case errors.Is(err, errSkipped): + case migration.ActionAckSkip: // A deliberate skip — already metered via onSkipped by the handler. Ack but DON'T count // it as processed (that would double-count the same event). dispose("ack", m.Ack) - case isFinalDelivery(m, deliverCap): + case migration.ActionTermExhausted: // A further Nak would hit the cap and be silently dropped by JetStream. // Term it explicitly so the give-up is logged + metered instead of vanishing. slog.Error("delivery limit reached — terming (dropping)", "eventId", ev.EventID, "op", ev.Op, "cap", deliverCap, "error", err, "request_id", reqID) @@ -201,19 +209,6 @@ func processOne(ctx context.Context, h *handler, m jetstream.Msg, mtr *metrics, } } -// isFinalDelivery reports whether this is the last delivery (NumDelivered ≥ maxDeliver), so a further -// Nak would be a silent drop. maxDeliver ≤ 0 means unlimited; a Metadata error prefers Nak over a premature Term. -func isFinalDelivery(m jetstream.Msg, maxDeliver int) bool { - if maxDeliver <= 0 { - return false - } - meta, err := m.Metadata() - if err != nil { - return false - } - return meta.NumDelivered >= uint64(maxDeliver) -} - func nowMs() int64 { return time.Now().UTC().UnixMilli() } // streamWaitTimeout bounds how long startup waits for the connector to bootstrap MIGRATION_OPLOG. diff --git a/data-migration/oplog-transformer/processone_test.go b/data-migration/oplog-transformer/processone_test.go index 3c124c5f2..cf8ab833a 100644 --- a/data-migration/oplog-transformer/processone_test.go +++ b/data-migration/oplog-transformer/processone_test.go @@ -12,6 +12,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/hmchangw/chat/pkg/migration" "github.com/hmchangw/chat/pkg/model" ) @@ -204,9 +205,12 @@ func TestProcessOne_Dispositions(t *testing.T) { } func TestIsFinalDelivery(t *testing.T) { - assert.True(t, isFinalDelivery(&fakeJSMsg{numDelivered: 1000}, 1000), "at the cap") - assert.True(t, isFinalDelivery(&fakeJSMsg{numDelivered: 1001}, 1000), "past the cap") - assert.False(t, isFinalDelivery(&fakeJSMsg{numDelivered: 999}, 1000), "below the cap") - assert.False(t, isFinalDelivery(&fakeJSMsg{numDelivered: 5000}, 0), "maxDeliver<=0 means unlimited") - assert.False(t, isFinalDelivery(&fakeJSMsg{metaErr: errors.New("no metadata")}, 1000), "metadata error → prefer Nak") + // migration.IsFinalDelivery takes (numDelivered uint64, maxDeliver int) directly; + // metadata extraction (and the error-tolerant fallback to 0) now lives in processOne. + assert.True(t, migration.IsFinalDelivery(1000, 1000), "at the cap") + assert.True(t, migration.IsFinalDelivery(1001, 1000), "past the cap") + assert.False(t, migration.IsFinalDelivery(999, 1000), "below the cap") + assert.False(t, migration.IsFinalDelivery(5000, 0), "maxDeliver<=0 means unlimited") + // A metadata error makes processOne pass numDelivered=0, which is never ≥ maxDeliver > 0 → not final. + assert.False(t, migration.IsFinalDelivery(0, 1000), "metadata error → processOne passes 0 → prefer Nak") } diff --git a/docs/superpowers/plans/2026-06-18-inbox-worker-member-added-wait-for-user.md b/docs/superpowers/plans/2026-06-18-inbox-worker-member-added-wait-for-user.md new file mode 100644 index 000000000..bb8bf3297 --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-inbox-worker-member-added-wait-for-user.md @@ -0,0 +1,257 @@ +# inbox-worker `member_added` wait-for-user Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `inbox-worker`'s `handleMemberAdded` return an error (→ Nak/redeliver) instead of silently skipping a subscription whose referenced user isn't present yet, so a `member_added` for a not-yet-synced user waits rather than being lost. + +**Architecture:** This is §7 of the collections-migration design (`docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md`) — the single, live-safe `inbox-worker` change the collections apply-path depends on. It is correct steady-state cross-site behavior (a `member_added` for an unknown user should wait, not drop), so it is retained after the source is sunset. The consumer already bounds retries via `MaxDeliver` (default 5, env `MAX_DELIVER`) and Naks non-permanent errors (`inbox-worker/main.go:478-493`), so no consumer-config change is needed. + +**Tech Stack:** Go 1.25, `go.uber.org/mock`-free in-memory stub store (`stubInboxStore` in `inbox-worker/handler_test.go`), `stretchr/testify`. Tests run via `make test SERVICE=inbox-worker` (race detector on). + +--- + +## File Structure + +- `inbox-worker/handler.go` — `handleMemberAdded` (the loop at lines ~137-171). One responsibility change: collect accounts with no resolved user and, after creating the resolvable ones, return an error naming the missing accounts. +- `inbox-worker/handler_test.go` — add unit tests for the unknown-user and mixed (some-present, some-missing) cases. The existing `TestHandleEvent_MemberAdded` already covers the all-present happy path and stays green as a regression guard. + +No new files, no interface changes, no new dependencies. + +--- + +## Context the engineer needs + +Current `handleMemberAdded` (in `inbox-worker/handler.go`) resolves users with `FindUsersByAccounts`, builds a `userMap`, then loops over `event.Accounts`. Today the loop **skips** an account with no resolved user: + +```go +subs := make([]*model.Subscription, 0, len(event.Accounts)) +for _, account := range event.Accounts { + user, ok := userMap[account] + if !ok { + slog.Warn("user not found for account", "account", account) + continue + } + sub := &model.Subscription{ + ID: idgen.GenerateUUIDv7(), + User: model.SubscriptionUser{ID: user.ID, Account: user.Account}, + RoomID: event.RoomID, + RoomType: roomType, + SiteID: event.SiteID, + Roles: rolesForType(roomType), + Name: subscriptionName(roomType, event.RoomName, event.RequesterAccount), + IsSubscribed: subscriptionIsSubscribed(roomType, &user), + HistorySharedSince: historySharedSince, + JoinedAt: joinedAt, + } + subs = append(subs, sub) +} + +if len(subs) == 0 { + return nil +} +if err := h.store.BulkCreateSubscriptions(ctx, subs); err != nil { + if !mongo.IsDuplicateKeyError(err) { + return fmt.Errorf("bulk create subscriptions: %w", err) + } +} + +// No SubscriptionUpdateEvent is published here — room-worker already publishes +// to the user's subject and the NATS supercluster routes it to the user's +// home site. +return nil +``` + +Disposition (`inbox-worker/main.go:478-493`): a returned **non-permanent** error → `msg.Nak()` (redeliver up to `MaxDeliver`); a permanent error → `Ack` (drop). A plain `fmt.Errorf` is non-permanent, so returning one makes the event redeliver — exactly the wait-for-user behavior we want. `fmt` and `mongo` are already imported in `handler.go`. + +`stubInboxStore` (`handler_test.go`): `FindUsersByAccounts` returns only the users present in its `users` slice; `BulkCreateSubscriptions` appends to `subscriptions`; `getSubscriptions()` returns a copy. An empty `users` slice means every account resolves as missing. + +--- + +## Task 1: `member_added` returns an error when a referenced user is unknown + +**Files:** +- Modify: `inbox-worker/handler.go` (the `handleMemberAdded` loop + post-loop block, ~lines 137-171) +- Test: `inbox-worker/handler_test.go` (add two tests) + +- [ ] **Step 1: Write the failing tests** + +Add these two tests to `inbox-worker/handler_test.go` (after `TestHandleEvent_MemberAdded`, around line 400): + +```go +func TestHandleEvent_MemberAdded_UnknownUser_ReturnsError(t *testing.T) { + // Store has NO users, so the referenced account cannot resolve. + store := &stubInboxStore{} + h := NewHandler(store) + + change := model.MemberAddEvent{ + Type: "member_added", + RoomID: "room-1", + Accounts: []string{"ghost"}, + SiteID: "site-b", + JoinedAt: time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC).UnixMilli(), + } + changeData, err := json.Marshal(change) + require.NoError(t, err) + + evt := model.OutboxEvent{Type: "member_added", SiteID: "site-b", DestSiteID: "site-a", Payload: changeData} + evtData, err := json.Marshal(evt) + require.NoError(t, err) + + err = h.HandleEvent(context.Background(), evtData) + + // Returns an error (→ Nak/redeliver) naming the missing account. + require.Error(t, err) + assert.Contains(t, err.Error(), "ghost") + // Not classified as permanent — a permanent error would Ack-drop instead of redeliver. + _, isPermanent := errcode.IsPermanent(err) + assert.False(t, isPermanent, "missing-user error must be transient so the event redelivers") + // No subscription was created. + assert.Empty(t, store.getSubscriptions()) +} + +func TestHandleEvent_MemberAdded_PartialUsers_CreatesPresentAndErrors(t *testing.T) { + // "bob" resolves; "ghost" does not. + store := &stubInboxStore{users: []model.User{{ID: "uid-bob", Account: "bob", SiteID: "site-a"}}} + h := NewHandler(store) + + change := model.MemberAddEvent{ + Type: "member_added", + RoomID: "room-1", + Accounts: []string{"bob", "ghost"}, + SiteID: "site-b", + JoinedAt: time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC).UnixMilli(), + } + changeData, err := json.Marshal(change) + require.NoError(t, err) + + evt := model.OutboxEvent{Type: "member_added", SiteID: "site-b", DestSiteID: "site-a", Payload: changeData} + evtData, err := json.Marshal(evt) + require.NoError(t, err) + + err = h.HandleEvent(context.Background(), evtData) + + // Errors so the whole event redelivers... + require.Error(t, err) + assert.Contains(t, err.Error(), "ghost") + // ...but the resolvable subscription was still created (progress; redelivery re-upserts idempotently). + subs := store.getSubscriptions() + require.Len(t, subs, 1) + assert.Equal(t, "bob", subs[0].User.Account) +} +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `make test SERVICE=inbox-worker` +Expected: FAIL — `TestHandleEvent_MemberAdded_UnknownUser_ReturnsError` fails because `HandleEvent` returns `nil` (today's skip), and `TestHandleEvent_MemberAdded_PartialUsers_CreatesPresentAndErrors` fails for the same reason (no error returned). + +- [ ] **Step 3: Make the change in `handleMemberAdded`** + +In `inbox-worker/handler.go`, replace the loop + post-loop block (the code quoted in "Context", ~lines 137-171) with: + +```go + subs := make([]*model.Subscription, 0, len(event.Accounts)) + var missing []string + for _, account := range event.Accounts { + user, ok := userMap[account] + if !ok { + missing = append(missing, account) + continue + } + sub := &model.Subscription{ + ID: idgen.GenerateUUIDv7(), + User: model.SubscriptionUser{ID: user.ID, Account: user.Account}, + RoomID: event.RoomID, + RoomType: roomType, + SiteID: event.SiteID, + Roles: rolesForType(roomType), + Name: subscriptionName(roomType, event.RoomName, event.RequesterAccount), + IsSubscribed: subscriptionIsSubscribed(roomType, &user), + HistorySharedSince: historySharedSince, + JoinedAt: joinedAt, + } + subs = append(subs, sub) + } + + if len(subs) > 0 { + if err := h.store.BulkCreateSubscriptions(ctx, subs); err != nil { + if !mongo.IsDuplicateKeyError(err) { + return fmt.Errorf("bulk create subscriptions: %w", err) + } + } + } + + // A referenced user that isn't present yet is a federation/migration race, not a + // permanent failure: return a (transient) error so JetStream redelivers the event + // until the user lands. The resolvable subscriptions above are created first to make + // progress; redelivery re-upserts them idempotently (guarded by the unique index). + if len(missing) > 0 { + return fmt.Errorf("member_added references unknown users %v in room %s", missing, event.RoomID) + } + + // No SubscriptionUpdateEvent is published here — room-worker already publishes + // to the user's subject and the NATS supercluster routes it to the user's + // home site. + return nil +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `make test SERVICE=inbox-worker` +Expected: PASS — the two new tests pass, and the existing `TestHandleEvent_MemberAdded*` tests remain green (all-present path unchanged). + +- [ ] **Step 5: Lint** + +Run: `make lint` +Expected: no findings in `inbox-worker`. + +- [ ] **Step 6: Commit** + +```bash +git add inbox-worker/handler.go inbox-worker/handler_test.go +git commit -m "feat(inbox-worker): member_added waits for unknown user instead of skipping" +``` + +--- + +## Task 2: Document the behavior change in `docs/client-api.md` (only if member_added is documented there) + +**Files:** +- Check: `docs/client-api.md` + +- [ ] **Step 1: Determine whether this is a client-facing change** + +Run: `grep -n "member_added" docs/client-api.md` +Expected: `member_added` is a cross-site OUTBOX/INBOX event, **not** a `chat.user.*` client RPC. If grep returns no client-RPC request/response schema for it, **no doc change is required** (per CLAUDE.md, only `chat.user.*` handlers and `auth-service` HTTP routes require `docs/client-api.md` updates). Record "n/a — internal federation event" and skip to done. + +- [ ] **Step 2: If (and only if) a client-facing schema references it, update it** + +If grep shows a documented client contract whose behavior changed, edit that section to note that a `member_added` referencing an unsynced user is retried (not dropped). Then: + +```bash +git add docs/client-api.md +git commit -m "docs(client-api): note member_added retry-on-unknown-user" +``` + +Otherwise no commit for this task. + +--- + +## Self-Review + +- **Spec coverage:** Implements §7's sole `inbox-worker` change (skip→error on unknown user) and confirms the §5 "bounded MaxDeliver" requirement is already met by the existing consumer config (default 5, `MAX_DELIVER`) — noted in Architecture, no task needed. The dropped `MemberAddEvent` extension (§7, post-correction) is correctly absent. ✓ +- **Placeholder scan:** No TBD/placeholder steps; every code step shows complete code; every run step has an exact command + expected outcome. ✓ +- **Type consistency:** Uses `model.MemberAddEvent`, `model.OutboxEvent`, `model.SubscriptionUser`, `errcode.IsPermanent`, `stubInboxStore.getSubscriptions()`, `NewHandler` — all matching existing signatures in `inbox-worker/handler.go` and `handler_test.go`. The `missing []string` variable and the `fmt.Errorf` message are self-contained. ✓ + +--- + +## Notes for the broader effort (not part of this plan) + +This is plan 1 of the collections-migration sequence. Remaining plans (to be written next): + +1. **(this plan) inbox-worker member_added wait-for-user** — §7. +2. **oplog-connector checkpoint start-mode** — add a start mode that begins the change stream at a supplied resume token/checkpoint (today: `now`/`time` only); §0/§4.0 (N1). +3. **`pkg/migration` shared extraction** — lift `sourceLookup`, disposition (`errPoison`/`errSkipped` + the processOne loop), base metrics, consume loop, and shared config out of `oplog-transformer` into `pkg/migration`; refactor `oplog-transformer` to import it, keeping its tests green. +4. **`oplog-collections-transformer`** — the new consumer (router + rooms/subscriptions/threadsubs/users mappers + inbox publisher + target-Mongo store + classify), built on `pkg/migration`. Likely split per collection. + +Correctness hardening flagged in the design review (C1 non-monotonic `member_removed`; C3 strict `siteId` parse) should be scheduled as their own plan before the collections-transformer goes live. diff --git a/docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md b/docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md new file mode 100644 index 000000000..8b3f56417 --- /dev/null +++ b/docs/superpowers/specs/2026-06-16-oplog-transformer-collections-design.md @@ -0,0 +1,337 @@ +# oplog-transformer (collections) — rooms / subscriptions / thread-subs / users migration (Design) + +> **Status:** DESIGN — next increment of the data-migration suite, after the message path (`oplog-transformer`, PR #331). Migrates the **operational collections** from the legacy ("source") RocketChat MongoDB into the new-stack **per-site MongoDB**, reusing the existing cross-site `inbox-worker` apply machinery where one exists and writing Mongo directly where it doesn't. Built on branch `claude/oplog-transformer-collections`. + +*The message path wrote message history to **global Cassandra**. This path writes **operational state** — rooms, subscriptions, thread subscriptions, users — to **per-site MongoDB**. That difference drives every decision below.* + +--- + +## 0. Where this sits + +```text + ┌─ OUT OF SCOPE (separate owner): bulk/initial state sync of all + │ existing data ≤ checkpoint (history + the existing rooms/subs/ + │ thread_subs/users snapshot). Produces the checkpoint we tail from + │ and guarantees context is fully synced before our tail applies. + ▼ + ┌──────────── checkpoint C (resume token) handed to us ───────────┐ + │ │ + (per site) source Mongo ─▶ oplog-connector ─▶ MIGRATION_OPLOG_{site} (chat.oplog.{site}.{coll}.{op}) + startAfter(C): live CDC tail only │ + ▼ this service (name OPEN — §10) + collections-transformer ─┬─▶ INBOX_{site} (OutboxEvent) ─▶ inbox-worker ─▶ per-site Mongo + │ rooms · subscriptions · thread_subscriptions + └─▶ direct Mongo write + users (insert-if-absent) +``` + +**Scope boundary — we are the live CDC tail, not the bulk migration.** A separate owner migrates all pre-existing state (message history *and* the existing operational-collection rows) up to a **checkpoint** and hands it to us; by the time we apply a tailed change, its **context and contents are fully synced** in the destination (the referenced user, room, and thread_room already exist). We do **not** snapshot/backfill, compute the cut, or derive thread_rooms. We **only** consume change events **at and after the given checkpoint** and apply them to the new-stack per-site Mongo. + +The connector already tails these collections (they're in `WATCH_COLLECTIONS`) and publishes raw `model.OplogEvent`s; for collections it is configured to **start the change stream at the supplied checkpoint** (not `now`). This service is a **new consumer** on `MIGRATION_OPLOG_{site}` filtered to the operational-collection subjects — the counterpart of the message transformer, for non-message data. + +**Deployment is per-site.** Each site runs its own connector + this transformer against **that site's** source DB; `SITE_ID` is the site code. There is no cross-record site derivation for routing — a site's pump produces that site's data into that site's new stack (see §6 for the one place origin matters: `siteId` stamping). + +--- + +## 1. Scope + +Migrate **4** source collections: + +| Source | → Destination (per-site Mongo) | Path | +|---|---|---| +| `users` | `users` | **direct write** | +| `rocketchat_rooms` | `rooms` | **inbox publish** | +| `rocketchat_subscriptions` | `subscriptions` | **inbox publish** | +| `tsmc_thread_subscriptions` | `thread_subscriptions` | **inbox publish** | + +**`threadRooms` is explicitly out of scope** — `thread_rooms` are *derived* by `message-worker` from the message canonical stream (created on the first thread reply, accumulating `LastMsgAt`/`LastMsgID`/`ReplyAccounts`). We do not track them here. + +**The decision rule** (validated against the live writers): a destination gets the **inbox-publish** path iff `inbox-worker` already has an apply-handler for it (rooms, subscriptions, thread_subscriptions). The two with no apply-path get a **direct write**: `users` (nothing in the new stack writes it — it's populated externally) and, by the same logic, `thread_rooms` (only message-worker writes it) — which is why `thread_rooms` is left to the message path. + +**No new collections, no new event types** — reuse the existing `model.OutboxEvent` set and existing Mongo collections. + +--- + +## 2. Two output paths + +### 2.1 Inbox publish (rooms, subscriptions, thread-subs) + +Emit reused `model.OutboxEvent`s to the **local `INBOX_{site}`** stream; the existing `inbox-worker` applies them to per-site Mongo via its `InboxStore`. This reuses `UpsertRoom` / `BulkCreateSubscriptions` / `UpdateSubscriptionRoles` / `UpsertThreadSubscription` / etc. rather than reimplementing those writes. + +Publishing to the **local** INBOX (not OUTBOX) is deliberate: INBOX is normally sourced from *remote* OUTBOX, but here we want the *local* inbox-worker to apply *this* site's migrated data to *this* site's Mongo. The publish is a **fire-and-forget JetStream publish** (no apply-result back to the transformer) — order-healing therefore lives either in inbox-worker (users, §5) or in the transformer's pre-publish resolution (thread-subs, §4.4). + +### 2.2 Direct write (users) + +`users` has **no** inbox apply-path and is otherwise populated by external company-wide user syncs. The transformer writes the per-site `users` collection **directly**, **insert-if-absent keyed by account** — a pure seed that unblocks subscriptions/thread-subs without ever clobbering what another sync owns. Requires a **unique index on account** so the seed and other syncs converge on one doc. + +--- + +## 3. Cross-site model (the "cross-site CDC") + +Each site's source DB **already contains its federated copies** — site B's RocketChat already holds the rooms its B-users joined and those users' subscriptions. So the new stack reproduces the cross-site replication **by construction**: each pump migrates its **full source locally**, and the same room/subscription that was replicated across sites in the source ends up replicated across sites in the new stack. + +Consequences: +- **No active fan-out** — no `OUTBOX` routing, no dest-site derivation. N independent per-site pumps. +- **No federation drop-filter** — unlike the message path (global Cassandra ⇒ drop foreign-origin copies to avoid duplicates), per-site Mongo *wants* the federated copies. We migrate everything in the source. + +--- + +## 4. Per-collection mapping + +### 4.0 CDC op handling (cross-collection) + +The connector forwards raw change events with no `updateLookup` and no `fullDocumentBeforeChange` (`source_mongo.go`), so each op carries different data and the transformer treats them uniformly across all four collections: + +| Op | Payload | Handling | +|---|---|---| +| `insert` | full `fullDocument` | classify + map from the doc | +| `replace` | full `fullDocument` | **same path as `insert`** — re-classify from scratch (exclusions, type, `siteId`) then upsert, since a whole-doc rewrite can cross a classification boundary (e.g. `c`→`l`, a `d` gaining a 3rd participant, or `federation.origin` changing). Decided. | +| `update` | only `updateDescription` (changed fields, no post-image) | **re-read the full current source doc by `documentKey._id`** (the doc still exists), then map — a partial delta can't reconstruct full state (e.g. `max(ls,lr)` needs both fields; positional `roles.N` carries no array). Mirrors the message transformer's `FindByID`-on-update. | +| `delete` | only `documentKey._id` (no pre-image, doc gone) | **un-actionable for subs/thread-subs** — the destination keys by generated `UUIDv7` / `(threadRoomId,userId)`, not the source `_id`, and the removal keys live inside the deleted doc. Disposition: **skip + metric**, never an attempted apply. (Rooms/users delete is skipped by policy anyway; subscription *leave* arrives as an `open:false` **update**, which is handled.) | + +**Collection-level ops (`drop` / `rename` / `dropDatabase` / `invalidate`) — OUT OF SCOPE (deferred).** A single-collection change stream terminates/invalidates on these; they signal a watched source collection moving out from under the migration. Recovery is an operator action (re-point the connector), not transformer logic. Connector behavior on a terminal event (halt-and-alert vs. resume) is to be decided in that later work. Recorded here so it isn't silently dropped. + + +### 4.1 users — direct write +- **insert / replace / update** → **insert-if-absent by account**. If the account already exists (another sync got there first), **leave it untouched** — so post-seed **HR-field** `update`s (engName, tsmcName, dept/sect, roles, …) are intentionally **not** propagated (the company-wide user sync owns those, §9). Field mapping (confirmed): +- **`statusText` is the one exception.** It is **chat-originated** (set by the user inside the legacy chat, e.g. "In a meeting") and is **not** part of the HR dataset, so no other sync carries it. On a `statusText` `update` the transformer fans a `user_status_updated` event to **all** sites (global-visibility; see §4.1a) — otherwise a legacy status change during the migration window would be silently lost. + +| Destination | Source | +|---|---| +| `ID` | generate `UUIDv7` on create (joins are by account, not id — source `_id` not preserved) | +| `Account` | `username` (**unique but mutable** — see §9) | +| `EngName` | `customFields.engName` | +| `ChineseName` | `customFields.tsmcName` | +| `SectID` / `SectName` | `customFields.sectId` / `customFields.sectName` | +| `DeptID` / `DeptName` | `customFields.deptId` / `customFields.deptName` | +| `StatusText` | `statusText` | +| `Roles` | `roles[]` (global; `admin` marker) | +| `SiteID` | `federation.origin` first label (absent ⇒ local; federated users **do** get local docs flagged `isRemote:true`) | +| `SubscriptionUser.IsBot` (consumer side) | `type == "bot"` (bot has `appId`); feeds room-type `botDM` and bot-DM detection | + +- **No clean source** (set to zero-value, documented §9): `SectTCName`, `DeptTCName`, `EmployeeID`, `StatusIsShow`. These are owned by the external company-wide user sync, not this seed. +- **delete** → **skip for now**; deactivation is `active:false` (not a deletion) — that's the later signal. + +#### 4.1a `statusText` fan-out (the one propagated user field) +A `statusText` `update` fans a **reused** `user_status_updated` `InboxEvent` (no new event type) to **every** site in `ALL_SITE_IDS`, including our own — a user's status is globally visible, and each site holds its own (federated) copy of the user doc, so all must converge. `StatusIsShow` stays nil (the company-wide sync owns it). inbox-worker's `UpdateUserStatus` applies it keyed by **account** (not siteId) under a **`statusUpdatedAt` high-water guard**, so an out-of-order or duplicate fan-out delivery can't regress the status; a missing user on a site is a logged no-op. An empty `ALL_SITE_IDS` (partial deployment / misconfig) → warn + Ack-skip per event (a startup hard-fail is the eventual form — deferred until the failure modes are known). This is the **only** post-seed user field the migration propagates and the **only** additional inbox-worker apply-path beyond §7's `handleMemberAdded` change. + +### 4.2 rocketchat_rooms — inbox publish +- **insert / replace** → `room_sync` (full `model.Room` upsert). `Room.UpdatedAt` maps from source `_updatedAt` (`CreatedAt` from `ts`), zero-guarded to `now` when absent — this value is the **high-water mark** `inbox-worker`'s `UpsertRoom` guards on (`$lt`), so a zero would freeze the room after first sync. +- **update** (re-read full doc per §4.0, then diff): + - name change → `room_renamed` **+** `room_sync` (the rename event updates subscriptions' denormalized name; `room_sync` converges the `rooms` doc — this transformer is the doc's only writer). + - restricted / externalAccess change → `room_restricted` **+** `room_sync` (same split: subs' denormalized visibility + the room doc). + - other field deltas → `room_sync`. + - The companion `room_renamed`/`room_restricted` carry the source `_updatedAt` millis as their guard timestamp, matching the `room_sync` high-water mark. +- **delete** → **skip** — the app has no room deletion (no `DeleteRoom` anywhere; `rooms` is only ever upserted), and the delete event is un-actionable anyway (§4.0). A deleted source room's members are cleaned up via their subscription leaves; the room remains. **Deliberate constraint, not an omission.** +- **Type mapping** (source `t` is one of exactly `c,p,d,l,v` — confirmed by source-data team; new-stack `RoomType` is the closed set `channel,dm,botDM,discussion` per `pkg/model/room.go`): + - `c` (public channel) / `p` (private group) → `channel` — **the new model has a single `channel` type with no public/private distinction**; both collapse, by design (not a dropped attribute). + - **`p` with `prid` set** → `discussion` (a discussion is a `p` room carrying a parent-room id — check `prid` **before** the plain `p→channel` rule) + - `d` (2 participants) → `dm`, or `botDM` if a participant's `users.type == "bot"` (requires a user-type lookup) + - **Team rooms** (`teamId` present, with/without `teamMain`) → `channel`. The new model has **no team concept**: `teamId`/`teamMain` are dropped, team sub-rooms migrate as ordinary channels (documented gap, §9). +- **Type exclusion** (skip + metric + documented gap, §9) — these have **no** `RoomType` equivalent: + - `l` (livechat/omnichannel) and `v` (voip) — non-conversational. + - **Group DMs** — `d` rooms with **>2 participants**. The new stack's DM is strictly two-party (`idgen.BuildDMRoomID` / `model.BuildDMParticipants` require exactly two users); a 3+-party DM has no representable id. Skip. + - We deliberately do **not** adopt the source team's fuzzier "rooms to ignore" heuristics (name-prefix `_`, message-count, etc.) — type-based exclusion only, deterministic for CDC. + +### 4.3 rocketchat_subscriptions — inbox publish (full fidelity) +`member_added` creates a subscription with **defaults** (`rolesForType`, computed `Name`/`IsSubscribed`); real state comes from follow-up events. Source field mapping (confirmed by source-data team): + +| Destination | Source field | +|---|---| +| `User.ID`, `User.Account` | `u._id`, `u.username` (unique index `{rid:1,'u._id':1}`) | +| `RoomID` | `rid` | +| `Roles` | `roles[]` (`owner`/`moderator`/`leader`/`user` — role-based ownership, no separate pointer) | +| `Muted` | `disableNotifications` (TSMC custom — authoritative all-off; **not** `muteGroupMentions`, which is @all/@here-only) | +| `Favorite` | `f` (absent ⇒ false) | +| `Alert` | `alert` (any unread content, not just mentions) | +| `LastSeenAt` | **`max(ls, lr)`** — the furthest point consumed by either path (`ls` scrolled cursor, `lr` explicit mark); minimizes false-unread, consistent with the advance-only (`$lt`) apply guard | +| `JoinedAt` | `ts` (set once on first join; re-join just flips `open` back to true) | +| `Name` | `fname` (friendly display name); `name` is the machine handle | +| `SiteID` | `federation.origin` first label (absent ⇒ local, §6) | +| **`IsSubscribed`** | **NOT from source** — inbox-worker computes it (`subscriptionIsSubscribed`: botDM-human ⇒ true, else false). Never mapped from `open`. | +| **`HasMention`, `ThreadUnread`** | **NOT from subscription CDC** — these are unread-state owned by the message pipeline (broadcast/notification workers). Their value at the checkpoint is set by the bulk-sync owner; ongoing changes during the tail come from the migrated message flow. A tail-created sub starts with no unread (correct default). | + +> **Membership is binary in the new backend.** A subscription row exists ⟺ the user is a member; leaving **deletes** the row (`removeMember` → `member_removed` → `DeleteSubscriptionsByAccounts`). There is **no soft "hidden/unsubscribed-but-member" state** — `IsSubscribed` is a botDM marker, *not* a membership flag. So the source `open` toggle maps to the membership lifecycle, and the roles/read/favorite reset on leave **is** the correct new-backend semantics (rejoin starts fresh), not a fidelity loss. + +- **insert / replace** → `member_added` **+** the state events that reproduce the source row: + - `role_updated` ← `roles[]` (overrides the default) + - `subscription_mute_toggled` ← `disableNotifications` + - `subscription_favorite_toggled` ← `f` + - `subscription_read` ← `max(ls, lr)` + `alert` +- **update** (re-read full doc per §4.0, then diff) → emit the matching event(s). **Membership leave/rejoin is an `open` toggle:** + - `open` true→false → **`member_removed`** (deletes the row — correct: binary membership) + - `open` false→true → **`member_added`** (re-subscribe; idempotent upsert) + - mute/favorite/role/read changes → the single matching state event. +- **delete** (true row delete) → **un-actionable, skip + metric** (§4.0) — the event carries only the source `_id`, which doesn't map to the destination sub. Rare: leaving flips `open:false` (an update, handled above), so true deletes are the uncommon case. + +The destination `Subscription` fields and how each is set are pinned in §8. + +**D1 (decided):** `LastSeenAt = max(ls, lr)` — neither source field alone is correct (`ls`-only shows false unread after a mark-read-without-scroll; `lr`-only after a scroll-without-mark), so take the later of the two. Paired with source `alert` → `Alert`. + +### 4.4 tsmc_thread_subscriptions — inbox publish, transformer-resolved +Confirmed source schema: `_id`, `u` (`u._id`, `u.username`), `rid`, `parentMessage._id` (`tmid`), `lastMessage` (`_id`,`_updatedAt`), `createdAt`, `lastSeenAt`, `unreadMention`. Unique index `{'u._id':1, 'parentMessage._id':1}` — **one row per (user, thread)**. + +The `thread_subscription_upserted` payload is the **full `model.ThreadSubscription`** and inbox-worker upserts it **verbatim** (keyed by `threadRoomId`+`userId`) — it resolves nothing. Field mapping + two foreign keys the source row lacks in new-stack form (resolved **in the transformer** before publishing): + +| Destination | Source | +|---|---| +| `ParentMessageID` | `parentMessage._id` (`tmid`) | +| `UserAccount` | `u.username` | +| `LastSeenAt` | `lastSeenAt` | +| `HasMention` | `unreadMention > 0` | +| `CreatedAt` | `createdAt` | +| **`ThreadRoomID` (+ `RoomID`)** | resolve: lookup target `thread_rooms` by `parentMessage._id` (1:1). One lookup yields both IDs. `rid` cross-checks `RoomID`. | +| **`UserID`** | resolve: lookup target `users` by `u.username`. | +| `SiteID` | inherits the room's site (resolved thread_room), §6 (source row has no `federation.origin`). | + +**Double dependency:** thread-subs need **users** *and* the **message migration's thread rooms**. If either lookup misses → **Nak** (bounded retry) until both land. (Unlike subscriptions, this resolution can't be pushed to inbox-worker, since in live federation the origin site already knows both IDs — resolving them is migration-specific.) + +- **insert / replace** (created lazily on follow/first reply) → `thread_subscription_upserted` (resolved as above). +- **update** (re-read full doc per §4.0) → re-`thread_subscription_upserted` (idempotent upsert). +- **delete** (unfollow deletes the row — no flag) → **un-actionable, skip + metric (D2).** Two independent reasons: the delete event carries only the source `_id` (can't resolve `(threadRoomId,userId)`, §4.0), **and** there is no `thread_subscription_removed` inbox handler — the live stack doesn't federate unfollows either. A stale follow may linger during the cutover window and self-corrects once live. Recommended skip. + +--- + +## 5. Ordering & retry — a safety net for the live-tail race + +Upstream guarantees context is fully synced ≤ checkpoint, so a tailed subscription's user already exists for anything that predates the cut. The residual risk is the **live tail racing across collections**: a brand-new user and their subscription both created *after* the checkpoint stream on the connector's **own concurrent watchers** with no cross-collection ordering, so the subscription change can arrive first. Today `handleMemberAdded` **silently skips** (and Acks) a subscription whose user isn't present — a swallowed loss. + +**Change:** `handleMemberAdded` returns an **error (→ Nak/redeliver)** instead of `continue`-skipping when a referenced user is missing. This is **the correct steady-state behavior**, not a migration hack — a cross-site `member_added` for an unknown user *should* wait, not drop — so it persists after the source is sunset. The existence gate is inbox-worker's *existing* `FindUsersByAccounts` (no new lookup, no transformer-side user check for subscriptions). With context synced upstream this rarely fires; it's cheap insurance for the post-checkpoint race window. + +Requirements for that change: +- **Bounded `MaxDeliver`** on inbox-worker's consumer so a member_added for a user who never arrives eventually gives up + alerts (the exhaustion-signal lesson from the message path) rather than looping forever. +- **Idempotent on redelivery** — `BulkCreateSubscriptions` must be a guarded upsert (re-applying a partially-done batch can't create duplicates). Verify. +- **Partial-account batches** — live `member_added` can carry several accounts; "error if any missing" retries the whole event and re-upserts the present ones (fine given idempotency). Migration emits single-account events. Audit sibling handlers (`member_removed`, thread-sub) for the same skip-on-missing pattern and make them consistent. + +Thread-subs heal order via the **transformer-side** Nak-retry in §4.4. + +--- + +## 6. `siteId` stamping (documented rule) + +`Subscription.SiteID` / `Room.SiteID` is the record's **home/origin site, invariant across replicas** — every replica of a subscription carries the same `siteId` regardless of where the doc is stored. So a federated copy must carry its **origin** site, not the local deployment. + +`federation.origin` is authoritative and has **three** cases. The origin domain embeds the **site code** as its first label (e.g. `0030204.tchat-test.test.company.com` → `0030204`), and `SITE_ID` *is* that bare code: + +``` +origin = doc.federation.origin +siteId = (origin absent || origin == "local") → SITE_ID // this deployment's code + else → firstLabel(origin) // e.g. "0030204" +``` + +Applies to users, rooms, subscriptions (thread-subs inherit the room's site via the resolved thread_room). `"local"` is a sentinel synonym for absent — never used as a literal value. **`federation.domains[]` is ignored** (no target field on `model.Room`; the new stack re-derives cross-site reach from members). This rule is **documented and followed as-is**; any future change is a deliberate edit. + +--- + +## 7. Required `inbox-worker` changes + +**One** minimal change to the shared apply path: + +1. **`handleMemberAdded` skip→error** on unknown user (§5) — currently `continue`-skips (handler.go:140-143); change to return an error so the event Naks/redelivers until the user lands. Live-safe (the correct cross-site behavior anyway). + +No `MemberAddEvent` extension is needed: `IsSubscribed` is already computed correctly by inbox-worker (`subscriptionIsSubscribed`), and `HasMention`/`ThreadUnread` are unread-state owned by the message pipeline (not subscription federation) — see §4.3/§8. Earlier drafts proposed carrying these on an extended `member_added`; that's dropped. + +No new event types; no new collections. + +--- + +## 8. Destination `Subscription` field coverage + +| Field | Set by | +|---|---| +| `Roles` | `member_added` (default) → **`role_updated`** (source roles) | +| `Muted` | **`subscription_mute_toggled`** | +| `Favorite` | **`subscription_favorite_toggled`** | +| `LastSeenAt`, `Alert` | **`subscription_read`** | +| `IsSubscribed` | inbox-worker computes (`subscriptionIsSubscribed`: botDM-human ⇒ true, else false) — **not** from source | +| `Name` | `member_added` (default) → **`room_renamed`** (channel rename) | +| `RoomType`, `SiteID`, `JoinedAt`, `HistorySharedSince` | `member_added` | +| `Restricted`, `ExternalAccess` | **rooms** migration (`room_restricted`), not per-sub | +| `HasMention`, `ThreadUnread` | **message pipeline** — unread-state, not subscription federation. Initial value at checkpoint owned by the bulk-sync owner; ongoing changes from the migrated message flow. A tail-created sub starts with no unread (correct). | + +Every destination field has a faithful path. + +### 8.1 inbox-worker handler coverage + +Every inbox-worker apply-handler is either produced by the migration or intentionally not: + +| Inbox handler | Emitted? | From | +|---|---|---| +| `member_added` | ✅ | sub `insert`/`replace`; `open` false→true | +| `member_removed` | ✅ | sub `open` true→false | +| `room_sync` | ✅ | room `insert`/`replace`/other-field `update` | +| `role_updated` | ✅ | sub `roles[]` | +| `subscription_read` | ✅ | sub `max(ls,lr)` + `alert` | +| `subscription_mute_toggled` | ✅ | sub `disableNotifications` | +| `subscription_favorite_toggled` | ✅ | sub `f` | +| `thread_subscription_upserted` | ✅ | thread-sub `insert`/`replace`/`update` | +| `room_renamed` | ✅ | room `name`/`fname` change | +| `room_restricted` | ✅ | room `restricted`/`externalAccess` change | +| `user_status_updated` | ✅ | user `statusText` change — chat-owned, fanned to all sites (§4.1a) | +| `thread_read` | ⚠️ **not emitted** | redundant: the thread-sub `lastSeenAt` is carried by `thread_subscription_upserted`; `Subscription.ThreadUnread` is message-pipeline-owned (§8) | + +No handler is left silently unaddressed. + +--- + +## 9. Not faithfully migrated (documented gaps — "no hidden gimmick") + +The spec carries this list explicitly; nothing is silently defaulted: +- **`federation.domains[]`** — ignored; new model re-derives cross-site reach from members. +- **User delete / deactivation** — skipped for now. +- **Livechat (`l`) / VoIP (`v`) rooms** — skipped (no `RoomType` equivalent); metric on skip. +- **Group DMs** (`d` with >2 participants) — skipped (new stack has no group-DM type, §4.2); metric on skip. +- **Team grouping** (`teamId`/`teamMain`) — team rooms migrate as plain `channel`; the team relationship is dropped (no team concept in the new model). +- **Subscription `Restricted`/`ExternalAccess`** — sourced from the **rooms** migration, not the per-sub row (dependency, not a gap). +- **Subscription unread-state (`HasMention`, `Alert`, `ThreadUnread`)** — owned by the message pipeline, not subscription CDC; initial value at checkpoint owned by the bulk-sync owner (not a gap; §4.3/§8). +- **Thread-sub unfollow** (D2) — `delete` is un-actionable (only the source `_id`, §4.0) **and** there's no inbox removal handler (the live stack never federates unfollows either); skip + metric. **The only true "can't push" item.** +- **Subscription / room / user true `delete`** — un-actionable (only the source `_id`, no pre-image, destination doesn't key by source `_id`; §4.0). Rooms/users skipped by policy anyway; subscription leave arrives as an `open:false` update (handled). +- **User post-seed `update`s** — **HR fields** (engName, tsmcName, dept/sect, roles, …) not propagated (insert-if-absent; the company-wide user sync owns those). **Exception: `statusText`** is chat-owned and IS propagated (fan `user_status_updated` to all sites, §4.1a) — no other sync carries it, so dropping it would lose legacy status changes during the migration window. +- **Collection-level ops** (`drop`/`rename`/`invalidate`) — **out of scope, deferred** (§4.0); operator re-points the connector, not transformer logic. +- **User fields `SectTCName`/`DeptTCName`/`EmployeeID`/`StatusIsShow`** — no clean source in `users`; owned by the external user sync, left zero-valued by the seed. +- **`username`/account mutability** — the entire new stack joins by account; a source username rename during cutover would orphan rows until the next sync. Low-risk, documented. +- **DM/botDM subscription `Name`** — `member_added`'s `subscriptionName` derives a DM label from `RequesterAccount`, which the migration doesn't set, so migrated DM/botDM subscriptions land with an empty `Name` (channels use the room name and are fine). The DM display label is re-derived client-side from the counterpart's HR info, so impact is cosmetic; recorded as a known minor gap. + +--- + +## 10. Service structure (decided) + +A **separate sibling service** `data-migration/oplog-collections-transformer` consumes the operational-collection subjects on `MIGRATION_OPLOG_{site}` — distinct from the Cassandra/canonical message transformer. Shared infrastructure (oplogEvent decode, source lookup, disposition mapping, request-id, metrics) is **factored into a common pkg under `data-migration/`** and imported by both transformers rather than duplicated. (Rejected: extending the existing `oplog-transformer` to route the extra collections — the inbox/Mongo path is different enough that a single service would tangle two unrelated output mechanisms.) + +--- + +## 11. Resolved items (was OPEN) +- **Room-type exclusion** — skip `l`, `v`, and group DMs (`d` >2 participants); §4.2 / §9. +- **Room-type mapping wrinkles** — discussions (`p`+`prid`), teams (`teamId`→`channel`), bot DMs (`users.type=="bot"`); §4.2. +- **D1 — read timestamp** — `LastSeenAt = max(ls, lr)`; §4.3. +- **D3 — `IsSubscribed`/`HasMention`/`ThreadUnread`** — *not* from subscription CDC. `IsSubscribed` is inbox-worker-computed (botDM marker); unread-state is message-pipeline-owned. Drops the `member_added` extension; §4.3/§7/§8. +- **"Can't push" audit** — vs inbox-worker's handler set, the only source change with no apply-path is **thread-sub unfollow** (D2 — also un-actionable per §4.0); room delete is moot. §9. +- **CDC op handling** — `replace` → same path as `insert` (re-classify + upsert); `update` → re-read full source doc; `delete` → un-actionable skip + metric; §4.0. +- **D2 — thread-sub unfollow** — skip + metric (un-actionable **and** no handler); §4.4/§9. +- **Collection-level ops** (`drop`/`rename`/`invalidate`) — out of scope, deferred; §4.0. +- **Service structure** — separate sibling + shared `data-migration/` pkg; §10. +- **Federation filters** for these collections resolve to **no drop** (§3) — recorded, not deferred. + +**No open design items remain** — the spec is decision-complete pending source-engineer confirmation of the `SOURCE_DATA.md` assumptions (notably the room `Restricted`/`ExternalAccess` fields). + +--- + +## 12. Disposition, idempotency, observability + +Mirror the message transformer's conventions: +- **Disposition:** decode/contract-violation → `Term`; transient (lookup miss, target unavailable) → `Nak`; deliberate skip (out-of-scope collection/op, excluded room type) → Ack-without-count (`onSkipped{reason}`); `MaxDeliver` exhaustion → `Term` + metric (no silent JetStream drop). +- **Idempotency:** the path must tolerate JetStream redelivery and reprocessing across the checkpoint boundary (the bulk-sync owner may have applied the same row's state ≤ checkpoint). Leans on inbox-worker's monotonic guard fields (`UpsertRoom` UpdatedAt high-water mark, `rolesUpdatedAt`, `$lt` lastSeenAt, `$setOnInsert` on thread-subs) and the users insert-if-absent. Verify each. +- **Correlation:** stamp `request_id` at consume; propagate into the published `OutboxEvent` headers and the direct-write context. +- **Metrics:** processed/skipped/nak/term/exhausted by op + collection; thread-sub resolution misses; user-seed insert vs already-present. + +--- + +## 13. Testing +- **Unit** (mocked publisher + target store + source lookup): the op→event mapping per collection (rooms create/rename/restrict/skip-delete; subscription full-fidelity insert + delta-classified updates + member_removed; thread-sub FK resolution + double-dependency Nak; users insert-if-absent vs already-present); `siteId` stamping (absent / `"local"` / remote-domain first-label); disposition mapping. +- **inbox-worker** unit: `member_added` error-on-unknown-user (the one apply-path change); idempotent redelivery. +- **Integration** (testcontainers Mongo + NATS): source CDC → INBOX → inbox-worker → per-site Mongo for a room + its subscriptions; users-before-subscriptions ordering heals via retry; thread-sub resolves against a message-migration-created thread room. + +--- + +## 14. Footprint / teardown +Same blast-radius discipline as the message path: the whole `data-migration/` folder is deletable at source retirement. The single `inbox-worker` change (`member_added` error-on-missing-user) is **retained** — it's correct live-federation behavior, not migration scaffolding. Document it as such so the cleanup PR doesn't revert it. diff --git a/inbox-worker/handler.go b/inbox-worker/handler.go index 1298164b6..b22d769cd 100644 --- a/inbox-worker/handler.go +++ b/inbox-worker/handler.go @@ -34,19 +34,19 @@ type InboxStore interface { // UpdateSubscriptionRead sets lastSeenAt and alert on the subscription // keyed by (roomID, account). Idempotent and order-safe: the write // only applies when the stored lastSeenAt is missing or strictly - // earlier than the supplied value. Older or duplicate events are - // silent no-ops. Missing-subscription is also a silent no-op. + // earlier than the supplied value. Older or duplicate events are silent no-ops. + // A genuinely missing sub returns an error (Nak) so the event redelivers until member_added lands. UpdateSubscriptionRead(ctx context.Context, roomID, account string, lastSeenAt time.Time, alert bool) error UpsertThreadSubscription(ctx context.Context, sub *model.ThreadSubscription) error // ApplyThreadRead writes ThreadSubscription under a $lt lastSeenAt guard, then the Subscription only if the guard accepted. ApplyThreadRead(ctx context.Context, roomID, threadRoomID, account string, newThreadUnread []string, alert bool, lastSeenAt time.Time) error // UpdateSubscriptionMute sets muted by (roomID, account), guarded by // muteUpdatedAt (the source event's publish time): older/duplicate events - // are silent no-ops. Missing-sub is also a silent no-op for federation races. + // are silent no-ops. A genuinely missing sub returns an error (Nak) so the event redelivers until member_added lands. UpdateSubscriptionMute(ctx context.Context, roomID, account string, muted bool, muteUpdatedAt time.Time) error // UpdateSubscriptionFavorite sets favorite by (roomID, account), guarded by // favoriteUpdatedAt (the source event's publish time): older/duplicate events - // are silent no-ops. Missing-sub is also a silent no-op for federation races. + // are silent no-ops. A genuinely missing sub returns an error (Nak) so the event redelivers until member_added lands. UpdateSubscriptionFavorite(ctx context.Context, roomID, account string, favorite bool, favoriteUpdatedAt time.Time) error // UpdateSubscriptionNamesForRoom sets name on every subscription in the room, // each guarded by its own nameUpdatedAt so an out-of-order rename cannot regress @@ -59,11 +59,11 @@ type InboxStore interface { // ownerAccount is non-empty, a $cond pipeline demotes all accounts except // ownerAccount to RoleMember. ApplySubscriptionVisibility(ctx context.Context, roomID string, restricted, externalAccess bool, ownerAccount string, visibilityUpdatedAt time.Time) error - // UpdateUserStatus replicates a cross-site status change onto the local - // users doc keyed by account. statusIsShow is written only when non-nil so a - // text-only update cannot clobber the stored flag. A missing user (no doc on - // this site) is a silent no-op. - UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool) error + // UpdateUserStatus replicates a cross-site status change onto the local users doc keyed by + // account, guarded by statusUpdatedAt (the event publish time): an older/equal high-water + // mark is a no-op so out-of-order multi-site delivery can't regress the status. statusIsShow + // is written only when non-nil. A missing user (no doc on this site) is a logged no-op. + UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool, statusUpdatedAt time.Time) error } // Handler processes cross-site InboxEvent messages; replicates only subscription/room metadata, never room keys. @@ -142,10 +142,11 @@ func (h *Handler) handleMemberAdded(ctx context.Context, evt *model.InboxEvent) } subs := make([]*model.Subscription, 0, len(event.Accounts)) + var missing []string for _, account := range event.Accounts { user, ok := userMap[account] if !ok { - slog.Warn("user not found for account", "account", account) + missing = append(missing, account) continue } sub := &model.Subscription{ @@ -163,15 +164,22 @@ func (h *Handler) handleMemberAdded(ctx context.Context, evt *model.InboxEvent) subs = append(subs, sub) } - if len(subs) == 0 { - return nil - } - if err := h.store.BulkCreateSubscriptions(ctx, subs); err != nil { - if !mongo.IsDuplicateKeyError(err) { - return fmt.Errorf("bulk create subscriptions: %w", err) + if len(subs) > 0 { + if err := h.store.BulkCreateSubscriptions(ctx, subs); err != nil { + if !mongo.IsDuplicateKeyError(err) { + return fmt.Errorf("bulk create subscriptions: %w", err) + } } } + // A referenced user that isn't present yet is a federation/migration race, not a + // permanent failure: return a (transient) error so JetStream redelivers the event + // until the user lands. The resolvable subscriptions above are created first to make + // progress; redelivery re-upserts them idempotently (guarded by the unique index). + if len(missing) > 0 { + return fmt.Errorf("member_added references unknown users %v in room %s", missing, event.RoomID) + } + // No SubscriptionUpdateEvent is published here — room-worker already publishes // to the user's subject and the NATS supercluster routes it to the user's // home site. @@ -325,14 +333,14 @@ func (h *Handler) handleRoomVisibilityChanged(ctx context.Context, evt *model.In return nil } -// handleUserStatusUpdated mirrors a cross-site status change onto the local -// users doc. Status is last-write-wins, so no timestamp guard is applied. +// handleUserStatusUpdated mirrors a cross-site status change onto the local users doc, guarded by +// the event Timestamp so an out-of-order or duplicate fan-out delivery can't regress the status. func (h *Handler) handleUserStatusUpdated(ctx context.Context, evt *model.InboxEvent) error { var e model.UserStatusUpdated if err := json.Unmarshal(evt.Payload, &e); err != nil { return fmt.Errorf("unmarshal user_status_updated payload: %w", err) } - if err := h.store.UpdateUserStatus(ctx, e.Account, e.StatusText, e.StatusIsShow); err != nil { + if err := h.store.UpdateUserStatus(ctx, e.Account, e.StatusText, e.StatusIsShow, time.UnixMilli(e.Timestamp).UTC()); err != nil { return fmt.Errorf("update user status for %q: %w", e.Account, err) } return nil diff --git a/inbox-worker/handler_test.go b/inbox-worker/handler_test.go index ed99923e8..ed9be5e74 100644 --- a/inbox-worker/handler_test.go +++ b/inbox-worker/handler_test.go @@ -95,6 +95,7 @@ type userStatusUpdate struct { account string statusText string isShow *bool + updatedAt time.Time } func (s *stubInboxStore) CreateSubscription(ctx context.Context, sub *model.Subscription) error { @@ -336,14 +337,14 @@ func (s *stubInboxStore) getThreadSubs() []model.ThreadSubscription { return cp } -func (s *stubInboxStore) UpdateUserStatus(_ context.Context, account, statusText string, statusIsShow *bool) error { +func (s *stubInboxStore) UpdateUserStatus(_ context.Context, account, statusText string, statusIsShow *bool, statusUpdatedAt time.Time) error { s.mu.Lock() defer s.mu.Unlock() if s.userStatusErr != nil { return s.userStatusErr } s.userStatusUpdates = append(s.userStatusUpdates, userStatusUpdate{ - account: account, statusText: statusText, isShow: statusIsShow, + account: account, statusText: statusText, isShow: statusIsShow, updatedAt: statusUpdatedAt, }) return nil } @@ -1191,6 +1192,68 @@ func (s *errorThreadSubStore) UpsertThreadSubscription(_ context.Context, _ *mod return fmt.Errorf("boom") } +func TestHandleEvent_MemberAdded_UnknownUser_ReturnsError(t *testing.T) { + // Store has NO users, so the referenced account cannot resolve. + store := &stubInboxStore{} + h := NewHandler(store) + + change := model.MemberAddEvent{ + Type: "member_added", + RoomID: "room-1", + Accounts: []string{"ghost"}, + SiteID: "site-b", + JoinedAt: time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC).UnixMilli(), + } + changeData, err := json.Marshal(change) + require.NoError(t, err) + + evt := model.InboxEvent{Type: "member_added", SiteID: "site-b", DestSiteID: "site-a", Payload: changeData} + evtData, err := json.Marshal(evt) + require.NoError(t, err) + + err = h.HandleEvent(context.Background(), evtData) + + // Returns an error (→ Nak/redeliver) naming the missing account. + require.Error(t, err) + assert.Contains(t, err.Error(), "ghost") + // A plain fmt.Errorf can never be permanent (IsPermanent requires *errcode.Error wrapping), + // but assert explicitly so a future refactor that adds wrapping trips this guard. + _, isPermanent := errcode.IsPermanent(err) + assert.False(t, isPermanent, "missing-user error must be transient so the event redelivers") + // No subscription was created. + assert.Empty(t, store.getSubscriptions()) +} + +func TestHandleEvent_MemberAdded_PartialUsers_CreatesPresentAndErrors(t *testing.T) { + // "bob" resolves; "ghost" does not. + store := &stubInboxStore{users: []model.User{{ID: "uid-bob", Account: "bob", SiteID: "site-a"}}} + h := NewHandler(store) + + change := model.MemberAddEvent{ + Type: "member_added", + RoomID: "room-1", + Accounts: []string{"bob", "ghost"}, + SiteID: "site-b", + JoinedAt: time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC).UnixMilli(), + } + changeData, err := json.Marshal(change) + require.NoError(t, err) + + evt := model.InboxEvent{Type: "member_added", SiteID: "site-b", DestSiteID: "site-a", Payload: changeData} + evtData, err := json.Marshal(evt) + require.NoError(t, err) + + err = h.HandleEvent(context.Background(), evtData) + + // Errors so the whole event redelivers... + require.Error(t, err) + assert.Contains(t, err.Error(), "ghost") + // ...but the resolvable subscription was still created (progress; redelivery re-upserts idempotently). + subs := store.getSubscriptions() + require.Len(t, subs, 1) + assert.Equal(t, "bob", subs[0].User.Account) +} + func TestRolesForType(t *testing.T) { assert.Equal(t, []model.Role{model.RoleMember}, rolesForType(model.RoomTypeChannel)) assert.Nil(t, rolesForType(model.RoomTypeDM)) @@ -1481,12 +1544,12 @@ func TestHandler_SubscriptionMuteToggled(t *testing.T) { assert.Equal(t, int64(12345), updates[0].updatedAt.UnixMilli()) } -func TestHandler_SubscriptionMuteToggled_MissingSubscriptionNoOp(t *testing.T) { +func TestHandler_SubscriptionMuteToggled_Forwarded(t *testing.T) { store := &stubInboxStore{} h := NewHandler(store) payload, err := json.Marshal(model.SubscriptionMuteToggledEvent{ - Account: "ghost", RoomID: "r1", Muted: true, Timestamp: 12345, + Account: "alice", RoomID: "r1", Muted: true, Timestamp: 12345, }) require.NoError(t, err) evt, err := json.Marshal(model.InboxEvent{ @@ -1496,6 +1559,10 @@ func TestHandler_SubscriptionMuteToggled_MissingSubscriptionNoOp(t *testing.T) { require.NoError(t, err) require.NoError(t, h.HandleEvent(context.Background(), evt)) + updates := store.getMuteUpdates() + require.Len(t, updates, 1) + assert.Equal(t, "alice", updates[0].account) + assert.True(t, updates[0].muted) } func TestHandler_SubscriptionMuteToggled_MalformedPayload(t *testing.T) { @@ -1546,12 +1613,12 @@ func TestHandler_SubscriptionFavoriteToggled(t *testing.T) { assert.Equal(t, int64(12345), updates[0].updatedAt.UnixMilli()) } -func TestHandler_SubscriptionFavoriteToggled_MissingSubscriptionNoOp(t *testing.T) { +func TestHandler_SubscriptionFavoriteToggled_Forwarded(t *testing.T) { store := &stubInboxStore{} h := NewHandler(store) payload, err := json.Marshal(model.SubscriptionFavoriteToggledEvent{ - Account: "ghost", RoomID: "r1", Favorite: true, Timestamp: 12345, + Account: "alice", RoomID: "r1", Favorite: true, Timestamp: 12345, }) require.NoError(t, err) evt, err := json.Marshal(model.InboxEvent{ @@ -1561,6 +1628,10 @@ func TestHandler_SubscriptionFavoriteToggled_MissingSubscriptionNoOp(t *testing. require.NoError(t, err) require.NoError(t, h.HandleEvent(context.Background(), evt)) + updates := store.getFavoriteUpdates() + require.Len(t, updates, 1) + assert.Equal(t, "alice", updates[0].account) + assert.True(t, updates[0].favorite) } func TestHandler_SubscriptionFavoriteToggled_MalformedPayload(t *testing.T) { @@ -1645,6 +1716,8 @@ func TestHandler_UserStatusUpdated(t *testing.T) { assert.Equal(t, "out to lunch", updates[0].statusText) require.NotNil(t, updates[0].isShow) assert.True(t, *updates[0].isShow) + // The handler threads the event Timestamp through as the order-guard high-water mark. + assert.Equal(t, time.UnixMilli(12345).UTC(), updates[0].updatedAt) } func TestHandler_UserStatusUpdated_IsShowOmittedStaysNil(t *testing.T) { diff --git a/inbox-worker/integration_test.go b/inbox-worker/integration_test.go index 299685d71..a48c89279 100644 --- a/inbox-worker/integration_test.go +++ b/inbox-worker/integration_test.go @@ -337,18 +337,33 @@ func TestInbox_UpdateSubscriptionRead_EqualTimestampSkipped(t *testing.T) { assert.True(t, got.Alert) // unchanged } -func TestInbox_UpdateSubscriptionRead_MissingSubscriptionNoOp(t *testing.T) { +func TestInbox_UpdateSubscriptionRead_MissingSubscriptionErrors(t *testing.T) { ctx := context.Background() - db := setupMongo(t) - store := &mongoInboxStore{ - subCol: db.Collection("subscriptions"), - roomCol: db.Collection("rooms"), - userCol: db.Collection("users"), - threadSubCol: db.Collection("thread_subscriptions"), - } + store := newGuardStore(setupMongo(t)) + + // No subscription seeded — a genuinely missing sub must error so the event redelivers until + // member_added lands (field events can race ahead of member_added on the worker pool). + err := store.UpdateSubscriptionRead(ctx, "missing-room", "ghost", time.Now().UTC(), false) + require.Error(t, err) + assert.Contains(t, err.Error(), "subscription not found") +} + +func TestInbox_UpdateSubscriptionMute_MissingSubscriptionErrors(t *testing.T) { + ctx := context.Background() + store := newGuardStore(setupMongo(t)) - now := time.Now().UTC() - require.NoError(t, store.UpdateSubscriptionRead(ctx, "missing-room", "ghost", now, false)) + err := store.UpdateSubscriptionMute(ctx, "missing-room", "ghost", true, time.UnixMilli(100).UTC()) + require.Error(t, err) + assert.Contains(t, err.Error(), "subscription not found") +} + +func TestInbox_UpdateSubscriptionFavorite_MissingSubscriptionErrors(t *testing.T) { + ctx := context.Background() + store := newGuardStore(setupMongo(t)) + + err := store.UpdateSubscriptionFavorite(ctx, "missing-room", "ghost", true, time.UnixMilli(100).UTC()) + require.Error(t, err) + assert.Contains(t, err.Error(), "subscription not found") } func TestInboxWorker_ThreadSubscriptionUpserted_Insert_Integration(t *testing.T) { @@ -395,6 +410,48 @@ func TestInboxWorker_ThreadSubscriptionUpserted_Insert_Integration(t *testing.T) assert.True(t, got.UpdatedAt.Equal(now)) } +// TestInboxWorker_ThreadSubscription_DedupByUserAccount_Integration pins the natural key to +// (threadRoomId, userAccount) — matching message-worker's threadStoreMongo. Two upserts for the same +// (threadRoomId, userAccount) must converge on ONE row even if userId differs, so inbox-worker and +// message-worker never disagree about which row is "the" subscription. (Keyed by userId, the old +// code left two rows here.) +func TestInboxWorker_ThreadSubscription_DedupByUserAccount_Integration(t *testing.T) { + db := setupMongo(t) + ctx := context.Background() + + store := &mongoInboxStore{ + subCol: db.Collection("subscriptions"), + roomCol: db.Collection("rooms"), + userCol: db.Collection("users"), + threadSubCol: db.Collection("thread_subscriptions"), + } + require.NoError(t, store.ensureIndexes(ctx)) + + now := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC) + first := model.ThreadSubscription{ + ID: "ts-first", ParentMessageID: "pm-1", RoomID: "r1", ThreadRoomID: "tr-1", + UserID: "u-bob", UserAccount: "bob", SiteID: "site-a", CreatedAt: now, UpdatedAt: now, + } + // Same (threadRoomId, userAccount) but a different userId — keyed by userAccount this is the + // same subscription, so it must NOT create a second row. + second := model.ThreadSubscription{ + ID: "ts-second", ParentMessageID: "pm-1", RoomID: "r1", ThreadRoomID: "tr-1", + UserID: "u-bob-other", UserAccount: "bob", SiteID: "site-a", CreatedAt: now, UpdatedAt: now.Add(time.Minute), + } + require.NoError(t, store.UpsertThreadSubscription(ctx, &first)) + require.NoError(t, store.UpsertThreadSubscription(ctx, &second)) + + count, err := db.Collection("thread_subscriptions"). + CountDocuments(ctx, bson.M{"threadRoomId": "tr-1", "userAccount": "bob"}) + require.NoError(t, err) + assert.Equal(t, int64(1), count, "thread-sub dedups by (threadRoomId, userAccount), matching message-worker") + + var got model.ThreadSubscription + require.NoError(t, db.Collection("thread_subscriptions"). + FindOne(ctx, bson.M{"threadRoomId": "tr-1", "userAccount": "bob"}).Decode(&got)) + assert.Equal(t, "ts-first", got.ID, "$setOnInsert keeps the first row; the second upsert is a no-op insert") +} + func TestInboxWorker_ThreadSubscriptionUpserted_MonotonicMention_Integration(t *testing.T) { db := setupMongo(t) ctx := context.Background() @@ -1374,9 +1431,12 @@ func TestInboxWorker_UpdateUserStatus_Integration(t *testing.T) { }) require.NoError(t, err) + t1 := time.UnixMilli(1000).UTC() + t2 := time.UnixMilli(2000).UTC() + t.Run("updates text and isShow when both supplied", func(t *testing.T) { hide := false - require.NoError(t, store.UpdateUserStatus(ctx, "alice", "out to lunch", &hide)) + require.NoError(t, store.UpdateUserStatus(ctx, "alice", "out to lunch", &hide, t1)) var got model.User require.NoError(t, store.userCol.FindOne(ctx, bson.M{"account": "alice"}).Decode(&got)) @@ -1387,7 +1447,7 @@ func TestInboxWorker_UpdateUserStatus_Integration(t *testing.T) { t.Run("text-only update leaves stored isShow untouched", func(t *testing.T) { // Stored isShow is currently false from the previous subtest; a nil // isShow must not clobber it. - require.NoError(t, store.UpdateUserStatus(ctx, "alice", "heads down", nil)) + require.NoError(t, store.UpdateUserStatus(ctx, "alice", "heads down", nil, t2)) var got model.User require.NoError(t, store.userCol.FindOne(ctx, bson.M{"account": "alice"}).Decode(&got)) @@ -1395,8 +1455,17 @@ func TestInboxWorker_UpdateUserStatus_Integration(t *testing.T) { assert.False(t, got.StatusIsShow) }) - t.Run("unknown account is a silent no-op", func(t *testing.T) { - require.NoError(t, store.UpdateUserStatus(ctx, "ghost", "nope", nil)) + t.Run("stale event is rejected by the statusUpdatedAt high-water guard", func(t *testing.T) { + // statusUpdatedAt is t2; a t1 (older) event must be a no-op, not regress the text. + require.NoError(t, store.UpdateUserStatus(ctx, "alice", "STALE", nil, t1)) + + var got model.User + require.NoError(t, store.userCol.FindOne(ctx, bson.M{"account": "alice"}).Decode(&got)) + assert.Equal(t, "heads down", got.StatusText, "stale status must not overwrite a newer one") + }) + + t.Run("unknown account is a no-op", func(t *testing.T) { + require.NoError(t, store.UpdateUserStatus(ctx, "ghost", "nope", nil, t2)) count, err := store.userCol.CountDocuments(ctx, bson.M{"account": "ghost"}) require.NoError(t, err) diff --git a/inbox-worker/main.go b/inbox-worker/main.go index cc8ee3f77..7fdcdfe3b 100644 --- a/inbox-worker/main.go +++ b/inbox-worker/main.go @@ -105,9 +105,17 @@ func (s *mongoInboxStore) UpdateSubscriptionRoles(ctx context.Context, account, if err != nil { return fmt.Errorf("update subscription roles for %q in room %q: %w", account, roomID, err) } - if res.MatchedCount > 0 { - return nil + if res.MatchedCount == 0 { + return s.naksIfSubscriptionMissing(ctx, account, roomID) } + return nil +} + +// naksIfSubscriptionMissing disambiguates a MatchedCount==0 guarded subscription write. A genuinely +// missing subscription returns an error (Nak → redelivered until member_added lands, the +// federation/migration race where field events can race ahead of member_added); a stale event the +// high-water guard rejected is a silent no-op (the sub exists with a newer-or-equal value). +func (s *mongoInboxStore) naksIfSubscriptionMissing(ctx context.Context, account, roomID string) error { exists, err := s.subscriptionExists(ctx, account, roomID) if err != nil { return fmt.Errorf("check subscription exists for %q in room %q: %w", account, roomID, err) @@ -161,17 +169,32 @@ func (s *mongoInboxStore) FindUsersByAccounts(ctx context.Context, accounts []st // keyed by account. statusIsShow is written only when non-nil so a text-only // update cannot clobber the stored flag. A missing user (no doc on this site) // is a silent no-op — the event is for an account that doesn't live here. -func (s *mongoInboxStore) UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool) error { - set := bson.M{"statusText": statusText} +func (s *mongoInboxStore) UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool, statusUpdatedAt time.Time) error { + set := bson.M{"statusText": statusText, "statusUpdatedAt": statusUpdatedAt} if statusIsShow != nil { set["statusIsShow"] = *statusIsShow } - res, err := s.userCol.UpdateOne(ctx, bson.M{"account": account}, bson.M{"$set": set}) + // Guard on the statusUpdatedAt high-water mark so an out-of-order or duplicate event + // (the status fans to all sites) can't regress to an older status. + filter := bson.M{"account": account, "$or": bson.A{ + bson.M{"statusUpdatedAt": bson.M{"$exists": false}}, + bson.M{"statusUpdatedAt": bson.M{"$lt": statusUpdatedAt}}, + }} + res, err := s.userCol.UpdateOne(ctx, filter, bson.M{"$set": set}) if err != nil { return fmt.Errorf("update user status for %q: %w", account, err) } if res.MatchedCount == 0 { - slog.WarnContext(ctx, "user_status_updated for unknown account, skipping", "account", account) + // The UpdateOne above already committed; MatchedCount==0 is a correct no-op (account not on + // this site, or a stale event the guard rejected). CountDocuments only picks the warn + // message, so a failure here must not Nak/retry an already-applied write — log and move on. + count, cerr := s.userCol.CountDocuments(ctx, bson.M{"account": account}) + switch { + case cerr != nil: + slog.WarnContext(ctx, "user existence check failed, skipping unknown-account log", "account", account, "error", cerr) + case count == 0: + slog.WarnContext(ctx, "user_status_updated for unknown account, skipping", "account", account) + } } return nil } @@ -212,9 +235,13 @@ func (s *mongoInboxStore) UpdateSubscriptionMute(ctx context.Context, roomID, ac }, } update := bson.M{"$set": bson.M{"muted": muted, "muteUpdatedAt": muteUpdatedAt}} - if _, err := s.subCol.UpdateOne(ctx, filter, update); err != nil { + res, err := s.subCol.UpdateOne(ctx, filter, update) + if err != nil { return fmt.Errorf("update subscription mute for %q in room %q: %w", account, roomID, err) } + if res.MatchedCount == 0 { + return s.naksIfSubscriptionMissing(ctx, account, roomID) + } return nil } @@ -232,9 +259,13 @@ func (s *mongoInboxStore) UpdateSubscriptionFavorite(ctx context.Context, roomID }, } update := bson.M{"$set": bson.M{"favorite": favorite, "favoriteUpdatedAt": favoriteUpdatedAt}} - if _, err := s.subCol.UpdateOne(ctx, filter, update); err != nil { + res, err := s.subCol.UpdateOne(ctx, filter, update) + if err != nil { return fmt.Errorf("update subscription favorite for %q in room %q: %w", account, roomID, err) } + if res.MatchedCount == 0 { + return s.naksIfSubscriptionMissing(ctx, account, roomID) + } return nil } @@ -248,9 +279,13 @@ func (s *mongoInboxStore) UpdateSubscriptionRead(ctx context.Context, roomID, ac }, } update := bson.M{"$set": bson.M{"lastSeenAt": lastSeenAt, "alert": alert}} - if _, err := s.subCol.UpdateOne(ctx, filter, update); err != nil { + res, err := s.subCol.UpdateOne(ctx, filter, update) + if err != nil { return fmt.Errorf("update subscription read for %q in room %q: %w", account, roomID, err) } + if res.MatchedCount == 0 { + return s.naksIfSubscriptionMissing(ctx, account, roomID) + } return nil } diff --git a/inbox-worker/mock_store_test.go b/inbox-worker/mock_store_test.go index 4807b57bc..31146d3ef 100644 --- a/inbox-worker/mock_store_test.go +++ b/inbox-worker/mock_store_test.go @@ -3,7 +3,7 @@ // // Generated by this command: // -// mockgen -destination=mock_store_test.go -package=main . InboxStore +// mockgen -destination=mock_store_test.go -package=main github.com/hmchangw/chat/inbox-worker InboxStore // // Package main is a generated GoMock package. @@ -198,17 +198,17 @@ func (mr *MockInboxStoreMockRecorder) UpdateSubscriptionRoles(ctx, account, room } // UpdateUserStatus mocks base method. -func (m *MockInboxStore) UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool) error { +func (m *MockInboxStore) UpdateUserStatus(ctx context.Context, account, statusText string, statusIsShow *bool, statusUpdatedAt time.Time) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UpdateUserStatus", ctx, account, statusText, statusIsShow) + ret := m.ctrl.Call(m, "UpdateUserStatus", ctx, account, statusText, statusIsShow, statusUpdatedAt) ret0, _ := ret[0].(error) return ret0 } // UpdateUserStatus indicates an expected call of UpdateUserStatus. -func (mr *MockInboxStoreMockRecorder) UpdateUserStatus(ctx, account, statusText, statusIsShow any) *gomock.Call { +func (mr *MockInboxStoreMockRecorder) UpdateUserStatus(ctx, account, statusText, statusIsShow, statusUpdatedAt any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateUserStatus", reflect.TypeOf((*MockInboxStore)(nil).UpdateUserStatus), ctx, account, statusText, statusIsShow) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateUserStatus", reflect.TypeOf((*MockInboxStore)(nil).UpdateUserStatus), ctx, account, statusText, statusIsShow, statusUpdatedAt) } // UpsertRoom mocks base method. diff --git a/pkg/migration/disposition.go b/pkg/migration/disposition.go new file mode 100644 index 000000000..06cd6b15a --- /dev/null +++ b/pkg/migration/disposition.go @@ -0,0 +1,59 @@ +// Package migration holds the shared consume/disposition machinery for the +// data-migration transformer services (messages and collections). Each service +// keeps its own event mapping and metrics; this package owns the policy that +// turns a handler result into a JetStream disposition. +package migration + +import "errors" + +// ErrPoison marks an event that can never succeed (unmappable doc). The consume +// loop Terms these instead of redelivering, so one bad event never wedges the stream. +var ErrPoison = errors.New("poison event") + +// ErrSkipped marks an event the handler deliberately dropped. The consume loop Acks +// these but does NOT count them as processed — the skip is already metered by the handler. +var ErrSkipped = errors.New("event skipped") + +// Action is the JetStream disposition a consume loop should apply to a message. +type Action int + +const ( + // ActionAck: handler succeeded — Ack and count as processed. + ActionAck Action = iota + // ActionTerm: poison — Term, never redeliver. + ActionTerm + // ActionAckSkip: deliberate skip — Ack, do NOT count as processed. + ActionAckSkip + // ActionNak: transient failure — Nak for redelivery. + ActionNak + // ActionTermExhausted: transient failure that has hit the delivery cap — Term + // explicitly (with an exhaustion metric) instead of letting JetStream silently drop it. + ActionTermExhausted +) + +// Classify maps a handler result to a disposition Action. isFinal reports whether this +// is the last delivery (a further Nak would be silently dropped). Poison and skip take +// precedence over isFinal — a poison/skip is terminal regardless of delivery count. +func Classify(err error, isFinal bool) Action { + switch { + case err == nil: + return ActionAck + case errors.Is(err, ErrPoison): + return ActionTerm + case errors.Is(err, ErrSkipped): + return ActionAckSkip + case isFinal: + return ActionTermExhausted + default: + return ActionNak + } +} + +// IsFinalDelivery reports whether numDelivered has reached maxDeliver, so a further Nak +// would be a silent drop. maxDeliver <= 0 means unlimited (never final). +func IsFinalDelivery(numDelivered uint64, maxDeliver int) bool { + if maxDeliver <= 0 { + return false + } + return numDelivered >= uint64(maxDeliver) +} diff --git a/pkg/migration/disposition_test.go b/pkg/migration/disposition_test.go new file mode 100644 index 000000000..ce67bd664 --- /dev/null +++ b/pkg/migration/disposition_test.go @@ -0,0 +1,41 @@ +package migration + +import ( + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestClassify(t *testing.T) { + tests := []struct { + name string + err error + isFinal bool + want Action + }{ + {"success", nil, false, ActionAck}, + {"poison", ErrPoison, false, ActionTerm}, + {"poison wrapped", fmt.Errorf("map doc: %w", ErrPoison), false, ActionTerm}, + {"skipped", ErrSkipped, false, ActionAckSkip}, + {"skipped wrapped", fmt.Errorf("other collection: %w", ErrSkipped), false, ActionAckSkip}, + {"transient not final", errors.New("source down"), false, ActionNak}, + {"transient final", errors.New("source down"), true, ActionTermExhausted}, + {"poison takes precedence over final", ErrPoison, true, ActionTerm}, + {"skipped takes precedence over final", ErrSkipped, true, ActionAckSkip}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, Classify(tc.err, tc.isFinal)) + }) + } +} + +func TestIsFinalDelivery(t *testing.T) { + assert.False(t, IsFinalDelivery(1, 0), "maxDeliver<=0 means unlimited — never final") + assert.False(t, IsFinalDelivery(100, 0), "unlimited") + assert.False(t, IsFinalDelivery(4, 5), "below cap") + assert.True(t, IsFinalDelivery(5, 5), "at cap is final") + assert.True(t, IsFinalDelivery(6, 5), "past cap is final") +} diff --git a/data-migration/oplog-transformer/sourcelookup.go b/pkg/migration/sourcelookup.go similarity index 52% rename from data-migration/oplog-transformer/sourcelookup.go rename to pkg/migration/sourcelookup.go index c9a99aaeb..fb5ed29a0 100644 --- a/data-migration/oplog-transformer/sourcelookup.go +++ b/pkg/migration/sourcelookup.go @@ -1,4 +1,4 @@ -package main +package migration import ( "context" @@ -11,24 +11,28 @@ import ( "go.opentelemetry.io/otel/codes" ) -// sourceLookup fetches the current full message doc from the source by _id. -type sourceLookup interface { - // FindByID returns the raw BSON-extended-JSON document, or (nil, nil) if absent. +// SourceLookup fetches the current full source doc by _id, as relaxed extended JSON +// (the same shape the connector emits). Used on the update path, where the connector +// forwards only the change delta and the full current doc must be re-read. +type SourceLookup interface { + // FindByID returns the raw relaxed-extJSON document, or (nil, nil) if absent. FindByID(ctx context.Context, id string) ([]byte, error) } -type mongoSourceLookup struct { +// MongoSourceLookup is a SourceLookup backed by a source Mongo collection. +type MongoSourceLookup struct { coll *mongo.Collection } -func newMongoSourceLookup(coll *mongo.Collection) *mongoSourceLookup { - return &mongoSourceLookup{coll: coll} +// NewMongoSourceLookup returns a MongoSourceLookup over the given source collection. +func NewMongoSourceLookup(coll *mongo.Collection) *MongoSourceLookup { + return &MongoSourceLookup{coll: coll} } // FindByID reads the doc and re-encodes it as relaxed extended JSON, matching the shape -// messagemap expects (same as the connector emits). -func (m *mongoSourceLookup) FindByID(ctx context.Context, id string) (out []byte, err error) { - ctx, span := otel.Tracer("oplog-transformer").Start(ctx, "source.findByID") +// the connector emits. A missing document returns (nil, nil), not an error. +func (m *MongoSourceLookup) FindByID(ctx context.Context, id string) (out []byte, err error) { + ctx, span := otel.Tracer("migration").Start(ctx, "source.findByID") defer func() { if err != nil { span.SetStatus(codes.Error, err.Error())