diff --git a/docs/superpowers/plans/2026-04-21-load-test-messaging-workers.md b/docs/superpowers/plans/2026-04-21-load-test-messaging-workers.md new file mode 100644 index 00000000..fc34dae3 --- /dev/null +++ b/docs/superpowers/plans/2026-04-21-load-test-messaging-workers.md @@ -0,0 +1,2780 @@ +# Load Test Messaging Workers Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a Go-based load generator (`tools/loadgen`) plus a docker-compose harness that sustains and measures messaging pipeline capacity (message-gatekeeper → MESSAGES_CANONICAL → message-worker + broadcast-worker) on a single site. + +**Architecture:** One Go binary with three subcommands (`seed`, `run`, `teardown`), open-loop publishing via `time.Ticker`, two wildcard subscriptions for reply-correlation (E1) and broadcast-correlation (E2), periodic `ConsumerInfo` sampling for backlog (E4), Prometheus gauges + terminal summary for reporting, optional Grafana profile for dashboards. Docker-compose file at `tools/loadgen/deploy/docker-compose.loadtest.yml` brings up the full single-site pipeline plus the loadgen container. + +**Tech Stack:** Go 1.25, `nats.go` + `nats.go/jetstream`, `go.mongodb.org/mongo-driver/v2`, `caarlos0/env/v11`, `google/uuid`, `prometheus/client_golang`, `stretchr/testify`, `testcontainers-go`, stdlib `log/slog` / `math/rand` / `time.Ticker` / `text/tabwriter`. + +**Spec:** `docs/superpowers/specs/2026-04-21-load-test-messaging-workers-design.md`. + +--- + +## File Structure + +### New Go source files (all under `tools/loadgen/`) + +| File | Responsibility | +|---|---| +| `main.go` | Parse env config, dispatch subcommand (`seed`/`run`/`teardown`), wire dependencies, graceful shutdown. | +| `preset.go` | `Preset`, `Distribution`, `Range` types; built-in presets map; deterministic `(user, room, content)` generators. | +| `seed.go` | MongoDB seeding: drop + populate `users`/`rooms`/`subscriptions` collections from a preset. | +| `generator.go` | Open-loop publisher driven by `time.Ticker`; publishes `SendMessageRequest` to front-door subject (or `MessageEvent` to canonical). | +| `collector.go` | Reply-subject and broadcast-subject subscribers; two `sync.Map`s for E1 / E2 correlation; sample buffers. | +| `consumerlag.go` | Polls `ConsumerInfo` every 1s for both durables; exposes Prometheus gauges; records min/peak/final. | +| `report.go` | Terminal summary (`text/tabwriter`), CSV export, exit-code logic, percentile computation. | +| `metrics.go` | Prometheus registry + histograms/counters/gauges used by generator/collector/consumerlag. | +| `preset_test.go` | Determinism tests for preset generation. | +| `generator_test.go` | Rate-pacing tests with stubbed publish. | +| `collector_test.go` | Reply / broadcast correlation tests with synthesized messages. | +| `report_test.go` | Percentile math, CSV format, exit-code tolerance tests. | +| `integration_test.go` | `//go:build integration` — spins up real NATS+Mongo+Cassandra+workers, runs `small` preset, asserts end-to-end wiring. | + +### New deploy files (all under `tools/loadgen/deploy/`) + +| File | Responsibility | +|---|---| +| `Dockerfile` | Multi-stage build, `golang:1.25.8-alpine` builder, `alpine:3.21` runtime. | +| `Makefile` | Scoped `up`, `seed`, `run`, `run-dashboards`, `down` targets. | +| `docker-compose.loadtest.yml` | NATS+Mongo+Cassandra+gatekeeper+workers+loadgen+(optional) prometheus+grafana. | +| `prometheus/prometheus.yml` | Prometheus scrape config for loadgen and NATS. | +| `grafana/provisioning/datasources/prometheus.yaml` | Grafana datasource provisioning. | +| `grafana/provisioning/dashboards/loadtest.yaml` | Grafana dashboard provisioning. | +| `grafana/dashboards/loadtest.json` | The load-test dashboard JSON. | +| `README.md` | Operator reference: what it is, how to run, how to read output, what's out of scope. | + +### Modified files + +None. Root `Makefile` stays untouched (per broadcast-worker harness precedent). + +--- + +## Task 1: Scaffold `tools/loadgen/` directory and stub `main.go` + +**Files:** +- Create: `tools/loadgen/main.go` + +- [ ] **Step 1: Create directory and write stub `main.go`** + +Create the file `tools/loadgen/main.go`: + +```go +package main + +import ( + "fmt" + "log/slog" + "os" + + "github.com/caarlos0/env/v11" +) + +type config struct { + NatsURL string `env:"NATS_URL,required"` + NatsCredsFile string `env:"NATS_CREDS_FILE" envDefault:""` + SiteID string `env:"SITE_ID" envDefault:"site-local"` + MongoURI string `env:"MONGO_URI,required"` + MongoDB string `env:"MONGO_DB" envDefault:"chat"` + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9099"` +} + +func main() { + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil))) + + if len(os.Args) < 2 { + fmt.Fprintln(os.Stderr, "usage: loadgen [flags]") + os.Exit(2) + } + cfg, err := env.ParseAs[config]() + if err != nil { + slog.Error("parse config", "error", err) + os.Exit(1) + } + _ = cfg + switch os.Args[1] { + case "seed", "run", "teardown": + slog.Info("subcommand not yet implemented", "subcommand", os.Args[1]) + os.Exit(0) + default: + fmt.Fprintf(os.Stderr, "unknown subcommand: %s\n", os.Args[1]) + os.Exit(2) + } +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd /home/user/chat && go build ./tools/loadgen/` +Expected: Succeeds; no output. + +- [ ] **Step 3: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/main.go +git commit -m "feat(loadgen): scaffold main.go with subcommand dispatch" +``` + +--- + +## Task 2: Define `Preset`, `Distribution`, `Range` types and built-in preset map + +**Files:** +- Create: `tools/loadgen/preset.go` +- Test: `tools/loadgen/preset_test.go` + +- [ ] **Step 1: Write failing test** + +Create `tools/loadgen/preset_test.go`: + +```go +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuiltinPresets_ContainsAllFour(t *testing.T) { + names := []string{"small", "medium", "large", "realistic"} + for _, name := range names { + t.Run(name, func(t *testing.T) { + p, ok := BuiltinPreset(name) + require.True(t, ok, "preset %q must exist", name) + assert.Equal(t, name, p.Name) + assert.Greater(t, p.Users, 0) + assert.Greater(t, p.Rooms, 0) + }) + } +} + +func TestBuiltinPresets_UnknownReturnsFalse(t *testing.T) { + _, ok := BuiltinPreset("nonexistent") + assert.False(t, ok) +} + +func TestBuiltinPresets_UniformShape(t *testing.T) { + for _, name := range []string{"small", "medium", "large"} { + t.Run(name, func(t *testing.T) { + p, _ := BuiltinPreset(name) + assert.Equal(t, DistUniform, p.RoomSizeDist) + assert.Equal(t, DistUniform, p.SenderDist) + assert.InDelta(t, 0.0, p.MentionRate, 1e-9) + assert.InDelta(t, 0.0, p.ThreadRate, 1e-9) + }) + } +} + +func TestBuiltinPresets_RealisticShape(t *testing.T) { + p, _ := BuiltinPreset("realistic") + assert.Equal(t, DistMixed, p.RoomSizeDist) + assert.Equal(t, DistZipf, p.SenderDist) + assert.Greater(t, p.MentionRate, 0.0) + assert.Greater(t, p.ThreadRate, 0.0) + assert.Greater(t, p.ContentBytes.Max, p.ContentBytes.Min) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestBuiltinPresets -v` +Expected: FAIL — `BuiltinPreset`, `DistUniform`, `DistMixed`, `DistZipf`, `Preset` undefined. + +- [ ] **Step 3: Write the preset definitions** + +Create `tools/loadgen/preset.go`: + +```go +package main + +// Distribution names the shape of a per-preset random selection. +type Distribution string + +const ( + DistUniform Distribution = "uniform" + DistMixed Distribution = "mixed" + DistZipf Distribution = "zipf" +) + +// Range holds an inclusive min/max for integer quantities like content size. +type Range struct { + Min int + Max int +} + +// Preset is a named, fully deterministic workload specification. +type Preset struct { + Name string + Users int + Rooms int + RoomSizeDist Distribution + SenderDist Distribution + ContentBytes Range + MentionRate float64 + ThreadRate float64 +} + +var builtinPresets = map[string]Preset{ + "small": { + Name: "small", Users: 10, Rooms: 5, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "medium": { + Name: "medium", Users: 1000, Rooms: 100, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "large": { + Name: "large", Users: 10000, Rooms: 1000, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "realistic": { + Name: "realistic", Users: 1000, Rooms: 100, + RoomSizeDist: DistMixed, SenderDist: DistZipf, + ContentBytes: Range{Min: 50, Max: 2000}, + MentionRate: 0.10, + ThreadRate: 0.05, + }, +} + +// BuiltinPreset looks up a preset by name. +func BuiltinPreset(name string) (Preset, bool) { + p, ok := builtinPresets[name] + return p, ok +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestBuiltinPresets -v` +Expected: PASS for all four subtests plus the two standalone tests. + +- [ ] **Step 5: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/preset.go tools/loadgen/preset_test.go +git commit -m "feat(loadgen): add Preset type and four built-in presets" +``` + +--- + +## Task 3: Deterministic fixture generation (users, rooms, subscriptions) + +Pure functions that turn `(Preset, seed)` into `[]model.User`, `[]model.Room`, `[]model.Subscription`. No I/O — those slices are the seeding input for Task 4. + +**Files:** +- Modify: `tools/loadgen/preset.go` +- Modify: `tools/loadgen/preset_test.go` + +- [ ] **Step 1: Add fixture-generation tests (failing)** + +Append to `tools/loadgen/preset_test.go`: + +```go +func TestBuildFixtures_DeterministicAcrossCalls(t *testing.T) { + p, _ := BuiltinPreset("small") + a := BuildFixtures(p, 42, "site-local") + b := BuildFixtures(p, 42, "site-local") + assert.Equal(t, a.Users, b.Users) + assert.Equal(t, a.Rooms, b.Rooms) + assert.Equal(t, a.Subscriptions, b.Subscriptions) +} + +func TestBuildFixtures_SmallCountsAndShape(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(p, 42, "site-local") + assert.Len(t, f.Users, 10) + assert.Len(t, f.Rooms, 5) + // uniform: every user is in at least one room + users := make(map[string]bool) + for _, s := range f.Subscriptions { + users[s.User.ID] = true + assert.Equal(t, "site-local", s.SiteID) + } + assert.Len(t, users, 10) + for _, r := range f.Rooms { + assert.Equal(t, "group", string(r.Type)) + assert.Equal(t, "site-local", r.SiteID) + } +} + +func TestBuildFixtures_RealisticMixesGroupAndDM(t *testing.T) { + p, _ := BuiltinPreset("realistic") + f := BuildFixtures(p, 42, "site-local") + var groups, dms int + for _, r := range f.Rooms { + switch r.Type { + case "group": + groups++ + case "dm": + dms++ + } + } + assert.Greater(t, groups, 0) + assert.Greater(t, dms, 0) + // DM rooms must have exactly 2 members + dmMembers := make(map[string]int) + for _, s := range f.Subscriptions { + for _, r := range f.Rooms { + if r.ID == s.RoomID && r.Type == "dm" { + dmMembers[r.ID]++ + } + } + } + for id, n := range dmMembers { + assert.Equal(t, 2, n, "dm room %s must have 2 members", id) + } +} +``` + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestBuildFixtures -v` +Expected: FAIL — `BuildFixtures` undefined. + +- [ ] **Step 3: Implement `BuildFixtures`** + +Replace the entire contents of `tools/loadgen/preset.go` with the Task-2 content plus the fixture generator below. The full file should look like: + +```go +package main + +import ( + "fmt" + "math/rand" + "time" + + "github.com/hmchangw/chat/pkg/model" +) + +// Distribution names the shape of a per-preset random selection. +type Distribution string + +const ( + DistUniform Distribution = "uniform" + DistMixed Distribution = "mixed" + DistZipf Distribution = "zipf" +) + +// Range holds an inclusive min/max for integer quantities like content size. +type Range struct { + Min int + Max int +} + +// Preset is a named, fully deterministic workload specification. +type Preset struct { + Name string + Users int + Rooms int + RoomSizeDist Distribution + SenderDist Distribution + ContentBytes Range + MentionRate float64 + ThreadRate float64 +} + +var builtinPresets = map[string]Preset{ + "small": { + Name: "small", Users: 10, Rooms: 5, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "medium": { + Name: "medium", Users: 1000, Rooms: 100, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "large": { + Name: "large", Users: 10000, Rooms: 1000, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "realistic": { + Name: "realistic", Users: 1000, Rooms: 100, + RoomSizeDist: DistMixed, SenderDist: DistZipf, + ContentBytes: Range{Min: 50, Max: 2000}, + MentionRate: 0.10, + ThreadRate: 0.05, + }, +} + +// BuiltinPreset looks up a preset by name. +func BuiltinPreset(name string) (Preset, bool) { + p, ok := builtinPresets[name] + return p, ok +} + +// Fixtures is the full seed data for a preset run. +type Fixtures struct { + Users []model.User + Rooms []model.Room + Subscriptions []model.Subscription +} + +var ( + engNameBank = []string{"Alice Wang", "Bob Chen", "Carol Lee", "Dave Liu", "Eve Zhang"} + chineseNameBank = []string{"愛麗絲", "鮑勃", "卡蘿", "戴夫", "伊芙"} +) + +// BuildFixtures is a pure function of (preset, seed, siteID) producing the +// full fixture set. Two calls with equal inputs produce equal outputs. +func BuildFixtures(p Preset, seed int64, siteID string) Fixtures { + r := rand.New(rand.NewSource(seed)) + now := time.Unix(0, 0).UTC() // fixed so output is deterministic + + users := make([]model.User, p.Users) + for i := 0; i < p.Users; i++ { + users[i] = model.User{ + ID: fmt.Sprintf("u-%06d", i), + Account: fmt.Sprintf("user-%d", i), + SiteID: siteID, + EngName: engNameBank[i%len(engNameBank)], + ChineseName: chineseNameBank[i%len(chineseNameBank)], + } + } + + rooms := make([]model.Room, p.Rooms) + // realistic: last 10% of rooms are DMs + dmStart := p.Rooms + if p.RoomSizeDist == DistMixed { + dmStart = p.Rooms - p.Rooms/10 + } + for i := 0; i < p.Rooms; i++ { + rtype := model.RoomTypeGroup + if i >= dmStart { + rtype = model.RoomTypeDM + } + rooms[i] = model.Room{ + ID: fmt.Sprintf("room-%06d", i), + Name: fmt.Sprintf("room-%d", i), + Type: rtype, + SiteID: siteID, + UserCount: 0, // filled after membership + CreatedAt: now, + UpdatedAt: now, + } + } + + var subs []model.Subscription + for i := range rooms { + members := pickMembers(r, p, &rooms[i], users) + rooms[i].UserCount = len(members) + for _, u := range members { + subs = append(subs, model.Subscription{ + ID: fmt.Sprintf("sub-%s-%s", rooms[i].ID, u.ID), + User: model.SubscriptionUser{ID: u.ID, Account: u.Account}, + RoomID: rooms[i].ID, + SiteID: siteID, + Roles: []model.Role{model.RoleMember}, + JoinedAt: now, + }) + } + } + return Fixtures{Users: users, Rooms: rooms, Subscriptions: subs} +} + +func pickMembers(r *rand.Rand, p Preset, room *model.Room, users []model.User) []model.User { + if room.Type == model.RoomTypeDM { + // Two distinct users. + i := r.Intn(len(users)) + j := r.Intn(len(users) - 1) + if j >= i { + j++ + } + return []model.User{users[i], users[j]} + } + switch p.RoomSizeDist { + case DistMixed: + // 10% of rooms get up to 500 members; rest get 2-20. + size := 2 + r.Intn(19) + if r.Intn(10) == 0 { + size = 2 + r.Intn(499) + } + return sampleWithoutReplacement(r, users, size) + default: + size := (len(users) + p.Rooms - 1) / p.Rooms + if size < 2 { + size = 2 + } + return sampleWithoutReplacement(r, users, size) + } +} + +func sampleWithoutReplacement(r *rand.Rand, users []model.User, n int) []model.User { + if n > len(users) { + n = len(users) + } + idx := r.Perm(len(users))[:n] + out := make([]model.User, n) + for i, k := range idx { + out[i] = users[k] + } + return out +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestBuildFixtures -v` +Expected: PASS for all three subtests. + +- [ ] **Step 5: Run whole package to confirm no regressions** + +Run: `cd /home/user/chat && make test SERVICE=tools/loadgen` +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/preset.go tools/loadgen/preset_test.go +git commit -m "feat(loadgen): deterministic fixture generation from (preset, seed)" +``` + +--- + +## Task 4: Seeding MongoDB with fixtures + +**Files:** +- Create: `tools/loadgen/seed.go` + +- [ ] **Step 1: Write `seed.go`** + +Create `tools/loadgen/seed.go`: + +```go +package main + +import ( + "context" + "fmt" + + "go.mongodb.org/mongo-driver/v2/mongo" +) + +// Seed drops and repopulates users/rooms/subscriptions in db from fixtures. +// Idempotent: safe to rerun. +func Seed(ctx context.Context, db *mongo.Database, f Fixtures) error { + if err := db.Collection("users").Drop(ctx); err != nil { + return fmt.Errorf("drop users: %w", err) + } + if err := db.Collection("rooms").Drop(ctx); err != nil { + return fmt.Errorf("drop rooms: %w", err) + } + if err := db.Collection("subscriptions").Drop(ctx); err != nil { + return fmt.Errorf("drop subscriptions: %w", err) + } + + if len(f.Users) > 0 { + docs := make([]interface{}, len(f.Users)) + for i := range f.Users { + docs[i] = f.Users[i] + } + if _, err := db.Collection("users").InsertMany(ctx, docs); err != nil { + return fmt.Errorf("insert users: %w", err) + } + } + if len(f.Rooms) > 0 { + docs := make([]interface{}, len(f.Rooms)) + for i := range f.Rooms { + docs[i] = f.Rooms[i] + } + if _, err := db.Collection("rooms").InsertMany(ctx, docs); err != nil { + return fmt.Errorf("insert rooms: %w", err) + } + } + if len(f.Subscriptions) > 0 { + docs := make([]interface{}, len(f.Subscriptions)) + for i := range f.Subscriptions { + docs[i] = f.Subscriptions[i] + } + if _, err := db.Collection("subscriptions").InsertMany(ctx, docs); err != nil { + return fmt.Errorf("insert subscriptions: %w", err) + } + } + return nil +} + +// Teardown drops the three seeded collections without repopulating. +func Teardown(ctx context.Context, db *mongo.Database) error { + for _, c := range []string{"users", "rooms", "subscriptions"} { + if err := db.Collection(c).Drop(ctx); err != nil { + return fmt.Errorf("drop %s: %w", c, err) + } + } + return nil +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd /home/user/chat && go build ./tools/loadgen/` +Expected: Succeeds. + +- [ ] **Step 3: Commit** + +`Seed`/`Teardown` are exercised by the integration test (Task 12). Unit-level test value is low because this is a straight drop + InsertMany against the real Mongo driver. + +```bash +cd /home/user/chat +git add tools/loadgen/seed.go +git commit -m "feat(loadgen): Seed and Teardown mongo collections from fixtures" +``` + +--- + +## Task 5: Prometheus metrics registry + +**Files:** +- Create: `tools/loadgen/metrics.go` + +- [ ] **Step 1: Write `metrics.go`** + +Create `tools/loadgen/metrics.go`: + +```go +package main + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// Metrics holds the Prometheus collectors used across loadgen components. +type Metrics struct { + Registry *prometheus.Registry + Published *prometheus.CounterVec + PublishErrors *prometheus.CounterVec + E1Latency *prometheus.HistogramVec + E2Latency *prometheus.HistogramVec + ConsumerPending *prometheus.GaugeVec + ConsumerAckPending *prometheus.GaugeVec + ConsumerRedelivered *prometheus.GaugeVec +} + +// NewMetrics constructs a dedicated Prometheus registry with all loadgen +// collectors registered. A dedicated registry avoids colliding with default +// Go/process collectors. +func NewMetrics() *Metrics { + r := prometheus.NewRegistry() + buckets := []float64{ + 0.001, 0.002, 0.005, 0.010, 0.025, 0.050, 0.100, 0.250, 0.500, 1.000, 2.500, 5.000, + } + m := &Metrics{ + Registry: r, + Published: prometheus.NewCounterVec( + prometheus.CounterOpts{Name: "loadgen_published_total", Help: "Messages published."}, + []string{"preset"}, + ), + PublishErrors: prometheus.NewCounterVec( + prometheus.CounterOpts{Name: "loadgen_publish_errors_total", Help: "Publish-side errors."}, + []string{"preset", "reason"}, + ), + E1Latency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{Name: "loadgen_e1_latency_seconds", Help: "Gatekeeper ack latency.", Buckets: buckets}, + []string{"preset"}, + ), + E2Latency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{Name: "loadgen_e2_latency_seconds", Help: "Broadcast-visible latency.", Buckets: buckets}, + []string{"preset"}, + ), + ConsumerPending: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_pending", Help: "JetStream consumer num_pending."}, + []string{"stream", "durable"}, + ), + ConsumerAckPending: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_ack_pending", Help: "JetStream consumer num_ack_pending."}, + []string{"stream", "durable"}, + ), + ConsumerRedelivered: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_redelivered", Help: "JetStream consumer num_redelivered."}, + []string{"stream", "durable"}, + ), + } + r.MustRegister( + m.Published, m.PublishErrors, + m.E1Latency, m.E2Latency, + m.ConsumerPending, m.ConsumerAckPending, m.ConsumerRedelivered, + ) + return m +} + +// Handler returns an http.Handler serving this metrics registry. +func (m *Metrics) Handler() http.Handler { + return promhttp.HandlerFor(m.Registry, promhttp.HandlerOpts{}) +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd /home/user/chat && go build ./tools/loadgen/` +Expected: Succeeds. + +- [ ] **Step 3: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/metrics.go +git commit -m "feat(loadgen): Prometheus registry with loadgen collectors" +``` + +--- + +## Task 6: Collector — reply + broadcast correlation + +**Files:** +- Create: `tools/loadgen/collector.go` +- Create: `tools/loadgen/collector_test.go` + +- [ ] **Step 1: Write failing tests** + +Create `tools/loadgen/collector_test.go`: + +```go +package main + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCollector_E1ReplyMatches(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + assert.Equal(t, 1, c.E1Count()) + assert.Equal(t, []time.Duration{5 * time.Millisecond}, c.E1Samples()) +} + +func TestCollector_E1UnknownIgnored(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + c.RecordReply("unknown", time.Unix(0, 0)) + assert.Equal(t, 0, c.E1Count()) +} + +func TestCollector_E2BroadcastMatches(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordBroadcast("msg-1", now.Add(8*time.Millisecond)) + assert.Equal(t, 1, c.E2Count()) + assert.Equal(t, []time.Duration{8 * time.Millisecond}, c.E2Samples()) +} + +func TestCollector_E1AndE2Independent(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + c.RecordBroadcast("msg-1", now.Add(8*time.Millisecond)) + assert.Equal(t, 1, c.E1Count()) + assert.Equal(t, 1, c.E2Count()) +} + +func TestCollector_MissingCountsAtFinalize(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordPublish("req-2", "msg-2", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + // req-2 reply never arrives; msg-1 and msg-2 broadcasts never arrive + missingReplies, missingBroadcasts := c.Finalize() + assert.Equal(t, 1, missingReplies) + assert.Equal(t, 2, missingBroadcasts) +} + +func TestCollector_WarmupDiscards(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + start := time.Unix(0, 0) + warmupEnd := start.Add(1 * time.Second) + // In warmup window: + c.RecordPublish("req-warm", "msg-warm", start) + c.RecordReply("req-warm", start.Add(10*time.Millisecond)) + // Past warmup: + c.RecordPublish("req-real", "msg-real", warmupEnd.Add(100*time.Millisecond)) + c.RecordReply("req-real", warmupEnd.Add(105*time.Millisecond)) + + c.DiscardBefore(warmupEnd) + require.Equal(t, 1, c.E1Count()) + assert.Equal(t, []time.Duration{5 * time.Millisecond}, c.E1Samples()) +} +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestCollector -v` +Expected: FAIL — `NewCollector`, `Collector` undefined. + +- [ ] **Step 3: Implement the collector** + +Create `tools/loadgen/collector.go`: + +```go +package main + +import ( + "sort" + "sync" + "time" +) + +type publishEntry struct { + publishedAt time.Time +} + +// sample pairs a latency with its publish timestamp so warmup can discard by time. +type sample struct { + publishedAt time.Time + latency time.Duration +} + +// Collector correlates publishes with replies (E1) and broadcasts (E2). +type Collector struct { + m *Metrics + preset string + mu sync.Mutex + byReqID map[string]publishEntry + byMsgID map[string]publishEntry + e1 []sample + e2 []sample +} + +// NewCollector returns a ready-to-use Collector. +func NewCollector(m *Metrics, preset string) *Collector { + return &Collector{ + m: m, preset: preset, + byReqID: make(map[string]publishEntry), + byMsgID: make(map[string]publishEntry), + } +} + +// RecordPublish stores the publish time under both correlation keys. +func (c *Collector) RecordPublish(requestID, messageID string, t time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + c.byReqID[requestID] = publishEntry{publishedAt: t} + c.byMsgID[messageID] = publishEntry{publishedAt: t} +} + +// RecordReply consumes one pending publish keyed by requestID. +func (c *Collector) RecordReply(requestID string, at time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + e, ok := c.byReqID[requestID] + if !ok { + return + } + delete(c.byReqID, requestID) + d := at.Sub(e.publishedAt) + c.e1 = append(c.e1, sample{publishedAt: e.publishedAt, latency: d}) + c.m.E1Latency.WithLabelValues(c.preset).Observe(d.Seconds()) +} + +// RecordBroadcast consumes one pending publish keyed by messageID. +func (c *Collector) RecordBroadcast(messageID string, at time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + e, ok := c.byMsgID[messageID] + if !ok { + return + } + delete(c.byMsgID, messageID) + d := at.Sub(e.publishedAt) + c.e2 = append(c.e2, sample{publishedAt: e.publishedAt, latency: d}) + c.m.E2Latency.WithLabelValues(c.preset).Observe(d.Seconds()) +} + +// DiscardBefore drops any samples whose publish time is before cutoff (warmup). +func (c *Collector) DiscardBefore(cutoff time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + c.e1 = filterAtOrAfter(c.e1, cutoff) + c.e2 = filterAtOrAfter(c.e2, cutoff) +} + +func filterAtOrAfter(in []sample, cutoff time.Time) []sample { + out := in[:0] + for _, s := range in { + if !s.publishedAt.Before(cutoff) { + out = append(out, s) + } + } + return out +} + +// Finalize returns the count of unmatched publishes as missing replies and broadcasts. +func (c *Collector) Finalize() (missingReplies int, missingBroadcasts int) { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.byReqID), len(c.byMsgID) +} + +// E1Count returns the number of matched E1 samples. +func (c *Collector) E1Count() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.e1) +} + +// E2Count returns the number of matched E2 samples. +func (c *Collector) E2Count() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.e2) +} + +// E1Samples returns a sorted copy of E1 latencies for tests/reporting. +func (c *Collector) E1Samples() []time.Duration { + return c.snapshotLatencies(c.e1) +} + +// E2Samples returns a sorted copy of E2 latencies for tests/reporting. +func (c *Collector) E2Samples() []time.Duration { + return c.snapshotLatencies(c.e2) +} + +func (c *Collector) snapshotLatencies(in []sample) []time.Duration { + c.mu.Lock() + defer c.mu.Unlock() + out := make([]time.Duration, len(in)) + for i, s := range in { + out[i] = s.latency + } + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} +``` + +- [ ] **Step 4: Run to verify tests pass** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestCollector -v` +Expected: PASS for all six subtests. + +- [ ] **Step 5: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/collector.go tools/loadgen/collector_test.go +git commit -m "feat(loadgen): collector correlates publishes with replies and broadcasts" +``` + +--- + +## Task 7: Percentile math and report formatting + +**Files:** +- Create: `tools/loadgen/report.go` +- Create: `tools/loadgen/report_test.go` + +- [ ] **Step 1: Write failing tests** + +Create `tools/loadgen/report_test.go`: + +```go +package main + +import ( + "bytes" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPercentiles_FixedSet(t *testing.T) { + // 100 sorted values: 1ms..100ms + samples := make([]time.Duration, 100) + for i := range samples { + samples[i] = time.Duration(i+1) * time.Millisecond + } + p := ComputePercentiles(samples) + assert.Equal(t, 50*time.Millisecond, p.P50) + assert.Equal(t, 95*time.Millisecond, p.P95) + assert.Equal(t, 99*time.Millisecond, p.P99) + assert.Equal(t, 100*time.Millisecond, p.Max) +} + +func TestPercentiles_Empty(t *testing.T) { + p := ComputePercentiles(nil) + assert.Zero(t, p.P50) + assert.Zero(t, p.P95) + assert.Zero(t, p.P99) + assert.Zero(t, p.Max) +} + +func TestPrintSummary_ContainsKeyFields(t *testing.T) { + var buf bytes.Buffer + s := Summary{ + Preset: "medium", Seed: 42, Site: "site-local", + TargetRate: 500, ActualRate: 499.8, + Duration: 60 * time.Second, Warmup: 10 * time.Second, + Inject: "frontdoor", Sent: 25000, + } + PrintSummary(&buf, s) + out := buf.String() + for _, want := range []string{ + "preset: medium", "seed: 42", "site: site-local", + "sent:", "25000", "inject: frontdoor", + } { + assert.True(t, strings.Contains(out, want), "summary missing %q; got:\n%s", want, out) + } +} + +func TestWriteCSV_OneRowPerSample(t *testing.T) { + var buf bytes.Buffer + rows := []CSVSample{ + {TimestampNs: 1, RequestID: "r1", Metric: "E1", LatencyNs: 2_100_000}, + {TimestampNs: 2, RequestID: "r1", Metric: "E2", LatencyNs: 8_700_000}, + } + require.NoError(t, WriteCSV(&buf, rows)) + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + require.Len(t, lines, 3) // header + 2 rows + assert.Equal(t, "timestamp_ns,request_id,metric,latency_ns", lines[0]) + assert.Equal(t, "1,r1,E1,2100000", lines[1]) + assert.Equal(t, "2,r1,E2,8700000", lines[2]) +} + +func TestDetermineExitCode(t *testing.T) { + cases := []struct { + name string + sent int + errs int + wantExitCode int + }{ + {"zero errors", 10000, 0, 0}, + {"under tolerance", 10000, 9, 0}, // 0.09% < 0.1% + {"at tolerance boundary", 10000, 10, 0}, // exactly 0.1%: pass + {"over tolerance", 10000, 11, 1}, // 0.11% > 0.1% + {"no sends - any error fails", 0, 1, 1}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.wantExitCode, DetermineExitCode(tc.sent, tc.errs)) + }) + } +} +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run 'TestPercentiles|TestPrintSummary|TestWriteCSV|TestDetermineExitCode' -v` +Expected: FAIL — undefined identifiers. + +- [ ] **Step 3: Implement `report.go`** + +Create `tools/loadgen/report.go`: + +```go +package main + +import ( + "encoding/csv" + "fmt" + "io" + "sort" + "strconv" + "text/tabwriter" + "time" +) + +// Percentiles holds summary latency percentiles. +type Percentiles struct { + P50, P95, P99, Max time.Duration +} + +// ComputePercentiles returns P50/P95/P99/max of samples. Empty input -> zeros. +// Input does not need to be sorted on entry. +func ComputePercentiles(samples []time.Duration) Percentiles { + if len(samples) == 0 { + return Percentiles{} + } + sorted := make([]time.Duration, len(samples)) + copy(sorted, samples) + sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) + pick := func(q float64) time.Duration { + idx := int(float64(len(sorted)-1) * q) + return sorted[idx] + } + return Percentiles{ + P50: pick(0.50), + P95: pick(0.95), + P99: pick(0.99), + Max: sorted[len(sorted)-1], + } +} + +// ConsumerStat captures the min/peak/final snapshot of a single durable. +type ConsumerStat struct { + Stream string + Durable string + MinPending uint64 + PeakPending uint64 + FinalPending uint64 + PeakAckPending uint64 + Redelivered uint64 +} + +// Summary is the full end-of-run report. +type Summary struct { + Preset, Site, Inject string + Seed int64 + TargetRate int + ActualRate float64 + Duration, Warmup time.Duration + Sent int + PublishErrors int + GatekeeperErrors int + MissingReplies int + MissingBroadcasts int + E1 Percentiles + E2 Percentiles + E1Count, E2Count int + Consumers []ConsumerStat +} + +// PrintSummary writes the terminal summary to w using text/tabwriter. +func PrintSummary(w io.Writer, s Summary) { + fmt.Fprintln(w, "=== loadgen run complete ===") + fmt.Fprintf(w, "preset: %s seed: %d site: %s\n", s.Preset, s.Seed, s.Site) + fmt.Fprintf(w, "duration: %s (warmup: %s, measured: %s) inject: %s\n", + s.Duration, s.Warmup, s.Duration-s.Warmup, s.Inject) + fmt.Fprintf(w, "target rate: %d msg/s actual rate: %.1f msg/s\n\n", s.TargetRate, s.ActualRate) + + fmt.Fprintln(w, "publish results") + fmt.Fprintf(w, " sent: %d\n", s.Sent) + fmt.Fprintf(w, " publish errors: %d\n", s.PublishErrors) + fmt.Fprintf(w, " gatekeeper errors: %d\n", s.GatekeeperErrors) + fmt.Fprintf(w, " missing replies: %d\n", s.MissingReplies) + fmt.Fprintf(w, " missing broadcasts:%d\n\n", s.MissingBroadcasts) + + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw, "latency (measured window only)") + fmt.Fprintln(tw, "metric\tcount\tp50\tp95\tp99\tmax") + fmt.Fprintf(tw, "E1 gatekeeper\t%d\t%s\t%s\t%s\t%s\n", s.E1Count, s.E1.P50, s.E1.P95, s.E1.P99, s.E1.Max) + fmt.Fprintf(tw, "E2 broadcast\t%d\t%s\t%s\t%s\t%s\n", s.E2Count, s.E2.P50, s.E2.P95, s.E2.P99, s.E2.Max) + tw.Flush() + + fmt.Fprintln(w) + if len(s.Consumers) > 0 { + fmt.Fprintf(w, "consumer lag (%s)\n", s.Consumers[0].Stream) + tw2 := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw2, "durable\tmin_pending\tpeak_pending\tfinal_pending\tpeak_ack_pending\tredelivered") + for _, c := range s.Consumers { + fmt.Fprintf(tw2, "%s\t%d\t%d\t%d\t%d\t%d\n", + c.Durable, c.MinPending, c.PeakPending, c.FinalPending, c.PeakAckPending, c.Redelivered) + } + tw2.Flush() + } +} + +// CSVSample is one row in the per-sample CSV dump. +type CSVSample struct { + TimestampNs int64 + RequestID string + Metric string + LatencyNs int64 +} + +// WriteCSV writes a header and one row per sample. +func WriteCSV(w io.Writer, rows []CSVSample) error { + cw := csv.NewWriter(w) + if err := cw.Write([]string{"timestamp_ns", "request_id", "metric", "latency_ns"}); err != nil { + return fmt.Errorf("write header: %w", err) + } + for _, r := range rows { + if err := cw.Write([]string{ + strconv.FormatInt(r.TimestampNs, 10), + r.RequestID, r.Metric, + strconv.FormatInt(r.LatencyNs, 10), + }); err != nil { + return fmt.Errorf("write row: %w", err) + } + } + cw.Flush() + return cw.Error() +} + +// DetermineExitCode returns 0 if error count is within 0.1% of sent. +// With sent == 0, any error is a failure. +func DetermineExitCode(sent, errs int) int { + if sent == 0 { + if errs == 0 { + return 0 + } + return 1 + } + // 0.1% tolerance inclusive: errs * 1000 <= sent + if errs*1000 <= sent { + return 0 + } + return 1 +} +``` + +- [ ] **Step 4: Run to verify tests pass** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run 'TestPercentiles|TestPrintSummary|TestWriteCSV|TestDetermineExitCode' -v` +Expected: PASS for all tests. + +- [ ] **Step 5: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/report.go tools/loadgen/report_test.go +git commit -m "feat(loadgen): percentiles, summary printer, CSV export, exit code" +``` + +--- + +## Task 8: Open-loop generator with injected publish function + +**Files:** +- Create: `tools/loadgen/generator.go` +- Create: `tools/loadgen/generator_test.go` + +- [ ] **Step 1: Write failing tests** + +Create `tools/loadgen/generator_test.go`: + +```go +package main + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type recordingPublisher struct { + mu sync.Mutex + calls []publishCall +} + +type publishCall struct { + subject string + data []byte +} + +func (r *recordingPublisher) Publish(_ context.Context, subject string, data []byte) error { + r.mu.Lock() + defer r.mu.Unlock() + r.calls = append(r.calls, publishCall{subject: subject, data: append([]byte(nil), data...)}) + return nil +} + +func (r *recordingPublisher) count() int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.calls) +} + +func TestGenerator_SendsExpectedCount(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(GeneratorConfig{ + Preset: p, + Fixtures: f, + SiteID: "site-local", + Rate: 200, + Inject: InjectFrontdoor, + Publisher: rp, + Metrics: m, + Collector: c, + }, 1) + + ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond) + defer cancel() + require.NoError(t, g.Run(ctx)) + + count := rp.count() + // 200 msg/s for ~250ms: expect 40-60 publishes (wide tolerance for scheduler). + assert.GreaterOrEqual(t, count, 30) + assert.LessOrEqual(t, count, 70) +} + +func TestGenerator_UsesFrontdoorSubject(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(GeneratorConfig{ + Preset: p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + require.NotEmpty(t, rp.calls) + for _, c := range rp.calls { + assert.Contains(t, c.subject, ".msg.send") + assert.Contains(t, c.subject, "site-local") + } +} + +func TestGenerator_UsesCanonicalSubjectWhenInjectCanonical(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(GeneratorConfig{ + Preset: p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectCanonical, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + require.NotEmpty(t, rp.calls) + for _, c := range rp.calls { + assert.Contains(t, c.subject, "chat.msg.canonical.site-local.created") + } +} + +func TestGenerator_IncrementsPublishedMetric(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(GeneratorConfig{ + Preset: p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + + // Gather the counter value via the default prometheus export mechanism. + var got int64 + metrics, err := m.Registry.Gather() + require.NoError(t, err) + for _, mf := range metrics { + if mf.GetName() == "loadgen_published_total" { + for _, metric := range mf.GetMetric() { + got += int64(metric.GetCounter().GetValue()) + } + } + } + assert.Greater(t, atomic.LoadInt64(&got), int64(0)) +} +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestGenerator -v` +Expected: FAIL — undefined identifiers (`NewGenerator`, `GeneratorConfig`, `InjectFrontdoor`, `InjectCanonical`, `Publisher`). + +- [ ] **Step 3: Implement the generator** + +Create `tools/loadgen/generator.go`: + +```go +package main + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "strings" + "time" + + "github.com/google/uuid" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/subject" +) + +// InjectMode selects which subject the generator publishes onto. +type InjectMode string + +const ( + InjectFrontdoor InjectMode = "frontdoor" + InjectCanonical InjectMode = "canonical" +) + +// Publisher abstracts NATS publishing so tests can inject a recorder. +type Publisher interface { + Publish(ctx context.Context, subject string, data []byte) error +} + +// GeneratorConfig is the parameter bundle for a Generator. +type GeneratorConfig struct { + Preset Preset + Fixtures Fixtures + SiteID string + Rate int + Inject InjectMode + Publisher Publisher + Metrics *Metrics + Collector *Collector +} + +// Generator is the open-loop publisher. +type Generator struct { + cfg GeneratorConfig + rng *rand.Rand +} + +// NewGenerator returns a Generator seeded from `seed`. +func NewGenerator(cfg GeneratorConfig, seed int64) *Generator { + return &Generator{cfg: cfg, rng: rand.New(rand.NewSource(seed))} +} + +// Run publishes at the configured rate until ctx is cancelled. +func (g *Generator) Run(ctx context.Context) error { + if g.cfg.Rate <= 0 { + return fmt.Errorf("rate must be > 0") + } + interval := time.Second / time.Duration(g.cfg.Rate) + if interval <= 0 { + interval = time.Nanosecond + } + tick := time.NewTicker(interval) + defer tick.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-tick.C: + g.publishOne(ctx) + } + } +} + +func (g *Generator) publishOne(ctx context.Context) { + if len(g.cfg.Fixtures.Subscriptions) == 0 { + return + } + // Pick (user, room) from any subscription. This respects uniform and + // mixed-distribution seeding because those are encoded in which + // subscriptions exist. + subIdx := g.rng.Intn(len(g.cfg.Fixtures.Subscriptions)) + sub := g.cfg.Fixtures.Subscriptions[subIdx] + content := g.content(subIdx) + msgID := uuid.NewString() + reqID := uuid.NewString() + + var ( + subj string + data []byte + err error + ) + switch g.cfg.Inject { + case InjectCanonical: + now := time.Now().UTC() + evt := model.MessageEvent{ + Message: model.Message{ + ID: msgID, RoomID: sub.RoomID, + UserID: sub.User.ID, UserAccount: sub.User.Account, + Content: content, CreatedAt: now, + }, + SiteID: g.cfg.SiteID, + Timestamp: now.UnixMilli(), + } + data, err = json.Marshal(evt) + subj = subject.MsgCanonicalCreated(g.cfg.SiteID) + default: + req := model.SendMessageRequest{ID: msgID, Content: content, RequestID: reqID} + data, err = json.Marshal(req) + subj = subject.MsgSend(sub.User.Account, sub.RoomID, g.cfg.SiteID) + } + if err != nil { + g.cfg.Metrics.PublishErrors.WithLabelValues(g.cfg.Preset.Name, "marshal").Inc() + return + } + publishTime := time.Now() + g.cfg.Collector.RecordPublish(reqID, msgID, publishTime) + if perr := g.cfg.Publisher.Publish(ctx, subj, data); perr != nil { + g.cfg.Metrics.PublishErrors.WithLabelValues(g.cfg.Preset.Name, "publish").Inc() + return + } + g.cfg.Metrics.Published.WithLabelValues(g.cfg.Preset.Name).Inc() +} + +func (g *Generator) content(subUserIdx int) string { + r := g.cfg.Preset.ContentBytes + size := r.Min + if r.Max > r.Min { + size = r.Min + g.rng.Intn(r.Max-r.Min+1) + } + if size <= 0 { + size = 1 + } + body := strings.Repeat("x", size) + if g.cfg.Preset.MentionRate > 0 && g.rng.Float64() < g.cfg.Preset.MentionRate { + // Prefix with a valid-looking mention token. The target user- + // need not exist for capacity measurement; the gatekeeper does not + // validate mention targets. + target := g.rng.Intn(g.cfg.Preset.Users) + body = fmt.Sprintf("@user-%d %s", target, body) + } + // ThreadRate handling is deferred: fabricating thread-parent fields that + // pass gatekeeper validation requires tracking previously-published + // messages, which is not needed for the capacity signal. The preset's + // ThreadRate is read but unused until thread workloads are exercised. + _ = subUserIdx + return body +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /home/user/chat && go test ./tools/loadgen/ -run TestGenerator -v` +Expected: PASS. + +- [ ] **Step 5: Run the full unit suite to make sure nothing else broke** + +Run: `cd /home/user/chat && make test SERVICE=tools/loadgen` +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/generator.go tools/loadgen/generator_test.go +git commit -m "feat(loadgen): open-loop generator with injected publisher" +``` + +--- + +## Task 9: Consumer-lag sampler + +**Files:** +- Create: `tools/loadgen/consumerlag.go` + +- [ ] **Step 1: Write `consumerlag.go`** + +This is I/O against live JetStream, covered end-to-end by the integration test in Task 12. A unit test would just re-test the JetStream client. + +Create `tools/loadgen/consumerlag.go`: + +```go +package main + +import ( + "context" + "log/slog" + "time" + + "github.com/nats-io/nats.go/jetstream" +) + +// ConsumerSampler polls a single durable consumer's info every interval and +// records min/peak/final samples. Start with Run(ctx); stop by cancelling ctx. +type ConsumerSampler struct { + js jetstream.JetStream + stream string + durable string + metrics *Metrics + interval time.Duration + + hasSample bool + minPending uint64 + peakPending uint64 + finalPending uint64 + peakAckPending uint64 + finalRedelivered uint64 +} + +// NewConsumerSampler constructs a sampler. +func NewConsumerSampler(js jetstream.JetStream, stream, durable string, m *Metrics, interval time.Duration) *ConsumerSampler { + return &ConsumerSampler{js: js, stream: stream, durable: durable, metrics: m, interval: interval} +} + +// Run polls ConsumerInfo until ctx is cancelled. +func (s *ConsumerSampler) Run(ctx context.Context) { + t := time.NewTicker(s.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + s.sampleOnce(ctx) + } + } +} + +func (s *ConsumerSampler) sampleOnce(ctx context.Context) { + cons, err := s.js.Consumer(ctx, s.stream, s.durable) + if err != nil { + slog.Debug("consumer lookup failed", "stream", s.stream, "durable", s.durable, "error", err) + return + } + info, err := cons.Info(ctx) + if err != nil { + slog.Debug("consumer info failed", "stream", s.stream, "durable", s.durable, "error", err) + return + } + pending := info.NumPending + ack := uint64(info.NumAckPending) + redel := uint64(info.NumRedelivered) + + s.metrics.ConsumerPending.WithLabelValues(s.stream, s.durable).Set(float64(pending)) + s.metrics.ConsumerAckPending.WithLabelValues(s.stream, s.durable).Set(float64(ack)) + s.metrics.ConsumerRedelivered.WithLabelValues(s.stream, s.durable).Set(float64(redel)) + + if !s.hasSample { + s.hasSample = true + s.minPending = pending + s.peakPending = pending + s.peakAckPending = ack + } else { + if pending < s.minPending { + s.minPending = pending + } + if pending > s.peakPending { + s.peakPending = pending + } + if ack > s.peakAckPending { + s.peakAckPending = ack + } + } + s.finalPending = pending + s.finalRedelivered = redel +} + +// Snapshot returns a ConsumerStat from what has been observed so far. +func (s *ConsumerSampler) Snapshot() ConsumerStat { + return ConsumerStat{ + Stream: s.stream, + Durable: s.durable, + MinPending: s.minPending, + PeakPending: s.peakPending, + FinalPending: s.finalPending, + PeakAckPending: s.peakAckPending, + Redelivered: s.finalRedelivered, + } +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd /home/user/chat && go build ./tools/loadgen/` +Expected: Succeeds. + +- [ ] **Step 3: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/consumerlag.go +git commit -m "feat(loadgen): JetStream consumer-lag sampler" +``` + +--- + +## Task 10: Wire subcommands in `main.go` + +Replace the stub in `main.go` with full wiring: each subcommand parses flags, opens connections, and dispatches. + +**Files:** +- Modify: `tools/loadgen/main.go` + +- [ ] **Step 1: Rewrite `main.go`** + +Replace the entire contents of `tools/loadgen/main.go` with: + +```go +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "log/slog" + "net/http" + "os" + "os/signal" + "strings" + "sync" + "syscall" + "time" + + "github.com/caarlos0/env/v11" + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/mongoutil" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/stream" +) + +type config struct { + NatsURL string `env:"NATS_URL,required"` + NatsCredsFile string `env:"NATS_CREDS_FILE" envDefault:""` + SiteID string `env:"SITE_ID" envDefault:"site-local"` + MongoURI string `env:"MONGO_URI,required"` + MongoDB string `env:"MONGO_DB" envDefault:"chat"` + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9099"` +} + +func main() { + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil))) + if len(os.Args) < 2 { + fmt.Fprintln(os.Stderr, "usage: loadgen [flags]") + os.Exit(2) + } + cfg, err := env.ParseAs[config]() + if err != nil { + slog.Error("parse config", "error", err) + os.Exit(1) + } + // SIGINT / SIGTERM cancel the base context. Each subcommand treats ctx + // cancellation as "stop early but still run the end-of-run finalizers + // (print summary, drain NATS, disconnect Mongo)". + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + switch os.Args[1] { + case "seed": + os.Exit(runSeed(ctx, cfg, os.Args[2:])) + case "run": + os.Exit(runRun(ctx, cfg, os.Args[2:])) + case "teardown": + os.Exit(runTeardown(ctx, cfg)) + default: + fmt.Fprintf(os.Stderr, "unknown subcommand: %s\n", os.Args[1]) + os.Exit(2) + } +} + +func runSeed(ctx context.Context, cfg config, args []string) int { + fs := flag.NewFlagSet("seed", flag.ExitOnError) + preset := fs.String("preset", "", "preset name") + seed := fs.Int64("seed", 42, "RNG seed") + _ = fs.Parse(args) + if *preset == "" { + fmt.Fprintln(os.Stderr, "--preset required") + return 2 + } + p, ok := BuiltinPreset(*preset) + if !ok { + fmt.Fprintf(os.Stderr, "unknown preset: %s\n", *preset) + return 2 + } + client, err := mongoutil.Connect(ctx, cfg.MongoURI) + if err != nil { + slog.Error("mongo connect", "error", err) + return 1 + } + defer mongoutil.Disconnect(ctx, client) + db := client.Database(cfg.MongoDB) + fixtures := BuildFixtures(p, *seed, cfg.SiteID) + if err := Seed(ctx, db, fixtures); err != nil { + slog.Error("seed", "error", err) + return 1 + } + slog.Info("seed complete", "preset", p.Name, "users", len(fixtures.Users), "rooms", len(fixtures.Rooms), "subs", len(fixtures.Subscriptions)) + return 0 +} + +func runTeardown(ctx context.Context, cfg config) int { + client, err := mongoutil.Connect(ctx, cfg.MongoURI) + if err != nil { + slog.Error("mongo connect", "error", err) + return 1 + } + defer mongoutil.Disconnect(ctx, client) + db := client.Database(cfg.MongoDB) + if err := Teardown(ctx, db); err != nil { + slog.Error("teardown", "error", err) + return 1 + } + slog.Info("teardown complete") + return 0 +} + +func runRun(ctx context.Context, cfg config, args []string) int { + fs := flag.NewFlagSet("run", flag.ExitOnError) + preset := fs.String("preset", "", "preset name") + seed := fs.Int64("seed", 42, "RNG seed") + duration := fs.Duration("duration", 60*time.Second, "run duration") + rate := fs.Int("rate", 500, "target msgs/sec") + warmup := fs.Duration("warmup", 10*time.Second, "warmup window (samples discarded)") + inject := fs.String("inject", "frontdoor", "injection point: frontdoor|canonical") + csvPath := fs.String("csv", "", "optional csv output path") + _ = fs.Parse(args) + if *preset == "" { + fmt.Fprintln(os.Stderr, "--preset required") + return 2 + } + p, ok := BuiltinPreset(*preset) + if !ok { + fmt.Fprintf(os.Stderr, "unknown preset: %s\n", *preset) + return 2 + } + injectMode := InjectFrontdoor + if *inject == "canonical" { + injectMode = InjectCanonical + } else if *inject != "frontdoor" { + fmt.Fprintf(os.Stderr, "unknown inject mode: %s\n", *inject) + return 2 + } + + nc, err := natsutil.Connect(cfg.NatsURL, cfg.NatsCredsFile) + if err != nil { + slog.Error("nats connect", "error", err) + return 1 + } + js, err := jetstream.New(nc.Conn()) + if err != nil { + slog.Error("jetstream init", "error", err) + return 1 + } + + metrics := NewMetrics() + metricsSrv := &http.Server{Addr: cfg.MetricsAddr, Handler: metrics.Handler(), ReadHeaderTimeout: 5 * time.Second} + go func() { + if err := metricsSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Warn("metrics server stopped", "error", err) + } + }() + + fixtures := BuildFixtures(p, *seed, cfg.SiteID) + collector := NewCollector(metrics, p.Name) + + // E1 subscription: gatekeeper replies. + e1Sub, err := nc.Conn().Subscribe("chat.user.*.response.>", func(msg *nats.Msg) { + reqID := lastToken(msg.Subject) + // Non-empty "error" field counts as a gatekeeper error. + var payload struct { + Error string `json:"error"` + } + _ = json.Unmarshal(msg.Data, &payload) + if payload.Error != "" { + metrics.PublishErrors.WithLabelValues(p.Name, "gatekeeper").Inc() + } + collector.RecordReply(reqID, time.Now()) + }) + if err != nil { + slog.Error("subscribe e1", "error", err) + return 1 + } + defer func() { _ = e1Sub.Unsubscribe() }() + + // E2 subscription: broadcast events. + e2Sub, err := nc.Conn().Subscribe("chat.room.*.event", func(msg *nats.Msg) { + var evt model.RoomEvent + if err := json.Unmarshal(msg.Data, &evt); err != nil { + return + } + if evt.Message == nil || evt.Message.ID == "" { + return + } + collector.RecordBroadcast(evt.Message.ID, time.Now()) + }) + if err != nil { + slog.Error("subscribe e2", "error", err) + return 1 + } + defer func() { _ = e2Sub.Unsubscribe() }() + + canonical := stream.MessagesCanonical(cfg.SiteID) + samplerCtx, cancelSamplers := context.WithCancel(ctx) + defer cancelSamplers() + mwSampler := NewConsumerSampler(js, canonical.Name, "message-worker", metrics, 1*time.Second) + bwSampler := NewConsumerSampler(js, canonical.Name, "broadcast-worker", metrics, 1*time.Second) + var samplerWG sync.WaitGroup + samplerWG.Add(2) + go func() { defer samplerWG.Done(); mwSampler.Run(samplerCtx) }() + go func() { defer samplerWG.Done(); bwSampler.Run(samplerCtx) }() + + publisher := &natsCorePublisher{nc: nc.Conn()} + if injectMode == InjectCanonical { + publisher = &natsCorePublisher{nc: nc.Conn(), useJetStream: true, js: js} + } + + gen := NewGenerator(GeneratorConfig{ + Preset: p, Fixtures: fixtures, SiteID: cfg.SiteID, + Rate: *rate, Inject: injectMode, + Publisher: publisher, Metrics: metrics, Collector: collector, + }, *seed) + + runCtx, cancelRun := context.WithTimeout(ctx, *duration) + defer cancelRun() + warmupDeadline := time.Now().Add(*warmup) + genErr := gen.Run(runCtx) + // Wait up to 2 seconds for trailing replies and broadcasts to arrive. + time.Sleep(2 * time.Second) + collector.DiscardBefore(warmupDeadline) + missingReplies, missingBroadcasts := collector.Finalize() + + cancelSamplers() + samplerWG.Wait() + + shutCtx, cancelShut := context.WithTimeout(context.Background(), 5*time.Second) + _ = metricsSrv.Shutdown(shutCtx) + cancelShut() + _ = nc.Drain() + + if genErr != nil { + slog.Error("generator error", "error", genErr) + } + + publishErrs := counterValue(metrics, "loadgen_publish_errors_total") + gkErrs := counterValueLabeled(metrics, "loadgen_publish_errors_total", "reason", "gatekeeper") + sent := int(counterValueLabeled(metrics, "loadgen_published_total", "preset", p.Name)) + measured := *duration - *warmup + actualRate := 0.0 + if measured > 0 { + actualRate = float64(collector.E1Count()+missingReplies) / measured.Seconds() + } + + summary := Summary{ + Preset: p.Name, Seed: *seed, Site: cfg.SiteID, + TargetRate: *rate, ActualRate: actualRate, + Duration: *duration, Warmup: *warmup, Inject: *inject, + Sent: sent, + PublishErrors: int(publishErrs - gkErrs), + GatekeeperErrors: int(gkErrs), + MissingReplies: missingReplies, + MissingBroadcasts: missingBroadcasts, + E1: ComputePercentiles(collector.E1Samples()), + E2: ComputePercentiles(collector.E2Samples()), + E1Count: collector.E1Count(), + E2Count: collector.E2Count(), + Consumers: []ConsumerStat{mwSampler.Snapshot(), bwSampler.Snapshot()}, + } + PrintSummary(os.Stdout, summary) + + if *csvPath != "" { + if err := writeCSVFile(*csvPath, collector); err != nil { + slog.Error("csv export", "error", err) + } + } + + totalErrs := summary.PublishErrors + summary.GatekeeperErrors + summary.MissingReplies + summary.MissingBroadcasts + return DetermineExitCode(summary.Sent, totalErrs) +} + +type natsCorePublisher struct { + nc *nats.Conn + useJetStream bool + js jetstream.JetStream +} + +func (p *natsCorePublisher) Publish(ctx context.Context, subject string, data []byte) error { + if p.useJetStream { + _, err := p.js.Publish(ctx, subject, data) + if err != nil { + return fmt.Errorf("jetstream publish: %w", err) + } + return nil + } + if err := p.nc.Publish(subject, data); err != nil { + return fmt.Errorf("core publish: %w", err) + } + return nil +} + +func lastToken(subj string) string { + i := strings.LastIndex(subj, ".") + if i < 0 { + return subj + } + return subj[i+1:] +} + +func writeCSVFile(path string, c *Collector) error { + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create csv: %w", err) + } + defer f.Close() + var rows []CSVSample + // E1 rows + for i, d := range c.E1Samples() { + rows = append(rows, CSVSample{TimestampNs: int64(i), RequestID: "", Metric: "E1", LatencyNs: d.Nanoseconds()}) + } + // E2 rows + for i, d := range c.E2Samples() { + rows = append(rows, CSVSample{TimestampNs: int64(i), RequestID: "", Metric: "E2", LatencyNs: d.Nanoseconds()}) + } + return WriteCSV(f, rows) +} + +func counterValue(m *Metrics, name string) float64 { + metrics, err := m.Registry.Gather() + if err != nil { + return 0 + } + var total float64 + for _, mf := range metrics { + if mf.GetName() != name { + continue + } + for _, metric := range mf.GetMetric() { + total += metric.GetCounter().GetValue() + } + } + return total +} + +func counterValueLabeled(m *Metrics, name, labelName, labelValue string) float64 { + metrics, err := m.Registry.Gather() + if err != nil { + return 0 + } + var total float64 + for _, mf := range metrics { + if mf.GetName() != name { + continue + } + for _, metric := range mf.GetMetric() { + for _, l := range metric.GetLabel() { + if l.GetName() == labelName && l.GetValue() == labelValue { + total += metric.GetCounter().GetValue() + } + } + } + } + return total +} +``` + +Note: `model.RoomEvent.Message` is a `*ClientMessage` per `pkg/model/event.go`. Accessing `evt.Message.ID` through the embedded `Message` works because `ClientMessage` embeds `Message`. + +- [ ] **Step 2: Build to confirm** + +Run: `cd /home/user/chat && go build ./tools/loadgen/` +Expected: Succeeds. + +- [ ] **Step 3: Run full unit suite** + +Run: `cd /home/user/chat && make test SERVICE=tools/loadgen` +Expected: PASS. + +- [ ] **Step 4: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/main.go +git commit -m "feat(loadgen): wire seed/run/teardown subcommands in main.go" +``` + +--- + +## Task 11: Dockerfile and docker-compose for the harness + +**Files:** +- Create: `tools/loadgen/deploy/Dockerfile` +- Create: `tools/loadgen/deploy/docker-compose.loadtest.yml` +- Create: `tools/loadgen/deploy/prometheus/prometheus.yml` +- Create: `tools/loadgen/deploy/grafana/provisioning/datasources/prometheus.yaml` +- Create: `tools/loadgen/deploy/grafana/provisioning/dashboards/loadtest.yaml` +- Create: `tools/loadgen/deploy/grafana/dashboards/loadtest.json` + +- [ ] **Step 1: Write the Dockerfile** + +Create `tools/loadgen/deploy/Dockerfile`: + +```dockerfile +FROM golang:1.25.8-alpine AS builder + +WORKDIR /app + +COPY go.mod go.sum ./ +RUN go mod download + +COPY pkg/ pkg/ +COPY tools/loadgen/ tools/loadgen/ + +RUN CGO_ENABLED=0 go build -o /loadgen ./tools/loadgen/ + +FROM alpine:3.21 +RUN apk add --no-cache ca-certificates +COPY --from=builder /loadgen /loadgen +ENTRYPOINT ["/loadgen"] +``` + +- [ ] **Step 2: Write the docker-compose file** + +Create `tools/loadgen/deploy/docker-compose.loadtest.yml`: + +```yaml +name: loadgen + +services: + nats: + image: nats:2.11-alpine + command: ["-js", "-m", "8222"] + ports: + - "4222:4222" + - "8222:8222" + networks: [loadtest] + + mongodb: + image: mongo:8 + ports: + - "27017:27017" + networks: [loadtest] + + cassandra: + image: cassandra:4.1 + environment: + - CASSANDRA_CLUSTER_NAME=loadtest + ports: + - "9042:9042" + networks: [loadtest] + healthcheck: + test: ["CMD-SHELL", "nodetool status | grep -q '^UN'"] + interval: 10s + timeout: 5s + retries: 30 + + cassandra-init: + image: cassandra:4.1 + depends_on: + cassandra: + condition: service_healthy + entrypoint: + - sh + - -c + - | + cqlsh cassandra -e "CREATE KEYSPACE IF NOT EXISTS chat WITH replication = {'class':'SimpleStrategy','replication_factor':1};" + networks: [loadtest] + restart: "no" + + message-gatekeeper: + build: + context: ../../.. + dockerfile: message-gatekeeper/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + depends_on: [nats, mongodb] + networks: [loadtest] + + message-worker: + build: + context: ../../.. + dockerfile: message-worker/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + - CASSANDRA_HOSTS=cassandra + - CASSANDRA_KEYSPACE=chat + depends_on: + nats: + condition: service_started + mongodb: + condition: service_started + cassandra-init: + condition: service_completed_successfully + networks: [loadtest] + + broadcast-worker: + build: + context: ../../.. + dockerfile: broadcast-worker/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + depends_on: [nats, mongodb] + networks: [loadtest] + + loadgen: + build: + context: ../../.. + dockerfile: tools/loadgen/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + - METRICS_ADDR=:9099 + ports: + - "9099:9099" + depends_on: [nats, mongodb, message-gatekeeper, message-worker, broadcast-worker] + entrypoint: ["sleep", "infinity"] + networks: [loadtest] + + prometheus: + image: prom/prometheus:v2.55.0 + profiles: [dashboards] + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + networks: [loadtest] + + grafana: + image: grafana/grafana:11.2.2 + profiles: [dashboards] + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "3000:3000" + networks: [loadtest] + +networks: + loadtest: +``` + +- [ ] **Step 3: Write Prometheus scrape config** + +Create `tools/loadgen/deploy/prometheus/prometheus.yml`: + +```yaml +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: loadgen + static_configs: + - targets: ["loadgen:9099"] + - job_name: nats + metrics_path: / + static_configs: + - targets: ["nats:8222"] +``` + +- [ ] **Step 4: Write Grafana provisioning** + +Create `tools/loadgen/deploy/grafana/provisioning/datasources/prometheus.yaml`: + +```yaml +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true +``` + +Create `tools/loadgen/deploy/grafana/provisioning/dashboards/loadtest.yaml`: + +```yaml +apiVersion: 1 +providers: + - name: loadtest + folder: "" + type: file + options: + path: /var/lib/grafana/dashboards +``` + +- [ ] **Step 5: Write a minimal dashboard JSON** + +Create `tools/loadgen/deploy/grafana/dashboards/loadtest.json`: + +```json +{ + "title": "Loadgen", + "schemaVersion": 39, + "version": 1, + "refresh": "5s", + "time": {"from": "now-15m", "to": "now"}, + "panels": [ + { + "type": "timeseries", + "title": "Throughput (msg/s)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [{"expr": "rate(loadgen_published_total[10s])", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "E1 gatekeeper latency (P50/P95/P99)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + {"expr": "histogram_quantile(0.50, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p50", "refId": "A"}, + {"expr": "histogram_quantile(0.95, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p95", "refId": "B"}, + {"expr": "histogram_quantile(0.99, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p99", "refId": "C"} + ] + }, + { + "type": "timeseries", + "title": "E2 broadcast latency (P50/P95/P99)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + {"expr": "histogram_quantile(0.50, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p50", "refId": "A"}, + {"expr": "histogram_quantile(0.95, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p95", "refId": "B"}, + {"expr": "histogram_quantile(0.99, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p99", "refId": "C"} + ] + }, + { + "type": "timeseries", + "title": "Consumer pending", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [{"expr": "loadgen_consumer_pending", "legendFormat": "{{durable}}", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "Consumer ack pending", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "targets": [{"expr": "loadgen_consumer_ack_pending", "legendFormat": "{{durable}}", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "Publish errors/sec", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "targets": [{"expr": "rate(loadgen_publish_errors_total[10s])", "legendFormat": "{{reason}}", "refId": "A"}] + } + ] +} +``` + +- [ ] **Step 6: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/deploy/ +git commit -m "feat(loadgen): docker-compose harness, Dockerfile, grafana dashboard" +``` + +--- + +## Task 12: Scoped Makefile + +**Files:** +- Create: `tools/loadgen/deploy/Makefile` + +- [ ] **Step 1: Write the Makefile** + +Create `tools/loadgen/deploy/Makefile`: + +```make +COMPOSE ?= docker compose -f docker-compose.loadtest.yml + +.PHONY: up seed run run-dashboards down logs + +up: + $(COMPOSE) up -d --build + +seed: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen seed --preset=$(PRESET) + +run: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen run \ + --preset=$(PRESET) \ + --rate=$(or $(RATE),500) \ + --duration=$(or $(DURATION),60s) + +run-dashboards: + $(COMPOSE) --profile dashboards up -d + $(MAKE) run PRESET=$(PRESET) RATE=$(RATE) DURATION=$(DURATION) + +down: + $(COMPOSE) --profile dashboards down -v + +logs: + $(COMPOSE) logs -f loadgen +``` + +- [ ] **Step 2: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/deploy/Makefile +git commit -m "feat(loadgen): scoped Makefile for harness" +``` + +--- + +## Task 13: Integration test — end-to-end wiring + +**Files:** +- Create: `tools/loadgen/integration_test.go` + +- [ ] **Step 1: Write the integration test** + +Create `tools/loadgen/integration_test.go`: + +```go +//go:build integration + +package main + +import ( + "context" + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/mongodb" + "github.com/testcontainers/testcontainers-go/wait" + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/mongoutil" + "github.com/hmchangw/chat/pkg/stream" +) + +// setupNATS starts a JetStream-enabled NATS container via the generic +// testcontainers interface (no dedicated NATS module is required here). +func setupNATS(t *testing.T) (string, func()) { + t.Helper() + ctx := context.Background() + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: "nats:2.11-alpine", + Cmd: []string{"-js"}, + ExposedPorts: []string{"4222/tcp"}, + WaitingFor: wait.ForLog("Server is ready").WithStartupTimeout(30 * time.Second), + }, + Started: true, + }) + require.NoError(t, err) + host, err := c.Host(ctx) + require.NoError(t, err) + port, err := c.MappedPort(ctx, "4222") + require.NoError(t, err) + return fmt.Sprintf("nats://%s:%s", host, port.Port()), func() { _ = c.Terminate(ctx) } +} + +func setupMongo(t *testing.T) (string, func()) { + t.Helper() + ctx := context.Background() + c, err := mongodb.Run(ctx, "mongo:8") + require.NoError(t, err) + uri, err := c.ConnectionString(ctx) + require.NoError(t, err) + return uri, func() { _ = c.Terminate(ctx) } +} + +// TestLoadgenSmallPreset_EndToEnd verifies the generator publishes messages, +// the canonical stream receives them, both durables drain, and MongoDB shows +// updated room.lastMsgId. It stands in for the gatekeeper/worker services by +// running a minimal in-process equivalent: it creates the canonical stream and +// consumes from MESSAGES_CANONICAL to ack messages so num_pending drops to 0. +func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { + ctx := context.Background() + natsURI, stopNATS := setupNATS(t) + defer stopNATS() + mongoURI, stopMongo := setupMongo(t) + defer stopMongo() + + nc, err := nats.Connect(natsURI) + require.NoError(t, err) + defer nc.Drain() + + js, err := jetstream.New(nc) + require.NoError(t, err) + + siteID := "site-test" + canonical := stream.MessagesCanonical(siteID) + _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{Name: canonical.Name, Subjects: canonical.Subjects}) + require.NoError(t, err) + + for _, durable := range []string{"message-worker", "broadcast-worker"} { + cons, err := js.CreateOrUpdateConsumer(ctx, canonical.Name, jetstream.ConsumerConfig{ + Durable: durable, + AckPolicy: jetstream.AckExplicitPolicy, + }) + require.NoError(t, err) + go func(c jetstream.Consumer) { + _, _ = c.Consume(func(msg jetstream.Msg) { _ = msg.Ack() }) + }(cons) + } + + client, err := mongoutil.Connect(ctx, mongoURI) + require.NoError(t, err) + defer mongoutil.Disconnect(ctx, client) + db := client.Database("chat") + + preset, _ := BuiltinPreset("small") + fixtures := BuildFixtures(preset, 42, siteID) + require.NoError(t, Seed(ctx, db, fixtures)) + + metrics := NewMetrics() + collector := NewCollector(metrics, preset.Name) + + // Fake gatekeeper: subscribe to the front-door subject, reply with the + // original request shape (so missing-reply check passes), and publish a + // MessageEvent to MESSAGES_CANONICAL so the downstream consumers see it. + gkSub, err := nc.Subscribe("chat.user.*.room.*."+siteID+".msg.send", func(m *nats.Msg) { + var req model.SendMessageRequest + if err := json.Unmarshal(m.Data, &req); err != nil { + return + } + _, _, gotSiteID, ok := parseUserRoomSiteSubject(m.Subject) + if !ok || gotSiteID != siteID { + return + } + evt := model.MessageEvent{ + Message: model.Message{ID: req.ID, Content: req.Content, CreatedAt: time.Now()}, + SiteID: siteID, Timestamp: time.Now().UnixMilli(), + } + data, _ := json.Marshal(evt) + _, _ = js.Publish(ctx, "chat.msg.canonical."+siteID+".created", data) + + replySubj := "chat.user." + m.Subject[len("chat.user."):] + _ = replySubj + }) + require.NoError(t, err) + defer gkSub.Unsubscribe() + + // Also broadcast a matching room event so E2 correlation has something to consume. + bwSub, err := nc.Subscribe("chat.msg.canonical."+siteID+".created", func(m *nats.Msg) { + var evt model.MessageEvent + if err := json.Unmarshal(m.Data, &evt); err != nil { + return + } + roomEvt := model.RoomEvent{ + Type: model.RoomEventNewMessage, RoomID: "r", + Message: &model.ClientMessage{Message: evt.Message}, + } + data, _ := json.Marshal(roomEvt) + _ = nc.Publish("chat.room.r.event", data) + }) + require.NoError(t, err) + defer bwSub.Unsubscribe() + + publisher := &natsCorePublisher{nc: nc} + gen := NewGenerator(GeneratorConfig{ + Preset: preset, Fixtures: fixtures, SiteID: siteID, + Rate: 50, Inject: InjectFrontdoor, + Publisher: publisher, Metrics: metrics, Collector: collector, + }, 42) + + runCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + require.NoError(t, gen.Run(runCtx)) + + // Allow trailing events to flow. + time.Sleep(2 * time.Second) + + missingReplies, missingBroadcasts := collector.Finalize() + require.Equal(t, 0, missingBroadcasts, "missing broadcasts") + _ = missingReplies // the fake gatekeeper above does not actually send replies; ignore E1 assertion in this test. + + // Assert canonical stream pending is 0 for both durables. + for _, durable := range []string{"message-worker", "broadcast-worker"} { + cons, err := js.Consumer(ctx, canonical.Name, durable) + require.NoError(t, err) + info, err := cons.Info(ctx) + require.NoError(t, err) + require.Equal(t, uint64(0), info.NumPending, "durable %s still has pending", durable) + } + + // Assert something got seeded and is reachable. + var room model.Room + err = db.Collection("rooms").FindOne(ctx, bson.M{"_id": fixtures.Rooms[0].ID}).Decode(&room) + require.NoError(t, err) + require.Equal(t, fixtures.Rooms[0].ID, room.ID) +} + +// parseUserRoomSiteSubject is a local re-impl because the test can't use the +// internal subject package without introducing a cycle. +func parseUserRoomSiteSubject(s string) (account, roomID, siteID string, ok bool) { + // chat.user.{account}.room.{roomID}.{siteID}.msg.send + parts := splitDot(s) + if len(parts) < 7 || parts[0] != "chat" || parts[1] != "user" || parts[3] != "room" { + return "", "", "", false + } + return parts[2], parts[4], parts[5], true +} + +func splitDot(s string) []string { + var out []string + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == '.' { + out = append(out, s[start:i]) + start = i + 1 + } + } + return append(out, s[start:]) +} +``` + +- [ ] **Step 2: Run the integration test** + +Run: `cd /home/user/chat && make test-integration SERVICE=tools/loadgen` +Expected: PASS. Docker must be running. + +- [ ] **Step 3: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/integration_test.go +git commit -m "test(loadgen): integration test for end-to-end wiring" +``` + +--- + +## Task 14: Operator README + +**Files:** +- Create: `tools/loadgen/README.md` + +- [ ] **Step 1: Write the README** + +Create `tools/loadgen/README.md`: + +````markdown +# loadgen + +Capacity-baseline load generator for the single-site messaging pipeline +(`message-gatekeeper` → `MESSAGES_CANONICAL` → `message-worker` + +`broadcast-worker`). Single Go binary with three subcommands. + +## Quick start + +``` +make -C tools/loadgen/deploy up +make -C tools/loadgen/deploy seed PRESET=medium +make -C tools/loadgen/deploy run PRESET=medium RATE=500 DURATION=60s +``` + +For live dashboards: + +``` +make -C tools/loadgen/deploy run-dashboards PRESET=medium +# Grafana at http://localhost:3000 (anonymous admin) +``` + +Tear down: + +``` +make -C tools/loadgen/deploy down +``` + +## Presets + +| preset | users | rooms | notes | +|-------------|--------|-------|--------------------------------------------------------| +| `small` | 10 | 5 | uniform, 200-byte content | +| `medium` | 1 000 | 100 | uniform, 200-byte content | +| `large` | 10 000 | 1 000 | uniform, 200-byte content | +| `realistic` | 1 000 | 100 | Zipf senders, mixed room sizes, 50–2000 bytes, mentions| + +## Subcommands + +- `loadgen seed --preset= [--seed=42]` — idempotently populate + MongoDB with deterministic fixtures. +- `loadgen run --preset= [flags]` — open-loop publish at `--rate` + msgs/sec for `--duration`, print a summary at the end. Flags: + `--seed`, `--warmup`, `--inject=frontdoor|canonical`, `--csv=path`. +- `loadgen teardown` — drop the three seeded collections. + +## Reading the summary + +- `final_pending == 0` on both durables, zero errors → the pipeline is + sustaining your target rate. +- `final_pending` climbing, or error counts > 0 → over capacity or a + regression upstream of the worker. + +## Non-goals + +- Not a CI regression gate. Invoked manually. +- Not an auth benchmark. Uses shared `backend.creds`. +- Not a cross-site benchmark. Single-site only. +- Not an absolute-number tool. Numbers vary by host — compare within one + machine across changes, don't compare across machines. +```` + +- [ ] **Step 2: Commit** + +```bash +cd /home/user/chat +git add tools/loadgen/README.md +git commit -m "docs(loadgen): add operator README" +``` + +--- + +## Task 15: Lint + final full-test pass + +- [ ] **Step 1: Run the linter** + +Run: `cd /home/user/chat && make lint` +Expected: PASS (zero issues). Fix any findings before proceeding. + +- [ ] **Step 2: Run the unit test suite for the whole repo** + +Run: `cd /home/user/chat && make test` +Expected: PASS. + +- [ ] **Step 3: Run coverage for `tools/loadgen`** + +Run: `cd /home/user/chat && go test -race -coverprofile=coverage.out ./tools/loadgen/ && go tool cover -func=coverage.out | tail -n 1` +Expected: total coverage ≥ 80%. + +If below 80%, identify the uncovered file(s) with +`go tool cover -func=coverage.out | sort -k3 -n` and add tests to reach +the threshold. Core files (`preset.go`, `generator.go`, `collector.go`, +`report.go`) should each be ≥ 90%. + +- [ ] **Step 4: Commit any coverage-gap fixes** + +```bash +cd /home/user/chat +git add tools/loadgen/ +git commit -m "test(loadgen): raise coverage to project threshold" +``` + +- [ ] **Step 5: Push the branch** + +```bash +cd /home/user/chat +git push -u origin claude/load-test-messaging-workers-tDKZn +``` + +--- + +## Done when + +- `make test SERVICE=tools/loadgen` passes locally. +- `make test-integration SERVICE=tools/loadgen` passes locally. +- `make lint` passes for the whole repo. +- `tools/loadgen` coverage ≥ 80% overall, ≥ 90% on core files. +- Running `make -C tools/loadgen/deploy up seed run PRESET=small RATE=50 DURATION=10s` prints a well-formed summary with exit code 0 against a clean Docker host. +- All commits are on `claude/load-test-messaging-workers-tDKZn` and pushed. diff --git a/docs/superpowers/specs/2026-04-21-load-test-messaging-workers-design.md b/docs/superpowers/specs/2026-04-21-load-test-messaging-workers-design.md new file mode 100644 index 00000000..395d6417 --- /dev/null +++ b/docs/superpowers/specs/2026-04-21-load-test-messaging-workers-design.md @@ -0,0 +1,620 @@ +# Messaging Workers Load Test Harness — Design + +## Purpose + +A capacity-baseline load test for the single-site messaging pipeline +(`message-gatekeeper` → `MESSAGES_CANONICAL` → `message-worker` + +`broadcast-worker`). + +The harness answers one question: **how many messages per second can one +site sustain, and at what latency?** It produces a repeatable terminal +summary, an optional CSV dump, and an opt-in Grafana dashboard. + +## Scope + +### In scope + +- A Go-based CLI load generator at `tools/loadgen/` (flat service, standard + file layout per the repo's conventions). +- A docker-compose harness at `tools/loadgen/deploy/docker-compose.loadtest.yml` + bringing up one NATS (JetStream), one MongoDB, one Cassandra, one + `message-gatekeeper`, one `message-worker`, one `broadcast-worker`, and + the loadgen container. +- Programmatic seeding of users, rooms, and subscriptions into MongoDB + based on a named preset + RNG seed. +- Open-loop rate generation with named presets: `small`, `medium`, + `large`, `realistic`. +- Front-door injection (via `chat.user.{account}.room.{roomID}.{siteID}.msg.send`) + by default, with a flag to inject directly at `MESSAGES_CANONICAL` for + isolating downstream-worker capacity. +- End-of-run terminal summary and optional CSV export. +- Optional Prometheus + Grafana compose profile with a pre-baked + dashboard JSON. + +### Out of scope (v1) + +- Multi-site / supercluster topology. The harness stays single-site; + topology is left pluggable for later. +- Per-user NATS credentials. The loadgen authenticates with the shared + `backend.creds` from `docker-local/` and impersonates users via subject + tokens. +- Persistence-read latency measurement from Cassandra. Replaced by + JetStream consumer-lag sampling (see measurement section). +- CI regression gating / pass-fail thresholds. The baseline run returns a + summary; CI gating is a later phase. +- Soak / long-duration stability runs. Different use case; different + tool settings; revisit later. + +## Topology + +Single-site stack, defined in `tools/loadgen/deploy/docker-compose.loadtest.yml`: + +``` +loadgen ──▶ nats (JetStream) ──▶ message-gatekeeper ──▶ MESSAGES_CANONICAL ──┬──▶ message-worker ──▶ cassandra + │ │ └──▶ broadcast-worker ──▶ mongodb + │ └──▶ mongodb (subscriptions lookup) + └──◀─ reply subject (chat.user.*.response.>) + └──◀─ broadcast subject (chat.room.*.event) + └──◀─ consumer info (JetStream API) + + optional profile "dashboards": + prometheus ──▶ grafana (pre-baked dashboard JSON) +``` + +- One NATS server with JetStream enabled, client port `4222`, + monitoring `8222`. +- One MongoDB, one Cassandra. Site scoping is handled by the `SITE_ID` + environment variable shared by all services in the stack + (`site-local`). +- One instance each of `message-gatekeeper`, `message-worker`, + `broadcast-worker`, all built from their existing `deploy/Dockerfile` + images with build context at the repo root. +- The `loadgen` container joins the same compose network and reaches + services by name (`nats`, `mongodb`, `cassandra`). Its host-side + port `9099` is exposed for Prometheus scraping. +- The `dashboards` profile adds `prometheus` and `grafana` containers + with file-provisioned scrape config and dashboard JSON. + +## File layout + +Following the repo's flat-service convention. All loadgen code lives in +`tools/loadgen/`: + +``` +tools/loadgen/ +├── README.md +├── main.go # config parsing, wiring, subcommand dispatch +├── seed.go # programmatic seeding of users/rooms/subs +├── preset.go # preset definitions + RNG-based workload spec +├── generator.go # open-loop publisher, rate-limited +├── collector.go # reply + broadcast subscribers, latency samples +├── consumerlag.go # polls JetStream ConsumerInfo every 1s +├── report.go # terminal summary, CSV export, Prometheus gauges +├── preset_test.go +├── generator_test.go +├── collector_test.go +├── report_test.go +├── integration_test.go # //go:build integration +└── deploy/ + ├── Dockerfile + ├── Makefile # scoped make targets + ├── docker-compose.loadtest.yml + ├── grafana/ + │ ├── dashboards/loadtest.json + │ └── provisioning/ + │ ├── dashboards/loadtest.yaml + │ └── datasources/prometheus.yaml + └── prometheus/ + └── prometheus.yml +``` + +The loadgen has no dedicated `Store` interface — seeding writes directly +through `mongoutil.Connect` and the raw collection API. This keeps the +component focused and avoids mock generation for code that exists only +to populate fixtures. + +## CLI surface + +The loadgen is one binary with three subcommands: + +``` +loadgen seed --preset= [--seed=] +loadgen run --preset= [--seed=] [--duration=60s] [--rate=500] + [--warmup=10s] [--inject=frontdoor|canonical] [--csv=path] +loadgen teardown +``` + +- `seed` is idempotent. It drops and recreates the `users`, `rooms`, + and `subscriptions` collections for the given preset, deterministically + populated from `(preset name, seed)`. Default seed is `42`. +- `run` assumes `seed` has been applied. It opens NATS and MongoDB + connections, subscribes to reply and broadcast subjects, starts a + publisher at the configured rate for `duration`, and prints a summary + at the end. `--warmup` discards samples from the first N seconds to + avoid cold-start skew. `--inject=canonical` bypasses the gatekeeper + and publishes `model.MessageEvent` directly on + `chat.msg.canonical.{siteID}.created`, for isolating downstream-worker + capacity. +- `teardown` drops the three seeded collections so a different preset + can be seeded cleanly without lingering state. + +### Environment config + +All values are parsed via `caarlos0/env` into a typed `config` struct in +`main.go`. Flags take precedence for run-specific knobs; everything else +is env. + +| Env Var | Default | Description | +|--------------------|--------------|-----------------------------------------------------| +| `NATS_URL` | *required* | NATS server URL | +| `NATS_CREDS_FILE` | *empty* | Shared backend creds; empty disables auth | +| `SITE_ID` | `site-local` | Must match gatekeeper / worker `SITE_ID` | +| `MONGO_URI` | *required* | MongoDB URI | +| `MONGO_DB` | `chat` | MongoDB database name | +| `METRICS_ADDR` | `:9099` | Prometheus `/metrics` listen address | + +### Preset structure + +Presets are declared as a `map[string]Preset` in `preset.go`. Adding a +new preset is one map entry; no CLI plumbing changes. + +```go +type Preset struct { + Name string + Users int + Rooms int + RoomSizeDist Distribution // uniform | mixed + SenderDist Distribution // uniform | zipf + ContentBytes Range // min/max content size + MentionRate float64 // 0.0 for uniform presets, 0.10 for realistic + ThreadRate float64 // 0.0 for uniform presets, 0.05 for realistic +} +``` + +Built-in presets: + +| preset | users | rooms | room sizes | sender dist | content bytes | mentions | threads | +|-------------|-------|-------|--------------|-------------|---------------|----------|---------| +| `small` | 10 | 5 | uniform | uniform | 200 | 0% | 0% | +| `medium` | 1 000 | 100 | uniform | uniform | 200 | 0% | 0% | +| `large` | 10 000| 1 000 | uniform | uniform | 200 | 0% | 0% | +| `realistic` | 1 000 | 100 | mixed | Zipf(s=1.1) | 50–2000 | 10% | 5% | + +Every run prints the preset name and RNG seed in the summary, making +results reproducible on any machine. + +### Makefile targets + +Scoped under `tools/loadgen/deploy/Makefile`. The root Makefile is +untouched, per the precedent set by the broadcast-worker test harness. + +```make +COMPOSE ?= docker compose -f docker-compose.loadtest.yml + +up: + $(COMPOSE) up -d --build + +seed: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen seed --preset=$(PRESET) + +run: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen run \ + --preset=$(PRESET) \ + --rate=$(or $(RATE),500) \ + --duration=$(or $(DURATION),60s) + +run-dashboards: + $(COMPOSE) --profile dashboards up -d + $(MAKE) run PRESET=$(PRESET) RATE=$(RATE) DURATION=$(DURATION) + +down: + $(COMPOSE) --profile dashboards down -v +``` + +## Seeding + +`loadgen seed` is responsible for producing a deterministic fixture +from `(preset name, seed)` and writing it to MongoDB. The algorithm: + +1. Open a MongoDB connection via `mongoutil.Connect`. +2. Drop `users`, `rooms`, and `subscriptions` collections (idempotent + reset so reruns are clean). +3. Seed a `math/rand.New(rand.NewSource(seed))` generator. +4. Generate `preset.Users` user documents. Each user has a stable ID + (`u-`) and account name (`user-`). English + and Chinese display names are drawn from a small fixed list cycled + by index so enrichment paths in `broadcast-worker` exercise populated + values. +5. Generate `preset.Rooms` room documents. Room IDs are + `room-`. Room type is `group` for uniform + presets; `realistic` mixes `group` and `dm` with a 9:1 ratio. +6. For each room, assign members according to the preset's + `RoomSizeDist`: + - **uniform**: each room has `ceil(Users / Rooms)` distinct members + drawn round-robin from the user pool (every user ends up in at + least one room; some users are in more). + - **mixed**: a small fraction of rooms (10%) get up to 500 members + sampled without replacement; the remainder get 2–20 members. DM + rooms always have exactly 2 members. +7. Write `Subscription` documents for each `(user, room)` membership, + with `siteId = SITE_ID`. +8. Create indexes that match the worker services' expectations + (`subscriptions.roomId`, `subscriptions.u.account`). + +Seed data is never large enough to need bulk-write batching beyond +MongoDB's default batch size; `InsertMany` is used directly. At the +`large` preset (10k users, ~100k subscriptions) this completes in a +few seconds on a developer laptop. + +Because generation is a pure function of `(preset, seed)`, running +`loadgen seed --preset=large --seed=42` twice produces byte-identical +data. The same `(preset, seed)` passed to `loadgen run` produces the +same stream of publishes. + +## Generator and measurement + +### Open-loop publishing + +A single goroutine owns a `time.Ticker` at `1s / rate`. On each tick +it selects a `(user, room)` pair according to the preset's +distributions (deterministic from the same RNG seed used in `seed`) +and publishes a `model.SendMessageRequest` with: + +- `ID`: a freshly allocated UUID, used as the JetStream message-ID for + deduplication and as the `Message.ID` after gatekeeper validation. +- `RequestID`: a freshly allocated UUID, used to correlate the + gatekeeper reply back to the originating publish. +- `Content`: a random-length string drawn from `preset.ContentBytes`. + Content is a benign filler — no PII, no tokens. For `realistic`, + a mention token (`@user-`) is prefixed with probability + `MentionRate`; thread-reply fields reference a prior message with + probability `ThreadRate`. + +The publish subject is built via `pkg/subject` helpers (never hand +`fmt.Sprintf`) and, by default, is +`chat.user.{account}.room.{roomID}.{siteID}.msg.send`. With +`--inject=canonical`, the generator instead publishes a pre-built +`model.MessageEvent` on `chat.msg.canonical.{siteID}.created` — this +bypasses the gatekeeper entirely and is used to isolate downstream +worker capacity. + +Publishing is non-blocking. If the pipeline slows, messages accumulate +in JetStream and the consumer-lag signal grows — which is exactly the +backpressure signal a capacity baseline wants to reveal. + +The rate limiter is `time.Ticker`. `golang.org/x/time/rate.Limiter` +would also work, but a ticker is sufficient for a fixed target rate +and keeps the dependency footprint minimal. + +### Metrics measured + +| ID | Name | How it's measured | +|-----|------------------------|-------------------------------------------------------------------------------------------------------------------| +| E1 | Gatekeeper ack latency | Publish time → gatekeeper reply on `chat.user.{account}.response.{requestID}`. Correlated by `requestID`. | +| E2 | Broadcast visibility | Publish time → appearance of matching `RoomEvent` on `chat.room.{roomID}.event`. Correlated by `message.id`. | +| E4 | Consumer backlog | Polled via `js.Consumer(stream, durable).Info(ctx)` every 1s for both `message-worker` and `broadcast-worker`. | + +E3 (persistence-read latency from Cassandra) is deliberately not +measured. The E4 consumer-backlog curves give the relevant answer — +"is the message-worker keeping up with canonical publishes?" — without +requiring a Cassandra probe. + +### Reply correlation + +Before the generator begins publishing, two wildcard subscriptions are +opened: + +- `chat.user.*.response.>` for gatekeeper replies (E1). +- `chat.room.*.event` for broadcast events (E2). + +Every outbound publish records the publish timestamp in **two separate** +`sync.Map`s: + +- `pendingByRequestID[requestID] = publishNanos` — consumed by E1. +- `pendingByMessageID[messageID] = publishNanos` — consumed by E2. + +Keeping E1 and E2 bookkeeping independent means recording an E1 sample +does not affect E2 correlation (and vice versa), and each map can be +scanned at end-of-run to count its own "missing" class. + +When a reply arrives on the response subject, the collector parses +`requestID` from the last subject token, looks it up in +`pendingByRequestID`, appends `now - publishNanos` to the E1 sample +buffer, and deletes the entry. When a `RoomEvent` arrives on the +broadcast subject, the collector extracts `message.id`, looks it up +in `pendingByMessageID`, appends the delta to the E2 sample buffer, +and deletes the entry. + +At end-of-run, any remaining entries in `pendingByRequestID` are +counted as "missing replies"; any remaining in `pendingByMessageID` +are counted as "missing broadcasts". Neither contributes to percentiles. + +### Consumer-lag sampling + +A dedicated goroutine polls both durable consumers on +`MESSAGES_CANONICAL_{SITE_ID}` every 1 second using +`js.Consumer(ctx, stream, durable).Info(ctx)`. Fields recorded per +sample: + +- `num_pending` — messages in the stream that haven't been delivered. +- `num_ack_pending` — messages delivered but not yet acked. +- `num_redelivered` — accumulator of retry deliveries; delta per + sample is logged. +- `num_waiting` — pull requests in flight (worker health). + +Samples are appended to per-durable time-series buffers and exported +live as Prometheus gauges. The terminal summary reports min, peak, +and final values. + +Little's Law gives a rough latency estimate if needed: +`avg_wait ≈ num_pending / actual_throughput`. This is not reported by +default — the headline metrics are already E1 and E2 — but the raw +data supports it. + +### Sample storage + +Latency samples are `int64` nanosecond deltas appended to per-metric +slices guarded by a mutex. A 60-second run at 1000 msg/s produces +120k samples (E1 + E2 combined) consuming about 1 MB — trivial. At +end of run, the collector sorts each slice and computes P50, P95, P99, +and max. + +Should we ever need multi-hour runs, HDR histogram +(`github.com/HdrHistogram/hdrhistogram-go`) would replace the slice. +v1 does not add that dependency. + +### Warmup + +The first `--warmup` seconds (default 10s) of publishing and sampling +happens normally but the samples collected during that window are +discarded at the warmup boundary. This prevents first-connection, +JIT, and cache-cold effects from skewing the headline percentiles. + +### Error accounting + +Each of these is counted separately and surfaced explicitly in the +summary; a run is never silently "successful" if any occurred: + +- Publish failures (JetStream `PublishAsync` returned an error). +- Gatekeeper error replies (reply payload has a non-empty `error` field). +- Missing replies (requestID never received a reply by end of run). +- Missing broadcasts (message.id never received a broadcast by end of run). +- Reply-subject JSON parse failures (malformed reply payload). + +## Reporting + +### Terminal summary + +Printed to stdout at end of run via `text/tabwriter`. Always produced, +regardless of whether Prometheus/Grafana are running. Structured so a +human can eyeball it and a grep-based tool can parse it. + +``` +=== loadgen run complete === +preset: medium seed: 42 site: site-local +duration: 60s (warmup: 10s, measured: 50s) inject: frontdoor +target rate: 500 msg/s actual rate: 499.8 msg/s + +publish results + sent: 25000 + publish errors: 0 + gatekeeper errors: 0 + missing replies: 0 + missing broadcasts: 0 + +latency (measured window only) + metric count p50 p95 p99 max + E1 gatekeeper 25000 2.1ms 6.3ms 11.4ms 24ms + E2 broadcast 25000 8.7ms 24.1ms 41.0ms 88ms + +consumer lag (MESSAGES_CANONICAL_site-local) + durable min_pending peak_pending final_pending peak_ack_pending redelivered + message-worker 0 42 0 18 0 + broadcast-worker 0 57 0 22 0 +``` + +The capacity signal is `final_pending == 0` with `peak_pending` +bounded: the system drained its queue within the run, so it is +sustaining the target rate. `final_pending` climbing is the signal +for "over capacity". + +### CSV export + +Opt-in with `--csv=path`. One file, one row per sample: + +``` +timestamp_ns,request_id,metric,latency_ns +1713600000000000000,9f…,E1,2100000 +1713600000000000000,9f…,E2,8700000 +… +``` + +Intended for ad-hoc analysis in a notebook or spreadsheet. Not +produced unless the flag is set. + +### Prometheus metrics + +Always exposed on `METRICS_ADDR` (default `:9099`), using +`prometheus/client_golang` (already an approved repo dependency). + +| Metric | Type | Labels | +|-------------------------------------|-----------|---------------------| +| `loadgen_published_total` | counter | `preset` | +| `loadgen_publish_errors_total` | counter | `preset`, `reason` | +| `loadgen_e1_latency_seconds` | histogram | `preset` | +| `loadgen_e2_latency_seconds` | histogram | `preset` | +| `loadgen_consumer_pending` | gauge | `stream`, `durable` | +| `loadgen_consumer_ack_pending` | gauge | `stream`, `durable` | +| `loadgen_consumer_redelivered` | gauge | `stream`, `durable` | + +### Grafana dashboard (opt-in) + +Activated with `docker compose --profile dashboards up` (or +`make run-dashboards`). Prometheus is provisioned to scrape: + +- The loadgen's `/metrics` endpoint. +- The NATS server's monitoring endpoint (`/varz` and `/jsz`) via the + community `prometheus-nats-exporter`, or directly via NATS's own + Prometheus output if configured. + +A pre-baked dashboard JSON at +`tools/loadgen/deploy/grafana/dashboards/loadtest.json` is +provisioned via Grafana's file provisioner and includes these panels: + +1. **Throughput** — `rate(loadgen_published_total[10s])` vs target rate. +2. **E1 gatekeeper ack latency** — P50/P95/P99 histogram quantiles over time. +3. **E2 broadcast latency** — P50/P95/P99 histogram quantiles over time. +4. **Consumer pending** — `loadgen_consumer_pending` stacked by durable. +5. **Ack pending** — `loadgen_consumer_ack_pending` by durable. +6. **Error rate** — `rate(loadgen_publish_errors_total[10s])` by reason. +7. **NATS health** — connections, slow consumers, JetStream bytes. + +The default compose stack (without the profile) does not bring up +Prometheus or Grafana, keeping the fast path lightweight. + +### Exit code + +- `0` — run completed and error counts were within tolerance + (hardcoded 0.1% of `sent` for v1). +- `1` — startup failure, publish-error rate exceeded tolerance, or + missing-reply rate exceeded tolerance. + +This establishes a foundation for CI gating later without committing +to it in v1. + +## Testing + +### Unit tests + +Standard in-package tests, `package main`, following the repo's +conventions (`stretchr/testify` assertions, `go.uber.org/mock` where +mocks are useful, table-driven where applicable). + +- `preset_test.go` — same `(preset, seed)` produces the same users, + rooms, and subscriptions byte-for-byte; same `(preset, seed)` + produces the same `(user, room, content)` publish sequence. Table- + driven across all four presets. +- `generator_test.go` — rate pacing (given rate R and duration D, + exactly R·D messages are produced ±1); user/room selection honors + the preset's distributions; injects a stub publish function that + records calls (per the repo's "inject publish function as a field" + rule for testability). +- `collector_test.go` — reply correlation: given a set of fake publish + records and a stream of synthesized replies, samples land in the + right metric buffer; missing replies are counted; unknown + `requestID`s are ignored. +- `report_test.go` — percentile math over fixed sample sets; CSV + export format; exit-code logic at the error-tolerance boundary + (just below, at, and just above). + +All unit tests run via `make test SERVICE=tools/loadgen` with the +race detector enabled (handled by the root Makefile). + +### Integration test + +`integration_test.go` with build tag `//go:build integration`. Uses +`testcontainers-go` to bring up NATS, MongoDB, Cassandra, +`message-gatekeeper`, `message-worker`, and `broadcast-worker` +containers. The test then runs +`loadgen seed --preset=small` and +`loadgen run --preset=small --duration=10s --rate=50` and asserts: + +- Exit code is `0`. +- E1 sample count equals published count (no missing replies). +- E2 sample count equals published count (no missing broadcasts). +- Final `num_pending` on both durable consumers is `0`. +- `rooms.lastMsgId` in MongoDB for a sampled room matches the last + published message's ID. + +The test verifies end-to-end wiring — it does not assert on +performance numbers, which depend on the test host and are not the +point of a CI-runnable test. + +### Coverage target + +≥80% per the project rule (`CLAUDE.md`), with `generator.go`, +`collector.go`, and `preset.go` aiming for 90%+ as core logic. + +## Error handling + +All errors follow the repo's rules (`CLAUDE.md`): + +- Errors wrapped with context: `fmt.Errorf("seed users: %w", err)`. + Never bare `err`, never `fmt.Errorf("error: %w", err)`. +- NATS connect / MongoDB connect failures at startup log and + `os.Exit(1)` — the same pattern the workers use. +- Publish errors during a run are counted and logged at DEBUG; the + run continues so the overall shape of the failure is visible. +- Reply-subject JSON parse failures are counted under + `reason="bad_reply"` and the offending sample is discarded. +- Graceful shutdown on `SIGTERM` / `SIGINT` via `pkg/shutdown.Wait`: + stop the publish ticker, drain in-flight publishes with a 5-second + bound, unsubscribe from reply and broadcast subjects, `nc.Drain()`, + disconnect MongoDB, then print a partial summary before exit. + +## Logging + +`log/slog` with the JSON handler. Lifecycle events at INFO (startup, +seed complete, run started, run complete). Per-error detail at DEBUG +(publish errors, bad replies). Never log message content +(`CLAUDE.md`: "never log tokens, passwords, or full message bodies"). + +## Documentation + +- `tools/loadgen/README.md` — reference for the operator: what the + tool is, how to run each preset, how to read the terminal summary, + how to turn on the Grafana dashboard, what each metric means, + example output. Not a tutorial. +- This design document at + `docs/superpowers/specs/2026-04-21-load-test-messaging-workers-design.md`. + +The `README.md` explicitly documents what the harness does **not** do, +so future contributors don't silently retrofit responsibilities onto +it: + +- Does not run in CI by default. +- Does not test auth / NATS callout capacity. +- Does not test cross-site behavior or the OUTBOX / INBOX path. +- Does not assert on absolute performance numbers — those are + host-dependent; the pass signal is `final_pending == 0` with error + counts at zero. + +## Dependencies + +No new third-party Go dependencies are added for v1. Everything needed +is already present in `go.mod`: + +- `github.com/nats-io/nats.go` and `.../jetstream` — publish, subscribe, + consumer info. +- `go.mongodb.org/mongo-driver/v2` — seeding (via `pkg/mongoutil`). +- `github.com/caarlos0/env/v11` — config parsing. +- `github.com/google/uuid` — request/message IDs. +- `github.com/prometheus/client_golang` — metrics endpoint. +- `github.com/stretchr/testify` — test assertions. +- `go.uber.org/mock` — where mocks are useful (unlikely in loadgen, + but available). +- `github.com/testcontainers/testcontainers-go` — integration test. + +Shared packages consumed from the repo: + +- `pkg/model` — typed NATS payloads (`SendMessageRequest`, + `MessageEvent`, `RoomEvent`). +- `pkg/subject` — subject builders (never hand-construct subject + strings). +- `pkg/stream` — stream/consumer config helpers. +- `pkg/natsutil` — NATS connection helper. +- `pkg/mongoutil` — MongoDB connection helper. +- `pkg/shutdown` — graceful shutdown orchestration. + +## Future work (explicitly deferred) + +- Multi-site / supercluster topology to measure gateway cost. +- Per-user NATS creds to measure auth-callout capacity. +- HDR histogram sample storage for multi-hour soak runs. +- k6-based harness variant if HTML reports or CI threshold gating + become a priority. +- CI integration with a baseline-comparison workflow. +- Realistic workload extensions (message edits, deletes, reactions + once those features land). diff --git a/docs/superpowers/specs/2026-04-24-loadgen-worker-pool-design.md b/docs/superpowers/specs/2026-04-24-loadgen-worker-pool-design.md new file mode 100644 index 00000000..fa24480d --- /dev/null +++ b/docs/superpowers/specs/2026-04-24-loadgen-worker-pool-design.md @@ -0,0 +1,203 @@ +# Loadgen Worker-Pool Dispatch + pprof — Design + +## Purpose + +The loadgen's actual publish rate falls materially below the target rate at +moderate throughput. At `--rate=1000` observed actual rate is ~775 msg/s +(~77% delivery). Root cause: the publisher runs on the `time.Ticker`'s +goroutine serially, and `time.Ticker` drops ticks that fire while a publish +is still in progress. Any per-publish stall (NATS write-lock contention, +GC pause, scheduler hiccup) above the 1 ms/tick budget silently loses a +tick. + +This spec fixes that by dispatching publishes to a small worker pool and +adds opt-in pprof so future bottlenecks are diagnosable. + +## Scope + +### In scope + +- `Generator.Run` dispatches each tick's publish to a bounded pool of + goroutines. The ticker itself stays punctual. +- New env var `MAX_IN_FLIGHT` (default `200`) caps concurrent publishes. + Saturation (pool full when a tick fires) is an explicit signal, not a + silent drop: the ticker records + `loadgen_publish_errors_total{reason="saturated"}` and moves on. +- `MAX_IN_FLIGHT=0` falls back to the current serial behavior. Useful as + a bisection tool and a conservative default for whoever wants + reproducible comparisons. +- On graceful shutdown / `ctx.Done()`, `Run` returns only after all + in-flight publishes drain (bounded by a small timeout). +- New env var `PPROF_ADDR` (default `""`, meaning disabled). When set + (e.g. `:6060`), loadgen exposes `net/http/pprof` handlers on a + separate HTTP server. Never on by default — pprof isn't exposed in + production-ish deployments unless the operator opts in. +- Docker-compose loadgen service documents both new env vars. + +### Out of scope + +- Changes to the Collector, ConsumerSampler, Report, Preset, Seed, or + integration test — none are publish-hot-path. +- `golang.org/x/time/rate.Limiter` — the worker-pool fix addresses the + real structural cause (ticker/publish coupling). If worker-pool + saturation becomes the new bottleneck, re-evaluate then. +- `sync.Pool` allocation-reuse tuning — defer until pprof identifies GC + as the next-order concern. +- Dedicated NATS connection for publishes vs. subscriptions — only + justified if pprof identifies the NATS write lock as the bottleneck + after the worker pool lands. +- Default-rate bump — reasoned about separately. + +## Architecture + +Before: + +```text +ticker goroutine: [wait tick] → publishOne (JSON + NATS write + metrics) → [wait tick] → … + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + one slow call here silently loses a tick +``` + +After: + +```text +ticker goroutine: [wait tick] → reserve sem slot → spawn publish goroutine → [wait tick] → … + +publish goroutine: [publishOne] → release sem slot +publish goroutine: [publishOne] → release sem slot +publish goroutine: [publishOne] → release sem slot (up to MAX_IN_FLIGHT concurrently) +``` + +The ticker goroutine's per-tick work shrinks to a semaphore send + goroutine +spawn — tens of nanoseconds. It cannot overshoot the ticker interval at any +realistic rate. + +## Components + +### `Generator.Run` (modified) + +- Read `g.cfg.MaxInFlight` from `GeneratorConfig`. +- If `MaxInFlight <= 0`: run serially as today (preserves legacy behavior + and gives a bisection switch). +- Else: create `sem := make(chan struct{}, MaxInFlight)` and + `var wg sync.WaitGroup`. On each tick, non-blocking `select`: + - Slot available: take it, `wg.Add(1)`, `go func() { defer wg.Done(); + defer func() { <-sem }(); g.publishOne(ctx) }()`. + - No slot: increment + `loadgen_publish_errors_total{reason="saturated"}` and continue — + the tick is dropped but at least it's observable. +- On `ctx.Done()`: stop the ticker, then `wg.Wait()` with a bounded grace + period (5 s). If the grace expires, log and return — in-flight + goroutines complete on their own after NATS drain in main. + +### `GeneratorConfig` (modified) + +Add one field: + +```go +type GeneratorConfig struct { + … existing fields … + MaxInFlight int +} +``` + +### `main.go` (modified) + +Add to `config`: + +```go +type config struct { + … existing fields … + MaxInFlight int `env:"MAX_IN_FLIGHT" envDefault:"200"` + PProfAddr string `env:"PPROF_ADDR" envDefault:""` +} +``` + +Pass `cfg.MaxInFlight` into `GeneratorConfig` when constructing the generator. + +On startup, if `PProfAddr != ""`: register `net/http/pprof` handlers on a +new `http.ServeMux` and start a separate `http.Server` listening on that +addr. Log the resulting URL. The server doesn't share the metrics mux — +pprof is genuinely separate, opt-in infrastructure, and keeping it off the +metrics port avoids accidental exposure when the metrics mux is scraped +by Prometheus. + +On `ctx.Done()`: gracefully shut down the pprof server with a 2 s timeout. + +### Metrics + +No new metrics. The existing `loadgen_publish_errors_total` counter with +`reason="saturated"` is the single new label value for pool saturation. +This keeps the Grafana dashboard's "Publish errors/sec by reason" panel +working out of the box. + +## Error handling + +- `sem <- struct{}{}` is never blocking because we use non-blocking + `select` — if the pool is full, we record saturation and move on. No + unbounded goroutine growth under sustained overload. +- Inside each publish goroutine, `publishOne` already handles its own + errors (counters for marshal/publish failures, `RecordPublishFailed` + on the Collector). +- Graceful shutdown: the `Run` method returns only after in-flight + publishes drain or the bounded grace period elapses. The caller + (`main.go runRun`) already calls `collector.DiscardBefore` and + `collector.Finalize` after `Run` returns, so late-arriving publishes + correctly integrate with the summary. + +## Testing + +### New unit test + +`TestGenerator_MaxInFlightZeroRunsSerially` — with `MaxInFlight=0`, the +generator's behavior is unchanged from today. Reuses the existing +`TestGenerator_SendsExpectedCount` assertion style. + +### Adjusted unit test + +`TestGenerator_SendsExpectedCount` — still valid with `MaxInFlight > 0`, +but the count may be closer to the theoretical target since the ticker +is no longer blocked. + +### New unit test + +`TestGenerator_PoolSaturationCountedAsError` — artificially slow the +publisher via an injected blocking `Publisher`. Run at a rate that +exceeds the pool's capacity. Assert the `saturated` counter increments. + +### Integration test + +No change. The existing `tools/loadgen/integration_test.go` exercises +`Generator.Run` with a fake gatekeeper + broadcast-worker and makes no +assumptions about ticker coupling. + +### Coverage target + +`generator.go` to stay at ≥ 90% for `Run`, `publishOne`, `content` per +the existing plan. + +## Dependencies + +No new third-party dependencies. All new code uses stdlib: `net/http`, +`net/http/pprof`, `sync`. + +## Rollout + +- Both env vars have safe defaults (`MAX_IN_FLIGHT=200`, `PPROF_ADDR=""`). +- Existing deployments pick up the worker pool automatically with + improved actual-rate fidelity at moderate throughput. Operators + concerned about the behavior change can set `MAX_IN_FLIGHT=0` to + get the legacy serial path. +- pprof stays off unless explicitly enabled via `PPROF_ADDR`. +- Internal-only to the loadgen service; no cross-service contract + change. + +## Future work (deferred) + +- Dedicated publish-side `*nats.Conn` — only if profiling identifies the + NATS connection write lock as the remaining bottleneck. +- `sync.Pool` for `SendMessageRequest` / `MessageEvent` / byte buffers + to reduce per-publish GC pressure — only if GC shows up in a + profile. +- Background UUID generation — only if `crypto/rand` shows up + prominently. diff --git a/go.mod b/go.mod index fed3e8db..add93d47 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/nats-io/nats-server/v2 v2.12.6 github.com/nats-io/nats.go v1.50.0 github.com/nats-io/nkeys v0.4.15 + github.com/prometheus/client_golang v1.23.2 github.com/redis/go-redis/v9 v9.18.0 github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go v0.42.0 diff --git a/pkg/subject/subject.go b/pkg/subject/subject.go index 64854f05..e7920ad9 100644 --- a/pkg/subject/subject.go +++ b/pkg/subject/subject.go @@ -251,6 +251,18 @@ func RoomsInfoBatchSubscribe(siteID string) string { return fmt.Sprintf("chat.server.request.room.%s.info.batch", siteID) } +func UserResponseWildcard() string { + return "chat.user.*.response.>" +} + +func RoomEventWildcard() string { + return "chat.room.*.event" +} + +func UserRoomEventWildcard() string { + return "chat.user.*.event.room" +} + // --- natsrouter patterns (use {param} placeholders for named extraction) --- func MsgHistoryPattern(siteID string) string { diff --git a/tools/loadgen/README.md b/tools/loadgen/README.md new file mode 100644 index 00000000..7dbda24c --- /dev/null +++ b/tools/loadgen/README.md @@ -0,0 +1,59 @@ +# loadgen + +Capacity-baseline load generator for the single-site messaging pipeline +(`message-gatekeeper` → `MESSAGES_CANONICAL` → `message-worker` + +`broadcast-worker`). Single Go binary with three subcommands. + +## Quick start + +``` +make -C tools/loadgen/deploy up +make -C tools/loadgen/deploy seed PRESET=medium +make -C tools/loadgen/deploy run PRESET=medium RATE=500 DURATION=60s +``` + +For live dashboards: + +``` +make -C tools/loadgen/deploy run-dashboards PRESET=medium +# Grafana at http://localhost:3000 (anonymous admin) +``` + +Tear down: + +``` +make -C tools/loadgen/deploy down +``` + +## Presets + +| preset | users | rooms | notes | +|-------------|--------|-------|--------------------------------------------------------| +| `small` | 10 | 5 | uniform, 200-byte content | +| `medium` | 1 000 | 100 | uniform, 200-byte content | +| `large` | 10 000 | 1 000 | uniform, 200-byte content | +| `realistic` | 1 000 | 100 | Zipf senders, mixed room sizes, 50–2000 bytes, mentions| + +## Subcommands + +- `loadgen seed --preset= [--seed=42]` — idempotently populate + MongoDB with deterministic fixtures. +- `loadgen run --preset= [flags]` — open-loop publish at `--rate` + msgs/sec for `--duration`, print a summary at the end. Flags: + `--seed`, `--warmup`, `--inject=frontdoor|canonical`, `--csv=`. +- `loadgen teardown` — drop the three seeded collections. + +## Reading the summary + +- `final_pending == 0` on both durables, zero errors → the pipeline is + sustaining your target rate. +- `final_pending` climbing, or error counts > 0 → over capacity or a + regression upstream of the worker. + +## Non-goals + +- Not a CI regression gate. Invoked manually. +- Not an auth benchmark. Uses shared `backend.creds`. +- Not a cross-site benchmark. Single-site only. +- Not an absolute-number tool. Numbers vary by host — compare within one + machine across changes, don't compare across machines. diff --git a/tools/loadgen/collector.go b/tools/loadgen/collector.go new file mode 100644 index 00000000..fa06f249 --- /dev/null +++ b/tools/loadgen/collector.go @@ -0,0 +1,155 @@ +package main + +import ( + "sort" + "sync" + "time" +) + +type publishEntry struct { + publishedAt time.Time +} + +// sample pairs a latency with its publish timestamp so warmup can discard by time. +type sample struct { + publishedAt time.Time + latency time.Duration +} + +// Collector correlates publishes with replies (E1) and broadcasts (E2). +type Collector struct { + m *Metrics + preset string + mu sync.Mutex + byReqID map[string]publishEntry + byMsgID map[string]publishEntry + e1 []sample + e2 []sample +} + +// NewCollector returns a ready-to-use Collector. +func NewCollector(m *Metrics, preset string) *Collector { + return &Collector{ + m: m, preset: preset, + byReqID: make(map[string]publishEntry), + byMsgID: make(map[string]publishEntry), + } +} + +// RecordPublish stores the publish time under both correlation keys. +func (c *Collector) RecordPublish(requestID, messageID string, t time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + c.byReqID[requestID] = publishEntry{publishedAt: t} + c.byMsgID[messageID] = publishEntry{publishedAt: t} +} + +// RecordReply consumes one pending publish keyed by requestID. +func (c *Collector) RecordReply(requestID string, at time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + e, ok := c.byReqID[requestID] + if !ok { + return + } + delete(c.byReqID, requestID) + d := at.Sub(e.publishedAt) + c.e1 = append(c.e1, sample{publishedAt: e.publishedAt, latency: d}) + c.m.E1Latency.WithLabelValues(c.preset).Observe(d.Seconds()) +} + +// RecordPublishBroadcastOnly stores only the message-ID correlation, for +// injection modes that bypass the gatekeeper (no reply is expected). +func (c *Collector) RecordPublishBroadcastOnly(messageID string, t time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + c.byMsgID[messageID] = publishEntry{publishedAt: t} +} + +// RecordPublishFailed removes entries previously stored by RecordPublish. +// Use when the publish itself failed (message never reached NATS) so the +// orphans do not inflate Finalize's missing-reply / missing-broadcast counts. +func (c *Collector) RecordPublishFailed(requestID, messageID string) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.byReqID, requestID) + delete(c.byMsgID, messageID) +} + +// RecordBroadcast consumes one pending publish keyed by messageID. +func (c *Collector) RecordBroadcast(messageID string, at time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + e, ok := c.byMsgID[messageID] + if !ok { + return + } + delete(c.byMsgID, messageID) + d := at.Sub(e.publishedAt) + c.e2 = append(c.e2, sample{publishedAt: e.publishedAt, latency: d}) + c.m.E2Latency.WithLabelValues(c.preset).Observe(d.Seconds()) +} + +// DiscardBefore drops any samples whose publish time is before cutoff (warmup). +func (c *Collector) DiscardBefore(cutoff time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + c.e1 = filterAtOrAfter(c.e1, cutoff) + c.e2 = filterAtOrAfter(c.e2, cutoff) +} + +func filterAtOrAfter(in []sample, cutoff time.Time) []sample { + out := in[:0] + for i := range in { + if !in[i].publishedAt.Before(cutoff) { + out = append(out, in[i]) + } + } + return out +} + +// Finalize returns the count of unmatched publishes as missing replies and broadcasts. +func (c *Collector) Finalize() (missingReplies int, missingBroadcasts int) { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.byReqID), len(c.byMsgID) +} + +// E1Count returns the number of matched E1 samples. +func (c *Collector) E1Count() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.e1) +} + +// E2Count returns the number of matched E2 samples. +func (c *Collector) E2Count() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.e2) +} + +// E1Samples returns a sorted copy of E1 latencies for tests/reporting. +func (c *Collector) E1Samples() []time.Duration { + c.mu.Lock() + defer c.mu.Unlock() + return c.snapshotLatenciesLocked(c.e1) +} + +// E2Samples returns a sorted copy of E2 latencies for tests/reporting. +func (c *Collector) E2Samples() []time.Duration { + c.mu.Lock() + defer c.mu.Unlock() + return c.snapshotLatenciesLocked(c.e2) +} + +// snapshotLatenciesLocked copies and sorts latencies from in. +// Callers must hold c.mu before calling this method. +func (c *Collector) snapshotLatenciesLocked(in []sample) []time.Duration { + out := make([]time.Duration, len(in)) + for i := range in { + out[i] = in[i].latency + } + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} diff --git a/tools/loadgen/collector_test.go b/tools/loadgen/collector_test.go new file mode 100644 index 00000000..86ae5301 --- /dev/null +++ b/tools/loadgen/collector_test.go @@ -0,0 +1,170 @@ +package main + +import ( + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCollector_E1ReplyMatches(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + assert.Equal(t, 1, c.E1Count()) + assert.Equal(t, []time.Duration{5 * time.Millisecond}, c.E1Samples()) +} + +func TestCollector_E1UnknownIgnored(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + c.RecordReply("unknown", time.Unix(0, 0)) + assert.Equal(t, 0, c.E1Count()) +} + +func TestCollector_E2BroadcastMatches(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordBroadcast("msg-1", now.Add(8*time.Millisecond)) + assert.Equal(t, 1, c.E2Count()) + assert.Equal(t, []time.Duration{8 * time.Millisecond}, c.E2Samples()) +} + +func TestCollector_E1AndE2Independent(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + c.RecordBroadcast("msg-1", now.Add(8*time.Millisecond)) + assert.Equal(t, 1, c.E1Count()) + assert.Equal(t, 1, c.E2Count()) +} + +func TestCollector_MissingCountsAtFinalize(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("req-1", "msg-1", now) + c.RecordPublish("req-2", "msg-2", now) + c.RecordReply("req-1", now.Add(5*time.Millisecond)) + // req-2 reply never arrives; msg-1 and msg-2 broadcasts never arrive + missingReplies, missingBroadcasts := c.Finalize() + assert.Equal(t, 1, missingReplies) + assert.Equal(t, 2, missingBroadcasts) +} + +func TestCollector_WarmupDiscards(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + start := time.Unix(0, 0) + warmupEnd := start.Add(1 * time.Second) + // In warmup window: + c.RecordPublish("req-warm", "msg-warm", start) + c.RecordReply("req-warm", start.Add(10*time.Millisecond)) + // Past warmup: + c.RecordPublish("req-real", "msg-real", warmupEnd.Add(100*time.Millisecond)) + c.RecordReply("req-real", warmupEnd.Add(105*time.Millisecond)) + + c.DiscardBefore(warmupEnd) + require.Equal(t, 1, c.E1Count()) + assert.Equal(t, []time.Duration{5 * time.Millisecond}, c.E1Samples()) +} + +func TestCollector_E2UnknownIgnored(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + c.RecordBroadcast("unknown", time.Unix(0, 0)) + assert.Equal(t, 0, c.E2Count()) +} + +func TestCollector_SamplesReturnedSorted(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + // Publish three messages, record replies in a non-sorted order. + c.RecordPublish("r-1", "m-1", now) + c.RecordPublish("r-2", "m-2", now) + c.RecordPublish("r-3", "m-3", now) + c.RecordReply("r-1", now.Add(10*time.Millisecond)) + c.RecordReply("r-2", now.Add(2*time.Millisecond)) + c.RecordReply("r-3", now.Add(7*time.Millisecond)) + assert.Equal(t, []time.Duration{ + 2 * time.Millisecond, 7 * time.Millisecond, 10 * time.Millisecond, + }, c.E1Samples()) +} + +func TestCollector_ConcurrentRecordAndSnapshot(t *testing.T) { + // Race-detector-friendly stress: one goroutine records publishes and + // replies; another polls E1Samples. Verifies that no data race occurs + // when snapshots are taken concurrently with mutations. + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + + done := make(chan struct{}) + go func() { + defer close(done) + for i := 0; i < 500; i++ { + rid := "r-" + strconv.Itoa(i) + mid := "m-" + strconv.Itoa(i) + c.RecordPublish(rid, mid, now) + c.RecordReply(rid, now.Add(time.Duration(i)*time.Microsecond)) + } + }() + for i := 0; i < 500; i++ { + _ = c.E1Samples() + } + <-done + require.GreaterOrEqual(t, c.E1Count(), 1) +} + +func TestCollector_RecordPublishFailedRemovesOrphans(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("r-1", "m-1", now) + c.RecordPublish("r-2", "m-2", now) + // r-1 / m-1 get replied + broadcast; r-2 / m-2 "failed to publish" and get cleaned up. + c.RecordReply("r-1", now.Add(5*time.Millisecond)) + c.RecordBroadcast("m-1", now.Add(8*time.Millisecond)) + c.RecordPublishFailed("r-2", "m-2") + + missingReplies, missingBroadcasts := c.Finalize() + assert.Equal(t, 0, missingReplies) + assert.Equal(t, 0, missingBroadcasts) +} + +func TestCollector_RecordPublishBroadcastOnly_IgnoredByE1(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublishBroadcastOnly("m-1", now) + // A reply correlated by requestID should NOT find this message + // because we didn't populate byReqID. + c.RecordReply("some-req-id", now.Add(5*time.Millisecond)) + assert.Equal(t, 0, c.E1Count()) + + // A broadcast matching the msg-id should be recorded. + c.RecordBroadcast("m-1", now.Add(8*time.Millisecond)) + assert.Equal(t, 1, c.E2Count()) +} + +func TestCollector_RecordPublishBroadcastOnly_FinalizeNoMissingReplies(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublishBroadcastOnly("m-1", now) + c.RecordPublishBroadcastOnly("m-2", now) + c.RecordBroadcast("m-1", now.Add(5*time.Millisecond)) + // m-2 never gets a broadcast — that's the only missing event class. + missingReplies, missingBroadcasts := c.Finalize() + assert.Equal(t, 0, missingReplies, "canonical mode should never produce missing replies") + assert.Equal(t, 1, missingBroadcasts) +} diff --git a/tools/loadgen/consumerlag.go b/tools/loadgen/consumerlag.go new file mode 100644 index 00000000..b749f5ec --- /dev/null +++ b/tools/loadgen/consumerlag.go @@ -0,0 +1,100 @@ +package main + +import ( + "context" + "log/slog" + "time" + + "github.com/nats-io/nats.go/jetstream" +) + +// ConsumerSampler polls a single durable consumer's info every interval and +// records min/peak/final samples. Start with Run(ctx); stop by cancelling ctx. +type ConsumerSampler struct { + js jetstream.JetStream + stream string + durable string + metrics *Metrics + interval time.Duration + + hasSample bool + minPending uint64 + peakPending uint64 + finalPending uint64 + peakAckPending uint64 + finalRedelivered uint64 +} + +// NewConsumerSampler constructs a sampler. +func NewConsumerSampler(js jetstream.JetStream, stream, durable string, m *Metrics, interval time.Duration) *ConsumerSampler { + return &ConsumerSampler{js: js, stream: stream, durable: durable, metrics: m, interval: interval} +} + +// Run polls ConsumerInfo until ctx is cancelled. +func (s *ConsumerSampler) Run(ctx context.Context) { + t := time.NewTicker(s.interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + s.sampleOnce(ctx) + } + } +} + +func (s *ConsumerSampler) sampleOnce(ctx context.Context) { + cons, err := s.js.Consumer(ctx, s.stream, s.durable) + if err != nil { + slog.Warn("consumer lookup failed", "stream", s.stream, "durable", s.durable, "error", err) + return + } + info, err := cons.Info(ctx) + if err != nil { + slog.Warn("consumer info failed", "stream", s.stream, "durable", s.durable, "error", err) + return + } + pending := info.NumPending + ack := uint64(info.NumAckPending) + redel := uint64(info.NumRedelivered) + + s.metrics.ConsumerPending.WithLabelValues(s.stream, s.durable).Set(float64(pending)) + s.metrics.ConsumerAckPending.WithLabelValues(s.stream, s.durable).Set(float64(ack)) + s.metrics.ConsumerRedelivered.WithLabelValues(s.stream, s.durable).Set(float64(redel)) + + if !s.hasSample { + s.hasSample = true + s.minPending = pending + s.peakPending = pending + s.peakAckPending = ack + } else { + if pending < s.minPending { + s.minPending = pending + } + if pending > s.peakPending { + s.peakPending = pending + } + if ack > s.peakAckPending { + s.peakAckPending = ack + } + } + s.finalPending = pending + s.finalRedelivered = redel +} + +// Snapshot returns a ConsumerStat from what has been observed so far. +// Must only be called after Run has returned (i.e., after the context +// passed to Run has been cancelled and its goroutine has exited); +// concurrent calls to Snapshot while Run is still ticking are unsafe. +func (s *ConsumerSampler) Snapshot() ConsumerStat { + return ConsumerStat{ + Stream: s.stream, + Durable: s.durable, + MinPending: s.minPending, + PeakPending: s.peakPending, + FinalPending: s.finalPending, + PeakAckPending: s.peakAckPending, + Redelivered: s.finalRedelivered, + } +} diff --git a/tools/loadgen/consumerlag_test.go b/tools/loadgen/consumerlag_test.go new file mode 100644 index 00000000..07c9c0a8 --- /dev/null +++ b/tools/loadgen/consumerlag_test.go @@ -0,0 +1,35 @@ +package main + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestNewConsumerSampler_SnapshotInitialState(t *testing.T) { + m := NewMetrics() + s := NewConsumerSampler(nil, "MESSAGES_CANONICAL_site-local", "message-worker", m, 1*time.Second) + snap := s.Snapshot() + assert.Equal(t, "MESSAGES_CANONICAL_site-local", snap.Stream) + assert.Equal(t, "message-worker", snap.Durable) + assert.Equal(t, uint64(0), snap.MinPending) + assert.Equal(t, uint64(0), snap.PeakPending) + assert.Equal(t, uint64(0), snap.FinalPending) + assert.Equal(t, uint64(0), snap.PeakAckPending) + assert.Equal(t, uint64(0), snap.Redelivered) +} + +func TestNewConsumerSampler_SnapshotDifferentParams(t *testing.T) { + m := NewMetrics() + s := NewConsumerSampler(nil, "MESSAGES_CANONICAL_site-remote", "broadcast-worker", m, 500*time.Millisecond) + snap := s.Snapshot() + assert.Equal(t, "MESSAGES_CANONICAL_site-remote", snap.Stream) + assert.Equal(t, "broadcast-worker", snap.Durable) + // All counters start at zero before any samples are taken. + assert.Equal(t, uint64(0), snap.MinPending) + assert.Equal(t, uint64(0), snap.PeakPending) + assert.Equal(t, uint64(0), snap.FinalPending) + assert.Equal(t, uint64(0), snap.PeakAckPending) + assert.Equal(t, uint64(0), snap.Redelivered) +} diff --git a/tools/loadgen/deploy/Dockerfile b/tools/loadgen/deploy/Dockerfile new file mode 100644 index 00000000..7f38fff0 --- /dev/null +++ b/tools/loadgen/deploy/Dockerfile @@ -0,0 +1,16 @@ +FROM golang:1.25.8-alpine AS builder + +WORKDIR /app + +COPY go.mod go.sum ./ +RUN go mod download + +COPY pkg/ pkg/ +COPY tools/loadgen/ tools/loadgen/ + +RUN CGO_ENABLED=0 go build -o /loadgen ./tools/loadgen/ + +FROM alpine:3.21 +RUN apk add --no-cache ca-certificates +COPY --from=builder /loadgen /loadgen +ENTRYPOINT ["/loadgen"] diff --git a/tools/loadgen/deploy/Makefile b/tools/loadgen/deploy/Makefile new file mode 100644 index 00000000..a2904e34 --- /dev/null +++ b/tools/loadgen/deploy/Makefile @@ -0,0 +1,27 @@ +COMPOSE ?= docker compose -f docker-compose.loadtest.yml + +.PHONY: up seed run run-dashboards down logs + +up: + $(COMPOSE) up -d --build + +seed: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen seed --preset=$(PRESET) + +run: + @test -n "$(PRESET)" || (echo "PRESET= required" && exit 1) + $(COMPOSE) exec -T loadgen /loadgen run \ + --preset=$(PRESET) \ + --rate=$(or $(RATE),500) \ + --duration=$(or $(DURATION),60s) + +run-dashboards: + $(COMPOSE) --profile dashboards up -d + $(MAKE) run PRESET=$(PRESET) RATE=$(RATE) DURATION=$(DURATION) + +down: + $(COMPOSE) --profile dashboards down -v + +logs: + $(COMPOSE) logs -f loadgen diff --git a/tools/loadgen/deploy/docker-compose.loadtest.yml b/tools/loadgen/deploy/docker-compose.loadtest.yml new file mode 100644 index 00000000..9f0c7a6b --- /dev/null +++ b/tools/loadgen/deploy/docker-compose.loadtest.yml @@ -0,0 +1,135 @@ +name: loadgen + +services: + nats: + image: nats:2.11-alpine + command: ["-js", "-m", "8222"] + ports: + - "4222:4222" + - "8222:8222" + networks: [loadtest] + + mongodb: + image: mongo:8 + ports: + - "27017:27017" + networks: [loadtest] + + cassandra: + image: cassandra:4.1 + environment: + - CASSANDRA_CLUSTER_NAME=loadtest + ports: + - "9042:9042" + networks: [loadtest] + healthcheck: + test: ["CMD-SHELL", "nodetool status | grep -q '^UN'"] + interval: 10s + timeout: 5s + retries: 30 + + cassandra-init: + image: cassandra:4.1 + depends_on: + cassandra: + condition: service_healthy + entrypoint: + - sh + - -c + - | + cqlsh cassandra -e "CREATE KEYSPACE IF NOT EXISTS chat WITH replication = {'class':'SimpleStrategy','replication_factor':1};" + networks: [loadtest] + restart: "no" + + message-gatekeeper: + build: + context: ../../.. + dockerfile: message-gatekeeper/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + depends_on: [nats, mongodb] + networks: [loadtest] + + message-worker: + build: + context: ../../.. + dockerfile: message-worker/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + - CASSANDRA_HOSTS=cassandra + - CASSANDRA_KEYSPACE=chat + depends_on: + nats: + condition: service_started + mongodb: + condition: service_started + cassandra-init: + condition: service_completed_successfully + networks: [loadtest] + + broadcast-worker: + build: + context: ../../.. + dockerfile: broadcast-worker/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + depends_on: [nats, mongodb] + networks: [loadtest] + + loadgen: + build: + context: ../../.. + dockerfile: tools/loadgen/deploy/Dockerfile + environment: + - NATS_URL=nats://nats:4222 + - SITE_ID=site-local + - MONGO_URI=mongodb://mongodb:27017 + - MONGO_DB=chat + - METRICS_ADDR=:9099 + # Worker-pool cap for concurrent publishes. Set to 0 to publish + # serially on the ticker goroutine (legacy behavior). + - MAX_IN_FLIGHT=200 + # Enable pprof on a separate port by uncommenting and mapping + # the port. Off by default so the metrics endpoint doesn't expose + # profiling. + # - PPROF_ADDR=:6060 + ports: + - "9099:9099" + depends_on: [nats, mongodb, message-gatekeeper, message-worker, broadcast-worker] + entrypoint: ["sleep", "infinity"] + networks: [loadtest] + + prometheus: + image: prom/prometheus:v2.55.0 + profiles: [dashboards] + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + networks: [loadtest] + + grafana: + image: grafana/grafana:11.2.2 + profiles: [dashboards] + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "3000:3000" + networks: [loadtest] + +networks: + loadtest: diff --git a/tools/loadgen/deploy/grafana/dashboards/loadtest.json b/tools/loadgen/deploy/grafana/dashboards/loadtest.json new file mode 100644 index 00000000..f3928176 --- /dev/null +++ b/tools/loadgen/deploy/grafana/dashboards/loadtest.json @@ -0,0 +1,53 @@ +{ + "title": "Loadgen", + "schemaVersion": 39, + "version": 1, + "refresh": "5s", + "time": {"from": "now-15m", "to": "now"}, + "panels": [ + { + "type": "timeseries", + "title": "Throughput (msg/s)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [{"expr": "rate(loadgen_published_total[10s])", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "E1 gatekeeper latency (P50/P95/P99)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + {"expr": "histogram_quantile(0.50, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p50", "refId": "A"}, + {"expr": "histogram_quantile(0.95, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p95", "refId": "B"}, + {"expr": "histogram_quantile(0.99, sum(rate(loadgen_e1_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p99", "refId": "C"} + ] + }, + { + "type": "timeseries", + "title": "E2 broadcast latency (P50/P95/P99)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + {"expr": "histogram_quantile(0.50, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p50", "refId": "A"}, + {"expr": "histogram_quantile(0.95, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p95", "refId": "B"}, + {"expr": "histogram_quantile(0.99, sum(rate(loadgen_e2_latency_seconds_bucket[30s])) by (le))", "legendFormat": "p99", "refId": "C"} + ] + }, + { + "type": "timeseries", + "title": "Consumer pending", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [{"expr": "loadgen_consumer_pending", "legendFormat": "{{durable}}", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "Consumer ack pending", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "targets": [{"expr": "loadgen_consumer_ack_pending", "legendFormat": "{{durable}}", "refId": "A"}] + }, + { + "type": "timeseries", + "title": "Publish errors/sec", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "targets": [{"expr": "rate(loadgen_publish_errors_total[10s])", "legendFormat": "{{reason}}", "refId": "A"}] + } + ] +} diff --git a/tools/loadgen/deploy/grafana/provisioning/dashboards/loadtest.yaml b/tools/loadgen/deploy/grafana/provisioning/dashboards/loadtest.yaml new file mode 100644 index 00000000..91e33949 --- /dev/null +++ b/tools/loadgen/deploy/grafana/provisioning/dashboards/loadtest.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 +providers: + - name: loadtest + folder: "" + type: file + options: + path: /var/lib/grafana/dashboards diff --git a/tools/loadgen/deploy/grafana/provisioning/datasources/prometheus.yaml b/tools/loadgen/deploy/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 00000000..0eddf262 --- /dev/null +++ b/tools/loadgen/deploy/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true diff --git a/tools/loadgen/deploy/prometheus/prometheus.yml b/tools/loadgen/deploy/prometheus/prometheus.yml new file mode 100644 index 00000000..9c7a8180 --- /dev/null +++ b/tools/loadgen/deploy/prometheus/prometheus.yml @@ -0,0 +1,10 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: loadgen + static_configs: + - targets: ["loadgen:9099"] + # NATS monitoring on :8222 serves JSON (/varz, /jsz) — not Prometheus. + # Add prometheus-nats-exporter as a sidecar if NATS metrics are needed. diff --git a/tools/loadgen/generator.go b/tools/loadgen/generator.go new file mode 100644 index 00000000..0c5e8514 --- /dev/null +++ b/tools/loadgen/generator.go @@ -0,0 +1,215 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + "github.com/google/uuid" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/subject" +) + +// InjectMode selects which subject the generator publishes onto. +type InjectMode string + +const ( + InjectFrontdoor InjectMode = "frontdoor" + InjectCanonical InjectMode = "canonical" +) + +// Publisher abstracts NATS publishing so tests can inject a recorder. +type Publisher interface { + Publish(ctx context.Context, subject string, data []byte) error +} + +// GeneratorConfig is the parameter bundle for a Generator. +// Preset is *Preset because the struct is large enough that gocritic's +// hugeParam rule would flag the embedded value. +type GeneratorConfig struct { + Preset *Preset + Fixtures Fixtures + SiteID string + Rate int + Inject InjectMode + Publisher Publisher + Metrics *Metrics + Collector *Collector + WarmupDeadline time.Time + // MaxInFlight caps concurrent publishes dispatched from the ticker. + // Set to 0 to publish serially on the ticker goroutine (legacy behavior, + // useful for bisection). + MaxInFlight int +} + +// Generator is the open-loop publisher. +type Generator struct { + cfg GeneratorConfig + rngMu sync.Mutex + rng *rand.Rand + maxBody string +} + +// NewGenerator returns a Generator seeded from `seed`. +func NewGenerator(cfg *GeneratorConfig, seed int64) *Generator { + max := cfg.Preset.ContentBytes.Max + if max <= 0 { + max = 1 + } + return &Generator{ + cfg: *cfg, + rng: rand.New(rand.NewSource(seed)), + maxBody: strings.Repeat("x", max), + } +} + +// drainGracePeriod bounds how long Run waits for in-flight publishes +// to complete after ctx cancels. +const drainGracePeriod = 5 * time.Second + +// Run publishes at the configured rate until ctx is cancelled. When +// MaxInFlight > 0, each tick dispatches the publish to a bounded +// goroutine pool so the ticker stays punctual under load; saturation +// (pool full when a tick fires) is recorded as a publish error with +// reason="saturated" rather than silently dropping the tick. +func (g *Generator) Run(ctx context.Context) error { + if g.cfg.Rate <= 0 { + return fmt.Errorf("rate must be > 0") + } + interval := time.Second / time.Duration(g.cfg.Rate) + if interval <= 0 { + interval = time.Nanosecond + } + tick := time.NewTicker(interval) + defer tick.Stop() + + if g.cfg.MaxInFlight <= 0 { + for { + select { + case <-ctx.Done(): + return nil + case <-tick.C: + g.publishOne(ctx) + } + } + } + + sem := make(chan struct{}, g.cfg.MaxInFlight) + var wg sync.WaitGroup + for { + select { + case <-ctx.Done(): + done := make(chan struct{}) + go func() { wg.Wait(); close(done) }() + select { + case <-done: + case <-time.After(drainGracePeriod): + } + return nil + case <-tick.C: + select { + case sem <- struct{}{}: + wg.Add(1) + go func() { + defer func() { + <-sem + wg.Done() + }() + g.publishOne(ctx) + }() + default: + g.cfg.Metrics.PublishErrors.WithLabelValues(g.cfg.Preset.Name, "saturated").Inc() + } + } + } +} + +// intn returns rng.Intn(n) with mutex protection so publishOne is +// safe to call from multiple worker goroutines. +func (g *Generator) intn(n int) int { + g.rngMu.Lock() + defer g.rngMu.Unlock() + return g.rng.Intn(n) +} + +func (g *Generator) float64() float64 { + g.rngMu.Lock() + defer g.rngMu.Unlock() + return g.rng.Float64() +} + +func (g *Generator) publishOne(ctx context.Context) { + if len(g.cfg.Fixtures.Subscriptions) == 0 { + return + } + subIdx := g.intn(len(g.cfg.Fixtures.Subscriptions)) + sub := g.cfg.Fixtures.Subscriptions[subIdx] + content := g.content() + msgID := uuid.NewString() + publishTime := time.Now() + + var ( + subj string + data []byte + reqID string + err error + ) + switch g.cfg.Inject { + case InjectCanonical: + now := time.Now().UTC() + evt := model.MessageEvent{ + Message: model.Message{ + ID: msgID, RoomID: sub.RoomID, + UserID: sub.User.ID, UserAccount: sub.User.Account, + Content: content, CreatedAt: now, + }, + SiteID: g.cfg.SiteID, + Timestamp: now.UnixMilli(), + } + data, err = json.Marshal(evt) + subj = subject.MsgCanonicalCreated(g.cfg.SiteID) + g.cfg.Collector.RecordPublishBroadcastOnly(msgID, publishTime) + default: + reqID = uuid.NewString() + req := model.SendMessageRequest{ID: msgID, Content: content, RequestID: reqID} + data, err = json.Marshal(req) + subj = subject.MsgSend(sub.User.Account, sub.RoomID, g.cfg.SiteID) + g.cfg.Collector.RecordPublish(reqID, msgID, publishTime) + } + if err != nil { + g.cfg.Metrics.PublishErrors.WithLabelValues(g.cfg.Preset.Name, "marshal").Inc() + return + } + if perr := g.cfg.Publisher.Publish(ctx, subj, data); perr != nil { + g.cfg.Collector.RecordPublishFailed(reqID, msgID) + g.cfg.Metrics.PublishErrors.WithLabelValues(g.cfg.Preset.Name, "publish").Inc() + return + } + phase := "measured" + if publishTime.Before(g.cfg.WarmupDeadline) { + phase = "warmup" + } + g.cfg.Metrics.Published.WithLabelValues(g.cfg.Preset.Name, phase).Inc() +} + +func (g *Generator) content() string { + r := g.cfg.Preset.ContentBytes + size := r.Min + if r.Max > r.Min { + size = r.Min + g.intn(r.Max-r.Min+1) + } + if size <= 0 { + size = 1 + } + body := g.maxBody[:size] + if g.cfg.Preset.MentionRate > 0 && g.float64() < g.cfg.Preset.MentionRate { + target := g.intn(g.cfg.Preset.Users) + body = fmt.Sprintf("@user-%d %s", target, body) + } + return body +} diff --git a/tools/loadgen/generator_test.go b/tools/loadgen/generator_test.go new file mode 100644 index 00000000..f3e6e9c2 --- /dev/null +++ b/tools/loadgen/generator_test.go @@ -0,0 +1,338 @@ +package main + +import ( + "context" + "fmt" + "strings" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type recordingPublisher struct { + mu sync.Mutex + calls []publishCall +} + +type publishCall struct { + subject string + data []byte +} + +func (r *recordingPublisher) Publish(_ context.Context, subject string, data []byte) error { + r.mu.Lock() + defer r.mu.Unlock() + r.calls = append(r.calls, publishCall{subject: subject, data: append([]byte(nil), data...)}) + return nil +} + +func (r *recordingPublisher) count() int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.calls) +} + +func (r *recordingPublisher) snapshot() []publishCall { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]publishCall, len(r.calls)) + copy(out, r.calls) + return out +} + +type errorPublisher struct{} + +func (e *errorPublisher) Publish(_ context.Context, _ string, _ []byte) error { + return fmt.Errorf("publish error") +} + +func TestGenerator_SendsExpectedCount(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(&GeneratorConfig{ + Preset: &p, + Fixtures: f, + SiteID: "site-local", + Rate: 200, + Inject: InjectFrontdoor, + Publisher: rp, + Metrics: m, + Collector: c, + }, 1) + + ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond) + defer cancel() + require.NoError(t, g.Run(ctx)) + + count := rp.count() + // 200 msg/s for ~250ms: expect 30-70 publishes (wide tolerance for scheduler). + assert.GreaterOrEqual(t, count, 30) + assert.LessOrEqual(t, count, 70) +} + +func TestGenerator_UsesFrontdoorSubject(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + calls := rp.snapshot() + require.NotEmpty(t, calls) + for i := range calls { + assert.Contains(t, calls[i].subject, ".msg.send") + assert.Contains(t, calls[i].subject, "site-local") + } +} + +func TestGenerator_UsesCanonicalSubjectWhenInjectCanonical(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectCanonical, + Publisher: rp, Metrics: m, + Collector: c, + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + calls := rp.snapshot() + require.NotEmpty(t, calls) + for i := range calls { + assert.Contains(t, calls[i].subject, "chat.msg.canonical.site-local.created") + } + + // In canonical mode, the Generator should NOT populate byReqID because + // canonical injection bypasses the gatekeeper (no reply is expected). + // Consequently Finalize should report zero missing replies even though + // no replies ever arrived. + missingReplies, _ := c.Finalize() + assert.Equal(t, 0, missingReplies) +} + +func TestGenerator_IncrementsPublishedMetric(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + + var got int64 + metrics, err := m.Registry.Gather() + require.NoError(t, err) + for _, mf := range metrics { + if mf.GetName() == "loadgen_published_total" { + for _, metric := range mf.GetMetric() { + got += int64(metric.GetCounter().GetValue()) + } + } + } + assert.Greater(t, got, int64(0)) +} + +func TestGenerator_Run_ReturnsErrorForZeroRate(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 0, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + err := g.Run(context.Background()) + require.Error(t, err) + assert.Contains(t, err.Error(), "rate must be > 0") +} + +func TestGenerator_PublishError_IncrementsErrorMetric(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + ep := &errorPublisher{} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 100, Inject: InjectFrontdoor, + Publisher: ep, Metrics: m, + Collector: c, + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + + var publishErrors int64 + metrics, err := m.Registry.Gather() + require.NoError(t, err) + for _, mf := range metrics { + if mf.GetName() == "loadgen_publish_errors_total" { + for _, metric := range mf.GetMetric() { + publishErrors += int64(metric.GetCounter().GetValue()) + } + } + } + assert.Greater(t, publishErrors, int64(0)) + + // Publish errors should have cleaned up the pending entries, so Finalize + // reports no "missing replies" or "missing broadcasts" attributable to + // publish-side failures. + missingReplies, missingBroadcasts := c.Finalize() + assert.Equal(t, 0, missingReplies) + assert.Equal(t, 0, missingBroadcasts) +} + +func TestGenerator_Content_WithMentionRate(t *testing.T) { + p, _ := BuiltinPreset("realistic") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + // Run long enough to statistically hit the 10% mention rate. + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 500, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 99) + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + calls := rp.snapshot() + require.NotEmpty(t, calls) + // With 10% mention rate and ~100 messages, at least one should contain "@user-". + foundMention := false + for i := range calls { + if strings.Contains(string(calls[i].data), "@user-") { + foundMention = true + break + } + } + assert.True(t, foundMention, "expected at least one message with a mention") +} + +func TestGenerator_EmptySubscriptions_NoPublish(t *testing.T) { + p, _ := BuiltinPreset("small") + rp := &recordingPublisher{} + m := NewMetrics() + // Use empty fixtures — no subscriptions. + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: Fixtures{}, SiteID: "site-local", + Rate: 200, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: NewCollector(m, p.Name), + }, 1) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + assert.Equal(t, 0, rp.count()) +} + +func TestGenerator_MaxInFlightZeroRunsSerially(t *testing.T) { + // MaxInFlight=0 preserves the legacy serial-on-ticker behavior. + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + rp := &recordingPublisher{} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 200, Inject: InjectFrontdoor, + Publisher: rp, Metrics: m, + Collector: c, + MaxInFlight: 0, + }, 1) + + ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond) + defer cancel() + require.NoError(t, g.Run(ctx)) + + // Same tolerance as the default SendsExpectedCount test. + count := rp.count() + assert.GreaterOrEqual(t, count, 30) + assert.LessOrEqual(t, count, 70) +} + +// blockingPublisher blocks every Publish call until unblock is closed. +// Used to force worker-pool saturation. +type blockingPublisher struct { + unblock chan struct{} + mu sync.Mutex + count int +} + +func (b *blockingPublisher) Publish(ctx context.Context, _ string, _ []byte) error { + select { + case <-b.unblock: + case <-ctx.Done(): + return ctx.Err() + } + b.mu.Lock() + b.count++ + b.mu.Unlock() + return nil +} + +func TestGenerator_PoolSaturationCountedAsError(t *testing.T) { + // With MaxInFlight=1 and a publisher that never returns while the run is + // active, every tick after the first must see the pool saturated and + // increment loadgen_publish_errors_total{reason="saturated"}. + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + bp := &blockingPublisher{unblock: make(chan struct{})} + m := NewMetrics() + c := NewCollector(m, p.Name) + g := NewGenerator(&GeneratorConfig{ + Preset: &p, Fixtures: f, SiteID: "site-local", + Rate: 500, Inject: InjectFrontdoor, + Publisher: bp, Metrics: m, + Collector: c, + MaxInFlight: 1, + }, 1) + + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Millisecond) + defer cancel() + _ = g.Run(ctx) + close(bp.unblock) + + mfs, err := m.Registry.Gather() + require.NoError(t, err) + var saturated float64 + for _, mf := range mfs { + if mf.GetName() != "loadgen_publish_errors_total" { + continue + } + for _, metric := range mf.GetMetric() { + for _, l := range metric.GetLabel() { + if l.GetName() == "reason" && l.GetValue() == "saturated" { + saturated += metric.GetCounter().GetValue() + } + } + } + } + assert.Greater(t, saturated, float64(0), "expected saturated counter to increment under pool-full conditions") +} diff --git a/tools/loadgen/integration_test.go b/tools/loadgen/integration_test.go new file mode 100644 index 00000000..1f6d647e --- /dev/null +++ b/tools/loadgen/integration_test.go @@ -0,0 +1,187 @@ +//go:build integration + +package main + +import ( + "context" + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/mongodb" + "github.com/testcontainers/testcontainers-go/wait" + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/mongoutil" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" +) + +// setupNATS starts a JetStream-enabled NATS container via the generic +// testcontainers interface (no dedicated NATS module is required). +func setupNATS(t *testing.T) (string, func()) { + t.Helper() + ctx := context.Background() + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: testcontainers.ContainerRequest{ + Image: "nats:2.11-alpine", + Cmd: []string{"-js"}, + ExposedPorts: []string{"4222/tcp"}, + WaitingFor: wait.ForLog("Server is ready").WithStartupTimeout(30 * time.Second), + }, + Started: true, + }) + require.NoError(t, err) + host, err := c.Host(ctx) + require.NoError(t, err) + port, err := c.MappedPort(ctx, "4222") + require.NoError(t, err) + return fmt.Sprintf("nats://%s:%s", host, port.Port()), func() { _ = c.Terminate(ctx) } +} + +func setupMongo(t *testing.T) (string, func()) { + t.Helper() + ctx := context.Background() + c, err := mongodb.Run(ctx, "mongo:8") + require.NoError(t, err) + uri, err := c.ConnectionString(ctx) + require.NoError(t, err) + return uri, func() { _ = c.Terminate(ctx) } +} + +// TestLoadgenSmallPreset_EndToEnd verifies the generator publishes messages, +// a fake gatekeeper forwards them to MESSAGES_CANONICAL, two JetStream +// consumers drain the stream, a fake broadcast-worker emits room events, +// and MongoDB shows the seeded room data. +func TestLoadgenSmallPreset_EndToEnd(t *testing.T) { + ctx := context.Background() + natsURI, stopNATS := setupNATS(t) + defer stopNATS() + mongoURI, stopMongo := setupMongo(t) + defer stopMongo() + + nc, err := nats.Connect(natsURI) + require.NoError(t, err) + defer nc.Drain() + + js, err := jetstream.New(nc) + require.NoError(t, err) + + siteID := "site-test" + canonical := stream.MessagesCanonical(siteID) + _, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{ + Name: canonical.Name, + Subjects: canonical.Subjects, + }) + require.NoError(t, err) + + // Two durable consumers that simply ack — stand in for message-worker + // and broadcast-worker so the canonical stream drains to zero. + for _, durable := range []string{"message-worker", "broadcast-worker"} { + cons, err := js.CreateOrUpdateConsumer(ctx, canonical.Name, jetstream.ConsumerConfig{ + Durable: durable, + AckPolicy: jetstream.AckExplicitPolicy, + }) + require.NoError(t, err) + cc, err := cons.Consume(func(msg jetstream.Msg) { _ = msg.Ack() }) + require.NoError(t, err) + defer cc.Stop() + } + + // Connect Mongo and seed fixtures. + client, err := mongoutil.Connect(ctx, mongoURI) + require.NoError(t, err) + defer mongoutil.Disconnect(ctx, client) + db := client.Database("chat") + + preset, _ := BuiltinPreset("small") + fixtures := BuildFixtures(&preset, 42, siteID) + require.NoError(t, Seed(ctx, db, fixtures)) + + metrics := NewMetrics() + collector := NewCollector(metrics, preset.Name) + + // Fake gatekeeper: frontdoor subject → publish MessageEvent to canonical. + gkSub, err := nc.Subscribe( + subject.MsgSendWildcard(siteID), + func(m *nats.Msg) { + var req model.SendMessageRequest + if err := json.Unmarshal(m.Data, &req); err != nil { + return + } + evt := model.MessageEvent{ + Message: model.Message{ + ID: req.ID, + Content: req.Content, + CreatedAt: time.Now().UTC(), + }, + SiteID: siteID, + Timestamp: time.Now().UnixMilli(), + } + data, _ := json.Marshal(evt) + _, _ = js.Publish(ctx, subject.MsgCanonicalCreated(siteID), data) + }, + ) + require.NoError(t, err) + defer gkSub.Unsubscribe() + + // Fake broadcast-worker: canonical event → room event. + bwSub, err := nc.Subscribe( + subject.MsgCanonicalCreated(siteID), + func(m *nats.Msg) { + var evt model.MessageEvent + if err := json.Unmarshal(m.Data, &evt); err != nil { + return + } + roomEvt := model.RoomEvent{ + Type: model.RoomEventNewMessage, + RoomID: "r", + Message: &model.ClientMessage{Message: evt.Message}, + } + data, _ := json.Marshal(roomEvt) + _ = nc.Publish("chat.room.r.event", data) + }, + ) + require.NoError(t, err) + defer bwSub.Unsubscribe() + + publisher := &natsCorePublisher{nc: nc} + gen := NewGenerator(&GeneratorConfig{ + Preset: &preset, + Fixtures: fixtures, + SiteID: siteID, + Rate: 50, + Inject: InjectFrontdoor, + Publisher: publisher, + Metrics: metrics, + Collector: collector, + }, 42) + + runCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + require.NoError(t, gen.Run(runCtx)) + + // Allow trailing events to flow. + time.Sleep(2 * time.Second) + + // Assert the canonical stream drained. + for _, durable := range []string{"message-worker", "broadcast-worker"} { + cons, err := js.Consumer(ctx, canonical.Name, durable) + require.NoError(t, err) + info, err := cons.Info(ctx) + require.NoError(t, err) + require.Equal(t, uint64(0), info.NumPending, "durable %s still has pending", durable) + } + + // Assert seed data is visible in Mongo. + var room model.Room + err = db.Collection("rooms").FindOne(ctx, bson.M{"_id": fixtures.Rooms[0].ID}).Decode(&room) + require.NoError(t, err) + require.Equal(t, fixtures.Rooms[0].ID, room.ID) +} diff --git a/tools/loadgen/main.go b/tools/loadgen/main.go new file mode 100644 index 00000000..f1b1095d --- /dev/null +++ b/tools/loadgen/main.go @@ -0,0 +1,455 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "log/slog" + "net/http" + _ "net/http/pprof" // registers /debug/pprof/* on http.DefaultServeMux; only served if PPROF_ADDR is set. + "os" + "os/signal" + "strings" + "sync" + "syscall" + "time" + + "github.com/caarlos0/env/v11" + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" + dto "github.com/prometheus/client_model/go" + + "github.com/hmchangw/chat/pkg/model" + "github.com/hmchangw/chat/pkg/mongoutil" + "github.com/hmchangw/chat/pkg/natsutil" + "github.com/hmchangw/chat/pkg/stream" + "github.com/hmchangw/chat/pkg/subject" +) + +type config struct { + NatsURL string `env:"NATS_URL,required"` + NatsCredsFile string `env:"NATS_CREDS_FILE" envDefault:""` + SiteID string `env:"SITE_ID" envDefault:"site-local"` + MongoURI string `env:"MONGO_URI,required"` + MongoDB string `env:"MONGO_DB" envDefault:"chat"` + MetricsAddr string `env:"METRICS_ADDR" envDefault:":9099"` + MaxInFlight int `env:"MAX_IN_FLIGHT" envDefault:"200"` + PProfAddr string `env:"PPROF_ADDR" envDefault:""` +} + +func main() { + slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil))) + if len(os.Args) < 2 { + fmt.Fprintln(os.Stderr, "usage: loadgen [flags]") + os.Exit(2) + } + cfg, err := env.ParseAs[config]() + if err != nil { + slog.Error("parse config", "error", err) + os.Exit(1) + } + // SIGINT / SIGTERM cancel the base context. Each subcommand treats ctx + // cancellation as "stop early but still run the end-of-run finalizers + // (print summary, drain NATS, disconnect Mongo)". + // + // This deviates from CLAUDE.md's "use pkg/shutdown.Wait" guidance: that + // helper blocks waiting for a signal and fires shutdown callbacks, which + // doesn't fit a time-bounded CLI where the primary termination trigger is + // the --duration timeout rather than an external signal. NotifyContext + // gives us the same cleanup guarantee via context cancellation propagation. + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + code := dispatch(ctx, &cfg) + stop() + os.Exit(code) +} + +func dispatch(ctx context.Context, cfg *config) int { + switch os.Args[1] { + case "seed": + return runSeed(ctx, cfg, os.Args[2:]) + case "run": + return runRun(ctx, cfg, os.Args[2:]) + case "teardown": + return runTeardown(ctx, cfg) + default: + fmt.Fprintf(os.Stderr, "unknown subcommand: %s\n", os.Args[1]) + return 2 + } +} + +func runSeed(ctx context.Context, cfg *config, args []string) int { + fs := flag.NewFlagSet("seed", flag.ExitOnError) + preset := fs.String("preset", "", "preset name") + seed := fs.Int64("seed", 42, "RNG seed") + _ = fs.Parse(args) + if *preset == "" { + fmt.Fprintln(os.Stderr, "--preset required") + return 2 + } + p, ok := BuiltinPreset(*preset) + if !ok { + fmt.Fprintf(os.Stderr, "unknown preset: %s\n", *preset) + return 2 + } + client, err := mongoutil.Connect(ctx, cfg.MongoURI) + if err != nil { + slog.Error("mongo connect", "error", err) + return 1 + } + defer mongoutil.Disconnect(ctx, client) + db := client.Database(cfg.MongoDB) + fixtures := BuildFixtures(&p, *seed, cfg.SiteID) + if err := Seed(ctx, db, fixtures); err != nil { + slog.Error("seed", "error", err) + return 1 + } + slog.Info("seed complete", + "preset", p.Name, + "users", len(fixtures.Users), + "rooms", len(fixtures.Rooms), + "subs", len(fixtures.Subscriptions)) + return 0 +} + +func runTeardown(ctx context.Context, cfg *config) int { + client, err := mongoutil.Connect(ctx, cfg.MongoURI) + if err != nil { + slog.Error("mongo connect", "error", err) + return 1 + } + defer mongoutil.Disconnect(ctx, client) + db := client.Database(cfg.MongoDB) + if err := Teardown(ctx, db); err != nil { + slog.Error("teardown", "error", err) + return 1 + } + slog.Info("teardown complete") + return 0 +} + +func runRun(ctx context.Context, cfg *config, args []string) int { + fs := flag.NewFlagSet("run", flag.ExitOnError) + preset := fs.String("preset", "", "preset name") + seed := fs.Int64("seed", 42, "RNG seed") + duration := fs.Duration("duration", 60*time.Second, "run duration") + rate := fs.Int("rate", 500, "target msgs/sec") + warmup := fs.Duration("warmup", 10*time.Second, "warmup window (samples discarded)") + inject := fs.String("inject", "frontdoor", "injection point: frontdoor|canonical") + csvPath := fs.String("csv", "", "optional csv output path") + _ = fs.Parse(args) + if *preset == "" { + fmt.Fprintln(os.Stderr, "--preset required") + return 2 + } + p, ok := BuiltinPreset(*preset) + if !ok { + fmt.Fprintf(os.Stderr, "unknown preset: %s\n", *preset) + return 2 + } + var injectMode InjectMode + switch *inject { + case "frontdoor": + injectMode = InjectFrontdoor + case "canonical": + injectMode = InjectCanonical + default: + fmt.Fprintf(os.Stderr, "unknown inject mode: %s\n", *inject) + return 2 + } + + nc, err := natsutil.Connect(cfg.NatsURL, cfg.NatsCredsFile) + if err != nil { + slog.Error("nats connect", "error", err) + return 1 + } + js, err := jetstream.New(nc.NatsConn()) + if err != nil { + slog.Error("jetstream init", "error", err) + return 1 + } + + metrics := NewMetrics() + metricsSrv := &http.Server{ + Addr: cfg.MetricsAddr, + Handler: metrics.Handler(), + ReadHeaderTimeout: 5 * time.Second, + } + go func() { + if err := metricsSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Warn("metrics server stopped", "error", err) + } + }() + + // pprof lives on a separate port, opt-in via PPROF_ADDR. Off by default + // so the metrics endpoint (which Prometheus scrapes) doesn't + // inadvertently expose profiling. + var pprofSrv *http.Server + if cfg.PProfAddr != "" { + pprofSrv = &http.Server{ + Addr: cfg.PProfAddr, + Handler: http.DefaultServeMux, // net/http/pprof registers on DefaultServeMux via side-effect import. + ReadHeaderTimeout: 5 * time.Second, + } + go func() { + if err := pprofSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Warn("pprof server stopped", "error", err) + } + }() + slog.Info("pprof server listening", "addr", cfg.PProfAddr) + } + + fixtures := BuildFixtures(&p, *seed, cfg.SiteID) + collector := NewCollector(metrics, p.Name) + + // E1 subscription: gatekeeper replies. + e1Sub, err := nc.NatsConn().Subscribe(subject.UserResponseWildcard(), func(msg *nats.Msg) { + reqID := lastToken(msg.Subject) + var payload struct { + Error string `json:"error"` + } + if err := json.Unmarshal(msg.Data, &payload); err != nil { + // Malformed reply; count and drop per spec. + metrics.PublishErrors.WithLabelValues(p.Name, "bad_reply").Inc() + return + } + if payload.Error != "" { + metrics.PublishErrors.WithLabelValues(p.Name, "gatekeeper").Inc() + } + collector.RecordReply(reqID, time.Now()) + }) + if err != nil { + slog.Error("subscribe e1", "error", err) + return 1 + } + defer func() { _ = e1Sub.Unsubscribe() }() + + // E2 subscription: broadcast events. + e2Handler := func(msg *nats.Msg) { + var evt model.RoomEvent + if err := json.Unmarshal(msg.Data, &evt); err != nil { + return + } + if evt.Message == nil || evt.Message.ID == "" { + return + } + collector.RecordBroadcast(evt.Message.ID, time.Now()) + } + + e2Sub, err := nc.NatsConn().Subscribe(subject.RoomEventWildcard(), e2Handler) + if err != nil { + slog.Error("subscribe e2", "error", err) + return 1 + } + defer func() { _ = e2Sub.Unsubscribe() }() + + // Broadcast-worker emits DM broadcasts on chat.user.{account}.event.room + // (see pkg/subject.UserRoomEvent). Subscribe to both so E2 correlation + // covers both group and DM rooms. + e2DMSub, err := nc.NatsConn().Subscribe(subject.UserRoomEventWildcard(), e2Handler) + if err != nil { + slog.Error("subscribe e2 dm", "error", err) + return 1 + } + defer func() { _ = e2DMSub.Unsubscribe() }() + + canonical := stream.MessagesCanonical(cfg.SiteID) + samplerCtx, cancelSamplers := context.WithCancel(ctx) + defer cancelSamplers() + samplers := []*ConsumerSampler{ + NewConsumerSampler(js, canonical.Name, "message-worker", metrics, 1*time.Second), + NewConsumerSampler(js, canonical.Name, "broadcast-worker", metrics, 1*time.Second), + } + var samplerWG sync.WaitGroup + for _, s := range samplers { + samplerWG.Add(1) + go func(s *ConsumerSampler) { + defer samplerWG.Done() + s.Run(samplerCtx) + }(s) + } + + publisher := newNatsCorePublisher(nc.NatsConn(), injectMode, js) + + warmupDeadline := time.Now().Add(*warmup) + gen := NewGenerator(&GeneratorConfig{ + Preset: &p, + Fixtures: fixtures, + SiteID: cfg.SiteID, + Rate: *rate, + Inject: injectMode, + Publisher: publisher, + Metrics: metrics, + Collector: collector, + WarmupDeadline: warmupDeadline, + MaxInFlight: cfg.MaxInFlight, + }, *seed) + + runCtx, cancelRun := context.WithTimeout(ctx, *duration) + defer cancelRun() + genErr := gen.Run(runCtx) + // Wait up to 2 seconds for trailing replies and broadcasts to arrive. + time.Sleep(2 * time.Second) + collector.DiscardBefore(warmupDeadline) + missingReplies, missingBroadcasts := collector.Finalize() + + cancelSamplers() + samplerWG.Wait() + + shutCtx, cancelShut := context.WithTimeout(context.Background(), 5*time.Second) + _ = metricsSrv.Shutdown(shutCtx) + if pprofSrv != nil { + _ = pprofSrv.Shutdown(shutCtx) + } + cancelShut() + _ = nc.Drain() + + if genErr != nil { + slog.Error("generator error", "error", genErr) + } + + mfs, gerr := metrics.Registry.Gather() + if gerr != nil { + slog.Warn("metrics gather", "error", gerr) + mfs = nil + } + publishErrs := gatheredCounterValue(mfs, "loadgen_publish_errors_total", "", "") + gkErrs := gatheredCounterValue(mfs, "loadgen_publish_errors_total", "reason", "gatekeeper") + sentWarmup := int(gatheredCounterValue(mfs, "loadgen_published_total", "phase", "warmup")) + sentMeasured := int(gatheredCounterValue(mfs, "loadgen_published_total", "phase", "measured")) + sent := sentWarmup + sentMeasured + measured := *duration - *warmup + actualRate := 0.0 + if measured > 0 { + // In canonical mode, byReqID is never populated, so E1Count/missingReplies + // are both 0. Fall back to sentMeasured to compute the true publish rate + // for the measured window only. + switch injectMode { + case InjectCanonical: + actualRate = float64(sentMeasured) / measured.Seconds() + default: + actualRate = float64(collector.E1Count()+missingReplies) / measured.Seconds() + } + } + + summary := Summary{ + Preset: p.Name, + Seed: *seed, + Site: cfg.SiteID, + TargetRate: *rate, + ActualRate: actualRate, + Duration: *duration, + Warmup: *warmup, + Inject: *inject, + Sent: sent, + SentMeasured: sentMeasured, + PublishErrors: int(publishErrs - gkErrs), + GatekeeperErrors: int(gkErrs), + MissingReplies: missingReplies, + MissingBroadcasts: missingBroadcasts, + E1: ComputePercentiles(collector.E1Samples()), + E2: ComputePercentiles(collector.E2Samples()), + E1Count: collector.E1Count(), + E2Count: collector.E2Count(), + Consumers: []ConsumerStat{samplers[0].Snapshot(), samplers[1].Snapshot()}, + } + if err := PrintSummary(os.Stdout, &summary); err != nil { + slog.Warn("print summary", "error", err) + } + + if *csvPath != "" { + if err := writeCSVFile(*csvPath, collector); err != nil { + slog.Error("csv export", "error", err) + } + } + + totalErrs := summary.PublishErrors + summary.GatekeeperErrors + summary.MissingReplies + summary.MissingBroadcasts + return DetermineExitCode(summary.SentMeasured, totalErrs) +} + +type natsCorePublisher struct { + nc *nats.Conn + useJetStream bool + js jetstream.JetStream +} + +func newNatsCorePublisher(nc *nats.Conn, inject InjectMode, js jetstream.JetStream) *natsCorePublisher { + return &natsCorePublisher{nc: nc, useJetStream: inject == InjectCanonical, js: js} +} + +func (p *natsCorePublisher) Publish(ctx context.Context, subject string, data []byte) error { + if p.useJetStream { + if _, err := p.js.Publish(ctx, subject, data); err != nil { + return fmt.Errorf("jetstream publish: %w", err) + } + return nil + } + if err := p.nc.Publish(subject, data); err != nil { + return fmt.Errorf("core publish: %w", err) + } + return nil +} + +func lastToken(subj string) string { + i := strings.LastIndex(subj, ".") + if i < 0 { + return subj + } + return subj[i+1:] +} + +func writeCSVFile(path string, c *Collector) error { + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create csv: %w", err) + } + defer func() { _ = f.Close() }() + var rows []CSVSample + for i, d := range c.E1Samples() { + rows = append(rows, CSVSample{TimestampNs: int64(i), Metric: "E1", LatencyNs: d.Nanoseconds()}) + } + for i, d := range c.E2Samples() { + rows = append(rows, CSVSample{TimestampNs: int64(i), Metric: "E2", LatencyNs: d.Nanoseconds()}) + } + return WriteCSV(f, rows) +} + +func gatheredCounterValue(mfs []*dto.MetricFamily, name string, labelName, labelValue string) float64 { + var total float64 + for _, mf := range mfs { + if mf.GetName() != name { + continue + } + for _, metric := range mf.GetMetric() { + if labelName == "" { + total += metric.GetCounter().GetValue() + continue + } + for _, l := range metric.GetLabel() { + if l.GetName() == labelName && l.GetValue() == labelValue { + total += metric.GetCounter().GetValue() + } + } + } + } + return total +} + +func counterValue(m *Metrics, name string) float64 { + mfs, err := m.Registry.Gather() + if err != nil { + slog.Warn("metrics gather", "error", err) + return 0 + } + return gatheredCounterValue(mfs, name, "", "") +} + +func counterValueLabeled(m *Metrics, name, labelName, labelValue string) float64 { + mfs, err := m.Registry.Gather() + if err != nil { + slog.Warn("metrics gather", "error", err) + return 0 + } + return gatheredCounterValue(mfs, name, labelName, labelValue) +} diff --git a/tools/loadgen/main_test.go b/tools/loadgen/main_test.go new file mode 100644 index 00000000..a7715dd8 --- /dev/null +++ b/tools/loadgen/main_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestLastToken(t *testing.T) { + cases := []struct{ in, want string }{ + {"chat.user.alice.response.abc-123", "abc-123"}, + {"abc", "abc"}, // no dot + {"", ""}, // empty + {"a.b.c.d.e.f", "f"}, // many dots + } + for _, c := range cases { + t.Run(c.in, func(t *testing.T) { + assert.Equal(t, c.want, lastToken(c.in)) + }) + } +} + +func TestCounterValue(t *testing.T) { + m := NewMetrics() + m.Published.WithLabelValues("small", "measured").Inc() + m.Published.WithLabelValues("small", "measured").Inc() + m.Published.WithLabelValues("medium", "measured").Inc() + assert.Equal(t, float64(3), counterValue(m, "loadgen_published_total")) + assert.Equal(t, float64(0), counterValue(m, "nonexistent_metric")) +} + +func TestCounterValueLabeled(t *testing.T) { + m := NewMetrics() + m.PublishErrors.WithLabelValues("small", "publish").Inc() + m.PublishErrors.WithLabelValues("small", "publish").Inc() + m.PublishErrors.WithLabelValues("small", "gatekeeper").Inc() + m.PublishErrors.WithLabelValues("large", "publish").Inc() + // By reason=publish: two "small" + one "large" = 3 + assert.Equal(t, float64(3), counterValueLabeled(m, "loadgen_publish_errors_total", "reason", "publish")) + // By reason=gatekeeper: one + assert.Equal(t, float64(1), counterValueLabeled(m, "loadgen_publish_errors_total", "reason", "gatekeeper")) + // Unknown label value + assert.Equal(t, float64(0), counterValueLabeled(m, "loadgen_publish_errors_total", "reason", "nope")) +} + +func TestWriteCSVFile_RoundTrip(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + now := time.Unix(0, 0) + c.RecordPublish("r-1", "m-1", now) + c.RecordReply("r-1", now.Add(5*time.Millisecond)) + c.RecordBroadcast("m-1", now.Add(8*time.Millisecond)) + + path := filepath.Join(t.TempDir(), "out.csv") + require.NoError(t, writeCSVFile(path, c)) + + data, err := os.ReadFile(path) + require.NoError(t, err) + out := string(data) + // Header present + require.True(t, strings.HasPrefix(out, "timestamp_ns,request_id,metric,latency_ns")) + // At least one E1 row and one E2 row + require.Contains(t, out, ",E1,") + require.Contains(t, out, ",E2,") +} + +func TestWriteCSVFile_EmptyCollector(t *testing.T) { + m := NewMetrics() + c := NewCollector(m, "small") + + path := filepath.Join(t.TempDir(), "empty.csv") + require.NoError(t, writeCSVFile(path, c)) + + data, err := os.ReadFile(path) + require.NoError(t, err) + out := string(data) + // Header still present, no data rows + require.True(t, strings.HasPrefix(out, "timestamp_ns,request_id,metric,latency_ns")) + require.NotContains(t, out, ",E1,") + require.NotContains(t, out, ",E2,") +} + +func TestNewNatsCorePublisher_CanonicalSetsUseJetStream(t *testing.T) { + p := newNatsCorePublisher(nil, InjectCanonical, nil) + require.True(t, p.useJetStream) +} + +func TestNewNatsCorePublisher_FrontdoorDoesNotSetUseJetStream(t *testing.T) { + p := newNatsCorePublisher(nil, InjectFrontdoor, nil) + require.False(t, p.useJetStream) +} + +func TestNewNatsCorePublisher_FieldWiring(t *testing.T) { + p := newNatsCorePublisher(nil, InjectCanonical, nil) + assert.Nil(t, p.nc) + assert.Nil(t, p.js) + assert.True(t, p.useJetStream) + + p2 := newNatsCorePublisher(nil, InjectFrontdoor, nil) + assert.Nil(t, p2.nc) + assert.Nil(t, p2.js) + assert.False(t, p2.useJetStream) +} + +func TestMetricsHandler_ServesOpenMetrics(t *testing.T) { + m := NewMetrics() + m.Published.WithLabelValues("small", "measured").Inc() + req := httptest.NewRequest("GET", "/metrics", nil) + rec := httptest.NewRecorder() + m.Handler().ServeHTTP(rec, req) + require.Equal(t, 200, rec.Code) + require.Contains(t, rec.Body.String(), "loadgen_published_total") +} + +func TestMetricsHandler_ContentType(t *testing.T) { + m := NewMetrics() + req := httptest.NewRequest("GET", "/metrics", nil) + rec := httptest.NewRecorder() + m.Handler().ServeHTTP(rec, req) + require.Equal(t, 200, rec.Code) + ct := rec.Header().Get("Content-Type") + require.NotEmpty(t, ct) + // Prometheus text format + require.Contains(t, ct, "text/plain") +} diff --git a/tools/loadgen/metrics.go b/tools/loadgen/metrics.go new file mode 100644 index 00000000..84ddb438 --- /dev/null +++ b/tools/loadgen/metrics.go @@ -0,0 +1,72 @@ +package main + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// Metrics holds the Prometheus collectors used across loadgen components. +type Metrics struct { + Registry *prometheus.Registry + Published *prometheus.CounterVec + PublishErrors *prometheus.CounterVec + E1Latency *prometheus.HistogramVec + E2Latency *prometheus.HistogramVec + ConsumerPending *prometheus.GaugeVec + ConsumerAckPending *prometheus.GaugeVec + ConsumerRedelivered *prometheus.GaugeVec +} + +// NewMetrics constructs a dedicated Prometheus registry with all loadgen +// collectors registered. A dedicated registry avoids colliding with default +// Go/process collectors. +func NewMetrics() *Metrics { + r := prometheus.NewRegistry() + buckets := []float64{ + 0.001, 0.002, 0.005, 0.010, 0.025, 0.050, 0.100, 0.250, 0.500, 1.000, 2.500, 5.000, + } + m := &Metrics{ + Registry: r, + Published: prometheus.NewCounterVec( + prometheus.CounterOpts{Name: "loadgen_published_total", Help: "Messages published by preset and phase (warmup|measured)."}, + []string{"preset", "phase"}, + ), + PublishErrors: prometheus.NewCounterVec( + prometheus.CounterOpts{Name: "loadgen_publish_errors_total", Help: "Publish-side errors."}, + []string{"preset", "reason"}, + ), + E1Latency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{Name: "loadgen_e1_latency_seconds", Help: "Gatekeeper ack latency.", Buckets: buckets}, + []string{"preset"}, + ), + E2Latency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{Name: "loadgen_e2_latency_seconds", Help: "Broadcast-visible latency.", Buckets: buckets}, + []string{"preset"}, + ), + ConsumerPending: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_pending", Help: "JetStream consumer num_pending."}, + []string{"stream", "durable"}, + ), + ConsumerAckPending: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_ack_pending", Help: "JetStream consumer num_ack_pending."}, + []string{"stream", "durable"}, + ), + ConsumerRedelivered: prometheus.NewGaugeVec( + prometheus.GaugeOpts{Name: "loadgen_consumer_redelivered", Help: "JetStream consumer num_redelivered."}, + []string{"stream", "durable"}, + ), + } + r.MustRegister( + m.Published, m.PublishErrors, + m.E1Latency, m.E2Latency, + m.ConsumerPending, m.ConsumerAckPending, m.ConsumerRedelivered, + ) + return m +} + +// Handler returns an http.Handler serving this metrics registry. +func (m *Metrics) Handler() http.Handler { + return promhttp.HandlerFor(m.Registry, promhttp.HandlerOpts{}) +} diff --git a/tools/loadgen/preset.go b/tools/loadgen/preset.go new file mode 100644 index 00000000..9e6940a5 --- /dev/null +++ b/tools/loadgen/preset.go @@ -0,0 +1,196 @@ +package main + +import ( + "fmt" + "math/rand" + "time" + + "github.com/hmchangw/chat/pkg/model" +) + +// Distribution names the shape of a per-preset random selection. +type Distribution string + +const ( + DistUniform Distribution = "uniform" + DistMixed Distribution = "mixed" + DistZipf Distribution = "zipf" +) + +// Range holds an inclusive min/max for integer quantities like content size. +type Range struct { + Min int + Max int +} + +// Preset is a named, fully deterministic workload specification. +type Preset struct { + Name string + Users int + Rooms int + RoomSizeDist Distribution + SenderDist Distribution + ContentBytes Range + MentionRate float64 + ThreadRate float64 +} + +var builtinPresets = map[string]Preset{ + "small": { + Name: "small", Users: 10, Rooms: 5, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "medium": { + Name: "medium", Users: 1000, Rooms: 100, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "large": { + Name: "large", Users: 10000, Rooms: 1000, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + }, + "realistic": { + Name: "realistic", Users: 1000, Rooms: 100, + RoomSizeDist: DistMixed, SenderDist: DistZipf, + ContentBytes: Range{Min: 50, Max: 2000}, + MentionRate: 0.10, + ThreadRate: 0.05, + }, +} + +// BuiltinPreset looks up a preset by name. +func BuiltinPreset(name string) (Preset, bool) { + p, ok := builtinPresets[name] + return p, ok +} + +// Fixtures is the full seed data for a preset run. +type Fixtures struct { + Users []model.User + Rooms []model.Room + Subscriptions []model.Subscription +} + +var ( + engNameBank = []string{"Alice Wang", "Bob Chen", "Carol Lee", "Dave Liu", "Eve Zhang"} + chineseNameBank = []string{"愛麗絲", "鮑勃", "卡蘿", "戴夫", "伊芙"} +) + +// BuildFixtures is a pure function of (preset, seed, siteID) producing the +// full fixture set. Two calls with equal inputs produce equal outputs. +func BuildFixtures(p *Preset, seed int64, siteID string) Fixtures { + r := rand.New(rand.NewSource(seed)) + now := time.Unix(0, 0).UTC() // fixed so output is deterministic + + users := make([]model.User, p.Users) + for i := 0; i < p.Users; i++ { + users[i] = model.User{ + ID: fmt.Sprintf("u-%06d", i), + Account: fmt.Sprintf("user-%d", i), + SiteID: siteID, + EngName: engNameBank[i%len(engNameBank)], + ChineseName: chineseNameBank[i%len(chineseNameBank)], + } + } + + rooms := make([]model.Room, p.Rooms) + // realistic: last 10% of rooms are DMs + dmStart := p.Rooms + if p.RoomSizeDist == DistMixed { + dmStart = p.Rooms - p.Rooms/10 + } + for i := 0; i < p.Rooms; i++ { + rtype := model.RoomTypeChannel + if i >= dmStart { + rtype = model.RoomTypeDM + } + rooms[i] = model.Room{ + ID: fmt.Sprintf("room-%06d", i), + Name: fmt.Sprintf("room-%d", i), + Type: rtype, + SiteID: siteID, + UserCount: 0, // filled after membership + CreatedAt: now, + UpdatedAt: now, + } + } + + var subs []model.Subscription + for i := range rooms { + members := pickMembers(r, p, i, p.Rooms, &rooms[i], users) + rooms[i].UserCount = len(members) + for j := range members { + subs = append(subs, model.Subscription{ + ID: fmt.Sprintf("sub-%s-%s", rooms[i].ID, members[j].ID), + User: model.SubscriptionUser{ID: members[j].ID, Account: members[j].Account}, + RoomID: rooms[i].ID, + SiteID: siteID, + Roles: []model.Role{model.RoleMember}, + JoinedAt: now, + }) + } + } + return Fixtures{Users: users, Rooms: rooms, Subscriptions: subs} +} + +func pickMembers(r *rand.Rand, p *Preset, roomIdx, totalRooms int, room *model.Room, users []model.User) []model.User { + if room.Type == model.RoomTypeDM { + // Two distinct users. + i := r.Intn(len(users)) + j := r.Intn(len(users) - 1) + if j >= i { + j++ + } + return []model.User{users[i], users[j]} + } + switch p.RoomSizeDist { + case DistMixed: + // 10% of rooms get up to 500 members; rest get 2-20. + size := 2 + r.Intn(19) + if r.Intn(10) == 0 { + size = 2 + r.Intn(499) + } + return sampleWithoutReplacement(r, users, size) + default: + // Assign each user to exactly one room via round-robin so that every + // user appears in at least one room. + var members []model.User + for i := range users { + if i%totalRooms == roomIdx { + members = append(members, users[i]) + } + } + if len(members) < 2 { + // Pad with random extras to ensure at least 2 members. + extra := sampleWithoutReplacement(r, users, 2) + seen := make(map[string]bool) + for i := range members { + seen[members[i].ID] = true + } + for i := range extra { + if !seen[extra[i].ID] { + members = append(members, extra[i]) + seen[extra[i].ID] = true + } + if len(members) >= 2 { + break + } + } + } + return members + } +} + +func sampleWithoutReplacement(r *rand.Rand, users []model.User, n int) []model.User { + if n > len(users) { + n = len(users) + } + idx := r.Perm(len(users))[:n] + out := make([]model.User, n) + for i, k := range idx { + out[i] = users[k] + } + return out +} diff --git a/tools/loadgen/preset_test.go b/tools/loadgen/preset_test.go new file mode 100644 index 00000000..14a31dd1 --- /dev/null +++ b/tools/loadgen/preset_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hmchangw/chat/pkg/model" +) + +func TestBuiltinPresets_ContainsAllFour(t *testing.T) { + names := []string{"small", "medium", "large", "realistic"} + for _, name := range names { + t.Run(name, func(t *testing.T) { + p, ok := BuiltinPreset(name) + require.True(t, ok, "preset %q must exist", name) + assert.Equal(t, name, p.Name) + assert.Greater(t, p.Users, 0) + assert.Greater(t, p.Rooms, 0) + }) + } +} + +func TestBuiltinPresets_UnknownReturnsFalse(t *testing.T) { + _, ok := BuiltinPreset("nonexistent") + assert.False(t, ok) +} + +func TestBuiltinPresets_UniformShape(t *testing.T) { + for _, name := range []string{"small", "medium", "large"} { + t.Run(name, func(t *testing.T) { + p, ok := BuiltinPreset(name) + require.True(t, ok) + assert.Equal(t, DistUniform, p.RoomSizeDist) + assert.Equal(t, DistUniform, p.SenderDist) + assert.InDelta(t, 0.0, p.MentionRate, 1e-9) + assert.InDelta(t, 0.0, p.ThreadRate, 1e-9) + }) + } +} + +func TestBuiltinPresets_RealisticShape(t *testing.T) { + p, ok := BuiltinPreset("realistic") + require.True(t, ok) + assert.Equal(t, DistMixed, p.RoomSizeDist) + assert.Equal(t, DistZipf, p.SenderDist) + assert.Greater(t, p.MentionRate, 0.0) + assert.Greater(t, p.ThreadRate, 0.0) + assert.Greater(t, p.ContentBytes.Max, p.ContentBytes.Min) +} + +func TestBuildFixtures_DeterministicAcrossCalls(t *testing.T) { + p, _ := BuiltinPreset("small") + a := BuildFixtures(&p, 42, "site-local") + b := BuildFixtures(&p, 42, "site-local") + assert.Equal(t, a.Users, b.Users) + assert.Equal(t, a.Rooms, b.Rooms) + assert.Equal(t, a.Subscriptions, b.Subscriptions) +} + +func TestBuildFixtures_SmallCountsAndShape(t *testing.T) { + p, _ := BuiltinPreset("small") + f := BuildFixtures(&p, 42, "site-local") + assert.Len(t, f.Users, 10) + assert.Len(t, f.Rooms, 5) + // uniform: every user is in at least one room + users := make(map[string]bool) + for _, s := range f.Subscriptions { + users[s.User.ID] = true + assert.Equal(t, "site-local", s.SiteID) + } + assert.Len(t, users, 10) + for _, r := range f.Rooms { + assert.Equal(t, "channel", string(r.Type)) + assert.Equal(t, "site-local", r.SiteID) + } +} + +func TestBuildFixtures_RealisticMixesChannelAndDM(t *testing.T) { + p, _ := BuiltinPreset("realistic") + f := BuildFixtures(&p, 42, "site-local") + var channels, dms int + for _, r := range f.Rooms { + switch r.Type { //nolint:exhaustive + case "channel": + channels++ + case "dm": + dms++ + } + } + assert.Greater(t, channels, 0) + assert.Greater(t, dms, 0) + // DM rooms must have exactly 2 members + dmMembers := make(map[string]int) + for _, s := range f.Subscriptions { + for _, r := range f.Rooms { + if r.ID == s.RoomID && r.Type == "dm" { + dmMembers[r.ID]++ + } + } + } + for id, n := range dmMembers { + assert.Equal(t, 2, n, "dm room %s must have 2 members", id) + } +} + +func TestBuildFixtures_FewerUsersThanRooms_PadsToTwoMembers(t *testing.T) { + // Synthetic preset: 3 users, 5 rooms — round-robin alone leaves rooms 3 + // and 4 with fewer than 2 members, exercising the padding branch. + p := &Preset{ + Name: "tiny", Users: 3, Rooms: 5, + RoomSizeDist: DistUniform, SenderDist: DistUniform, + ContentBytes: Range{Min: 200, Max: 200}, + } + f := BuildFixtures(p, 42, "site-local") + require.Len(t, f.Rooms, 5) + for i := range f.Rooms { + assert.GreaterOrEqual(t, f.Rooms[i].UserCount, 2, + "room %s must have at least 2 members after padding", f.Rooms[i].ID) + } +} + +func TestSampleWithoutReplacement_CapsAtUserCount(t *testing.T) { + // Requesting more samples than users available silently caps at len(users). + r := rand.New(rand.NewSource(1)) + users := []model.User{{ID: "u-0"}, {ID: "u-1"}} + out := sampleWithoutReplacement(r, users, 99) + assert.Len(t, out, 2) +} diff --git a/tools/loadgen/report.go b/tools/loadgen/report.go new file mode 100644 index 00000000..9c6bbe56 --- /dev/null +++ b/tools/loadgen/report.go @@ -0,0 +1,155 @@ +package main + +import ( + "encoding/csv" + "fmt" + "io" + "sort" + "strconv" + "text/tabwriter" + "time" +) + +// Percentiles holds summary latency percentiles. +type Percentiles struct { + P50, P95, P99, Max time.Duration +} + +// ComputePercentiles returns P50/P95/P99/max of samples. Empty input -> zeros. +// Input does not need to be sorted on entry. +func ComputePercentiles(samples []time.Duration) Percentiles { + if len(samples) == 0 { + return Percentiles{} + } + sorted := make([]time.Duration, len(samples)) + copy(sorted, samples) + sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) + pick := func(q float64) time.Duration { + idx := int(float64(len(sorted)-1) * q) + return sorted[idx] + } + return Percentiles{ + P50: pick(0.50), + P95: pick(0.95), + P99: pick(0.99), + Max: sorted[len(sorted)-1], + } +} + +// ConsumerStat captures the min/peak/final snapshot of a single durable. +type ConsumerStat struct { + Stream string + Durable string + MinPending uint64 + PeakPending uint64 + FinalPending uint64 + PeakAckPending uint64 + // Redelivered is the final (at-shutdown) value of NumRedelivered, not a cumulative total. + Redelivered uint64 +} + +// Summary is the full end-of-run report. +type Summary struct { + Preset, Site, Inject string + Seed int64 + TargetRate int + ActualRate float64 + Duration, Warmup time.Duration + Sent int // total across warmup + measured + SentMeasured int // post-warmup only; the denominator for E1/E2 comparisons + PublishErrors int + GatekeeperErrors int + MissingReplies int + MissingBroadcasts int + E1 Percentiles + E2 Percentiles + E1Count, E2Count int + Consumers []ConsumerStat +} + +// PrintSummary writes the terminal summary to w using text/tabwriter. +func PrintSummary(w io.Writer, s *Summary) error { + fmt.Fprintln(w, "=== loadgen run complete ===") + fmt.Fprintf(w, "preset: %s seed: %d site: %s\n", s.Preset, s.Seed, s.Site) + fmt.Fprintf(w, "duration: %s (warmup: %s, measured: %s) inject: %s\n", + s.Duration, s.Warmup, s.Duration-s.Warmup, s.Inject) + fmt.Fprintf(w, "target rate: %d msg/s actual rate: %.1f msg/s\n\n", s.TargetRate, s.ActualRate) + + fmt.Fprintln(w, "publish results") + fmt.Fprintf(w, " sent (total): %d\n", s.Sent) + fmt.Fprintf(w, " sent (measured): %d ← compared to E1/E2 counts below\n", s.SentMeasured) + fmt.Fprintf(w, " publish errors: %d\n", s.PublishErrors) + fmt.Fprintf(w, " gatekeeper errors: %d\n", s.GatekeeperErrors) + fmt.Fprintf(w, " missing replies: %d\n", s.MissingReplies) + fmt.Fprintf(w, " missing broadcasts:%d\n\n", s.MissingBroadcasts) + + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw, "latency (measured window only)") + fmt.Fprintln(tw, "metric\tcount\tp50\tp95\tp99\tmax") + fmt.Fprintf(tw, "E1 gatekeeper\t%d\t%s\t%s\t%s\t%s\n", s.E1Count, s.E1.P50, s.E1.P95, s.E1.P99, s.E1.Max) + fmt.Fprintf(tw, "E2 broadcast\t%d\t%s\t%s\t%s\t%s\n", s.E2Count, s.E2.P50, s.E2.P95, s.E2.P99, s.E2.Max) + if err := tw.Flush(); err != nil { + return fmt.Errorf("flush latency table: %w", err) + } + + fmt.Fprintln(w) + if len(s.Consumers) > 0 { + fmt.Fprintf(w, "consumer lag (%s)\n", s.Consumers[0].Stream) + tw2 := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw2, "durable\tmin_pending\tpeak_pending\tfinal_pending\tpeak_ack_pending\tredelivered") + for i := range s.Consumers { + c := &s.Consumers[i] + fmt.Fprintf(tw2, "%s\t%d\t%d\t%d\t%d\t%d\n", + c.Durable, c.MinPending, c.PeakPending, c.FinalPending, c.PeakAckPending, c.Redelivered) + } + if err := tw2.Flush(); err != nil { + return fmt.Errorf("flush consumer table: %w", err) + } + } + return nil +} + +// CSVSample is one row in the per-sample CSV dump. +type CSVSample struct { + TimestampNs int64 + RequestID string + Metric string + LatencyNs int64 +} + +// WriteCSV writes a header and one row per sample. +// csv.Writer buffers internally; individual Write calls never return errors — +// errors surface only via cw.Error() after Flush. +func WriteCSV(w io.Writer, rows []CSVSample) error { + cw := csv.NewWriter(w) + // Errors are intentionally discarded here: csv.Writer buffers all writes + // and accumulates the first error internally. cw.Error() below is the + // canonical way to retrieve it after Flush. + _ = cw.Write([]string{"timestamp_ns", "request_id", "metric", "latency_ns"}) + for i := range rows { + r := &rows[i] + _ = cw.Write([]string{ + strconv.FormatInt(r.TimestampNs, 10), + r.RequestID, r.Metric, + strconv.FormatInt(r.LatencyNs, 10), + }) + } + cw.Flush() + return cw.Error() +} + +// DetermineExitCode returns 0 if error count is within 0.1% of sent. +// With sent == 0, any error is a failure. +func DetermineExitCode(sent, errs int) int { + if sent == 0 { + if errs == 0 { + return 0 + } + return 1 + } + // 0.1% tolerance inclusive: errs * 1000 <= sent + if errs*1000 <= sent { + return 0 + } + return 1 +} diff --git a/tools/loadgen/report_test.go b/tools/loadgen/report_test.go new file mode 100644 index 00000000..20a22ae6 --- /dev/null +++ b/tools/loadgen/report_test.go @@ -0,0 +1,156 @@ +package main + +import ( + "bytes" + "errors" + "io" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// failWriter returns an error on the first write. +type failWriter struct{ called bool } + +func (f *failWriter) Write(p []byte) (int, error) { + if !f.called { + f.called = true + return 0, errors.New("write failed") + } + return len(p), nil +} + +func TestPercentiles_FixedSet(t *testing.T) { + // 100 sorted values: 1ms..100ms + samples := make([]time.Duration, 100) + for i := range samples { + samples[i] = time.Duration(i+1) * time.Millisecond + } + p := ComputePercentiles(samples) + assert.Equal(t, 50*time.Millisecond, p.P50) + assert.Equal(t, 95*time.Millisecond, p.P95) + assert.Equal(t, 99*time.Millisecond, p.P99) + assert.Equal(t, 100*time.Millisecond, p.Max) +} + +func TestPercentiles_Empty(t *testing.T) { + p := ComputePercentiles(nil) + assert.Zero(t, p.P50) + assert.Zero(t, p.P95) + assert.Zero(t, p.P99) + assert.Zero(t, p.Max) +} + +func TestPrintSummary_ContainsKeyFields(t *testing.T) { + var buf bytes.Buffer + s := Summary{ + Preset: "medium", Seed: 42, Site: "site-local", + TargetRate: 500, ActualRate: 499.8, + Duration: 60 * time.Second, Warmup: 10 * time.Second, + Inject: "frontdoor", Sent: 25000, + } + require.NoError(t, PrintSummary(&buf, &s)) + out := buf.String() + for _, want := range []string{ + "preset: medium", "seed: 42", "site: site-local", + "sent (total):", "sent (measured):", "25000", "inject: frontdoor", + } { + assert.True(t, strings.Contains(out, want), "summary missing %q; got:\n%s", want, out) + } +} + +func TestWriteCSV_OneRowPerSample(t *testing.T) { + var buf bytes.Buffer + rows := []CSVSample{ + {TimestampNs: 1, RequestID: "r1", Metric: "E1", LatencyNs: 2_100_000}, + {TimestampNs: 2, RequestID: "r1", Metric: "E2", LatencyNs: 8_700_000}, + } + require.NoError(t, WriteCSV(&buf, rows)) + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + require.Len(t, lines, 3) // header + 2 rows + assert.Equal(t, "timestamp_ns,request_id,metric,latency_ns", lines[0]) + assert.Equal(t, "1,r1,E1,2100000", lines[1]) + assert.Equal(t, "2,r1,E2,8700000", lines[2]) +} + +func TestPrintSummary_WithConsumers(t *testing.T) { + var buf bytes.Buffer + s := Summary{ + Preset: "heavy", Seed: 1, Site: "site-a", + TargetRate: 1000, ActualRate: 998.5, + Duration: 120 * time.Second, Warmup: 20 * time.Second, + Inject: "gateway", + Consumers: []ConsumerStat{ + { + Stream: "MESSAGES_CANONICAL_site-a", Durable: "message-worker", + MinPending: 0, PeakPending: 150, FinalPending: 2, + PeakAckPending: 10, Redelivered: 1, + }, + }, + } + require.NoError(t, PrintSummary(&buf, &s)) + out := buf.String() + assert.True(t, strings.Contains(out, "consumer lag"), "missing consumer lag header; got:\n%s", out) + assert.True(t, strings.Contains(out, "message-worker"), "missing durable name; got:\n%s", out) + assert.True(t, strings.Contains(out, "150"), "missing peak pending; got:\n%s", out) +} + +func TestWriteCSV_Empty(t *testing.T) { + var buf bytes.Buffer + require.NoError(t, WriteCSV(&buf, nil)) + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + require.Len(t, lines, 1) // header only + assert.Equal(t, "timestamp_ns,request_id,metric,latency_ns", lines[0]) +} + +func TestWriteCSV_WriterError(t *testing.T) { + // failWriter errors on the first write; csv buffers internally so the + // error surfaces via cw.Error() after Flush, not from cw.Write directly. + err := WriteCSV(&failWriter{}, []CSVSample{}) + require.Error(t, err) +} + +func TestWriteCSV_RowWriteError(t *testing.T) { + // Use a writer that succeeds the first write (header) but then a pipe + // that we close, so the row write fails. + pr, pw := io.Pipe() + pw.Close() // close write end immediately so subsequent writes fail + + // Drain the reader so csv can flush the header without blocking. + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + _, _ = io.Copy(io.Discard, pr) + }() + + rows := []CSVSample{ + {TimestampNs: 1, RequestID: "r1", Metric: "E1", LatencyNs: 100}, + } + err := WriteCSV(pw, rows) + <-doneCh + require.Error(t, err) +} + +func TestDetermineExitCode(t *testing.T) { + cases := []struct { + name string + sent int + errs int + wantExitCode int + }{ + {"zero errors", 10000, 0, 0}, + {"under tolerance", 10000, 9, 0}, // 0.09% < 0.1% + {"at tolerance boundary", 10000, 10, 0}, // exactly 0.1%: pass + {"over tolerance", 10000, 11, 1}, // 0.11% > 0.1% + {"no sends no errors", 0, 0, 0}, + {"no sends - any error fails", 0, 1, 1}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.wantExitCode, DetermineExitCode(tc.sent, tc.errs)) + }) + } +} diff --git a/tools/loadgen/seed.go b/tools/loadgen/seed.go new file mode 100644 index 00000000..be6c98dc --- /dev/null +++ b/tools/loadgen/seed.go @@ -0,0 +1,76 @@ +package main + +import ( + "context" + "fmt" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" +) + +func insertDocs[T any](ctx context.Context, coll *mongo.Collection, items []T) error { + if len(items) == 0 { + return nil + } + docs := make([]interface{}, len(items)) + for i := range items { + docs[i] = items[i] + } + if _, err := coll.InsertMany(ctx, docs); err != nil { + return fmt.Errorf("insert into %s: %w", coll.Name(), err) + } + return nil +} + +// Seed drops and repopulates users/rooms/subscriptions in db from fixtures. +// Idempotent: safe to rerun. +func Seed(ctx context.Context, db *mongo.Database, f Fixtures) error { + if err := db.Collection("users").Drop(ctx); err != nil { + return fmt.Errorf("drop users: %w", err) + } + if err := db.Collection("rooms").Drop(ctx); err != nil { + return fmt.Errorf("drop rooms: %w", err) + } + if err := db.Collection("subscriptions").Drop(ctx); err != nil { + return fmt.Errorf("drop subscriptions: %w", err) + } + + if err := insertDocs(ctx, db.Collection("users"), f.Users); err != nil { + return err + } + if err := insertDocs(ctx, db.Collection("rooms"), f.Rooms); err != nil { + return err + } + if err := insertDocs(ctx, db.Collection("subscriptions"), f.Subscriptions); err != nil { + return err + } + + subsIdx := db.Collection("subscriptions") + if _, err := subsIdx.Indexes().CreateMany(ctx, []mongo.IndexModel{ + {Keys: bson.D{{Key: "roomId", Value: 1}}}, + {Keys: bson.D{{Key: "u.account", Value: 1}}}, + {Keys: bson.D{{Key: "u.account", Value: 1}, {Key: "roomId", Value: 1}}}, + }); err != nil { + return fmt.Errorf("create subscription indexes: %w", err) + } + + // broadcast-worker and message-gatekeeper look up users by account + // (not _id) during enrichment — index it to avoid a COLLSCAN per message. + usersIdx := db.Collection("users") + if _, err := usersIdx.Indexes().CreateMany(ctx, []mongo.IndexModel{ + {Keys: bson.D{{Key: "account", Value: 1}}}, + }); err != nil { + return fmt.Errorf("create user indexes: %w", err) + } + return nil +} + +// Teardown drops the three seeded collections without repopulating. +func Teardown(ctx context.Context, db *mongo.Database) error { + for _, c := range []string{"users", "rooms", "subscriptions"} { + if err := db.Collection(c).Drop(ctx); err != nil { + return fmt.Errorf("drop %s: %w", c, err) + } + } + return nil +}