statisticsnorway
diff --git a/‎cli/src/cli.cr‎
Lines changed: 6 additions & 0 deletions b/‎cli/src/cli.cr‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cli/src/worker.cr‎
Lines changed: 197 additions & 260 deletions b/‎cli/src/worker.cr‎
Lines changed: 197 additions & 260 deletions
diff --git a/‎doc/data-model.md‎
Lines changed: 2 additions & 1 deletion b/‎doc/data-model.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/derive-pipeline.md‎
Lines changed: 68 additions & 31 deletions b/‎doc/derive-pipeline.md‎
Lines changed: 68 additions & 31 deletions
diff --git a/‎doc/worker-structured-concurrency.md‎
Lines changed: 1 addition & 1 deletion b/‎doc/worker-structured-concurrency.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎migrations/20260211203756_fix_worker_reset_cross_database_kill.down.sql‎
Lines changed: 44 additions & 0 deletions b/‎migrations/20260211203756_fix_worker_reset_cross_database_kill.down.sql‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎migrations/20260211203756_fix_worker_reset_cross_database_kill.up.sql‎
Lines changed: 58 additions & 0 deletions b/‎migrations/20260211203756_fix_worker_reset_cross_database_kill.up.sql‎
Lines changed: 58 additions & 0 deletions
@@ -101,6 +101,12 @@ module Statbus
         end
         parser.on("worker", "Run Statbus Worker for background processing") do
           @mode = Mode::Worker
+          parser.on("--stop-when-idle", "Exit when all queues are idle (for testing)") do
+            @worker.stop_when_idle = true
+          end
+          parser.on("--database DB", "Override database name") do |db|
+            @config.postgres_db = db
+          end
         end
         parser.on("import", "Import into installed StatBus") do
           @mode = Mode::Import
 
@@ -195,7 +195,8 @@ Handles background processing. A long-running worker process calls `worker.proce
 - `command_registry(command, created_at, handler_procedure, before_procedure, after_procedure, description, queue, batches_per_wave)`
   - Key FKs: queue.
 - `queue_registry(queue, description, default_concurrency)`
-- `last_processed(table_name, transaction_id)`
+- `base_change_log(establishment_ids, legal_unit_ids, enterprise_ids, edited_by_valid_range)`
+- `base_change_log_has_pending(has_pending)`
 
 ## Auth & System Tables/Views
 
 
@@ -74,7 +74,7 @@ group. The queues are fully independent — work on one queue never blocks anoth
 ```
 
 **Import queue** (1 fiber): Processes `import_job_process` one at a time.
-When an import modifies data, it triggers `check_table` on the analytics queue.
+When an import modifies data, it triggers `collect_changes` on the analytics queue.
 
 **Analytics queue** (4 fibers = 1 top + 3 child): Derives all statistical
 tables. The top fiber runs each pipeline stage sequentially; when a stage
@@ -102,9 +102,9 @@ all downstream tables. Each box below is a **top-level task** — they run
     │ lifecycle trigger
     ▼
                           ┌─────────────────────────────────────────────┐
-                     ①    │ check_table                                 │
-                          │  Detects changed legal_unit / establishment │
-                          │  rows. Enqueues derive_statistical_unit.    │
+                     ①    │ collect_changes                             │
+                          │  Drains base_change_log accumulator and     │
+                          │  enqueues derive_statistical_unit.          │
                           └──────────────────┬──────────────────────────┘
                                              │ (strictly sequential — next task)
                                              ▼
@@ -160,13 +160,25 @@ all downstream tables. Each box below is a **top-level task** — they run
                                              │
                                              ▼
                           ┌─────────────────────────────────────────────┐
-                     ⑥    │ derive_statistical_unit_facet   MONOLITHIC  │
+                     ⑥    │ derive_statistical_unit_facet       PARENT  │
                           │                                             │
-                          │  Aggregates statistical_unit into facets    │
-                          │  for the drilldown UI. Runs as a single    │
-                          │  operation (see "Why monolithic?" below).   │
+                          │  Spawns partition children (parallel):      │
+                          │  ┌──────────┐ ┌──────────┐ ┌──────────┐   │
+                          │  │ part 0   │ │ part 1   │ │ part N   │...│
+                          │  └──────────┘ └──────────┘ └──────────┘   │
+                          │  derive_statistical_unit_facet_partition    │
                           │                                             │
+                          │  Enqueues → statistical_unit_facet_reduce   │
                           │  Enqueues → derive_statistical_history_facet│
+                          │                                             │
+                          │  Parent "waiting" → children → done         │
+                          └──────────────────┬──────────────────────────┘
+                                             │
+                                             ▼
+                          ┌─────────────────────────────────────────────┐
+                    ⑥b    │ statistical_unit_facet_reduce       SERIAL  │
+                          │  Merges partition staging data into         │
+                          │  main statistical_unit_facet table.         │
                           └──────────────────┬──────────────────────────┘
                                              │
                                              ▼
@@ -195,15 +207,36 @@ all children finish. This is structured concurrency — concurrency is scoped
 inside the parent, never between top-level tasks.
 
 
-## Why derive_statistical_unit_facet Is Monolithic
+## Staging Pattern and Race Safety
+
+Step ② (`derive_statistical_unit`) writes to an UNLOGGED staging table
+(`statistical_unit_staging`) via batch children, then step ③
+(`statistical_unit_flush_staging`) merges staging into the main table and
+TRUNCATEs staging.
+
+**Why there is no TRUNCATE at the start of derive_statistical_unit:**
+An earlier version TRUNCATEd staging at the start of each derive cycle to
+"clean up interrupted runs." This created a latent race: if `collect_changes`
+enqueued a new `derive_statistical_unit` before the previous cycle's
+`flush_staging` ran, the TRUNCATE would destroy all staged data. The race was
+nearly triggered in concurrent testing (priority gaps as small as 2 sequence
+values). The TRUNCATE was removed because:
+
+1. Batch children already do `DELETE FROM staging WHERE unit_type/unit_id`
+   before inserting — stale data from a previous cycle gets overwritten
+2. `flush_staging` TRUNCATEs at the end after merging staging → main
+3. UNLOGGED tables auto-truncate on unclean shutdown (PostgreSQL guarantee)
+
+
+## Partitioning derive_statistical_unit_facet
 
 The three derived tables have different data models:
 
 | Table                      | Keyed by                               | Natural partition |
 |----------------------------|----------------------------------------|-------------------|
 | `statistical_history`      | `(resolution, year, month, unit_type)` | period            |
 | `statistical_history_facet`| `(resolution, year, month, ...dims)`   | period            |
-| `statistical_unit_facet`   | `(valid_from, valid_until, ...dims)`   | date range        |
+| `statistical_unit_facet`   | `(valid_from, valid_until, ...dims)`   | `(unit_type, unit_id)` |
 
 `statistical_history` and `statistical_history_facet` use period-based keys
 (`year`, `month`), so each period child touches **disjoint rows** — perfect
@@ -214,10 +247,13 @@ A facet row with `valid_from=2020, valid_until=2025` would overlap **65
 periods** (5 year + 60 month). Splitting by period would cause each child to
 redundantly DELETE and re-INSERT the same row. Correct but 65x wasteful.
 
-At current scale (3.1M statistical units), DSUF takes 30–180 seconds as a
-monolithic operation. This is fast enough. If it becomes a bottleneck with
-larger datasets, the right approach is splitting by `(unit_type, unit_id)`
-using a map-reduce pattern (not by period).
+Instead, DSUF partitions by `(unit_type, unit_id)` using a **map-reduce**
+pattern: each partition child writes partial aggregations to an UNLOGGED
+staging table, then `statistical_unit_facet_reduce` merges and swaps the
+results into the main table in a single transaction.
+
+Only **dirty partitions** (those with changed data tracked in
+`statistical_unit_facet_dirty_partitions`) are recomputed.
 
 
 ## Production Performance (1.1M LU + 826K ES = 3.1M stat units)
@@ -254,23 +290,24 @@ the next-largest costs. The reporting stages (DSH, DSHF) take seconds.
 
 All commands and their queue assignments:
 
-| Queue       | Command                                | Role        | Notes                        |
-|-------------|----------------------------------------|-------------|------------------------------|
-| analytics   | `check_table`                          | top-level   | Detects changed rows         |
-| analytics   | `derive_statistical_unit`              | parent      | Spawns batch children        |
-| analytics   | `statistical_unit_refresh_batch`       | child       | Parallel batch processing    |
-| analytics   | `derive_statistical_unit_continue`     | top-level   | ANALYZE sync point           |
-| analytics   | `statistical_unit_flush_staging`       | top-level   | Merge staging → main table   |
-| analytics   | `derive_reports`                       | top-level   | Enqueues DSH                 |
-| analytics   | `derive_statistical_history`           | parent      | Spawns period children       |
-| analytics   | `derive_statistical_history_period`    | child       | Per-period aggregation       |
-| analytics   | `derive_statistical_unit_facet`        | top-level   | Monolithic facet derivation  |
-| analytics   | `derive_statistical_history_facet`     | parent      | Spawns period children       |
-| analytics   | `derive_statistical_history_facet_period` | child    | Per-period facet aggregation |
-| analytics   | `deleted_row`                          | top-level   | Handle deletions             |
-| import      | `import_job_process`                   | top-level   | One import at a time         |
-| maintenance | `task_cleanup`                         | top-level   | Clean old tasks              |
-| maintenance | `import_job_cleanup`                   | top-level   | Clean expired imports        |
+| Queue       | Command                                   | Role        | Notes                        |
+|-------------|-------------------------------------------|-------------|------------------------------|
+| analytics   | `collect_changes`                         | top-level   | Drains base_change_log       |
+| analytics   | `derive_statistical_unit`                 | parent      | Spawns batch children        |
+| analytics   | `statistical_unit_refresh_batch`          | child       | Parallel batch processing    |
+| analytics   | `derive_statistical_unit_continue`        | top-level   | ANALYZE sync point           |
+| analytics   | `statistical_unit_flush_staging`          | top-level   | Merge staging → main table   |
+| analytics   | `derive_reports`                          | top-level   | Enqueues DSH                 |
+| analytics   | `derive_statistical_history`              | parent      | Spawns period children       |
+| analytics   | `derive_statistical_history_period`       | child       | Per-period aggregation       |
+| analytics   | `derive_statistical_unit_facet`           | parent      | Spawns partition children    |
+| analytics   | `derive_statistical_unit_facet_partition` | child       | Per-partition facet compute  |
+| analytics   | `statistical_unit_facet_reduce`           | top-level   | Merge partitions → main      |
+| analytics   | `derive_statistical_history_facet`        | parent      | Spawns period children       |
+| analytics   | `derive_statistical_history_facet_period` | child       | Per-period facet aggregation |
+| import      | `import_job_process`                      | top-level   | One import at a time         |
+| maintenance | `task_cleanup`                            | top-level   | Clean old tasks              |
+| maintenance | `import_job_cleanup`                      | top-level   | Clean expired imports        |
 
 
 ## Frontend Status Detection
 
@@ -160,7 +160,7 @@ Uncle tasks:
 For the full pipeline diagram, see [derive-pipeline.md](./derive-pipeline.md).
 
 ```
-1. check_table detects changes → enqueues derive_statistical_unit
+1. collect_changes drains base_change_log → enqueues derive_statistical_unit
 
 2. derive_statistical_unit runs:
    - Computes closed groups of affected enterprises
 
@@ -0,0 +1,44 @@
+BEGIN;
+
+-- Restore the original function (without datname filter)
+CREATE OR REPLACE FUNCTION worker.reset_abandoned_processing_tasks()
+ RETURNS integer
+ LANGUAGE plpgsql
+AS $function$
+DECLARE
+  v_reset_count int := 0;
+  v_task RECORD;
+  v_stale_pid INT;
+BEGIN
+  -- Terminate all other lingering worker backends.
+  -- The current worker holds the global advisory lock, so any other process with
+  -- application_name = 'worker' is a stale remnant from a previous crash.
+  FOR v_stale_pid IN
+    SELECT pid FROM pg_stat_activity
+    WHERE application_name = 'worker' AND pid <> pg_backend_pid()
+  LOOP
+    RAISE LOG 'Terminating stale worker PID %', v_stale_pid;
+    PERFORM pg_terminate_backend(v_stale_pid);
+  END LOOP;
+
+  -- Find tasks stuck in 'processing' and reset their status to 'pending'.
+  -- The backends have already been terminated above.
+  FOR v_task IN
+    SELECT id FROM worker.tasks WHERE state = 'processing'::worker.task_state FOR UPDATE
+  LOOP
+    -- Reset the task to pending state.
+    UPDATE worker.tasks
+    SET state = 'pending'::worker.task_state,
+        worker_pid = NULL,
+        processed_at = NULL,
+        error = NULL,
+        duration_ms = NULL
+    WHERE id = v_task.id;
+
+    v_reset_count := v_reset_count + 1;
+  END LOOP;
+  RETURN v_reset_count;
+END;
+$function$;
+
+END;
@@ -0,0 +1,58 @@
+BEGIN;
+
+-- Fix: reset_abandoned_processing_tasks was killing worker connections
+-- across ALL databases (pg_stat_activity is cluster-wide).
+-- When running multiple workers on different databases (e.g., test worker
+-- on test_concurrent_* alongside production worker on statbus_local),
+-- they would enter a death spiral — each one's reset function killing
+-- the other's connections.
+--
+-- Fix: Add datname = current_database() filter so workers only terminate
+-- stale connections to their OWN database.
+CREATE OR REPLACE FUNCTION worker.reset_abandoned_processing_tasks()
+ RETURNS integer
+ LANGUAGE plpgsql
+AS $function$
+DECLARE
+  v_reset_count int := 0;
+  v_task RECORD;
+  v_stale_pid INT;
+BEGIN
+  -- Terminate all other lingering worker backends FOR THIS DATABASE ONLY.
+  -- The current worker holds the global advisory lock, so any other process with
+  -- application_name = 'worker' connected to the same database is a stale remnant
+  -- from a previous crash.
+  -- CRITICAL: Filter by datname = current_database() because pg_stat_activity is
+  -- cluster-wide. Without this filter, workers on different databases (e.g., test
+  -- databases) would kill each other's connections.
+  FOR v_stale_pid IN
+    SELECT pid FROM pg_stat_activity
+    WHERE application_name = 'worker'
+      AND pid <> pg_backend_pid()
+      AND datname = current_database()
+  LOOP
+    RAISE LOG 'Terminating stale worker PID %', v_stale_pid;
+    PERFORM pg_terminate_backend(v_stale_pid);
+  END LOOP;
+
+  -- Find tasks stuck in 'processing' and reset their status to 'pending'.
+  -- The backends have already been terminated above.
+  FOR v_task IN
+    SELECT id FROM worker.tasks WHERE state = 'processing'::worker.task_state FOR UPDATE
+  LOOP
+    -- Reset the task to pending state.
+    UPDATE worker.tasks
+    SET state = 'pending'::worker.task_state,
+        worker_pid = NULL,
+        processed_at = NULL,
+        error = NULL,
+        duration_ms = NULL
+    WHERE id = v_task.id;
+
+    v_reset_count := v_reset_count + 1;
+  END LOOP;
+  RETURN v_reset_count;
+END;
+$function$;
+
+END;