benzsevern
diff --git a/‎packages/goldenmatch-js/README.md‎
Lines changed: 54 additions & 2 deletions b/‎packages/goldenmatch-js/README.md‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎packages/goldenmatch-js/examples/README.md‎
Lines changed: 2 additions & 0 deletions b/‎packages/goldenmatch-js/examples/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/goldenmatch-js/examples/strictModeParity.ts‎
Lines changed: 78 additions & 0 deletions b/‎packages/goldenmatch-js/examples/strictModeParity.ts‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎packages/goldenmatch-js/examples/verificationInspection.ts‎
Lines changed: 99 additions & 0 deletions b/‎packages/goldenmatch-js/examples/verificationInspection.ts‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎packages/goldenmatch-js/package.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/goldenmatch-js/package.json‎
Lines changed: 1 addition & 1 deletion
@@ -9,7 +9,7 @@ npm install goldenmatch
 [![npm](https://img.shields.io/npm/v/goldenmatch?color=d4a017)](https://www.npmjs.com/package/goldenmatch)
 [![Node](https://img.shields.io/node/v/goldenmatch?color=339933)](https://nodejs.org/)
 [![License: MIT](https://img.shields.io/badge/license-MIT-green)](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE)
-[![Tests](https://img.shields.io/badge/tests-478%20passing-brightgreen)](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
+[![Tests](https://img.shields.io/badge/tests-590%20passing-brightgreen)](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
 
 ---
 
@@ -18,7 +18,7 @@ npm install goldenmatch
 - **Edge-safe core** — the matching engine runs in browsers, Workers, Vercel Edge Runtime, Deno
 - **Pure TypeScript** — no native dependencies required; peer deps unlock performance (hnswlib, ONNX, piscina)
 - **Feature parity with Python goldenmatch** — same scorers, same clustering, same YAML configs
-- **478 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
+- **590 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
 
 ## Quick Start
 
@@ -45,6 +45,58 @@ for (const record of result.goldenRecords) {
 }
 ```
 
+## Auto-Config Verification (v0.3)
+
+Auto-generated configs are now checked both before the pipeline runs and after
+scoring finishes, so you get actionable diagnostics instead of silent failures
+on edge-case data.
+
+### Preflight — six static checks
+
+When you call `autoConfigureRows(rows)`, the returned config ships with a
+`_preflightReport` summarising six config-time checks:
+
+1. **missing_column** — matchkey/blocking references a column not in the data
+2. **cardinality_high** — a column is near-unique (poor blocking signal)
+3. **cardinality_low** — a column has too few distinct values to discriminate
+4. **block_size** — a blocking key would produce oversized blocks
+5. **remote_asset** — a scorer requires a model download (gated offline)
+6. **weight_confidence** — a weighted matchkey's weights look unbalanced
+
+Many findings trigger **auto-repairs** (field dropped, scorer swapped,
+weight clamped). `hasErrors === true` on unrepairable errors raises
+`ConfigValidationError` with the full report attached.
+
+```ts
+import { autoConfigureRows, ConfigValidationError } from "goldenmatch";
+
+const cfg = autoConfigureRows(rows);
+for (const f of cfg._preflightReport!.findings) {
+  console.log(`[${f.severity}] ${f.check}/${f.subject}: ${f.message}`);
+}
+```
+
+Defaults are **offline-safe**: remote-asset scorers (cross-encoder, remote
+embeddings) are dropped unless you opt in with `allowRemoteAssets: true`.
+
+### Postflight — four runtime signals
+
+Inside `dedupe()` / `match()`, after scoring but before clustering, the
+pipeline computes four signals attached as `result.postflightReport`:
+
+1. **scoreHistogram** — 100-bin pair-score distribution
+2. **blockSizePercentiles** + **preliminaryClusterSizes** — p50/p95/p99/max
+3. **thresholdOverlapPct** — fraction of pairs near the current threshold
+4. **oversizedClusters** — components above size limit, with bottleneck pair
+
+If the score distribution is clearly bimodal, postflight proposes a
+threshold adjustment. In **strict mode** (`autoConfigureRows(rows, { strict: true })`
+or manual `_strictAutoconfig: true`) the signals are still emitted but the
+threshold is never touched — use this for reproducible CI pipelines.
+
+See `examples/verificationInspection.ts` and `examples/strictModeParity.ts`
+for runnable demos.
+
 ## Three entrypoints
 
 ```typescript
 
@@ -19,6 +19,8 @@ npx tsx examples/<name>.ts
 | 09 | `09-llm-scorer.ts` | LLM scorer for borderline pairs (needs OPENAI_API_KEY) |
 | 10 | `10-explain.ts` | Template NL explanation of a pair match |
 | 11 | `11-evaluate.ts` | Evaluate against ground truth (precision/recall/F1) |
+| 12 | `verificationInspection.ts` | Inspect preflight findings + postflight signals |
+| 13 | `strictModeParity.ts` | Use `_strictAutoconfig` to disable runtime threshold shifts |
 
 ## Running
 
 
@@ -0,0 +1,78 @@
+/**
+ * Strict-mode auto-config: show how `_strictAutoconfig: true` disables
+ * runtime postflight threshold adjustments, yielding byte-identical-shape
+ * configs suitable for CI / reproducible pipelines.
+ *
+ * Compares two runs on the same data:
+ *   (a) normal auto-config — postflight may shift threshold
+ *   (b) strict auto-config — postflight reports signals but adjusts nothing
+ *
+ * Run: npx tsx examples/strictModeParity.ts
+ */
+import { autoConfigureRows, dedupe } from "goldenmatch";
+import type { Row } from "goldenmatch";
+
+// Bimodal synthetic data: three tight duplicate clusters + scattered singletons.
+// The bimodal score distribution is what tempts postflight to shift threshold.
+const rows: Row[] = [
+  { id: 1,  name: "John Smith",    email: "john@a.com",  zip: "10001" },
+  { id: 2,  name: "Jon Smith",     email: "john@a.com",  zip: "10001" },
+  { id: 3,  name: "Johnny Smith",  email: "JOHN@a.com",  zip: "10001" },
+  { id: 4,  name: "Jane Doe",      email: "jane@b.com",  zip: "20002" },
+  { id: 5,  name: "Jane Doh",      email: "jane@b.com",  zip: "20002" },
+  { id: 6,  name: "Janet Doe",     email: "jane@b.com",  zip: "20002" },
+  { id: 7,  name: "Bob Jones",     email: "bob@c.com",   zip: "30003" },
+  { id: 8,  name: "Robert Jones",  email: "bob@c.com",   zip: "30003" },
+  { id: 9,  name: "Alice Zhang",   email: "alice@d.com", zip: "40004" },
+  { id: 10, name: "Carlos Ruiz",   email: "c@e.com",     zip: "50005" },
+  { id: 11, name: "Dana White",    email: "dana@f.com",  zip: "60006" },
+  { id: 12, name: "Eve Black",     email: "eve@g.com",   zip: "70007" },
+];
+
+function run(
+  label: string,
+  strict: boolean,
+): void {
+  console.log("\n" + "=".repeat(60));
+  console.log(`${label} (strict=${strict})`);
+  console.log("=".repeat(60));
+
+  const cfg = autoConfigureRows(rows, { strict });
+  console.log(`_strictAutoconfig flag on config: ${cfg._strictAutoconfig === true}`);
+
+  const result = dedupe(rows, { config: cfg });
+  const post = result.postflightReport;
+
+  console.log(`clusters: ${result.stats.totalClusters}`);
+  console.log(`match rate: ${(result.stats.matchRate * 100).toFixed(1)}%`);
+
+  if (post === undefined) {
+    console.log("no postflight report");
+    return;
+  }
+  console.log(`threshold used: ${post.signals.currentThreshold}`);
+  console.log(`adjustments proposed: ${post.adjustments.length}`);
+  for (const adj of post.adjustments) {
+    console.log(
+      `  - ${adj.field}: ${adj.fromValue} -> ${adj.toValue} (${adj.reason})`,
+    );
+  }
+  if (strict && post.adjustments.length === 0) {
+    console.log("  -> strict mode: no adjustments applied (as expected)");
+  }
+  for (const adv of post.advisories) {
+    console.log(`  advisory: ${adv}`);
+  }
+}
+
+run("A. normal auto-config", false);
+run("B. strict auto-config", true);
+
+console.log("\n" + "=".repeat(60));
+console.log("takeaway");
+console.log("=".repeat(60));
+console.log(
+  "Use strict=true when you need reproducible, deterministic configs " +
+    "across environments (CI, prod). Postflight still emits diagnostic " +
+    "signals, but the pipeline will not silently shift your threshold.",
+);
@@ -0,0 +1,99 @@
+/**
+ * Inspect auto-config verification signals — preflight + postflight.
+ *
+ * Shows how to:
+ *   1. Auto-configure a config from raw rows.
+ *   2. Read `cfg._preflightReport` before dedupe runs.
+ *   3. Run dedupe with the auto-configured config.
+ *   4. Read `result.postflightReport` produced by the pipeline.
+ *
+ * Run: npx tsx examples/verificationInspection.ts
+ */
+import { autoConfigureRows, dedupe } from "goldenmatch";
+
+// A small synthetic dataset with deliberate duplicates + noisy fields.
+const rows = [
+  { id: 1, name: "John Smith",   email: "john@x.com",   zip: "12345" },
+  { id: 2, name: "Jon  Smith",   email: "JOHN@X.COM",   zip: "12345" },
+  { id: 3, name: "Jane Doe",     email: "jane@y.com",   zip: "54321" },
+  { id: 4, name: "Jane Doh",     email: "jane@y.com",   zip: "54321" },
+  { id: 5, name: "Robert Brown", email: "bob@z.com",    zip: "99999" },
+  { id: 6, name: "Rob Brown",    email: "bob@z.com",    zip: "99999" },
+  { id: 7, name: "Alice Jones",  email: "alice@a.com",  zip: "11111" },
+  { id: 8, name: "Alicia Jones", email: "alice@a.com",  zip: "11111" },
+];
+
+console.log("=".repeat(60));
+console.log("STEP 1 — auto-configure from rows");
+console.log("=".repeat(60));
+
+const cfg = autoConfigureRows(rows);
+console.log(`matchkeys: ${cfg.matchkeys?.length ?? 0}`);
+console.log(`blocking strategy: ${cfg.blocking?.strategy}`);
+console.log(`threshold: ${cfg.threshold}`);
+
+console.log("\n" + "=".repeat(60));
+console.log("STEP 2 — preflight report (config-time checks)");
+console.log("=".repeat(60));
+
+const pre = cfg._preflightReport;
+if (pre === undefined) {
+  console.log("no preflight report attached");
+} else {
+  console.log(`findings: ${pre.findings.length}`);
+  console.log(`configWasModified: ${pre.configWasModified}`);
+  console.log(`hasErrors: ${pre.hasErrors}`);
+  for (const f of pre.findings) {
+    console.log(
+      `  - [${f.severity}] ${f.check} / ${f.subject}: ${f.message}` +
+        (f.repaired ? ` (repaired: ${f.repairNote ?? "auto"})` : ""),
+    );
+  }
+}
+
+console.log("\n" + "=".repeat(60));
+console.log("STEP 3 — run dedupe with the auto-configured config");
+console.log("=".repeat(60));
+
+const result = dedupe(rows, { config: cfg });
+console.log(`input: ${result.stats.totalRecords} rows`);
+console.log(`clusters: ${result.stats.totalClusters}`);
+console.log(`match rate: ${(result.stats.matchRate * 100).toFixed(1)}%`);
+
+console.log("\n" + "=".repeat(60));
+console.log("STEP 4 — postflight report (runtime signals)");
+console.log("=".repeat(60));
+
+const post = result.postflightReport;
+if (post === undefined) {
+  console.log("no postflight report attached (expected when no preflight ran)");
+} else {
+  const s = post.signals;
+  console.log(`totalPairsScored: ${s.totalPairsScored}`);
+  console.log(`currentThreshold: ${s.currentThreshold}`);
+  console.log(`blockingRecall: ${s.blockingRecall}`);
+  console.log(
+    `blockSize p50/p95/p99/max: ${s.blockSizePercentiles.p50}/` +
+      `${s.blockSizePercentiles.p95}/${s.blockSizePercentiles.p99}/` +
+      `${s.blockSizePercentiles.max}`,
+  );
+  console.log(
+    `clusterSize count/p50/p95/max: ${s.preliminaryClusterSizes.count}/` +
+      `${s.preliminaryClusterSizes.p50}/${s.preliminaryClusterSizes.p95}/` +
+      `${s.preliminaryClusterSizes.max}`,
+  );
+  console.log(
+    `thresholdOverlapPct: ${(s.thresholdOverlapPct * 100).toFixed(2)}%`,
+  );
+  console.log(`oversizedClusters: ${s.oversizedClusters.length}`);
+  console.log(`adjustments applied: ${post.adjustments.length}`);
+  for (const adj of post.adjustments) {
+    console.log(
+      `  - ${adj.field}: ${adj.fromValue} -> ${adj.toValue} ` +
+        `(${adj.signal}: ${adj.reason})`,
+    );
+  }
+  for (const adv of post.advisories) {
+    console.log(`  advisory: ${adv}`);
+  }
+}
@@ -1,6 +1,6 @@
 {
   "name": "goldenmatch",
-  "version": "0.1.0",
+  "version": "0.3.0",
   "description": "Entity resolution toolkit — deduplicate, match, and create golden records",
   "type": "module",
   "exports": {
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "goldenmatch",`
`3`		`- "version": "0.1.0",`
	`3`	`+ "version": "0.3.0",`
`4`	`4`	`"description": "Entity resolution toolkit — deduplicate, match, and create golden records",`
`5`	`5`	`"type": "module",`
`6`	`6`	`"exports": {`