Merge pull request #121 from harvard-lil/markdown-detail-dedup

jcushman · web-flow · commit f733c6a9c496 · 2026-06-29T13:41:30.000-04:00
Dedup summary-covered detail bullets + regenerate stale docs
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -56,7 +56,7 @@ jobs:
 
       - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
         with:
-          toolchain: 1.88.0
+          toolchain: 1.95.0
 
       - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
 
diff --git a/Cargo.toml b/Cargo.toml
@@ -44,7 +44,7 @@ debug = "line-tables-only"
 [workspace.package]
 version = "0.2.0"
 edition = "2021"
-rust-version = "1.88"
+rust-version = "1.95"
 license = "MIT"
 repository = "https://github.com/harvard-lil/binoc"
 homepage = "https://github.com/harvard-lil/binoc"
diff --git a/binoc-stdlib/src/renderers/markdown.rs b/binoc-stdlib/src/renderers/markdown.rs
@@ -873,11 +873,19 @@ fn specialized_detail_verb(verb: &str) -> bool {
     )
 }
 
+/// True when the node summary already states this edit, so a generic detail
+/// bullet would only repeat it (and, for structural edits, dump raw params).
+/// Each arm pairs the edit verb with the tag that proves the summary covers it.
 fn summary_covered_generic_verb(node: &DiffNode, edit: &serde_json::Value) -> bool {
     let Some(verb) = edit.get("verb").and_then(|value| value.as_str()) else {
         return false;
     };
-    matches!(verb, "tabular.rename_column") && node.tags.contains("binoc.column-rename")
+    match verb {
+        "tabular.rename_column" => node.tags.contains("binoc.column-rename"),
+        "tabular.reorder_columns" => node.tags.contains("binoc.column-reorder"),
+        "document.serialization_change" => node.tags.contains("binoc.serialization-change"),
+        _ => false,
+    }
 }
 
 fn humanize_edit_verb(verb: &str) -> String {
diff --git a/docs/adr/2026-04-10-rust_msrv_and_dependency_update_policy.md b/docs/adr/2026-04-10-rust_msrv_and_dependency_update_policy.md
@@ -1,7 +1,18 @@
 # Rust MSRV and dependency update policy
 
 **Date:** 2026-04-10
-**Status:** Implemented
+**Status:** Implemented (MSRV raised to 1.95 on 2026-06-29 — see Addendum)
+
+## Addendum (2026-06-29): MSRV raised to 1.95
+
+The workspace MSRV is now `1.95` (was `1.88`). This is the intentional,
+called-out bump the policy below requires: the `rusqlite` 0.39 → 0.40 update
+pulls `libsqlite3-sys` 0.38, whose build script uses `cfg_select!`, stabilized
+in Rust 1.95. Rather than pin `rusqlite` back to hold the 1.88 floor, we accept
+the bump — the project is pre-1.0 and moving quickly, and contributors and CI
+already run ≥1.95. `rust-version` in the workspace manifest and the MSRV CI job
+(`.github/workflows/ci.yml`) move together to `1.95.0`. The original 1.88
+decision and its rationale are preserved below.
 
 ## Context
 
diff --git a/docs/adr/README.md b/docs/adr/README.md
@@ -6,7 +6,7 @@ Newer entries appear first. Each entry shows its date and current status. Create
 
 | Date | Title | Status |
 |---|---|---|
-| 2026-06-22 | [The Vintage Audience: a Kept Benchmark for Metadata-Over-Data Reading](2026-06-22-vintage_audience_and_metadata_only_benchmark.md) | Accepted (benchmark landed; features deferred) |
+| 2026-06-22 | [The Vintage Audience: a Kept Benchmark for Metadata-Over-Data Reading](2026-06-22-vintage_audience_and_metadata_only_benchmark.md) | Accepted (benchmark landed; features deliberately deferred) |
 | 2026-06-15 | [Tiered Artifact Metadata: Column, Table, and a `parser_metadata_v1` Artifact](2026-06-15-tiered_artifact_metadata.md) | Implemented (channels + producers in CFM-80; rendering + significance in CFM-82) |
 | 2026-06-15 | [The Engine Overhaul, Told Whole: Single-Tree to Correspondence-First](2026-06-15-engine_overhaul_retrospective.md) | Retrospective |
 | 2026-06-15 | [Partition Identities: a JIT, Format-Owned Capability for N↔M Correspondence (CFM-72)](2026-06-15-partition_identities_jit_format_capability.md) | Implemented |
@@ -42,7 +42,7 @@ Newer entries appear first. Each entry shows its date and current status. Create
 | 2026-04-16 | [Test vector materialization: plugin trait, not a runtime plugin point](2026-04-16-test_vector_materialization.md) | Implemented |
 | 2026-04-16 | [Opportunistic ItemRef Metadata, Transformer-Hydrated for Correlation](2026-04-16-opportunistic_itemref_metadata.md) | Implemented |
 | 2026-04-10 | [Security posture and how to audit Binoc (core and plugins)](2026-04-10-security_posture_and_auditing.md) | Accepted |
-| 2026-04-10 | [Rust MSRV and dependency update policy](2026-04-10-rust_msrv_and_dependency_update_policy.md) | Implemented |
+| 2026-04-10 | [Rust MSRV and dependency update policy](2026-04-10-rust_msrv_and_dependency_update_policy.md) | Implemented (MSRV raised to 1.95 on 2026-06-29 — see Addendum) |
 | 2026-04-10 | [Independent release tags and published version policy](2026-04-10-independent_release_tags_and_published_version_policy.md) | Implemented |
 | 2026-04-08 | [Release Surface And Automated Publishing](2026-04-08-release_surface_and_automated_publishing.md) | Implemented |
 | 2026-03-20 | [Transformer Dispatch Refinement](2026-03-20-transformer_dispatch_refinement.md) | Implemented |
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -129,7 +129,6 @@ binoc diff ./test-vectors-materialized/csv-column-reorder/snapshot-a ./test-vect
 # Changelog: ./test-vectors-materialized/csv-column-reorder/snapshot-a → ./test-vectors-materialized/csv-column-reorder/snapshot-b
 
 - **data.csv**: Columns reordered
-  - Reorder Columns: order: ["city","name","age"]
 
 ```
 
@@ -150,7 +149,6 @@ binoc diff ./test-vectors-materialized/csv-mixed-changes/snapshot-a ./test-vecto
 - **data.csv**: Column added: 'email'; Columns reordered; 1 row added
   - Rows added
     - row 3: 'SF', 'Charlie', '35'
-  - Reorder Columns: order: ["city","name","age"]
   - Add Column: name: 'email'; values: {"total_values":3,"truncated":false,"values":["a@test.com","b@test.com","c@test.com"]}
 
 ```
diff --git a/docs/users/explanation/test-vectors-gallery.md b/docs/users/explanation/test-vectors-gallery.md
@@ -13,7 +13,7 @@ audience: new user, data steward, archivist
 
 These are runnable examples from binoc's test suite. Each example links to its source folder on GitHub, tells you whether it needs any extra setup, gives you the exact command to run, and shows the Markdown changelog binoc is expected to print.
 
-Binoc currently ships **62 shared examples** in this gallery.
+Binoc currently ships **63 shared examples** in this gallery.
 
 ## One-time setup
 
@@ -46,6 +46,7 @@ just materialize
 | [`csv-stacked-tables`](#csv-stacked-tables) | Detects two logical tables stacked in one messy CSV | data.csv/>table_2: 1 row added | Default pipeline |
 | [`csv-to-tsv-reformat`](#csv-to-tsv-reformat) | Table reformatted from CSV to TSV with row edits: detected as one reformatted-and-modified table, not remove + add | data.tsv: | Default pipeline |
 | [`csv-verbosity-full`](#csv-verbosity-full) | Markdown full verbosity renders every captured changed-cell example. | data.csv: 5 cells changed | Custom config |
+| [`csv-vintage-benchmark`](#csv-vintage-benchmark) | A 'vintage' reader compares two editions of the same published dataset and wants the structural story (a column appeared, a category vocabulary shifted) surfaced above the bulk data churn they intend to ignore. | facilities.csv: Column added: 'region'; 1 cell changed | Custom config |
 | [`directory-file-copy`](#directory-file-copy) | New file with same content as an existing unchanged file detected as a copy | duplicate.txt: Copied from original.txt | Default pipeline |
 | [`directory-nested`](#directory-nested) | Subdirectories with mixed changes | data/records.csv: 1 row added | Default pipeline |
 | [`directory-nested-with-tar`](#directory-nested-with-tar) | Shows binoc diffing a tar archive and a plain directory that contain overlapping internal paths. | data.tar.gz/>records.csv: 1 cell changed | Default pipeline |
@@ -233,7 +234,6 @@ Result:
 # Changelog: snapshot-a → snapshot-b
 
 - **data.csv**: Columns reordered
-  - Reorder Columns: order: ["city","name","age"]
 ```
 
 ## csv-distribution-shift
@@ -380,7 +380,6 @@ Result:
 - **data.csv**: Column added: 'email'; Columns reordered; 1 row added
   - Rows added
     - row 2: 'LA', 'Bob', '25'
-  - Reorder Columns: order: ["city","name","age"]
   - Add Column: name: 'email'; values: {"total_values":3,"truncated":false,"values":["alice@example.test","bob@example.test","charlie@example.test"]}
 ```
 
@@ -405,7 +404,6 @@ Result:
 - **data.csv**: Column added: 'email'; Columns reordered; 1 row added
   - Rows added
     - row 3: 'SF', 'Charlie', '35'
-  - Reorder Columns: order: ["city","name","age"]
   - Add Column: name: 'email'; values: {"total_values":3,"truncated":false,"values":["a@test.com","b@test.com","c@test.com"]}
 ```
 
@@ -571,6 +569,102 @@ Result:
     - row 5, column 'score': '50' -> '51'
 ```
 
+## csv-vintage-benchmark
+
+A 'vintage' reader compares two editions of the same published dataset and wants the structural story (a column appeared, a category vocabulary shifted) surfaced above the bulk data churn they intend to ignore.
+
+- **Browse source:** [csv-vintage-benchmark](https://github.com/harvard-lil/binoc/tree/main/test-vectors/csv-vintage-benchmark)
+- **Tags:** `csv`, `vintage`, `metadata`, `benchmark`
+- **Snapshots:** `snapshot-a` has 2 files — `facilities.csv`, `inspections.csv`; `snapshot-b` has 2 files — `facilities.csv`, `inspections.csv`
+- **Setup:** The dataset is a yearly facilities register published as a small directory of
+CSVs. Between the two editions:
+
+  * `facilities.csv` gains a `region` column (schema change) and one row's
+    `status` moves to a brand-new category value, `decommissioned`
+    (a *vocabulary* shift — the set of distinct values in a categorical column
+    grew).
+  * `inspections.csv` changes only in its data: several scores are edited and
+    two rows are appended. This is exactly the churn a vintage reader does not
+    want to read.
+
+The markdown config models the vintage stance as significance: schema/structural
+tags are the high-priority group, bulk cell/row tags the low-priority group.
+Because `classify_tags` promotes a node to the highest-priority group among its
+tags, `facilities.csv` (which carries both schema and cell tags) floats up to
+"Schema & vocabulary changes" while the pure-data `inspections.csv` sinks to
+"Bulk data updates". That file-granularity separation is the best vintage view
+binoc offers today.
+
+WHAT THIS BENCHMARK IS FOR — the gap between today's output (see
+`expected-output/changelog.snap`) and the target (see `VINTAGE-IDEAL.md`):
+
+  1. Within-node significance. `facilities.csv`'s `region` addition and its
+     `status` cell edit live on one node, so they cannot be separated: the
+     vintage reader still sees the cell bullet. There is no config-driven
+     edit-level drop/keep (only `EditProjection.visible`, set by writers).
+  2. Vocabulary as a first-class change. The `active -> decommissioned` shift is
+     reported as an ordinary `binoc.cell-change`, not as "the `status` vocabulary
+     gained a value". Columns are not first-class nodes and distinct-value-set
+     diffing does not exist.
+  3. Summary statistics. `inspections.csv` is rendered as full cell/row detail,
+     not as a one-line vintage statistic ("142 -> 144 rows, 3 cells changed").
+     The Summary/GlobalClaim seams exist to carry such a fact; no rule emits one.
+
+This vector is a kept benchmark, not a feature. It is expected to PASS against
+current output; as the vintage story improves, update the snapshot and watch it
+converge on VINTAGE-IDEAL.md. See docs/adr for the design rationale.
+Save this dataset config as `/tmp/csv-vintage-benchmark.yaml`:
+
+```yaml
+output:
+  markdown:
+    groups:
+      - heading: Schema & vocabulary changes
+        tags:
+          - binoc.schema-change
+          - binoc.column-addition
+          - binoc.column-removal
+          - binoc.column-rename
+          - binoc.metadata.value-label-set
+      - heading: Bulk data updates
+        tags:
+          - binoc.cell-change
+          - binoc.row-addition
+          - binoc.row-removal
+```
+
+
+Run it:
+```bash
+binoc diff \
+  ./test-vectors-materialized/csv-vintage-benchmark/snapshot-a \
+  ./test-vectors-materialized/csv-vintage-benchmark/snapshot-b \
+  --config /tmp/csv-vintage-benchmark.yaml
+```
+Result:
+```markdown
+# Changelog: snapshot-a → snapshot-b
+
+## Schema & vocabulary changes
+
+- **facilities.csv**: Column added: 'region'; 1 cell changed
+  - Changed cells
+    - row 2, column 'status': 'active' -> 'decommissioned'
+  - Set Headers: from: ["facility_id","name","status"]; to: ["facility_id","name","status","region"]
+  - Add Column: name: 'region'; values: {"total_values":4,"truncated":false,"values":["north","east","west","south"]}
+
+## Bulk data updates
+
+- **inspections.csv**: 2 rows added; 3 cells changed
+  - Changed cells
+    - row 1, column 'score': '82' -> '85'
+    - row 3, column 'score': '90' -> '91'
+    - row 4, column 'score': '68' -> '70'
+  - Rows added
+    - row 5: 'I104', 'F001', '88'
+    - row 6: 'I105', 'F002', '73'
+```
+
 ## directory-file-copy
 
 New file with same content as an existing unchanged file detected as a copy
@@ -965,7 +1059,6 @@ Result:
 # Changelog: snapshot-a → snapshot-b
 
 - **metadata.json**: Document serialization changed
-  - Serialization Change: kinds: ["object_key_order","formatting"]; left: {"byte_len":70,"line_ending":"lf","object_key_orders":[{"keys":["id","name"],"path":"$.fields"},{"keys":["name","version","fields"],"path":"$"}],"trailing_newli...; right: {"byte_len":98,"indentation":"2 spaces","line_ending":"lf","object_key_orders":[{"keys":["name","id"],"path":"$.fields"},{"keys":["fields","version","name"],"pa...
 ```
 
 ## json-records-cell-change
@@ -1101,7 +1194,6 @@ Result:
     - '\nFAKEICONv1'
 - **license-copy.txt**: Copied from license.txt
 - **metrics.csv**: Columns reordered
-  - Reorder Columns: order: ["category","year","value"]
 - **summary.txt**: Moved from report.txt
 ```
 
@@ -1623,7 +1715,6 @@ Result:
 # Changelog: snapshot-a → snapshot-b
 
 - **archive.zip/>metadata.json**: Document serialization changed
-  - Serialization Change: kinds: ["object_key_order","formatting"]; left: {"byte_len":82,"line_ending":"lf","object_key_orders":[{"keys":["id","name"],"path":"$.schema"},{"keys":["dataset","issued","schema"],"path":"$"}],"trailing_new...; right: {"byte_len":110,"indentation":"2 spaces","line_ending":"lf","object_key_orders":[{"keys":["name","id"],"path":"$.schema"},{"keys":["schema","issued","dataset"],...
 ```
 
 ## zip-nested
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -171,6 +171,7 @@ nav:
       - Architectural decisions:
           # BEGIN-ADR-NAV
           - adr/README.md
+          - 'The Vintage Audience: a Kept Benchmark for Metadata-Over-Data Reading': adr/2026-06-22-vintage_audience_and_metadata_only_benchmark.md
           - 'Tiered Artifact Metadata: Column, Table, and a `parser_metadata_v1` Artifact': adr/2026-06-15-tiered_artifact_metadata.md
           - 'The Engine Overhaul, Told Whole: Single-Tree to Correspondence-First': adr/2026-06-15-engine_overhaul_retrospective.md
           - 'Partition Identities: a JIT, Format-Owned Capability for N↔M Correspondence (CFM-72)': adr/2026-06-15-partition_identities_jit_format_capability.md
diff --git a/test-vectors/csv-column-reorder/expected-output/changelog.snap b/test-vectors/csv-column-reorder/expected-output/changelog.snap
@@ -5,4 +5,3 @@ expression: "&md"
 # Changelog: snapshot-a → snapshot-b
 
 - **data.csv**: Columns reordered
-  - Reorder Columns: order: ["city","name","age"]
diff --git a/test-vectors/csv-mid-row-insertion/expected-output/changelog.snap b/test-vectors/csv-mid-row-insertion/expected-output/changelog.snap
@@ -7,5 +7,4 @@ expression: "&md"
 - **data.csv**: Column added: 'email'; Columns reordered; 1 row added
   - Rows added
     - row 2: 'LA', 'Bob', '25'
-  - Reorder Columns: order: ["city","name","age"]
   - Add Column: name: 'email'; values: {"total_values":3,"truncated":false,"values":["alice@example.test","bob@example.test","charlie@example.test"]}
diff --git a/test-vectors/csv-mixed-changes/expected-output/changelog.snap b/test-vectors/csv-mixed-changes/expected-output/changelog.snap
@@ -7,5 +7,4 @@ expression: "&md"
 - **data.csv**: Column added: 'email'; Columns reordered; 1 row added
   - Rows added
     - row 3: 'SF', 'Charlie', '35'
-  - Reorder Columns: order: ["city","name","age"]
   - Add Column: name: 'email'; values: {"total_values":3,"truncated":false,"values":["a@test.com","b@test.com","c@test.com"]}
diff --git a/test-vectors/json-key-order-reexport/expected-output/changelog.snap b/test-vectors/json-key-order-reexport/expected-output/changelog.snap
@@ -5,4 +5,3 @@ expression: "&md"
 # Changelog: snapshot-a → snapshot-b
 
 - **metadata.json**: Document serialization changed
-  - Serialization Change: kinds: ["object_key_order","formatting"]; left: {"byte_len":70,"line_ending":"lf","object_key_orders":[{"keys":["id","name"],"path":"$.fields"},{"keys":["name","version","fields"],"path":"$"}],"trailing_newli...; right: {"byte_len":98,"indentation":"2 spaces","line_ending":"lf","object_key_orders":[{"keys":["name","id"],"path":"$.fields"},{"keys":["fields","version","name"],"pa...
diff --git a/test-vectors/kitchen-sink/expected-output/changelog.snap b/test-vectors/kitchen-sink/expected-output/changelog.snap
@@ -27,5 +27,4 @@ expression: "&md"
     - '\nFAKEICONv1'
 - **license-copy.txt**: Copied from license.txt
 - **metrics.csv**: Columns reordered
-  - Reorder Columns: order: ["category","year","value"]
 - **summary.txt**: Moved from report.txt
diff --git a/test-vectors/zip-json-key-order-reexport/expected-output/changelog.snap b/test-vectors/zip-json-key-order-reexport/expected-output/changelog.snap
@@ -5,4 +5,3 @@ expression: "&md"
 # Changelog: snapshot-a → snapshot-b
 
 - **archive.zip/>metadata.json**: Document serialization changed
-  - Serialization Change: kinds: ["object_key_order","formatting"]; left: {"byte_len":82,"line_ending":"lf","object_key_orders":[{"keys":["id","name"],"path":"$.schema"},{"keys":["dataset","issued","schema"],"path":"$"}],"trailing_new...; right: {"byte_len":110,"indentation":"2 spaces","line_ending":"lf","object_key_orders":[{"keys":["name","id"],"path":"$.schema"},{"keys":["schema","issued","dataset"],...

Original file line number	Diff line number	Diff line change
`@@ -873,11 +873,19 @@ fn specialized_detail_verb(verb: &str) -> bool {`
`873`	`873`	`)`
`874`	`874`	`}`
`875`	`875`
	`876`	`+/// True when the node summary already states this edit, so a generic detail`
	`877`	`+/// bullet would only repeat it (and, for structural edits, dump raw params).`
	`878`	`+/// Each arm pairs the edit verb with the tag that proves the summary covers it.`
`876`	`879`	`fn summary_covered_generic_verb(node: &DiffNode, edit: &serde_json::Value) -> bool {`
`877`	`880`	`let Some(verb) = edit.get("verb").and_then(\|value\| value.as_str()) else {`
`878`	`881`	`return false;`
`879`	`882`	`};`
`880`		`- matches!(verb, "tabular.rename_column") && node.tags.contains("binoc.column-rename")`
	`883`	`+ match verb {`
	`884`	`+ "tabular.rename_column" => node.tags.contains("binoc.column-rename"),`
	`885`	`+ "tabular.reorder_columns" => node.tags.contains("binoc.column-reorder"),`
	`886`	`+ "document.serialization_change" => node.tags.contains("binoc.serialization-change"),`
	`887`	`+ _ => false,`
	`888`	`+ }`
`881`	`889`	`}`
`882`	`890`
`883`	`891`	`fn humanize_edit_verb(verb: &str) -> String {`
Original file line number	Diff line number	Diff line change
`@@ -5,4 +5,3 @@ expression: "&md"`
`5`	`5`	`# Changelog: snapshot-a → snapshot-b`
`6`	`6`
`7`	`7`	`- data.csv: Columns reordered`
`8`		`- - Reorder Columns: order: ["city","name","age"]`