Skip to content

Commit 501c84a

Browse files
authored
Merge pull request #112 from harvard-lil/move-output
Improve reporting
2 parents 2d131cc + 550239a commit 501c84a

55 files changed

Lines changed: 1541 additions & 499 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/docs.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,24 @@ jobs:
3030

3131
- uses: extractions/setup-just@53165ef7e734c5c07cb06b3c8e7b647c5aa16db3 # v4.0.0
3232

33+
# The browser-demo wasm build compiles bundled SQLite (rusqlite) for
34+
# wasm32-wasip1, which needs a WASI sysroot and a wasm-targeting clang.
35+
# The WASI SDK bundles both. The justfile's docs-browser-demo recipe
36+
# picks these up via WASI_SYSROOT / CC_wasm32_wasip1 (consumed by the
37+
# `cc` crate); locally they come from `brew install wasi-libc llvm`.
38+
- name: Install WASI SDK (for the SQLite browser-demo wasm build)
39+
run: |
40+
set -euo pipefail
41+
tag=wasi-sdk-33
42+
asset=wasi-sdk-33.0-x86_64-linux
43+
curl -fsSL -o /tmp/wasi-sdk.tar.gz \
44+
"https://github.com/WebAssembly/wasi-sdk/releases/download/${tag}/${asset}.tar.gz"
45+
sudo mkdir -p /opt/wasi-sdk
46+
sudo tar -xzf /tmp/wasi-sdk.tar.gz -C /opt/wasi-sdk --strip-components=1
47+
echo "WASI_SYSROOT=/opt/wasi-sdk/share/wasi-sysroot" >> "$GITHUB_ENV"
48+
echo "CC_wasm32_wasip1=/opt/wasi-sdk/bin/clang" >> "$GITHUB_ENV"
49+
echo "AR_wasm32_wasip1=/opt/wasi-sdk/bin/llvm-ar" >> "$GITHUB_ENV"
50+
3351
# Regenerate every docs input the same way contributors do locally, so
3452
# CI and local flows stay in lockstep. See ADR
3553
# 2026-04-17-documentation_platform_and_info_design.md §6.

binoc-cli/tests/cli.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ dataset:
265265
.assert()
266266
.success()
267267
.stdout(predicates::str::contains("data.csv"))
268-
.stdout(predicates::str::contains("1 edit"));
268+
.stdout(predicates::str::contains("2 rows added"));
269269
}
270270

271271
#[test]

binoc-core/src/correspondence/project.rs

Lines changed: 104 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,18 @@ pub fn project(
126126

127127
let left_path = &store.item(left_id).logical_path;
128128
let right_path = &store.item(right_id).logical_path;
129+
let line_is_container = !store
130+
.tree(TreeSide::Right)
131+
.node(link.right)
132+
.children
133+
.is_empty()
134+
|| !store
135+
.tree(TreeSide::Left)
136+
.node(link.left)
137+
.children
138+
.is_empty();
129139
let edits = edit_lists.get(&index).cloned().unwrap_or_default();
130-
let visible_edits: Vec<Edit> = edits
131-
.iter()
132-
.filter(|edit| edit.projection.visible)
133-
.cloned()
134-
.collect();
140+
let mut projected_edits = edits.clone();
135141
// Grab each endpoint's own `item_type` BEFORE merging the two
136142
// projections together — the merge collapses them into one, which would
137143
// hide the fact that the representation changed across the link. These
@@ -142,23 +148,35 @@ pub fn project(
142148
let mut projection = store.projection(right_id).clone();
143149
projection.merge_from(store.projection(left_id));
144150
overlay_projection(&mut projection, &link.projection);
145-
for edit in &edits {
151+
for edit in &projected_edits {
146152
overlay_projection(&mut projection, &edit.projection.hint);
147153
}
148154
let carried = carried_path_change(store, link.left, link.right);
149155
let copied = projection.action.as_deref() == Some("copy");
150156
let moved = left_path != right_path && !carried && !copied;
151-
let changed = !edits.is_empty();
152-
let line_is_container = !store
153-
.tree(TreeSide::Right)
154-
.node(link.right)
155-
.children
156-
.is_empty()
157-
|| !store
158-
.tree(TreeSide::Left)
159-
.node(link.left)
160-
.children
161-
.is_empty();
157+
if !line_is_container
158+
&& !copied
159+
&& !moved
160+
&& projection.action.is_none()
161+
&& projected_edits.iter().all(|edit| !edit.projection.visible)
162+
&& content_hashes_differ(store.item(left_id), store.item(right_id))
163+
{
164+
let edit = Edit::new(
165+
"content.differs",
166+
json!({
167+
"reason": "linked endpoints have different content hashes, but no visible edit explained the difference"
168+
}),
169+
)
170+
.with_item_type("item");
171+
overlay_projection(&mut projection, &edit.projection.hint);
172+
projected_edits.push(edit);
173+
}
174+
let visible_edits: Vec<Edit> = projected_edits
175+
.iter()
176+
.filter(|edit| edit.projection.visible)
177+
.cloned()
178+
.collect();
179+
let changed = !projected_edits.is_empty();
162180
let derived_action = match (copied, moved, changed) {
163181
(true, _, _) => "copy",
164182
(false, true, _) => "move",
@@ -215,7 +233,10 @@ pub fn project(
215233
.with_evidence(link.evidence.clone())
216234
.with_action(action)],
217235
evidence: Some(link.evidence.clone()),
218-
verbs: edits.iter().map(|edit| edit.verb.clone()).collect(),
236+
verbs: projected_edits
237+
.iter()
238+
.map(|edit| edit.verb.clone())
239+
.collect(),
219240
edits: visible_edits,
220241
container: line_is_container,
221242
depth: store.tree(TreeSide::Right).ancestors(link.right).len(),
@@ -322,6 +343,16 @@ pub fn project(
322343
}
323344
}
324345

346+
fn content_hashes_differ(left: &binoc_sdk::ItemRef, right: &binoc_sdk::ItemRef) -> bool {
347+
if left.is_dir || right.is_dir {
348+
return false;
349+
}
350+
match (&left.content_hash, &right.content_hash) {
351+
(Some(left), Some(right)) => left != right,
352+
_ => false,
353+
}
354+
}
355+
325356
fn default_summary(action: &str, left_path: &str, edits: &[Edit]) -> Summary {
326357
match action {
327358
"copy" => Summary::new()
@@ -580,6 +611,13 @@ mod tests {
580611
}
581612
}
582613

614+
fn hashed_item(path: &str, hash: &str) -> ItemRef {
615+
ItemRef {
616+
content_hash: Some(hash.into()),
617+
..item(path)
618+
}
619+
}
620+
583621
#[test]
584622
fn projection_uses_rule_supplied_metadata_for_visible_output() {
585623
let mut store = Store::new(
@@ -635,6 +673,54 @@ mod tests {
635673
assert_eq!(node.details["edits"][0]["verb"], json!("test.edit"));
636674
}
637675

676+
#[test]
677+
fn projection_surfaces_unexplained_content_difference() {
678+
let mut store = Store::new(
679+
ItemRef {
680+
is_dir: true,
681+
projection_hint: ProjectionHint::default().item_type("tree"),
682+
..item("")
683+
},
684+
ItemRef {
685+
is_dir: true,
686+
projection_hint: ProjectionHint::default().item_type("tree"),
687+
..item("")
688+
},
689+
ProjectionHint::default().item_type("tree"),
690+
);
691+
let left = store.left.add_child(
692+
0,
693+
hashed_item("data.csv", "left-hash"),
694+
ProjectionHint::default().item_type("tabular"),
695+
);
696+
let right = store.right.add_child(
697+
0,
698+
hashed_item("data.csv", "right-hash"),
699+
ProjectionHint::default().item_type("tabular"),
700+
);
701+
store.links.apply(
702+
LinkProposal {
703+
left,
704+
right,
705+
evidence: "test.evidence".into(),
706+
settled: false,
707+
projection: ProjectionHint::default(),
708+
},
709+
"test-rule",
710+
1,
711+
);
712+
713+
let changeset = project(&store, &BTreeMap::new(), &[]).to_changeset("left", "right");
714+
let root = changeset.root.expect("root");
715+
let node = root
716+
.children
717+
.iter()
718+
.find(|node| node.path == "data.csv")
719+
.unwrap();
720+
assert_eq!(node.action, "modify");
721+
assert_eq!(node.details["edits"][0]["verb"], json!("content.differs"));
722+
}
723+
638724
#[test]
639725
fn projected_collision_keeps_first_class_sources() {
640726
let mut store = Store::new(

binoc-stdlib/src/correspondence/mod.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ fn summarize_known_edits(edits: &[Edit]) -> Option<String> {
318318
parts.push("Columns reordered".into());
319319
}
320320

321-
let rows_added = count_verb(edits, "tabular.add_row");
321+
let rows_added = count_verb(edits, "tabular.add_row") + count_appended_rows(edits);
322322
let rows_removed = count_verb(edits, "tabular.remove_row");
323323
if rows_added > 0 {
324324
parts.push(count_phrase(rows_added, "row added", "rows added"));
@@ -372,6 +372,15 @@ fn count_verb(edits: &[Edit], verb: &str) -> usize {
372372
edits.iter().filter(|edit| edit.verb == verb).count()
373373
}
374374

375+
fn count_appended_rows(edits: &[Edit]) -> usize {
376+
edits
377+
.iter()
378+
.filter(|edit| edit.verb == "tabular.append_rows")
379+
.filter_map(|edit| edit.params.get("rows").and_then(|value| value.as_array()))
380+
.map(Vec::len)
381+
.sum()
382+
}
383+
375384
fn unique_keyed_rows(edits: &[&Edit]) -> BTreeSet<String> {
376385
edits
377386
.iter()

binoc-stdlib/src/correspondence/parse.rs

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,9 @@ impl ParseRule for CsvParse {
6060

6161
fn parse(&self, item: &ItemRef, data: &dyn DataAccess) -> BinocResult<ParseOutput> {
6262
let bytes = data.read_bytes(item)?;
63-
let tabular = parse_csv_bytes(&bytes, delimiter_for(item))?;
64-
let sections = detect_stacked_sections(&tabular);
63+
let records = parse_csv_records(&bytes, delimiter_for(item))?;
64+
let tabular = table_from_csv_records(records.clone());
65+
let sections = detect_stacked_sections_from_rows(&records);
6566

6667
// Fewer than two qualifying regions: a plain CSV is a single table,
6768
// emitted as a LEAF `tabular_v1` artifact with no children.
@@ -697,31 +698,62 @@ fn delimiter_for(item: &ItemRef) -> u8 {
697698
}
698699
}
699700

701+
#[cfg(test)]
700702
fn parse_csv_bytes(bytes: &[u8], delimiter: u8) -> BinocResult<TabularData> {
703+
parse_csv_records(bytes, delimiter).map(table_from_csv_records)
704+
}
705+
706+
fn parse_csv_records(bytes: &[u8], delimiter: u8) -> BinocResult<Vec<Vec<String>>> {
701707
let mut reader = csv::ReaderBuilder::new()
702708
.delimiter(delimiter)
709+
.has_headers(false)
703710
.flexible(true)
704711
.from_reader(bytes);
705-
let headers = reader
706-
.byte_headers()
707-
.map_err(|err| BinocError::Csv(err.to_string()))?
708-
.iter()
709-
.map(|field| String::from_utf8_lossy(field).into_owned())
710-
.collect();
711-
let mut rows = Vec::new();
712+
let mut records = Vec::new();
712713
let mut record = csv::ByteRecord::new();
713714
while reader
714715
.read_byte_record(&mut record)
715716
.map_err(|err| BinocError::Csv(err.to_string()))?
716717
{
717-
rows.push(
718+
records.push(
718719
record
719720
.iter()
720721
.map(|field| String::from_utf8_lossy(field).into_owned())
721722
.collect(),
722723
);
723724
}
724-
Ok(TabularData::from_string_rows(headers, rows))
725+
Ok(records)
726+
}
727+
728+
fn table_from_csv_records(records: Vec<Vec<String>>) -> TabularData {
729+
let Some(first) = records.first() else {
730+
return TabularData::from_string_rows(Vec::new(), Vec::new());
731+
};
732+
let width = records.iter().map(Vec::len).max().unwrap_or(first.len());
733+
let headers = complete_csv_headers(first, width);
734+
let rows = records.into_iter().skip(1).collect();
735+
TabularData::from_string_rows(headers, rows)
736+
}
737+
738+
fn complete_csv_headers(first: &[String], width: usize) -> Vec<String> {
739+
let mut headers = Vec::with_capacity(width);
740+
let mut seen = BTreeSet::new();
741+
for index in 0..width {
742+
let raw = first
743+
.get(index)
744+
.map(|value| value.trim())
745+
.filter(|value| !value.is_empty())
746+
.map(str::to_string)
747+
.unwrap_or_else(|| format!("column_{}", index + 1));
748+
let mut candidate = raw.clone();
749+
let mut suffix = 2usize;
750+
while !seen.insert(candidate.clone()) {
751+
candidate = format!("{raw}_{suffix}");
752+
suffix += 1;
753+
}
754+
headers.push(candidate);
755+
}
756+
headers
725757
}
726758

727759
#[derive(Debug, Clone)]
@@ -747,13 +779,11 @@ struct StackedSection {
747779
/// has more than 10 rows (≥ 11, counting its header). When it qualifies, each
748780
/// region's first row is the header and the rest are data rows, trimmed to the
749781
/// region width. Otherwise an empty `Vec` is returned (a single flat table).
750-
fn detect_stacked_sections(table: &TabularData) -> Vec<StackedSection> {
751-
let rows = raw_rows(table);
752-
782+
fn detect_stacked_sections_from_rows(rows: &[Vec<String>]) -> Vec<StackedSection> {
753783
// Partition into regions of consecutive same-width rows, skipping blanks.
754784
let mut regions: Vec<Vec<Vec<String>>> = Vec::new();
755785
let mut current_width: Option<usize> = None;
756-
for row in &rows {
786+
for row in rows {
757787
let width = normalized_width(row);
758788
if width == 0 {
759789
// Blank rows are transparent.
@@ -816,17 +846,6 @@ fn children_from_sections(parent_path: &str, sections: &[StackedSection]) -> Vec
816846
children
817847
}
818848

819-
fn raw_rows(table: &TabularData) -> Vec<Vec<String>> {
820-
std::iter::once(table.headers.clone())
821-
.chain(
822-
table
823-
.rows
824-
.iter()
825-
.map(|row| row.iter().map(|cell| cell.as_text().into_owned()).collect()),
826-
)
827-
.collect()
828-
}
829-
830849
fn normalized_width(row: &[String]) -> usize {
831850
row.iter()
832851
.rposition(|cell| !cell.trim().is_empty())
@@ -849,8 +868,8 @@ mod tests {
849868
use super::*;
850869

851870
fn detect(csv: &str) -> Vec<StackedSection> {
852-
let table = parse_csv_bytes(csv.as_bytes(), b',').expect("parse csv");
853-
detect_stacked_sections(&table)
871+
let records = parse_csv_records(csv.as_bytes(), b',').expect("parse csv");
872+
detect_stacked_sections_from_rows(&records)
854873
}
855874

856875
/// Build a CSV body of `count` rows, each `width` comma-separated cells,
@@ -868,6 +887,27 @@ mod tests {
868887
out
869888
}
870889

890+
#[test]
891+
fn csv_parse_preserves_fields_after_single_cell_banner() {
892+
let csv = "Land-Ocean: Global Means\n\
893+
Year,Jan,Feb\n\
894+
1880,-.18,-.24\n";
895+
let table = parse_csv_bytes(csv.as_bytes(), b',').expect("parse csv");
896+
assert_eq!(
897+
table.headers,
898+
vec![
899+
"Land-Ocean: Global Means".to_string(),
900+
"column_2".to_string(),
901+
"column_3".to_string()
902+
]
903+
);
904+
assert_eq!(table.rows.len(), 2);
905+
assert_eq!(table.rows[0][0].as_text(), "Year");
906+
assert_eq!(table.rows[0][1].as_text(), "Jan");
907+
assert_eq!(table.rows[0][2].as_text(), "Feb");
908+
assert_eq!(table.rows[1][2].as_text(), "-.24");
909+
}
910+
871911
#[test]
872912
fn flat_ragged_csv_is_not_stacked() {
873913
// A plain flat table with a few ragged rows (brfss / fda shape). Width

0 commit comments

Comments
 (0)