Skip to content

Commit 4a7e0e4

Browse files
committed
Merge remote-tracking branch 'upstream/main'
# Conflicts: # napi/package.json
2 parents c503755 + 1f28e52 commit 4a7e0e4

4 files changed

Lines changed: 82 additions & 7 deletions

File tree

napi/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@docmost/pdf-inspector",
3-
"version": "1.9.3",
3+
"version": "1.9.4",
44
"description": "Fast PDF classification, text extraction, and image extraction. Native Rust performance via napi-rs.",
55
"main": "index.js",
66
"types": "index.d.ts",

src/lib.rs

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2168,13 +2168,15 @@ fn try_expand_multi_row_cells(
21682168
/// * `phantom_empty_row` — a row whose every cell is empty, surrounded
21692169
/// above and below by rows with content. SLANet sometimes emits an
21702170
/// extra row that doesn't correspond to any visible PDF row.
2171-
/// * `multi_row_in_cell` — at least one `rowspan==1` cell encloses
2172-
/// PDF text items that cluster into two distinct visual lines
2171+
/// * `multi_row_in_cell` — at least one non-label `rowspan==1` cell
2172+
/// encloses PDF text items that cluster into two distinct visual lines
21732173
/// separated by a whitespace gap larger than the line height. Cells
21742174
/// declared as `rowspan>1` are excluded since they are *expected*
2175-
/// to span multiple lines. SLANet's row under-detection on
2176-
/// tightly-packed tables produces the rowspan==1-but-multi-line
2177-
/// pattern (the FNBO failure mode).
2175+
/// to span multiple lines. First-row/first-column wraps are ignored
2176+
/// unless the in-place row expansion has enough support to repair them,
2177+
/// because those are often legitimate wrapped headers or row labels.
2178+
/// SLANet's row under-detection on tightly-packed tables produces the
2179+
/// rowspan==1-but-multi-line pattern (the FNBO failure mode).
21782180
fn detect_tsr_quality_issue(
21792181
buffer: &[u8],
21802182
input: &TsrTableInput,
@@ -2232,6 +2234,8 @@ fn detect_tsr_quality_issue(
22322234
};
22332235
let expanded_cells =
22342236
try_expand_multi_row_cells(cells, &items, page_h, coords, adaptive_threshold);
2237+
let first_row = cells.iter().map(|cell| cell.row).min().unwrap_or(0);
2238+
let first_col = cells.iter().map(|cell| cell.col).min().unwrap_or(0);
22352239

22362240
for cell in cells {
22372241
// rowspan>1 cells are intentionally multi-line — skip them.
@@ -2245,14 +2249,30 @@ fn detect_tsr_quality_issue(
22452249
if cell_items.len() < 2 {
22462250
continue;
22472251
}
2248-
if cluster_tsr_cell_text_lines(cell_items).len() >= 2 {
2252+
if cluster_tsr_cell_text_lines(cell_items).len() < 2 {
2253+
continue;
2254+
}
2255+
if expanded_cells.is_some() {
22492256
return Ok(Some(TsrQualityIssue::MultiRowInCell { expanded_cells }));
22502257
}
2258+
if !is_wrapped_tsr_label_cell(cell, first_row, first_col) {
2259+
return Ok(Some(TsrQualityIssue::MultiRowInCell {
2260+
expanded_cells: None,
2261+
}));
2262+
}
22512263
}
22522264

22532265
Ok(None)
22542266
}
22552267

2268+
fn is_wrapped_tsr_label_cell(
2269+
cell: &tables::StructuredCell,
2270+
first_row: usize,
2271+
first_col: usize,
2272+
) -> bool {
2273+
cell.is_header || cell.row == first_row || cell.col == first_col
2274+
}
2275+
22562276
/// Auto-fallback variant of [`extract_tables_with_structure_mem`]:
22572277
/// runs the TSR-hybrid path, checks the resulting cells for known
22582278
/// SLANet detection pathologies (phantom rows, multi-row-in-cell text),
41 KB
Binary file not shown.

tests/integration_tests.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2533,6 +2533,61 @@ fn test_auto_expands_under_counted_vector_grid_rows() {
25332533
);
25342534
}
25352535

2536+
#[test]
2537+
fn test_auto_keeps_wrapped_header_vector_grid_doc51() {
2538+
use pdf_inspector::{extract_tables_with_structure_auto_mem, TsrTableInput};
2539+
2540+
let buf = std::fs::read("tests/fixtures/government_positions_women.pdf").unwrap();
2541+
let crop = [0.0, 0.0, 612.0, 792.0];
2542+
let grid = detect_vector_grid_in_region_mem(&buf, 0, crop, 200.0)
2543+
.unwrap()
2544+
.expect("expected doc 51 vector grid");
2545+
assert_eq!(
2546+
grid.cell_bboxes.len(),
2547+
36,
2548+
"doc 51 should have a 9x4 vector grid"
2549+
);
2550+
2551+
let results = extract_tables_with_structure_auto_mem(
2552+
&buf,
2553+
&[TsrTableInput {
2554+
page: 0,
2555+
crop_pdf_pt_bbox: crop,
2556+
render_dpi: 200.0,
2557+
structure_tokens: grid.structure_tokens,
2558+
cell_bboxes: grid.cell_bboxes,
2559+
}],
2560+
)
2561+
.unwrap();
2562+
2563+
assert_eq!(results.len(), 1);
2564+
let r = &results[0];
2565+
assert!(
2566+
r.fallback_reason.is_none(),
2567+
"wrapped header/label text should not trigger heuristic fallback: {:?}\n{}",
2568+
r.fallback_reason,
2569+
r.markdown
2570+
);
2571+
let md = &r.markdown;
2572+
assert!(md.contains("Government Position"), "missing header: {md}");
2573+
assert!(
2574+
md.contains("Aquino Administration"),
2575+
"missing Aquino header: {md}"
2576+
);
2577+
assert!(
2578+
md.contains("Ramos Administration"),
2579+
"missing Ramos header: {md}"
2580+
);
2581+
assert!(
2582+
md.contains("City Municipal Councilor"),
2583+
"row label was truncated: {md}"
2584+
);
2585+
assert!(
2586+
!md.contains("|Position||Administration"),
2587+
"heuristic fallback split the header row: {md}"
2588+
);
2589+
}
2590+
25362591
#[test]
25372592
fn test_auto_returns_empty_inputs() {
25382593
use pdf_inspector::extract_tables_with_structure_auto_mem;

0 commit comments

Comments
 (0)