@@ -2168,13 +2168,15 @@ fn try_expand_multi_row_cells(
21682168/// * `phantom_empty_row` — a row whose every cell is empty, surrounded
21692169/// above and below by rows with content. SLANet sometimes emits an
21702170/// extra row that doesn't correspond to any visible PDF row.
2171- /// * `multi_row_in_cell` — at least one `rowspan==1` cell encloses
2172- /// PDF text items that cluster into two distinct visual lines
2171+ /// * `multi_row_in_cell` — at least one non-label `rowspan==1` cell
2172+ /// encloses PDF text items that cluster into two distinct visual lines
21732173/// separated by a whitespace gap larger than the line height. Cells
21742174/// declared as `rowspan>1` are excluded since they are *expected*
2175- /// to span multiple lines. SLANet's row under-detection on
2176- /// tightly-packed tables produces the rowspan==1-but-multi-line
2177- /// pattern (the FNBO failure mode).
2175+ /// to span multiple lines. First-row/first-column wraps are ignored
2176+ /// unless the in-place row expansion has enough support to repair them,
2177+ /// because those are often legitimate wrapped headers or row labels.
2178+ /// SLANet's row under-detection on tightly-packed tables produces the
2179+ /// rowspan==1-but-multi-line pattern (the FNBO failure mode).
21782180fn detect_tsr_quality_issue (
21792181 buffer : & [ u8 ] ,
21802182 input : & TsrTableInput ,
@@ -2232,6 +2234,8 @@ fn detect_tsr_quality_issue(
22322234 } ;
22332235 let expanded_cells =
22342236 try_expand_multi_row_cells ( cells, & items, page_h, coords, adaptive_threshold) ;
2237+ let first_row = cells. iter ( ) . map ( |cell| cell. row ) . min ( ) . unwrap_or ( 0 ) ;
2238+ let first_col = cells. iter ( ) . map ( |cell| cell. col ) . min ( ) . unwrap_or ( 0 ) ;
22352239
22362240 for cell in cells {
22372241 // rowspan>1 cells are intentionally multi-line — skip them.
@@ -2245,14 +2249,30 @@ fn detect_tsr_quality_issue(
22452249 if cell_items. len ( ) < 2 {
22462250 continue ;
22472251 }
2248- if cluster_tsr_cell_text_lines ( cell_items) . len ( ) >= 2 {
2252+ if cluster_tsr_cell_text_lines ( cell_items) . len ( ) < 2 {
2253+ continue ;
2254+ }
2255+ if expanded_cells. is_some ( ) {
22492256 return Ok ( Some ( TsrQualityIssue :: MultiRowInCell { expanded_cells } ) ) ;
22502257 }
2258+ if !is_wrapped_tsr_label_cell ( cell, first_row, first_col) {
2259+ return Ok ( Some ( TsrQualityIssue :: MultiRowInCell {
2260+ expanded_cells : None ,
2261+ } ) ) ;
2262+ }
22512263 }
22522264
22532265 Ok ( None )
22542266}
22552267
2268+ fn is_wrapped_tsr_label_cell (
2269+ cell : & tables:: StructuredCell ,
2270+ first_row : usize ,
2271+ first_col : usize ,
2272+ ) -> bool {
2273+ cell. is_header || cell. row == first_row || cell. col == first_col
2274+ }
2275+
22562276/// Auto-fallback variant of [`extract_tables_with_structure_mem`]:
22572277/// runs the TSR-hybrid path, checks the resulting cells for known
22582278/// SLANet detection pathologies (phantom rows, multi-row-in-cell text),
0 commit comments