|
1 | | -use pdf_render::TextSpan; |
2 | | -use pathfinder_geometry::{ |
3 | | - vector::Vector2F, |
4 | | - rect::RectF |
5 | | -}; |
| 1 | +mod gap; |
| 2 | +mod line; |
| 3 | +mod render; |
6 | 4 |
|
7 | | -use std::collections::BTreeSet; |
| 5 | +use gap::{dist_x, dist_y, gaps, left_right_gap, top_bottom_gap}; |
| 6 | +use line::{analyze_lines, overlapping_lines, Lines}; |
| 7 | +use pdf_render::TextSpan; |
| 8 | +use pathfinder_geometry::rect::RectF; |
8 | 9 |
|
9 | 10 | use itertools::Itertools; |
10 | 11 | use ordered_float::NotNan; |
@@ -37,7 +38,7 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]]) |
37 | 38 | }; |
38 | 39 | let probably_footer = |boxes: &mut [(RectF, usize)]| { |
39 | 40 | sort_x(boxes); |
40 | | - let x_gaps: Vec<f32> = gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x())) |
| 41 | + let x_gaps: Vec<f32> = gap::gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x())) |
41 | 42 | .collect(); |
42 | 43 |
|
43 | 44 | let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probably_header(cell)).count(); |
@@ -72,71 +73,6 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]]) |
72 | 73 | split(boxes, &spans, &lines) |
73 | 74 | } |
74 | 75 |
|
75 | | -fn analyze_lines(lines: &[[f32; 4]]) -> Lines { |
76 | | - let mut hlines = BTreeSet::new(); |
77 | | - let mut vlines = BTreeSet::new(); |
78 | | - |
79 | | - for &[x1, y1, x2, y2] in lines { |
80 | | - if x1 == x2 { |
81 | | - vlines.insert(NotNan::new(x1).unwrap()); |
82 | | - } else if y1 == y2 { |
83 | | - hlines.insert(NotNan::new(y1).unwrap()); |
84 | | - } |
85 | | - } |
86 | | - |
87 | | - fn dedup(lines: impl Iterator<Item=NotNan<f32>>) -> Vec<(f32, f32)> { |
88 | | - let threshold = 10.0; |
89 | | - let mut out = vec![]; |
90 | | - let mut lines = lines.map(|f| *f).peekable(); |
91 | | - while let Some(start) = lines.next() { |
92 | | - let mut last = start; |
93 | | - while let Some(&p) = lines.peek() { |
94 | | - if last + threshold > p { |
95 | | - last = p; |
96 | | - lines.next(); |
97 | | - } else { |
98 | | - break; |
99 | | - } |
100 | | - } |
101 | | - out.push((start, last)); |
102 | | - } |
103 | | - out |
104 | | - } |
105 | | - |
106 | | - let hlines = dedup(hlines.iter().cloned()); |
107 | | - let vlines = dedup(vlines.iter().cloned()); |
108 | | - |
109 | | - let mut line_grid = vec![false; vlines.len() * hlines.len()]; |
110 | | - for &[x1, y1, x2, y2] in lines { |
111 | | - if x1 == x2 { |
112 | | - let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len()); |
113 | | - let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len()); |
114 | | - let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len()); |
115 | | - for h in h_start .. h_end { |
116 | | - line_grid[v_idx * hlines.len() + h] = true; |
117 | | - } |
118 | | - } else if y1 == y2 { |
119 | | - let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len()); |
120 | | - let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len()); |
121 | | - let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len()); |
122 | | - for v in v_start .. v_end { |
123 | | - line_grid[v * hlines.len() + h_idx] = true; |
124 | | - } |
125 | | - } |
126 | | - } |
127 | | - |
128 | | - |
129 | | - //println!("hlines: {:?}", hlines); |
130 | | - //println!("vlines: {:?}", vlines); |
131 | | - |
132 | | - Lines { hlines, vlines, line_grid } |
133 | | -} |
134 | | - |
135 | | -pub struct Lines { |
136 | | - hlines: Vec<(f32, f32)>, |
137 | | - vlines: Vec<(f32, f32)>, |
138 | | - line_grid: Vec<bool>, |
139 | | -} |
140 | 76 |
|
141 | 77 | #[derive(Copy, Clone, Debug)] |
142 | 78 | struct Span { |
@@ -384,68 +320,6 @@ pub enum NodeTag { |
384 | 320 | Complex, |
385 | 321 | } |
386 | 322 |
|
387 | | -pub fn render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF) { |
388 | | - _render(w, spans, node, bbox, 0) |
389 | | -} |
390 | | -fn _render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF, level: usize) { |
391 | | - use std::fmt::Write; |
392 | | - |
393 | | - match *node { |
394 | | - Node::Final { ref indices } => { |
395 | | - /* |
396 | | - for i in start..end { |
397 | | - if let Span::Text(ref t) = spans[i] { |
398 | | - write!(w, r#"<text"#).unwrap(); |
399 | | - write!(w, r#" font-size="{}""#, t.font_size).unwrap(); |
400 | | - write!(w, r#" transform="{}""#, Transform::from(t.transform)).unwrap(); |
401 | | - write_text_span(w, t); |
402 | | - write!(w, "</text>").unwrap(); |
403 | | - } |
404 | | - } |
405 | | - */ |
406 | | - |
407 | | - if indices.len() > 0 { |
408 | | - let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i))); |
409 | | - |
410 | | - for &i in indices.iter() { |
411 | | - let r = spans[i].rect; |
412 | | - write!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" class="{:?}" />"#, |
413 | | - r.min_x(), r.max_x(), r.max_y(), r.max_y(), |
414 | | - class |
415 | | - ); |
416 | | - } |
417 | | - } |
418 | | - } |
419 | | - Node::Grid { ref x, ref y, ref cells, tag } => { |
420 | | - use std::iter::once; |
421 | | - let columns = x.len() + 1; |
422 | | - write!(w, r#"<rect x="{}" y="{}" width="{}" height="{}" class="{:?}" />"#, |
423 | | - bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag |
424 | | - ); |
425 | | - |
426 | | - for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() { |
427 | | - if j > 0 { |
428 | | - writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#, |
429 | | - bbox.min_x(), bbox.max_x(), min_y, min_y); |
430 | | - } |
431 | | - |
432 | | - for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() { |
433 | | - if i > 0 { |
434 | | - writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#, |
435 | | - min_x, min_x, bbox.min_y(), bbox.max_y()); |
436 | | - } |
437 | | - |
438 | | - let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y)); |
439 | | - _render(w, spans, cell, bbox, level+1); |
440 | | - } |
441 | | - } |
442 | | - } |
443 | | - Node::Table { .. } => { |
444 | | - |
445 | | - } |
446 | | - } |
447 | | -} |
448 | | - |
449 | 323 | fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines: &Lines) -> Node { |
450 | 324 | let num_boxes = boxes.len(); |
451 | 325 | if num_boxes < 2 { |
@@ -567,147 +441,13 @@ fn split_v(boxes: &mut [(RectF, usize)]) -> Node { |
567 | 441 | } |
568 | 442 | } |
569 | 443 |
|
570 | | -fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) { |
571 | | - let num_boxes = boxes.len(); |
572 | | - if num_boxes < 2 { |
573 | | - return (None, None); |
574 | | - } |
575 | | - |
576 | | - let mut gaps = gap_list(boxes, |r| ( |
577 | | - // top left y |
578 | | - r.min_y(), |
579 | | - // bottom right y |
580 | | - r.max_y() |
581 | | - )); |
582 | | - let top_limit = bbox.min_y() + bbox.height() * 0.2; |
583 | | - let bottom_limit = bbox.min_y() + bbox.height() * 0.8; |
584 | | - |
585 | | - match gaps.next() { |
586 | | - Some((y, _, top)) if y < top_limit => { |
587 | | - match gaps.last() { |
588 | | - Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)), |
589 | | - _ => (Some(top), None) |
590 | | - } |
591 | | - } |
592 | | - Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)), |
593 | | - _ => (None, None) |
594 | | - } |
595 | | -} |
596 | | - |
597 | | -fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) { |
598 | | - let num_boxes = boxes.len(); |
599 | | - if num_boxes < 2 { |
600 | | - return (None, None); |
601 | | - } |
602 | | - |
603 | | - let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x())); |
604 | | - let left_limit = bbox.min_x() + bbox.width() * 0.2; |
605 | | - let right_limit = bbox.min_x() + bbox.width() * 0.8; |
606 | | - match gaps.next() { |
607 | | - Some((x, _, left)) if x < left_limit => { |
608 | | - match gaps.last() { |
609 | | - Some((x, _, right)) if x > right_limit => (Some(left), Some(right)), |
610 | | - _ => (Some(left), None) |
611 | | - } |
612 | | - } |
613 | | - Some((x, _, right)) if x > right_limit => (None, Some(right)), |
614 | | - _ => (None, None) |
615 | | - } |
616 | | -} |
617 | | - |
618 | 444 | fn sort_x(boxes: &mut [(RectF, usize)]) { |
619 | 445 | boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap()); |
620 | 446 | } |
621 | 447 | fn sort_y(boxes: &mut [(RectF, usize)]) { |
622 | 448 | boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap()); |
623 | 449 | } |
624 | | -fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node { |
625 | | - sort_y(boxes); |
626 | | - let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap(); |
627 | | - |
628 | | - let mut y_center = boxes[0].0.center().y(); |
629 | | - let mut lines = vec![]; |
630 | | - let mut y_splits = vec![]; |
631 | | - |
632 | | - let mut start = 0; |
633 | | - 'a: loop { |
634 | | - for (i, &(r, _)) in boxes[start..].iter().enumerate() { |
635 | | - if r.center().y() > 0.5 * avg_height + y_center { |
636 | | - let end = start + i; |
637 | | - sort_x(&mut boxes[start..end]); |
638 | | - let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap(); |
639 | | - |
640 | | - y_splits.push(bbox.max_y()); |
641 | | - lines.push(Node::singleton(&boxes[start..end])); |
642 | | - y_center = r.center().y(); |
643 | | - |
644 | | - start = end; |
645 | | - continue 'a; |
646 | | - } |
647 | | - } |
648 | 450 |
|
649 | | - sort_x(&mut boxes[start..]); |
650 | | - lines.push(Node::singleton(&boxes[start..])); |
651 | | - |
652 | | - break; |
653 | | - } |
654 | | - match lines.len() { |
655 | | - 0 => Node::singleton(&[]), |
656 | | - 1 => lines.pop().unwrap(), |
657 | | - _ => Node::Grid { |
658 | | - x: vec![], |
659 | | - y: y_splits, |
660 | | - cells: lines, |
661 | | - tag: NodeTag::Paragraph |
662 | | - } |
663 | | - } |
664 | | -} |
665 | | - |
666 | | -fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a { |
667 | | - let mut boxes = boxes.iter(); |
668 | | - let &(ref r, _) = boxes.next().unwrap(); |
669 | | - let (_, mut last_max) = span(r); |
670 | | - boxes.enumerate().filter_map(move |(idx, &(ref r, _))| { |
671 | | - // top left y, bottom right y |
672 | | - let (min, max) = span(&r); |
673 | | - let r = if min > last_max { |
674 | | - Some((last_max, min, idx+1)) |
675 | | - } else { |
676 | | - None |
677 | | - }; |
678 | | - last_max = max.max(last_max); |
679 | | - r |
680 | | - }) |
681 | | -} |
682 | | - |
683 | | -fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a { |
684 | | - let mut boxes = boxes.iter(); |
685 | | - let &(ref r, _) = boxes.next().unwrap(); |
686 | | - let (_, mut last_max) = span(r); |
687 | | - boxes.filter_map(move |&(ref r, _)| { |
688 | | - let (min, max) = span(&r); |
689 | | - let r = if min - last_max >= threshold { |
690 | | - Some(0.5 * (last_max + min)) |
691 | | - } else { |
692 | | - None |
693 | | - }; |
694 | | - last_max = max.max(last_max); |
695 | | - r |
696 | | - }) |
697 | | -} |
698 | | - |
699 | | -fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> { |
700 | | - gap_list(boxes, span) |
701 | | - .max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap()) |
702 | | - .map(|(a, b, _)| (b - a, 0.5 * (a + b))) |
703 | | -} |
704 | | - |
705 | | -fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { |
706 | | - max_gap(boxes, |r| (r.min_x(), r.max_x())) |
707 | | -} |
708 | | -fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { |
709 | | - max_gap(boxes, |r| (r.min_y(), r.max_y())) |
710 | | -} |
711 | 451 | fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator<Item=&'a mut [(RectF, usize)]> { |
712 | 452 | SplitBy { |
713 | 453 | data: list, |
|
0 commit comments