Skip to content

Commit 5182bab

Browse files
author
vidy
committed
Reorganize the tree into serveral small files
1 parent 509df96 commit 5182bab

File tree

1 file changed

+8
-268
lines changed

1 file changed

+8
-268
lines changed

src/tree.rs

Lines changed: 8 additions & 268 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
use pdf_render::TextSpan;
2-
use pathfinder_geometry::{
3-
vector::Vector2F,
4-
rect::RectF
5-
};
1+
mod gap;
2+
mod line;
3+
mod render;
64

7-
use std::collections::BTreeSet;
5+
use gap::{dist_x, dist_y, gaps, left_right_gap, top_bottom_gap};
6+
use line::{analyze_lines, overlapping_lines, Lines};
7+
use pdf_render::TextSpan;
8+
use pathfinder_geometry::rect::RectF;
89

910
use itertools::Itertools;
1011
use ordered_float::NotNan;
@@ -37,7 +38,7 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
3738
};
3839
let probably_footer = |boxes: &mut [(RectF, usize)]| {
3940
sort_x(boxes);
40-
let x_gaps: Vec<f32> = gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x()))
41+
let x_gaps: Vec<f32> = gap::gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x()))
4142
.collect();
4243

4344
let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probably_header(cell)).count();
@@ -72,71 +73,6 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
7273
split(boxes, &spans, &lines)
7374
}
7475

75-
fn analyze_lines(lines: &[[f32; 4]]) -> Lines {
76-
let mut hlines = BTreeSet::new();
77-
let mut vlines = BTreeSet::new();
78-
79-
for &[x1, y1, x2, y2] in lines {
80-
if x1 == x2 {
81-
vlines.insert(NotNan::new(x1).unwrap());
82-
} else if y1 == y2 {
83-
hlines.insert(NotNan::new(y1).unwrap());
84-
}
85-
}
86-
87-
fn dedup(lines: impl Iterator<Item=NotNan<f32>>) -> Vec<(f32, f32)> {
88-
let threshold = 10.0;
89-
let mut out = vec![];
90-
let mut lines = lines.map(|f| *f).peekable();
91-
while let Some(start) = lines.next() {
92-
let mut last = start;
93-
while let Some(&p) = lines.peek() {
94-
if last + threshold > p {
95-
last = p;
96-
lines.next();
97-
} else {
98-
break;
99-
}
100-
}
101-
out.push((start, last));
102-
}
103-
out
104-
}
105-
106-
let hlines = dedup(hlines.iter().cloned());
107-
let vlines = dedup(vlines.iter().cloned());
108-
109-
let mut line_grid = vec![false; vlines.len() * hlines.len()];
110-
for &[x1, y1, x2, y2] in lines {
111-
if x1 == x2 {
112-
let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len());
113-
let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len());
114-
let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len());
115-
for h in h_start .. h_end {
116-
line_grid[v_idx * hlines.len() + h] = true;
117-
}
118-
} else if y1 == y2 {
119-
let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len());
120-
let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len());
121-
let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len());
122-
for v in v_start .. v_end {
123-
line_grid[v * hlines.len() + h_idx] = true;
124-
}
125-
}
126-
}
127-
128-
129-
//println!("hlines: {:?}", hlines);
130-
//println!("vlines: {:?}", vlines);
131-
132-
Lines { hlines, vlines, line_grid }
133-
}
134-
135-
pub struct Lines {
136-
hlines: Vec<(f32, f32)>,
137-
vlines: Vec<(f32, f32)>,
138-
line_grid: Vec<bool>,
139-
}
14076

14177
#[derive(Copy, Clone, Debug)]
14278
struct Span {
@@ -384,68 +320,6 @@ pub enum NodeTag {
384320
Complex,
385321
}
386322

387-
pub fn render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF) {
388-
_render(w, spans, node, bbox, 0)
389-
}
390-
fn _render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF, level: usize) {
391-
use std::fmt::Write;
392-
393-
match *node {
394-
Node::Final { ref indices } => {
395-
/*
396-
for i in start..end {
397-
if let Span::Text(ref t) = spans[i] {
398-
write!(w, r#"<text"#).unwrap();
399-
write!(w, r#" font-size="{}""#, t.font_size).unwrap();
400-
write!(w, r#" transform="{}""#, Transform::from(t.transform)).unwrap();
401-
write_text_span(w, t);
402-
write!(w, "</text>").unwrap();
403-
}
404-
}
405-
*/
406-
407-
if indices.len() > 0 {
408-
let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i)));
409-
410-
for &i in indices.iter() {
411-
let r = spans[i].rect;
412-
write!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" class="{:?}" />"#,
413-
r.min_x(), r.max_x(), r.max_y(), r.max_y(),
414-
class
415-
);
416-
}
417-
}
418-
}
419-
Node::Grid { ref x, ref y, ref cells, tag } => {
420-
use std::iter::once;
421-
let columns = x.len() + 1;
422-
write!(w, r#"<rect x="{}" y="{}" width="{}" height="{}" class="{:?}" />"#,
423-
bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag
424-
);
425-
426-
for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() {
427-
if j > 0 {
428-
writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#,
429-
bbox.min_x(), bbox.max_x(), min_y, min_y);
430-
}
431-
432-
for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() {
433-
if i > 0 {
434-
writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#,
435-
min_x, min_x, bbox.min_y(), bbox.max_y());
436-
}
437-
438-
let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y));
439-
_render(w, spans, cell, bbox, level+1);
440-
}
441-
}
442-
}
443-
Node::Table { .. } => {
444-
445-
}
446-
}
447-
}
448-
449323
fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines: &Lines) -> Node {
450324
let num_boxes = boxes.len();
451325
if num_boxes < 2 {
@@ -567,147 +441,13 @@ fn split_v(boxes: &mut [(RectF, usize)]) -> Node {
567441
}
568442
}
569443

570-
fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
571-
let num_boxes = boxes.len();
572-
if num_boxes < 2 {
573-
return (None, None);
574-
}
575-
576-
let mut gaps = gap_list(boxes, |r| (
577-
// top left y
578-
r.min_y(),
579-
// bottom right y
580-
r.max_y()
581-
));
582-
let top_limit = bbox.min_y() + bbox.height() * 0.2;
583-
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;
584-
585-
match gaps.next() {
586-
Some((y, _, top)) if y < top_limit => {
587-
match gaps.last() {
588-
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
589-
_ => (Some(top), None)
590-
}
591-
}
592-
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
593-
_ => (None, None)
594-
}
595-
}
596-
597-
fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
598-
let num_boxes = boxes.len();
599-
if num_boxes < 2 {
600-
return (None, None);
601-
}
602-
603-
let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
604-
let left_limit = bbox.min_x() + bbox.width() * 0.2;
605-
let right_limit = bbox.min_x() + bbox.width() * 0.8;
606-
match gaps.next() {
607-
Some((x, _, left)) if x < left_limit => {
608-
match gaps.last() {
609-
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
610-
_ => (Some(left), None)
611-
}
612-
}
613-
Some((x, _, right)) if x > right_limit => (None, Some(right)),
614-
_ => (None, None)
615-
}
616-
}
617-
618444
fn sort_x(boxes: &mut [(RectF, usize)]) {
619445
boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap());
620446
}
621447
fn sort_y(boxes: &mut [(RectF, usize)]) {
622448
boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap());
623449
}
624-
fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node {
625-
sort_y(boxes);
626-
let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap();
627-
628-
let mut y_center = boxes[0].0.center().y();
629-
let mut lines = vec![];
630-
let mut y_splits = vec![];
631-
632-
let mut start = 0;
633-
'a: loop {
634-
for (i, &(r, _)) in boxes[start..].iter().enumerate() {
635-
if r.center().y() > 0.5 * avg_height + y_center {
636-
let end = start + i;
637-
sort_x(&mut boxes[start..end]);
638-
let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap();
639-
640-
y_splits.push(bbox.max_y());
641-
lines.push(Node::singleton(&boxes[start..end]));
642-
y_center = r.center().y();
643-
644-
start = end;
645-
continue 'a;
646-
}
647-
}
648450

649-
sort_x(&mut boxes[start..]);
650-
lines.push(Node::singleton(&boxes[start..]));
651-
652-
break;
653-
}
654-
match lines.len() {
655-
0 => Node::singleton(&[]),
656-
1 => lines.pop().unwrap(),
657-
_ => Node::Grid {
658-
x: vec![],
659-
y: y_splits,
660-
cells: lines,
661-
tag: NodeTag::Paragraph
662-
}
663-
}
664-
}
665-
666-
fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
667-
let mut boxes = boxes.iter();
668-
let &(ref r, _) = boxes.next().unwrap();
669-
let (_, mut last_max) = span(r);
670-
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
671-
// top left y, bottom right y
672-
let (min, max) = span(&r);
673-
let r = if min > last_max {
674-
Some((last_max, min, idx+1))
675-
} else {
676-
None
677-
};
678-
last_max = max.max(last_max);
679-
r
680-
})
681-
}
682-
683-
fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
684-
let mut boxes = boxes.iter();
685-
let &(ref r, _) = boxes.next().unwrap();
686-
let (_, mut last_max) = span(r);
687-
boxes.filter_map(move |&(ref r, _)| {
688-
let (min, max) = span(&r);
689-
let r = if min - last_max >= threshold {
690-
Some(0.5 * (last_max + min))
691-
} else {
692-
None
693-
};
694-
last_max = max.max(last_max);
695-
r
696-
})
697-
}
698-
699-
fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
700-
gap_list(boxes, span)
701-
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
702-
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
703-
}
704-
705-
fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
706-
max_gap(boxes, |r| (r.min_x(), r.max_x()))
707-
}
708-
fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
709-
max_gap(boxes, |r| (r.min_y(), r.max_y()))
710-
}
711451
fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator<Item=&'a mut [(RectF, usize)]> {
712452
SplitBy {
713453
data: list,

0 commit comments

Comments
 (0)