Skip to content

Commit 59e6331

Browse files
author
vidy
committed
Rename tree to node
1 parent 5182bab commit 59e6331

File tree

10 files changed

+356
-86
lines changed

10 files changed

+356
-86
lines changed

examples/text.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ fn main() {
77
let resolver = file.resolver();
88

99
// for (page_nr, page) in file.pages().enumerate() {
10-
let page = file.get_page(0).unwrap();
10+
let page: pdf::object::PageRc = file.get_page(0).unwrap();
1111
let flow = pdf_text::run(&file, &page, &resolver, Default::default()).expect("can't render page");
1212
println!("# page {}", 0 + 1);
1313
for run in flow.runs {
1414
for line in run.lines {
1515
for w in line.words {
16-
// println!(": {}", w.text);
16+
println!(": {}", w.text);
1717
}
1818
}
1919
println!();

src/classify.rs

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@ use pdf_render::TextSpan;
55

66
use crate::util::is_number;
77

8-
use super::util::Tri;
9-
108
#[derive(Copy, Clone, Debug, PartialEq)]
119
pub enum Class {
1210
Number,
@@ -15,33 +13,6 @@ pub enum Class {
1513
Mixed,
1614
}
1715

18-
#[derive(Debug)]
19-
pub struct TriCount {
20-
tru: usize,
21-
fal: usize,
22-
}
23-
impl TriCount {
24-
fn new() -> Self {
25-
TriCount {
26-
tru: 0,
27-
fal: 0
28-
}
29-
}
30-
fn add(&mut self, b: bool) {
31-
match b {
32-
false => self.fal += 1,
33-
true => self.tru += 1,
34-
}
35-
}
36-
fn count(&self) -> Tri {
37-
match (self.fal, self.tru) {
38-
(0, 0) => Tri::Unknown,
39-
(0, _) => Tri::True,
40-
(_, 0) => Tri::False,
41-
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
42-
}
43-
}
44-
}
4516
pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>) -> Class {
4617
use pdf_render::FontEntry;
4718

@@ -72,4 +43,39 @@ pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>)
7243
(_, Tri::Maybe(_), _) => Class::Paragraph,
7344
_ => Class::Mixed
7445
}
46+
}
47+
48+
pub enum Tri {
49+
False,
50+
True,
51+
Maybe(f32),
52+
Unknown,
53+
}
54+
55+
#[derive(Debug)]
56+
pub struct TriCount {
57+
tru: usize,
58+
fal: usize,
59+
}
60+
impl TriCount {
61+
fn new() -> Self {
62+
TriCount {
63+
tru: 0,
64+
fal: 0
65+
}
66+
}
67+
fn add(&mut self, b: bool) {
68+
match b {
69+
false => self.fal += 1,
70+
true => self.tru += 1,
71+
}
72+
}
73+
fn count(&self) -> Tri {
74+
match (self.fal, self.tru) {
75+
(0, 0) => Tri::Unknown,
76+
(0, _) => Tri::True,
77+
(_, 0) => Tri::False,
78+
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
79+
}
80+
}
7581
}

src/flow.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::classify::{classify, Class};
2-
use crate::tree::{Node, NodeTag};
3-
use crate::util::{avg, CellContent, Rect};
2+
use crate::node::{Node, NodeTag};
3+
use crate::util::avg;
44
use crate::text::concat_text;
55
use std::iter::once;
66
use pathfinder_geometry::rect::RectF;
@@ -34,6 +34,33 @@ pub enum RunType {
3434
Cell,
3535
}
3636

37+
38+
#[derive(Copy, Clone, Debug)]
39+
#[derive(Serialize, Deserialize)]
40+
#[repr(C)]
41+
pub struct Rect {
42+
pub x: f32,
43+
pub y: f32,
44+
pub w: f32,
45+
pub h: f32
46+
}
47+
impl From<RectF> for Rect {
48+
fn from(r: RectF) -> Self {
49+
Rect {
50+
x: r.origin_x(),
51+
y: r.origin_y(),
52+
w: r.width(),
53+
h: r.height()
54+
}
55+
}
56+
}
57+
58+
#[derive(Clone, Debug, Serialize)]
59+
pub struct CellContent {
60+
pub text: String,
61+
pub rect: Rect,
62+
}
63+
3764
#[derive(Serialize, Deserialize)]
3865
pub struct Flow {
3966
pub lines: Vec<Line>,

src/lib.rs

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use pathfinder_geometry::transform2d::Transform2F;
55
use pdf::{backend::Backend, object::{Page, Resolve}, PdfError};
66
use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder};
77

8-
mod tree;
8+
mod node;
99
mod util;
1010
mod text;
1111
mod classify;
@@ -17,14 +17,13 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
1717
let mut clip_paths = vec![];
1818
let mut tracer = Tracer::new(&mut cache, &mut clip_paths);
1919

20-
// The tracer backend can be used to get text, pattern, image, etc.
21-
// We will use text and pattern to do further text processing.
20+
//Get text, pattern, image by the Tracer backend.
2221
render_page(&mut tracer, resolve, &page, transform)?;
2322

2423
let bbox = tracer.view_box();
2524

2625
let items: Vec<DrawItem<OutlineBuilder>> = tracer.finish();
27-
//Get patterns which may have lines and texts inside.
26+
//Get all patterns which may have lines and texts inside.
2827
let mut patterns = HashSet::new();
2928
for item in items.iter() {
3029
if let DrawItem::Vector(ref v) = item {
@@ -84,20 +83,12 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
8483
}
8584
}
8685

87-
// After this loop, all the text and lines are ready
86+
// After this loop, all the text and lines are ready for further processing.
8887
for item in items {
8988
visit_item(item);
9089
}
91-
92-
spans.sort_unstable_by(|a, b| a.rect.min_y().partial_cmp(&b.rect.min_y()).unwrap());
9390

94-
spans.sort_unstable_by(|a, b| a.rect.min_x().partial_cmp(&b.rect.min_x()).unwrap());
95-
96-
for s in spans.iter().map(|s|s.text.as_str()) {
97-
println!(":{}", s)
98-
}
99-
100-
let root = tree::build(&spans, bbox, &lines);
91+
let root = node::build(&spans, bbox, &lines);
10192

10293
let mut flow = Flow::new();
10394
flow::build(&mut flow, &spans, &root, bbox.min_x());
File renamed without changes.

src/node/gap.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
use ordered_float::NotNan;
2+
use pathfinder_geometry::rect::RectF;
3+
4+
pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
5+
let mut boxes = boxes.iter();
6+
let &(ref r, _) = boxes.next().unwrap();
7+
let (_, mut last_max) = span(r);
8+
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
9+
// top left y, bottom right y
10+
let (min, max) = span(&r);
11+
let r = if min > last_max {
12+
Some((last_max, min, idx+1))
13+
} else {
14+
None
15+
};
16+
last_max = max.max(last_max);
17+
r
18+
})
19+
}
20+
21+
pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
22+
let mut boxes = boxes.iter();
23+
let &(ref r, _) = boxes.next().unwrap();
24+
let (_, mut last_max) = span(r);
25+
boxes.filter_map(move |&(ref r, _)| {
26+
let (min, max) = span(&r);
27+
let r = if min - last_max >= threshold {
28+
Some(0.5 * (last_max + min))
29+
} else {
30+
None
31+
};
32+
last_max = max.max(last_max);
33+
r
34+
})
35+
}
36+
37+
pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
38+
gap_list(boxes, span)
39+
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
40+
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
41+
}
42+
43+
pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
44+
max_gap(boxes, |r| (r.min_x(), r.max_x()))
45+
}
46+
pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
47+
max_gap(boxes, |r| (r.min_y(), r.max_y()))
48+
}
49+
50+
pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
51+
let num_boxes = boxes.len();
52+
if num_boxes < 2 {
53+
return (None, None);
54+
}
55+
56+
let mut gaps = gap_list(boxes, |r| (
57+
// top left y
58+
r.min_y(),
59+
// bottom right y
60+
r.max_y()
61+
));
62+
let top_limit = bbox.min_y() + bbox.height() * 0.2;
63+
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;
64+
65+
match gaps.next() {
66+
Some((y, _, top)) if y < top_limit => {
67+
match gaps.last() {
68+
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
69+
_ => (Some(top), None)
70+
}
71+
}
72+
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
73+
_ => (None, None)
74+
}
75+
}
76+
77+
pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
78+
let num_boxes = boxes.len();
79+
if num_boxes < 2 {
80+
return (None, None);
81+
}
82+
83+
let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
84+
let left_limit = bbox.min_x() + bbox.width() * 0.2;
85+
let right_limit = bbox.min_x() + bbox.width() * 0.8;
86+
match gaps.next() {
87+
Some((x, _, left)) if x < left_limit => {
88+
match gaps.last() {
89+
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
90+
_ => (Some(left), None)
91+
}
92+
}
93+
Some((x, _, right)) if x > right_limit => (None, Some(right)),
94+
_ => (None, None)
95+
}
96+
}

0 commit comments

Comments
 (0)