Skip to content

Commit 9a6e33b

Browse files
committed
add paragraph parser
1 parent 6865523 commit 9a6e33b

14 files changed

+312
-136
lines changed

depdf/__init__.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1-
from depdf.api import convert_pdf_to_html, convert_pdf_to_html_by_page
1+
from depdf.api import *
2+
from depdf.config import Config
23
from depdf.pdf import DePDF
4+
from depdf.page import DePage
35
from depdf.version import __version__
46

57
__all__ = [
6-
'convert_pdf_to_html',
7-
'convert_pdf_to_html_by_page',
8+
'Config',
89
'DePDF',
10+
'DePage',
11+
'convert_pdf_to_html',
12+
'convert_page_to_html',
13+
'extract_page_tables',
14+
'extract_page_paragraphs',
915
]

depdf/api.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from depdf.error import PDFTypeError
66
from depdf.log import logger_init
77
from depdf.pdf import DePDF
8+
from depdf.page import DePage
89

910
log = logger_init(__name__)
1011

@@ -30,47 +31,46 @@ def wrapper(pdf_file_path, *args, **kwargs):
3031

3132

3233
@api_load_pdf
33-
def convert_pdf_to_html(pdf_file_path, **kwargs):
34+
def convert_pdf_to_html(pdf_file, **kwargs):
3435
"""
35-
:param pdf_file_path: pdf file absolute path
36+
:param pdf_file: pdf file absolute path
3637
:param kwargs: config keyword arguments
3738
:return:
3839
"""
39-
html = []
40-
return html
40+
return pdf_file.html
4141

4242

4343
@api_load_pdf
44-
def convert_pdf_to_html_by_page(pdf_file_path, pid, **kwargs):
44+
def convert_page_to_html(pdf_file, pid, **kwargs):
4545
"""
46-
:param pdf_file_path: pdf file absolute path
46+
:param pdf_file: pdf file absolute path
4747
:param pid: page number start from 1
4848
:param kwargs: config keyword arguments
4949
:return:
5050
"""
51-
html_page = ''
52-
return html_page
51+
page = pdf_file.pages[pid - 1]
52+
return page.html
5353

5454

5555
@api_load_pdf
56-
def extract_page_tables(pdf_file_path, pid, **kwargs):
56+
def extract_page_tables(pdf_file, pid, **kwargs):
5757
"""
58-
:param pdf_file_path: pdf file absolute path
58+
:param pdf_file: pdf file absolute path
5959
:param pid: page number start from 1
6060
:param kwargs: config keyword arguments
6161
:return:
6262
"""
63-
tables = []
64-
return tables
63+
page = pdf_file.pages[pid - 1]
64+
return page.tables
6565

6666

6767
@api_load_pdf
68-
def extract_page_paragraphs(pdf_file_path, pid, **kwargs):
68+
def extract_page_paragraphs(pdf_file, pid, **kwargs):
6969
"""
70-
:param pdf_file_path: pdf file absolute path
70+
:param pdf_file: pdf file absolute path
7171
:param pid: page number start from 1
7272
:param kwargs: config keyword arguments
7373
:return:
7474
"""
75-
paragraphs = []
76-
return paragraphs
75+
page = pdf_file.pages[pid - 1]
76+
return page.paragraphs

depdf/base.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ class Box(object):
1111
bottom = Decimal(0)
1212
_bbox = (x0, top, x1, bottom)
1313

14+
def __repr__(self):
15+
return '<depdf.Box: {}>'.format(tuple(self.bbox))
16+
1417
@property
1518
def width(self):
1619
return self.x1 - self.x0
@@ -27,7 +30,7 @@ def bbox(self):
2730
def bbox(self, value):
2831
if value is not None:
2932
bbox = self.normalize_bbox(value)
30-
(self.x0, self.top, self.x1, self.bottom) = bbox
33+
self.x0, self.top, self.x1, self.bottom = bbox
3134
self._bbox = bbox
3235

3336
@staticmethod
@@ -36,7 +39,9 @@ def normalize_bbox(bbox):
3639
raise BoxValueError(bbox)
3740
if isinstance(bbox, str):
3841
raise BoxValueError(bbox)
39-
bbox = (Decimal(i) for i in bbox)
42+
if len(bbox) != 4:
43+
raise BoxValueError(bbox)
44+
bbox = [Decimal(i) for i in bbox]
4045
return bbox
4146

4247

@@ -86,3 +91,10 @@ def refresh(self):
8691
def reset(self):
8792
pass
8893

94+
95+
class InnerWrapper(Base):
96+
_inner_objects = []
97+
98+
@property
99+
def inner_objects(self):
100+
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_objects]

depdf/components/paragraph.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,36 @@
1-
from depdf.base import Base, Box
1+
from depdf.base import Box, InnerWrapper
22
from depdf.config import check_config
33
from depdf.log import logger_init
4+
from depdf.utils import calc_bbox, construct_style
45

56
log = logger_init(__name__)
67

78

8-
class Paragraph(Base, Box):
9+
class Paragraph(InnerWrapper, Box):
910
object_type = 'paragraph'
1011

1112
@check_config
12-
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_object=None):
13-
self.bbox = bbox
13+
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
1414
para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
1515
para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
16-
html = '<p id="{para_id}" class="{para_class}">'.format(
17-
para_id=para_id, para_class=para_class
16+
style = construct_style(style=style)
17+
html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
18+
para_id=para_id, para_class=para_class, style=style
1819
)
20+
self.pid = pid
21+
self.para_id = para_idx
22+
self.bbox = bbox
1923
if text:
2024
self.text = text
2125
html += str(text)
2226
else:
23-
self._inner_object = [inner_object]
24-
for obj in inner_object:
25-
self.html += getattr(obj, 'html', '')
27+
if bbox is None:
28+
self.bbox = calc_bbox(inner_objects)
29+
self._inner_objects = inner_objects
30+
for obj in inner_objects:
31+
html += getattr(obj, 'html', '')
2632
html += '</p>'
2733
self.html = html
2834

29-
@property
30-
def inner_object(self):
31-
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_object]
35+
def __repr__(self):
36+
return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)

depdf/components/span.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from depdf.base import Base, Box
22
from depdf.config import check_config
33
from depdf.log import logger_init
4+
from depdf.utils import construct_style
45

56
log = logger_init(__name__)
67

@@ -9,10 +10,11 @@ class Span(Base, Box):
910
object_type = 'span'
1011

1112
@check_config
12-
def __init__(self, bbox=None, span_text='', pid=1, config=None):
13+
def __init__(self, bbox=None, span_text='', config=None, style=None):
1314
self.bbox = bbox
1415
self.text = span_text
1516
span_class = getattr(config, 'span_class')
16-
self.html = '<span class="{span_class} page-{pid}">{span_text}</span>'.format(
17-
span_class=span_class, pid=pid, span_text=span_text
17+
style = construct_style(style=style)
18+
self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
19+
span_class=span_class, span_text=span_text, style=style
1820
)

depdf/components/table.py

+16-29
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,24 @@
1-
from depdf.base import Base, Box
1+
from depdf.base import Base, Box, InnerWrapper
22
from depdf.config import check_config
33
from depdf.log import logger_init
4+
from depdf.utils import calc_bbox
45

56
log = logger_init(__name__)
67

78

8-
class Cell(Base, Box):
9+
class Cell(InnerWrapper, Box):
910
object_type = 'cell'
1011

11-
def __init__(self, bbox=None, text='', font_size=14, inner_object=None):
12+
def __init__(self, bbox=None, text='', inner_objects=None):
1213
self.bbox = bbox
13-
self.fs = font_size
1414
if text:
1515
self.text = text
1616
self.html = text
1717
else:
18-
self._inner_object = inner_object
19-
for obj in inner_object:
18+
self._inner_objects = inner_objects
19+
for obj in inner_objects:
2020
self.html += getattr(obj, 'html', '')
2121

22-
@property
23-
def inner_object(self):
24-
return self._inner_object.to_dict if hasattr(self._inner_object, 'to_dict') else self._inner_object
25-
2622

2723
class Table(Base, Box):
2824
object_type = 'table'
@@ -33,24 +29,10 @@ def __init__(self, rows, pid=1, tid=1, config=None, bbox=None):
3329
self.tid = tid
3430
self.rows = rows
3531
self.config = config
36-
self.bbox = bbox if bbox else self.calc_table_bbox_by_rows(rows)
32+
self.bbox = bbox if bbox else calc_bbox(rows)
3733

38-
@staticmethod
39-
def calc_table_bbox_by_rows(rows):
40-
x0_list, top_list, x1_list, bottom_list = [], [], [], []
41-
for row in rows:
42-
for cell in row:
43-
x0_list.append(cell.x0)
44-
top_list.append(cell.top)
45-
x1_list.append(cell.x1)
46-
bottom_list.append(cell.bottom)
47-
bbox = (
48-
min(x0_list),
49-
min(top_list),
50-
max(x1_list),
51-
max(bottom_list),
52-
)
53-
return bbox
34+
def __repr__(self):
35+
return '<depdf.Table: ({}, {})>'.format(self.pid, self.tid)
5436

5537
@property
5638
def to_dict(self):
@@ -63,16 +45,21 @@ def to_dict(self):
6345
]
6446
return table_dict
6547

48+
@property
49+
def html(self):
50+
if not self._html and hasattr(self, 'to_html'):
51+
return self.to_html
52+
return self._html
53+
6654
@property
6755
def to_html(self):
6856
table_class = getattr(self.config, 'table_class')
6957
table_cell_merge_tolerance = getattr(self.config, 'table_cell_merge_tolerance')
7058
skip_empty_table = getattr(self.config, 'skip_empty_table')
71-
self.html = convert_table_to_html(
59+
return convert_table_to_html(
7260
self.to_dict, pid=self.pid, tid=self.tid, tc_mt=table_cell_merge_tolerance,
7361
table_class=table_class, skip_et=skip_empty_table
7462
)
75-
return self.html
7663

7764

7865
def gen_column_cell_sizes(t):

depdf/components/text.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from depdf.base import Base
1+
from depdf.base import Base, Box
22

33

4-
class Text(Base):
4+
class Text(Base, Box):
55
object_type = 'text'
66

7-
def __init__(self, text):
7+
def __init__(self, bbox='', text=''):
8+
self.bbox = bbox
89
self.text = text
910
self.html = text

depdf/config.py

+5
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class Config(Base):
6161
table_class = DEFAULT_TABLE_CLASS
6262
pdf_class = DEFAULT_PDF_CLASS
6363
image_class = DEFAULT_IMAGE_CLASS
64+
page_class = DEFAULT_PAGE_CLASS
6465

6566
def __init__(self, **kwargs):
6667
# set log level automatically if debug mode enabled
@@ -71,10 +72,14 @@ def __init__(self, **kwargs):
7172

7273
# add configuration parameters
7374
self.update(**kwargs)
75+
self._kwargs = kwargs
7476

7577
# set logging level by log_level parameter
7678
logging.getLogger('depdf').setLevel(self.log_level)
7779

80+
def __repr__(self):
81+
return '<depdf.Config: {}>'.format(self._kwargs)
82+
7883
def update(self, **kwargs):
7984
for key, value in kwargs.items():
8085
if hasattr(self, key):

0 commit comments

Comments
 (0)