Skip to content

Commit 6865523

Browse files
committed
add tables and images extraction methods
1 parent 2138046 commit 6865523

File tree

9 files changed

+481
-53
lines changed

9 files changed

+481
-53
lines changed

depdf/components/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,17 @@
22
from depdf.components.text import Text
33
from depdf.components.span import Span
44
from depdf.components.table import Table, Cell
5+
from depdf.components.image import Image
56

67
component_list = [
78
Paragraph,
89
Table,
910
Span,
1011
Text,
1112
Cell,
13+
Image,
1214
]
1315

1416
__all__ = [
15-
'Paragraph', 'Table', 'Span', 'Text', 'Cell',
17+
'Paragraph', 'Table', 'Span', 'Text', 'Cell', 'Image',
1618
]

depdf/components/image.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from depdf.base import Base, Box
2+
from depdf.config import check_config
3+
from depdf.log import logger_init
4+
5+
log = logger_init(__name__)
6+
7+
8+
class Image(Base, Box):
9+
object_type = 'image'
10+
11+
@check_config
12+
def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None):
13+
self.bbox = bbox
14+
self.scan = scan
15+
width = bbox[2] - bbox[0]
16+
img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
17+
img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
18+
html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
19+
img_id=img_id, img_class=img_class, src=src, width=width
20+
)
21+
html += '</img>'
22+
self.html = html

depdf/components/paragraph.py

-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,3 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
2929
@property
3030
def inner_object(self):
3131
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_object]
32-
33-
34-
def extract_pdf_paragraph_by_page(page):
35-
pass

depdf/components/table.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,10 @@ def convert_table_to_html(table_dict, pid=1, tid=1, tc_mt=5, table_class='pdf-ta
129129
html_table_string += ' rowspan="{}"'.format(row_span)
130130
if col_span > 1:
131131
html_table_string += ' colspan="{}"'.format(col_span)
132-
html_table_string += ' style="font-size: {font_size}px;">{tc_text}</td>'.format(
133-
font_size=tc['fs'], tc_text=tc['html']
134-
)
132+
html_table_string += '>{tc_text}</td>'.format(tc_text=tc['html'])
135133
none_text_table = False if tc['html'] else none_text_table
136134
html_table_string += '</tr>'
137135
html_table_string += '</table>'
138136
if skip_et and none_text_table:
139137
return empty_table_html
140138
return html_table_string
141-
142-
143-
def extract_pdf_table_by_page(page):
144-
pass

depdf/config.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import uuid
21
from functools import wraps
32

43
from depdf.base import Base
@@ -13,18 +12,22 @@ class Config(Base):
1312
# pdf
1413
logo_flag = DEFAULT_LOGO_FLAG
1514
header_footer_flag = DEFAULT_HEADER_FOOTER_FLAG
15+
temp_dir_prefix = DEFAULT_TEMP_DIR_PREFIX
16+
unique_prefix = None # 该参数会根据 pdf 的文件名自动更新
1617

1718
# page
1819
table_flag = DEFAULT_TABLE_FLAG
1920
paragraph_flag = DEFAULT_PARAGRAPH_FLAG
20-
img_flag = DEFAULT_IMG_FLAG
21+
image_flag = DEFAULT_IMAGE_FLAG
2122
resolution = DEFAULT_RESOLUTION
22-
main_frame_tolerance = DEFAULT_MAIN_FRAME_TOLERANCE
23+
main_frame_tolerance = None # 该参数可通过页面内容自动分析
2324
x_tolerance = None # 该参数可通过页面内容自动分析
2425
y_tolerance = None # 该参数可通过页面内容自动分析
2526
page_num_top_fraction = DEFAULT_PAGE_NUM_TOP_FRACTION
2627
page_num_left_fraction = DEFAULT_PAGE_NUM_LEFT_FRACTION
2728
page_num_right_fraction = DEFAULT_PAGE_NUM_RIGHT_FRACTION
29+
dotted_line_flag = True
30+
curved_line_flag = False
2831

2932
# chars
3033
char_overlap_size = DEFAULT_CHAR_OVERLAP_SIZE
@@ -35,10 +38,15 @@ class Config(Base):
3538
# table
3639
snap_flag = DEFAULT_SNAP_FLAG
3740
add_line_flag = DEFAULT_ADD_LINE_FLAG
38-
double_line_tolerance = DEFAULT_DOUBLE_LINE_TOLERANCE
41+
min_double_line_tolerance = DEFAULT_MIN_DOUBLE_LINE_TOLERANCE # used in page class
42+
max_double_line_tolerance = DEFAULT_MAX_DOUBLE_LINE_TOLERANCE # used in page class
43+
vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE # used in page class
3944
table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
4045
skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
4146

47+
# image
48+
min_image_size = DEFAULT_MIN_IMAGE_SIZE
49+
4250
# head & tail
4351
default_head_tail_page_offset_percent = DEFAULT_HEAD_TAIL_PAGE_OFFSET_PERCENT
4452

@@ -52,13 +60,14 @@ class Config(Base):
5260
paragraph_class = DEFAULT_PARAGRAPH_CLASS
5361
table_class = DEFAULT_TABLE_CLASS
5462
pdf_class = DEFAULT_PDF_CLASS
63+
image_class = DEFAULT_IMAGE_CLASS
5564

5665
def __init__(self, **kwargs):
57-
# add unique prefix to dePDF instance
58-
self.unique_prefix = uuid.uuid4()
59-
66+
# set log level automatically if debug mode enabled
6067
if kwargs.get('debug_flag'):
6168
self.log_level = logging.DEBUG
69+
if kwargs.get('verbose_flag'):
70+
self.log_level = logging.INFO
6271

6372
# add configuration parameters
6473
self.update(**kwargs)

0 commit comments

Comments
 (0)