Skip to content

Commit dfd5b5b

Browse files
committed
welcome to the first release
1 parent 9a6e33b commit dfd5b5b

21 files changed

+328
-35
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.DS_Store
22
.idea
33
__pycache__
4-
*.py[cod]
4+
*.py[cod]
5+
temp_depdf/

README.md

+116-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,118 @@
1-
# depdf
1+
# DePDF
22

3-
An ultimate pdf file disintegration tool. Yet able to extract pages embedded with tables and paragraphs into structured markup language.
3+
An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.
44

5-
Built on [`pdfplumber`](https://github.com/jsvine/pdfplumber)
5+
Built on top of [`pdfplumber`](https://github.com/jsvine/pdfplumber)
6+
7+
# Table of Contents
8+
[toc]
9+
10+
11+
# Installation
12+
`pip install depdf`
13+
14+
# Example
15+
```python
16+
from depdf import DePDF
17+
from depdf import DePage
18+
19+
# general
20+
with DePDF.load('test/test_general.pdf') as pdf
21+
pdf_html = pdf.to_html
22+
print(pdf_html)
23+
24+
# with dedicated configurations
25+
c = Config(
26+
debug_flag=True,
27+
verbose_flag=True,
28+
add_line_flag=True
29+
)
30+
pdf = DePDF.load('test/test_general.pdf', config=c)
31+
page_index = 23 # start from zero
32+
page = pdf_file.pages[page_index]
33+
page_soup = page.soup
34+
print(page_soup.text)
35+
```
36+
37+
38+
# APIs
39+
| **functions** | usage |
40+
|:---:|---|
41+
| `extract_page_paragraphs` | extract paragraphs from specific page |
42+
| `extract_page_tables` | extract tables from specific page |
43+
| `convert_pdf_to_html` | convert the entire pdf to html |
44+
| `convert_page_to_html` | convert specific page to html |
45+
46+
47+
# In-Depth
48+
49+
## In-page elements
50+
* Paragraph
51+
+ Text
52+
+ Span
53+
* Table
54+
+ Cell
55+
* Image
56+
57+
## Common properties
58+
| **property & method** | explanation |
59+
|:---:|---|
60+
| `html` | converted html string |
61+
| `soup` | converted beautiful soup |
62+
| `bbox` | bounding box region |
63+
| `save_html` | write html tag to local file|
64+
65+
## DePDf HTML structure
66+
```html
67+
<div class="{pdf_class}">
68+
%for <!--page-{pid}-->
69+
<div id="page-{}" class="{}">
70+
%for {html_elements} endfor%
71+
</div>
72+
endfor%
73+
</div>
74+
```
75+
76+
## DePage HTML element structure
77+
78+
### Paragraph
79+
```html
80+
<p>
81+
{paragraph-content}
82+
<span> {span-content} </span>
83+
...
84+
</p>
85+
```
86+
87+
### Table
88+
```html
89+
<table>
90+
<tr>
91+
<td> {cell_0_0} </td>
92+
<td> {cell_0_1} </td>
93+
...
94+
</tr>
95+
<tr colspan=2>
96+
<td> {cell_1_0} </td>
97+
...
98+
</tr>
99+
...
100+
</table>
101+
```
102+
103+
### Image
104+
```
105+
<img src="temp_depdf/$prefix.png"></img>
106+
```
107+
# Appendix
108+
109+
## DePage element denotations
110+
> Useful element properties within page
111+
112+
![page element](annotations.jpg)
113+
114+
## todo
115+
116+
* [ ] add support for multiple-column pdf page
117+
* [ ] better table structure recognition
118+
* [x] recognize embedded objects inside page elements

annotations.jpg

116 KB
Loading

depdf/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def wrapper(pdf_file_path, *args, **kwargs):
2121
elif isinstance(pdf_file_path, PDF):
2222
pdf = DePDF(pdf_file_path, config=config, **kwargs)
2323
elif isinstance(pdf_file_path, str):
24-
pdf = DePDF.open(pdf_file_path, config=config, **kwargs)
24+
pdf = DePDF.load(pdf_file_path, config=config, **kwargs)
2525
else:
2626
raise PDFTypeError
2727
res = api_func(pdf, pid) if pid > 0 else api_func(pdf)

depdf/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ def html(self, html_value):
6161
def soup(self):
6262
return convert_html_to_soup(self._html)
6363

64+
def write_to(self, file_name):
65+
with open(file_name, "w") as file:
66+
file.write(self.html)
67+
6468
@property
6569
def to_dict(self):
6670
return {

depdf/components/image.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)
1515
width = bbox[2] - bbox[0]
1616
img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
1717
img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
18-
html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
18+
html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(
1919
img_id=img_id, img_class=img_class, src=src, width=width
2020
)
2121
html += '</img>'

depdf/components/paragraph.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,17 @@ class Paragraph(InnerWrapper, Box):
1010
object_type = 'paragraph'
1111

1212
@check_config
13-
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
13+
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None, align=None):
1414
para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
1515
para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
16-
style = construct_style(style=style)
17-
html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
18-
para_id=para_id, para_class=para_class, style=style
16+
style_text = construct_style(style=style)
17+
align_text = ' align="{}"'.format(align) if align else ''
18+
html = '<p id="{para_id}" class="{para_class}"{align_text}{style_text}>'.format(
19+
para_id=para_id, para_class=para_class, style_text=style_text, align_text=align_text
1920
)
2021
self.pid = pid
2122
self.para_id = para_idx
23+
self.config = config
2224
self.bbox = bbox
2325
if text:
2426
self.text = text
@@ -34,3 +36,7 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
3436

3537
def __repr__(self):
3638
return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)
39+
40+
def save_html(self):
41+
paragraph_file_name = '{}_page_{}_paragraph_{}.html'.format(self.config.unique_prefix, self.pid, self.para_id)
42+
return super().write_to(paragraph_file_name)

depdf/components/span.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def __init__(self, bbox=None, span_text='', config=None, style=None):
1414
self.bbox = bbox
1515
self.text = span_text
1616
span_class = getattr(config, 'span_class')
17-
style = construct_style(style=style)
18-
self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
19-
span_class=span_class, span_text=span_text, style=style
17+
style_text = construct_style(style=style)
18+
self.html = '<span class="{span_class}"{style_text}>{span_text}</span>'.format(
19+
span_class=span_class, span_text=span_text, style_text=style_text
2020
)

depdf/components/table.py

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ def to_dict(self):
4545
]
4646
return table_dict
4747

48+
def save_html(self):
49+
table_file_name = '{}_page_{}_table_{}.html'.format(self.config.unique_prefix, self.pid, self.tid)
50+
return super().write_to(table_file_name)
51+
4852
@property
4953
def html(self):
5054
if not self._html and hasattr(self, 'to_html'):

depdf/config.py

+8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from functools import wraps
2+
import os
23

34
from depdf.base import Base
45
from depdf.error import ConfigTypeError
@@ -43,6 +44,9 @@ class Config(Base):
4344
vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE # used in page class
4445
table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
4546
skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
47+
add_vertical_lines_flag = DEFAULT_ADD_VERTICAL_LINES_FLAG
48+
add_horizontal_lines_flag = DEFAULT_ADD_HORIZONTAL_LINES_FLAG
49+
add_horizontal_line_tolerance = DEFAULT_ADD_HORIZONTAL_LINE_TOLERANCE
4650

4751
# image
4852
min_image_size = DEFAULT_MIN_IMAGE_SIZE
@@ -74,6 +78,10 @@ def __init__(self, **kwargs):
7478
self.update(**kwargs)
7579
self._kwargs = kwargs
7680

81+
# create temporary folder
82+
if not os.path.isdir(self.temp_dir_prefix):
83+
os.mkdir(self.temp_dir_prefix)
84+
7785
# set logging level by log_level parameter
7886
logging.getLogger('depdf').setLevel(self.log_level)
7987

depdf/page.py

+35-9
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
import os
21
from statistics import mean, median
32
import uuid
43

54
from pdfplumber.page import Page
65

76
from depdf.base import Base
8-
from depdf.components import Image, Paragraph, Text, Span
9-
from depdf.config import check_config, check_config_type
7+
from depdf.components import Paragraph, Text, Span
8+
from depdf.config import check_config_type
109
from depdf.error import PageTypeError
1110
from depdf.page_tools import *
1211

@@ -164,6 +163,10 @@ def images(self):
164163
def images_raw(self):
165164
return self._images_raw
166165

166+
def save_html(self):
167+
page_file_name = '{}_page_{}.html'.format(self.prefix, self.pid)
168+
return super().write_to(page_file_name)
169+
167170
@property
168171
def html(self):
169172
if not self._html and hasattr(self, 'to_html'):
@@ -173,7 +176,9 @@ def html(self):
173176
@property
174177
def to_html(self):
175178
page_class = getattr(self.config, 'page_class')
176-
html = '<div id="page-{}" class="{}">'.format(self.pid, page_class)
179+
html = '<div id="page-{}" class="{}" new_para_start="{}" new_para_end="{}">'.format(
180+
self.pid, page_class, self.new_para_start_flag, self.new_para_end_flag
181+
)
177182
for obj in self.objects:
178183
html += getattr(obj, 'html', '')
179184
html += '</div>'
@@ -254,7 +259,9 @@ def analyze_main_frame(self):
254259

255260
def extract_phrases(self):
256261
phrases = [
257-
i for i in self.page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
262+
i for i in self.page.extract_words(x_tolerance=self.x_tolerance,
263+
y_tolerance=self.y_tolerance,
264+
keep_blank_chars=True)
258265
if 'top' in i and i['top'] >= self.frame_top and 'bottom' in i and i['bottom'] <= self.frame_bottom
259266
]
260267
self.phrases = phrases
@@ -314,6 +321,19 @@ def analyze_lines(self):
314321
h_lines.extend(h_curves)
315322
v_lines.extend(v_curves)
316323

324+
# 增加竖线
325+
add_vlf = getattr(self.config, 'add_vertical_lines_flag')
326+
if add_vlf:
327+
v_lines_add = add_vertical_lines(v_lines, h_lines, rect_edges_raw, self.page, self.ave_cs)
328+
v_lines.extend(v_lines_add)
329+
330+
# 增加顶部和底部的横线
331+
add_hlf = getattr(self.config, 'add_horizontal_lines_flag')
332+
vlts_tolerance = getattr(self.config, 'add_horizontal_line_tolerance')
333+
if add_hlf:
334+
h_lines_add = add_horizontal_lines(v_lines, h_lines, vlts_tolerance=vlts_tolerance)
335+
h_lines.extend(h_lines_add)
336+
317337
# 设定页面的横竖线列表
318338
self.h_edges = [{'top': i['top'], 'x0': i['x0'], 'x1': i['x1']} for i in h_lines]
319339
self.v_edges = [{'x': i['x0'], 'top': i['top'], 'bottom': i['bottom']} for i in v_lines]
@@ -389,7 +409,7 @@ def extract_images(self):
389409
for image in images_raw:
390410
try:
391411
image_area = self.page.within_bbox(image['bbox'])
392-
image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2))
412+
image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2, keep_blank_chars=True))
393413
except:
394414
pass
395415
self._image_phrases = image_words
@@ -411,7 +431,7 @@ def extract_paragraph(self):
411431
para_idx, paragraphs, paragraph_objects = 1, [], []
412432
ave_ts = ave_cs = self.ave_cs
413433
ave_lh, page_width = self.ave_lh, self.width
414-
div_flag = center_flag = False
434+
div_flag = center_flag = right_flag = False
415435
para_style = {}
416436
for i in self.phrases:
417437
if i in self.same_tmp or i in self._image_phrases or \
@@ -444,18 +464,21 @@ def extract_paragraph(self):
444464
if abs(left - ll) <= 1 and p_right >= lr - ave_ts * 3 / 2:
445465
new_para_flag = False # 如果该行的左边距特别小且上一行的右边距相对较小,则认为是同一个段落
446466
if new_para_flag:
447-
if abs(page_width - right - left) <= ave_ts / 2:
467+
if abs(page_width - right - left) <= ave_ts * 2:
448468
if abs(lr - right) >= 4 * ave_ts: # 段前有四个 char_size 大小的空白
449469
center_flag = True
450470
if left > ll + ave_ts * 4:
451471
div_flag = True
472+
if right >= lr - ave_ts:
473+
right_flag = True
452474
elif abs(left - p_right) >= ave_ts * 2: # 同一行需要判定该段落是否为文本框组合
453475
if abs(top - p_top) <= ave_ts / 2:
454476
new_line_flag = new_para_flag = False
455477

456478
if new_para_flag and paragraph_objects:
479+
align = para_style.pop('align') if 'align' in para_style else None
457480
paragraphs.append(Paragraph(
458-
pid=self.pid, para_idx=para_idx, config=self.config,
481+
pid=self.pid, para_idx=para_idx, config=self.config, align=align,
459482
inner_objects=paragraph_objects, style=para_style
460483
))
461484
para_style = {}
@@ -469,6 +492,8 @@ def extract_paragraph(self):
469492
para_style.update({'align': 'center'})
470493
elif div_flag:
471494
para_style.update({'margin-left': '{0}px'.format((left - ll))})
495+
if right_flag:
496+
para_style.update({'align': 'right'})
472497

473498
if new_line_flag:
474499
paragraph_objects.append(Text(bbox=bbox, text=text))
@@ -486,6 +511,7 @@ def extract_paragraph(self):
486511
if center_flag:
487512
para_style.update({'align': 'center'})
488513
elif div_flag:
514+
para_style.update({'align': 'left'})
489515
para_style.update({'margin-left': '{0}px'.format((left - ll))})
490516
paragraphs.append(Paragraph(
491517
pid=self.pid, para_idx=para_idx, config=self.config,

0 commit comments

Comments
 (0)