Skip to content

Commit dfd5b5b

Browse files
committed
welcome to the first release
1 parent 9a6e33b commit dfd5b5b

21 files changed

+328
-35
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.DS_Store
22
.idea
33
__pycache__
4-
*.py[cod]
4+
*.py[cod]
5+
temp_depdf/

README.md

Lines changed: 116 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,118 @@
1-
# depdf
1+
# DePDF
22

3-
An ultimate pdf file disintegration tool. Yet able to extract pages embedded with tables and paragraphs into structured markup language.
3+
An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.
44

5-
Built on [`pdfplumber`](https://github.com/jsvine/pdfplumber)
5+
Built on top of [`pdfplumber`](https://github.com/jsvine/pdfplumber)
6+
7+
# Table of Contents
8+
[toc]
9+
10+
11+
# Installation
12+
`pip install depdf`
13+
14+
# Example
15+
```python
16+
from depdf import DePDF
17+
from depdf import DePage
18+
19+
# general
20+
with DePDF.load('test/test_general.pdf') as pdf
21+
pdf_html = pdf.to_html
22+
print(pdf_html)
23+
24+
# with dedicated configurations
25+
c = Config(
26+
debug_flag=True,
27+
verbose_flag=True,
28+
add_line_flag=True
29+
)
30+
pdf = DePDF.load('test/test_general.pdf', config=c)
31+
page_index = 23 # start from zero
32+
page = pdf_file.pages[page_index]
33+
page_soup = page.soup
34+
print(page_soup.text)
35+
```
36+
37+
38+
# APIs
39+
| **functions** | usage |
40+
|:---:|---|
41+
| `extract_page_paragraphs` | extract paragraphs from specific page |
42+
| `extract_page_tables` | extract tables from specific page |
43+
| `convert_pdf_to_html` | convert the entire pdf to html |
44+
| `convert_page_to_html` | convert specific page to html |
45+
46+
47+
# In-Depth
48+
49+
## In-page elements
50+
* Paragraph
51+
+ Text
52+
+ Span
53+
* Table
54+
+ Cell
55+
* Image
56+
57+
## Common properties
58+
| **property & method** | explanation |
59+
|:---:|---|
60+
| `html` | converted html string |
61+
| `soup` | converted beautiful soup |
62+
| `bbox` | bounding box region |
63+
| `save_html` | write html tag to local file|
64+
65+
## DePDf HTML structure
66+
```html
67+
<div class="{pdf_class}">
68+
%for <!--page-{pid}-->
69+
<div id="page-{}" class="{}">
70+
%for {html_elements} endfor%
71+
</div>
72+
endfor%
73+
</div>
74+
```
75+
76+
## DePage HTML element structure
77+
78+
### Paragraph
79+
```html
80+
<p>
81+
{paragraph-content}
82+
<span> {span-content} </span>
83+
...
84+
</p>
85+
```
86+
87+
### Table
88+
```html
89+
<table>
90+
<tr>
91+
<td> {cell_0_0} </td>
92+
<td> {cell_0_1} </td>
93+
...
94+
</tr>
95+
<tr colspan=2>
96+
<td> {cell_1_0} </td>
97+
...
98+
</tr>
99+
...
100+
</table>
101+
```
102+
103+
### Image
104+
```
105+
<img src="temp_depdf/$prefix.png"></img>
106+
```
107+
# Appendix
108+
109+
## DePage element denotations
110+
> Useful element properties within page
111+
112+
![page element](annotations.jpg)
113+
114+
## todo
115+
116+
* [ ] add support for multiple-column pdf page
117+
* [ ] better table structure recognition
118+
* [x] recognize embedded objects inside page elements

annotations.jpg

116 KB
Loading

depdf/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def wrapper(pdf_file_path, *args, **kwargs):
2121
elif isinstance(pdf_file_path, PDF):
2222
pdf = DePDF(pdf_file_path, config=config, **kwargs)
2323
elif isinstance(pdf_file_path, str):
24-
pdf = DePDF.open(pdf_file_path, config=config, **kwargs)
24+
pdf = DePDF.load(pdf_file_path, config=config, **kwargs)
2525
else:
2626
raise PDFTypeError
2727
res = api_func(pdf, pid) if pid > 0 else api_func(pdf)

depdf/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ def html(self, html_value):
6161
def soup(self):
6262
return convert_html_to_soup(self._html)
6363

64+
def write_to(self, file_name):
65+
with open(file_name, "w") as file:
66+
file.write(self.html)
67+
6468
@property
6569
def to_dict(self):
6670
return {

depdf/components/image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)
1515
width = bbox[2] - bbox[0]
1616
img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
1717
img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
18-
html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
18+
html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(
1919
img_id=img_id, img_class=img_class, src=src, width=width
2020
)
2121
html += '</img>'

depdf/components/paragraph.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,17 @@ class Paragraph(InnerWrapper, Box):
1010
object_type = 'paragraph'
1111

1212
@check_config
13-
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
13+
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None, align=None):
1414
para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
1515
para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
16-
style = construct_style(style=style)
17-
html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
18-
para_id=para_id, para_class=para_class, style=style
16+
style_text = construct_style(style=style)
17+
align_text = ' align="{}"'.format(align) if align else ''
18+
html = '<p id="{para_id}" class="{para_class}"{align_text}{style_text}>'.format(
19+
para_id=para_id, para_class=para_class, style_text=style_text, align_text=align_text
1920
)
2021
self.pid = pid
2122
self.para_id = para_idx
23+
self.config = config
2224
self.bbox = bbox
2325
if text:
2426
self.text = text
@@ -34,3 +36,7 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
3436

3537
def __repr__(self):
3638
return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)
39+
40+
def save_html(self):
41+
paragraph_file_name = '{}_page_{}_paragraph_{}.html'.format(self.config.unique_prefix, self.pid, self.para_id)
42+
return super().write_to(paragraph_file_name)

depdf/components/span.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def __init__(self, bbox=None, span_text='', config=None, style=None):
1414
self.bbox = bbox
1515
self.text = span_text
1616
span_class = getattr(config, 'span_class')
17-
style = construct_style(style=style)
18-
self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
19-
span_class=span_class, span_text=span_text, style=style
17+
style_text = construct_style(style=style)
18+
self.html = '<span class="{span_class}"{style_text}>{span_text}</span>'.format(
19+
span_class=span_class, span_text=span_text, style_text=style_text
2020
)

depdf/components/table.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ def to_dict(self):
4545
]
4646
return table_dict
4747

48+
def save_html(self):
49+
table_file_name = '{}_page_{}_table_{}.html'.format(self.config.unique_prefix, self.pid, self.tid)
50+
return super().write_to(table_file_name)
51+
4852
@property
4953
def html(self):
5054
if not self._html and hasattr(self, 'to_html'):

depdf/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from functools import wraps
2+
import os
23

34
from depdf.base import Base
45
from depdf.error import ConfigTypeError
@@ -43,6 +44,9 @@ class Config(Base):
4344
vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE # used in page class
4445
table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
4546
skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
47+
add_vertical_lines_flag = DEFAULT_ADD_VERTICAL_LINES_FLAG
48+
add_horizontal_lines_flag = DEFAULT_ADD_HORIZONTAL_LINES_FLAG
49+
add_horizontal_line_tolerance = DEFAULT_ADD_HORIZONTAL_LINE_TOLERANCE
4650

4751
# image
4852
min_image_size = DEFAULT_MIN_IMAGE_SIZE
@@ -74,6 +78,10 @@ def __init__(self, **kwargs):
7478
self.update(**kwargs)
7579
self._kwargs = kwargs
7680

81+
# create temporary folder
82+
if not os.path.isdir(self.temp_dir_prefix):
83+
os.mkdir(self.temp_dir_prefix)
84+
7785
# set logging level by log_level parameter
7886
logging.getLogger('depdf').setLevel(self.log_level)
7987

0 commit comments

Comments
 (0)