-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathpdfplumber.py
126 lines (94 loc) · 3.54 KB
/
pdfplumber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from typing import List
import pandas as pd
import pdfplumber
from .annotations import PageToken, Page
import json
import logging
from pathlib import Path
from typing import Union
logger = logging.getLogger("uvicorn")
class PDFPlumberTokenExtractor:
@staticmethod
def convert_to_pagetoken(row: pd.Series) -> Page:
"""Convert a row in a DataFrame to pagetoken"""
return dict(
text=row["text"],
x=row["x0"],
width=row["width"],
y=row["top"],
height=row["height"],
)
def extract(self, pdf_path: str) -> List[Page]:
"""Extracts token text, positions, and style information from a PDF file.
Args:
pdf_path (str): the path to the pdf file.
include_lines (bool, optional): Whether to include line tokens. Defaults to False.
Returns:
PdfAnnotations: A `PdfAnnotations` containing all the paper token information.
"""
plumber_pdf_object = pdfplumber.open(pdf_path)
pages = []
for page_id in range(len(plumber_pdf_object.pages)):
cur_page = plumber_pdf_object.pages[page_id]
tokens = self.obtain_word_tokens(cur_page)
page = dict(
page=dict(
width=float(cur_page.width),
height=float(cur_page.height),
index=page_id
),
tokens=tokens
)
pages.append(page)
return pages
def obtain_word_tokens(self, cur_page: pdfplumber.page.Page) -> List[PageToken]:
"""Obtain all words from the current page.
Args:
cur_page (pdfplumber.page.Page):
the pdfplumber.page.Page object with PDF token information
Returns:
List[PageToken]:
A list of page tokens stored in PageToken format.
"""
words = cur_page.extract_words(
x_tolerance=1.5,
y_tolerance=3,
keep_blank_chars=False,
use_text_flow=True,
horizontal_ltr=True,
vertical_ttb=True,
extra_attrs=["fontname", "size"],
)
if len(words) == 0:
return []
df = pd.DataFrame(words)
# Avoid boxes outside the page
df[["x0", "x1"]] = df[["x0", "x1"]].\
clip(lower=0, upper=int(cur_page.width)).\
astype("float")
df[["top", "bottom"]] = df[["top", "bottom"]].\
clip(lower=0, upper=int(cur_page.height)).\
astype("float")
df["height"] = df["bottom"] - df["top"]
df["width"] = df["x1"] - df["x0"]
word_tokens = df.apply(self.convert_to_pagetoken, axis=1).tolist()
return word_tokens
def process_pdfplumber(file_path: Union[str, Path]) -> Path:
"""
Run a pre-processor on a pdf/directory of pawls pdfs and
write the resulting token information to the pdf location.
"""
file_path = Path(file_path)
if not file_path.exists():
msg = f'Cannot find {file_path}'
raise ValueError(msg)
structure_path = file_path.parent / "pdf_structure.json"
if not structure_path.exists():
logging.info(f"Processing {file_path} using pdfplumber...")
pdf_extractors = PDFPlumberTokenExtractor()
data = pdf_extractors.extract(file_path)
with open(structure_path, mode="w+", encoding='utf-8') as f:
json.dump(data, f)
else:
logging.warn(f"Parsed {structure_path} exists, skipping...")
return structure_path