-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape_admin_statements.py
53 lines (38 loc) · 1.27 KB
/
scrape_admin_statements.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
Constructs the admin_statements distributions.
"""
import glob
import pdfplumber
from parameters import *
from utils import *
def scrape():
"""
Scrapes the statements of administration policy from the
statements-of-administration-policy-main repository.
"""
NAME = "admin_statements"
directory = f"{DOWNLOAD_FOLDER}/{NAME}"
administrations = ["44-Obama", "45-Trump", "46-Biden"]
data = {}
for admin in administrations:
print(admin)
files = glob.glob(
f"{directory}/statements-of-administration-policy-main/archive/statements/{admin}/**/*.pdf"
)
statements = []
for file in files:
text = ""
try:
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() + " "
if (loc := text.find("The Administration")) != 0:
text = text[loc:].replace("\n", "")
text = text.replace("*", "").strip()
texts = split_delimiter_(text, "\n")
texts = split_truncate(texts)
statements.extend(texts)
except:
pass
data[admin] = statements
return data