-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpdf_to_txt.py
56 lines (37 loc) · 1.33 KB
/
pdf_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import json
import requests
from wasabi import msg
from dotenv import load_dotenv
load_dotenv()
msg.divider("Starting PDF Conversion to Text")
file_names = ["./data/combining_taste.pdf", "./data/taste_and_smell.pdf"]
url = "https://api.unstructured.io/general/v0/general"
api_key = os.environ.get("UNSTRUCTURED_API_KEY", "")
if api_key != "":
msg.good("Unstructured API Key available")
headers = {
"accept": "application/json",
"unstructured-api-key": api_key,
}
data = {
"strategy": "auto",
}
for file_path in file_names:
msg.info(f"Converting {file_path}")
file_data = {"files": open(file_path, "rb")}
response = requests.post(url, headers=headers, data=data, files=file_data)
file_data["files"].close()
json_response = response.json()
full_content = ""
for chunk in json_response:
if "text" in chunk:
text = chunk["text"]
full_content += text + " "
root, _ = os.path.splitext(file_path)
new_file_path = root + ".txt"
with open(f"{new_file_path}", "w") as writer:
writer.writelines(full_content)
msg.good(f"Successfully saved to {new_file_path}")
else:
msg.fail("No Unstructured API Key available. Please add your key to your .env file")