-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess-pdf-with-langchain.py
More file actions
34 lines (25 loc) · 1.04 KB
/
Copy pathprocess-pdf-with-langchain.py
File metadata and controls
34 lines (25 loc) · 1.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# to use this script, need to install
# pip install langchain langchain-community pypdf
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.parsers.images import RapidOCRBlobParser
# loader = PyPDFLoader("./testing_files/inputs/pdf-without-images-1-page.pdf")
loader = PyPDFLoader(
file_path="./testing_files/inputs/pdf-with-images.pdf",
mode="single", # vs "page"
extract_images=True,
images_parser=RapidOCRBlobParser(),
)
documents = loader.load() # Loads entire PDF
# documents = loader.lazy_load() # Loads pages as needed
print(len(documents))
for doc in documents:
print("----------- CONTENT STARTS --------------\n")
print(doc.page_content)
print("\n----------- CONTENT ENDS ----------------")
print("----------- METADATA STARTS --------------\n")
if doc.metadata:
print("---")
for key, value in doc.metadata.items():
print(f"{key}: {value}")
print("---")
print("\n----------- METADATA ENDS ----------------\n")