-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathlc_pymupdf.py
76 lines (62 loc) · 2.8 KB
/
lc_pymupdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
PDF Content Extraction Script using PyMuPDF (fitz)
This script demonstrates the use of PyMuPDFLoader from LangChain to extract content
from PDF files. PyMuPDF (also known as fitz) is a high-performance PDF processing
library that excels at handling complex PDFs with various content types.
Dependencies:
- langchain_community.document_loaders: For PDF loading interface
- PyMuPDF (fitz): Backend PDF processing library with advanced features
Usage:
Run the script directly to process a specified PDF file and print its content.
Different sample files can be uncommented in the main function to test various PDF types.
Advantages:
- High-performance processing
- Better handling of complex layouts
- Support for extracting images and annotations
- Ability to handle encrypted PDFs
- Memory efficient for large documents
"""
import os
from langchain_community.document_loaders import PyMuPDFLoader
# Get the project root directory
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
def main():
"""
Main function to demonstrate PDF content extraction using PyMuPDF.
Tests different types of PDF files:
- sample-1.pdf: Contains standard tables
- sample-2.pdf: Contains image-based simple tables
- sample-3.pdf: Contains image-based complex tables
- sample-4.pdf: Contains mixed content (text, images, complex tables)
- sample-5.pdf: Multi-column Texts
PyMuPDF advantages include:
- Fast processing speed
- Accurate text extraction with layout preservation
- Better handling of complex PDFs
- Support for various PDF features (forms, annotations)
- Lower memory footprint
Returns:
None: Prints extracted content to console
"""
# Select PDF file to process - uncomment desired sample file
#file_path = project_root+"/input/sample-1.pdf" # Table in pdf
#file_path = project_root+"/input/sample-2.pdf" # Image based simple table in pdf
#file_path = project_root+"/input/sample-3.pdf" # Image based complex table in pdf
file_path = project_root+"/input/sample-4.pdf" # Complex PDF with mixed content types
#file_path = project_root+"/input/sample-5.pdf" # Multi-column Texts
# Initialize PyMuPDF loader
# PyMuPDF handles PDF parsing and content extraction
loader = PyMuPDFLoader(file_path)
# Extract content from PDF
# Returns list of Document objects with extracted text and metadata
# Each document represents a page with its content and properties
docs = loader.load()
# Output options
extracted_content = ""
for doc in docs:
extracted_content += doc.page_content+ "\n"
# Output extracted content to output.txt
with open("output.txt", 'w') as file:
file.write(extracted_content)
if __name__ == "__main__":
main()