-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathlc_amazon_textract.py
99 lines (82 loc) · 3.18 KB
/
lc_amazon_textract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
PDF Content Extraction Script using Amazon Textract
This script demonstrates the use of AmazonTextractPDFLoader from LangChain to extract content
from PDF files using AWS Textract service. Textract is an AWS service that automatically
extracts text, handwriting, and data from scanned documents.
Dependencies:
- langchain_community.document_loaders: For Textract integration
- boto3: AWS SDK for Python
- AWS credentials: Properly configured AWS access
- Amazon Textract service access
Usage:
Requires:
- Configured AWS credentials
- Appropriate IAM permissions for Textract
Can process PDFs from:
- Local files
- S3 buckets directly
Advantages:
- Advanced OCR capabilities
- Table extraction
- Form processing
- Handwriting recognition
- Integration with AWS services
"""
import os
import boto3
from langchain_community.document_loaders import AmazonTextractPDFLoader
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
def main():
"""
Main function to demonstrate PDF content extraction using Amazon Textract.
Processes PDF files:
- Can handle local files
- Can process directly from S3
- Supports complex documents with:
* Tables
* Forms
* Handwritten text
* Mixed content types
Amazon Textract features:
- OCR (Optical Character Recognition)
- Table structure recognition
- Form field detection
- Key-value pair extraction
- Integration with AWS ecosystem
Returns:
None: Prints extracted content to console
"""
# Local file path for processing
file_path = project_root+"/input/sample-1.pdf" # Contains standard table format
#file_path = project_root+"/input/sample-2.pdf" # Contains image-based simple table
#file_path = project_root+"/input/sample-3.pdf" # Contains image-based complex table
#file_path = project_root+"/input/sample-4.pdf" # Complex PDF with mixed content (text and tables in images)
#file_path = project_root+"/input/sample-5.pdf" # Multi-column Texts
# Initialize AWS Textract client
# Requires properly configured AWS credentials
textract_client = boto3.client(
"textract",
region_name="us-east-1" # Specify your AWS region
)
# Alternative: Direct S3 path processing
#file_path = "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf"
# Initialize Textract loader with AWS client
# Can process both local files and S3 paths
loader = AmazonTextractPDFLoader(
file_path,
client=textract_client # Use configured Textract client
)
# Extract content using Textract
# Returns list of Document objects with extracted text and metadata
documents = loader.load()
# Second load call (Note: This appears redundant and could be removed)
docs = loader.load()
# Output options
extracted_content = ""
for doc in docs:
extracted_content += doc.page_content+ "\n"
# Output extracted content to output.txt
with open("output.txt", 'w') as file:
file.write(extracted_content)
if __name__ == "__main__":
main()