@@ -15,9 +15,9 @@ from extract_thinker import DocumentLoaderAzureForm
1515
1616# Initialize with Azure credentials
1717loader = DocumentLoaderAzureForm(
18+ subscription_key = " your_subscription_key" ,
1819 endpoint = " your_endpoint" ,
19- key = " your_api_key" ,
20- model = " prebuilt-document" # Use prebuilt document model
20+ model_id = " prebuilt-document" # Use prebuilt document model
2121)
2222
2323# Load document
@@ -29,6 +29,8 @@ for page in pages:
2929 text = page[" content" ]
3030 # Access tables if available
3131 tables = page.get(" tables" , [])
32+ # Access form fields if available
33+ forms = page.get(" forms" , {})
3234```
3335
3436### Configuration-based Usage
@@ -38,12 +40,11 @@ from extract_thinker import DocumentLoaderAzureForm, AzureConfig
3840
3941# Create configuration
4042config = AzureConfig(
43+ subscription_key = " your_subscription_key" ,
4144 endpoint = " your_endpoint" ,
42- key = " your_api_key" ,
43- model = " prebuilt-read" , # Use layout model for enhanced layout analysis
44- language = " en" , # Specify document language
45- pages = [1 , 2 , 3 ], # Process specific pages
46- cache_ttl = 600 # Cache results for 10 minutes
45+ model_id = " prebuilt-layout" , # Use layout model for enhanced layout analysis
46+ cache_ttl = 600 , # Cache results for 10 minutes
47+ features = [" ocrHighResolution" , " barcodes" ] # Enable advanced features
4748)
4849
4950# Initialize loader with configuration
@@ -53,36 +54,144 @@ loader = DocumentLoaderAzureForm(config)
5354pages = loader.load(" path/to/your/document.pdf" )
5455```
5556
57+ ### Advanced Features Usage
58+
59+ ``` python
60+ from extract_thinker import DocumentLoaderAzureForm, AzureConfig
61+
62+ # Configuration with multiple advanced features
63+ config = AzureConfig(
64+ subscription_key = " your_subscription_key" ,
65+ endpoint = " your_endpoint" ,
66+ model_id = " prebuilt-layout" ,
67+ features = [
68+ " ocrHighResolution" , # High resolution OCR for small text
69+ " formulas" , # Extract mathematical formulas in LaTeX
70+ " styleFont" , # Extract font properties
71+ " barcodes" , # Extract barcodes and QR codes
72+ " languages" , # Detect document languages
73+ " keyValuePairs" # Extract key-value pairs from forms
74+ ]
75+ )
76+
77+ loader = DocumentLoaderAzureForm(config)
78+ pages = loader.load(" document_with_advanced_content.pdf" )
79+
80+ for page in pages:
81+ # Standard content
82+ print (f " Text content: { page[' content' ]} " )
83+ print (f " Tables: { page[' tables' ]} " )
84+ print (f " Forms: { page[' forms' ]} " )
85+
86+ # Advanced features (if detected in document)
87+ if ' formulas' in page:
88+ print (f " Mathematical formulas: { page[' formulas' ]} " )
89+
90+ if ' fonts' in page:
91+ print (f " Font information: { page[' fonts' ]} " )
92+
93+ if ' barcodes' in page:
94+ print (f " Barcodes found: { page[' barcodes' ]} " )
95+
96+ if ' languages' in page:
97+ print (f " Detected languages: { page[' languages' ]} " )
98+ ```
99+
100+ ### Specialized Models Usage
101+
102+ ``` python
103+ # Use specialized invoice model
104+ config = AzureConfig(
105+ subscription_key = " your_subscription_key" ,
106+ endpoint = " your_endpoint" ,
107+ model_id = " prebuilt-invoice"
108+ )
109+
110+ loader = DocumentLoaderAzureForm(config)
111+ pages = loader.load(" invoice.pdf" )
112+
113+ # Access extracted invoice fields
114+ for page in pages:
115+ forms = page[" forms" ]
116+ vendor_name = forms.get(" VendorName" , " " )
117+ invoice_total = forms.get(" InvoiceTotal" , " " )
118+ print (f " Vendor: { vendor_name} , Total: { invoice_total} " )
119+ ```
120+
56121## Configuration Options
57122
58123The ` AzureConfig ` class supports the following options:
59124
60125| Option | Type | Default | Description |
61126| --------| ------| ---------| -------------|
127+ | ` subscription_key ` | str | Required | Azure subscription key |
128+ | ` endpoint ` | str | Required | Azure endpoint URL |
62129| ` content ` | Any | None | Initial content to process |
63130| ` cache_ttl ` | int | 300 | Cache time-to-live in seconds |
64- | ` endpoint ` | str | None | Azure endpoint URL |
65- | ` key ` | str | None | Azure API key |
66- | ` model ` | str | "prebuilt-document" | Model ID to use |
67- | ` language ` | str | None | Document language code |
68- | ` pages ` | List[ int] | None | Specific pages to process |
69- | ` reading_order ` | str | "natural" | Text reading order |
131+ | ` model_id ` | str | "prebuilt-layout" | Model ID to use |
132+ | ` max_retries ` | int | 3 | Maximum retries for failed requests |
133+ | ` features ` | List[ str] | None | Advanced features to enable |
134+
135+ ## Available Models
136+
137+ ### General Purpose Models
138+
139+ | Model ID | Description | Best For |
140+ | ----------| -------------| ----------|
141+ | ` prebuilt-read ` | OCR/Read model | Text extraction from printed and handwritten documents |
142+ | ` prebuilt-layout ` | Layout analysis | Documents with tables, selection marks, and complex layouts |
143+ | ` prebuilt-document ` | General document | Key-value pairs, tables, and general document structure |
144+
145+ ### Specialized Models
146+
147+ | Model ID | Description |
148+ | ----------| -------------|
149+ | ` prebuilt-invoice ` | Invoice processing |
150+ | ` prebuilt-receipt ` | Receipt processing |
151+ | ` prebuilt-idDocument ` | Identity documents |
152+ | ` prebuilt-businessCard ` | Business cards |
153+ | ` prebuilt-tax.us.w2 ` | US W2 tax forms |
154+ | ` prebuilt-tax.us.1040 ` | US 1040 tax forms |
155+ | ` prebuilt-contract ` | Contracts |
156+ | ` prebuilt-healthInsurance ` | US health insurance cards |
157+ | ` prebuilt-bankStatement ` | Bank statements |
158+ | ` prebuilt-payStub ` | Pay stubs |
159+
160+ ## Advanced Features
161+
162+ The loader supports advanced extraction features that can be enabled via the ` features ` parameter:
163+
164+ | Feature | Description | Output Field |
165+ | ---------| -------------| --------------|
166+ | ` ocrHighResolution ` | High resolution OCR for better small text recognition | Enhanced text in ` content ` |
167+ | ` formulas ` | Extract mathematical formulas in LaTeX format | ` formulas ` array |
168+ | ` styleFont ` | Extract font properties (family, style, weight, color) | ` fonts ` array |
169+ | ` barcodes ` | Extract barcodes and QR codes | ` barcodes ` array |
170+ | ` languages ` | Detect document languages | ` languages ` array |
171+ | ` keyValuePairs ` | Extract key-value pairs from forms | Enhanced ` forms ` dict |
172+ | ` queryFields ` | Enable custom field extraction | Enhanced extraction |
173+ | ` searchablePDF ` | Convert scanned PDFs to searchable format | Enhanced OCR |
70174
71175## Features
72176
73177- Text extraction with layout preservation
74178- Table detection and extraction
75- - Form field recognition
76- - Multiple model support (document, layout, read)
77- - Language specification
78- - Page selection
79- - Reading order control
80- - Caching support
81- - Support for pre-configured clients
179+ - Form field recognition with specialized models
180+ - Advanced OCR with high resolution support
181+ - Mathematical formula extraction (LaTeX format)
182+ - Font property extraction
183+ - Barcode and QR code detection
184+ - Multi-language document support
185+ - Caching support with configurable TTL
186+ - Vision mode support for image formats
187+ - Retry logic for robust processing
82188
83189## Notes
84190
85- - Available models: "prebuilt-document", "prebuilt-layout", "prebuilt-read"
86- - Vision mode is supported for image formats
87- - Azure credentials are required
191+ - Azure subscription key and endpoint are required
192+ - Advanced features may increase processing time and costs
193+ - Specialized models are optimized for specific document types
88194- Rate limits and quotas apply based on your Azure subscription
195+ - Vision mode is supported for image formats
196+ - High resolution OCR is recommended for documents with small text
197+ - Formula extraction works best with clear mathematical notation
0 commit comments