1111from tests .models .invoice import InvoiceContract
1212from tests .models .ChartWithContent import ChartWithContent
1313from tests .models .page_contract import ReportContract
14- from tests .models .gdp_contract import EUData
14+ from tests .models .gdp_contract import EUData , EUDataOptional
1515from extract_thinker .document_loader .document_loader_azure_document_intelligence import DocumentLoaderAzureForm
1616import pytest
1717import numpy as np
1818from litellm import embedding
19- from extract_thinker .document_loader .document_loader_docling import DocumentLoaderDocling
19+ from extract_thinker .document_loader .document_loader_docling import DoclingConfig , DocumentLoaderDocling
2020from tests .models .handbook_contract import HandbookContract
21+ from extract_thinker .global_models import get_lite_model , get_big_model
2122
2223
2324load_dotenv ()
@@ -32,7 +33,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
3233 extractor .load_document_loader (
3334 DocumentLoaderPyPdf ()
3435 )
35- extractor .load_llm ("gpt-4o-mini" )
36+ extractor .load_llm (get_lite_model () )
3637
3738 # Act
3839 result = extractor .extract (test_file_path , InvoiceContract )
@@ -51,7 +52,7 @@ def test_extract_with_azure_di_and_gpt4o_mini():
5152 extractor .load_document_loader (
5253 DocumentLoaderAzureForm (subscription_key , endpoint )
5354 )
54- extractor .load_llm ("gpt-4o-mini" )
55+ extractor .load_llm (get_lite_model () )
5556 # Act
5657 result = extractor .extract (test_file_path , InvoiceContract )
5758
@@ -71,7 +72,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
7172 extractor .load_llm ("gpt-4o-mini" )
7273
7374 # Act
74- result = extractor .extract (test_file_path , InvoiceContract )
75+ result = extractor .extract (test_file_path , InvoiceContract , vision = True )
7576
7677 # Assert
7778 assert result is not None
@@ -83,7 +84,7 @@ def test_extract_with_pypdf_and_gpt4o_mini():
8384def test_vision_content_pdf ():
8485 # Arrange
8586 extractor = Extractor ()
86- extractor .load_llm ("gpt-4o-mini" )
87+ extractor .load_llm (get_lite_model () )
8788 test_file_path = os .path .join (cwd , "tests" , "files" , "invoice.pdf" )
8889
8990 # Act
@@ -108,7 +109,7 @@ def test_vision_content_pdf():
108109def test_chart_with_content ():
109110 # Arrange
110111 extractor = Extractor ()
111- extractor .load_llm ("gpt-4o-mini" )
112+ extractor .load_llm (get_lite_model () )
112113 test_file_path = os .path .join (cwd , "tests" , "test_images" , "eu_tax_chart.png" )
113114
114115 # Act
@@ -131,7 +132,7 @@ def test_extract_with_loader_and_vision():
131132 extractor = Extractor ()
132133 loader = DocumentLoaderPyPdf ()
133134 extractor .load_document_loader (loader )
134- extractor .load_llm ("gpt-4o-mini" )
135+ extractor .load_llm (get_lite_model () )
135136
136137 # Act
137138 result = extractor .extract (test_file_path , InvoiceContract , vision = True )
@@ -152,7 +153,7 @@ def test_extract_with_loader_and_vision():
152153def test_extract_with_invalid_file_path ():
153154 # Arrange
154155 extractor = Extractor ()
155- extractor .load_llm ("gpt-4o-mini" )
156+ extractor .load_llm (get_lite_model () )
156157 invalid_file_path = os .path .join (cwd , "tests" , "nonexistent" , "fake_file.png" )
157158
158159 # Act & Assert
@@ -165,7 +166,7 @@ def test_forbidden_strategy_with_token_limit():
165166 test_file_path = os .path .join (os .getcwd (), "tests" , "test_images" , "eu_tax_chart.png" )
166167 tesseract_path = os .getenv ("TESSERACT_PATH" )
167168
168- llm = LLM ("gpt-4o-mini" , token_limit = 10 )
169+ llm = LLM (get_lite_model () , token_limit = 10 )
169170
170171 extractor = Extractor ()
171172 extractor .load_document_loader (DocumentLoaderTesseract (tesseract_path ))
@@ -194,7 +195,7 @@ def test_pagination_handler():
194195
195196 extractor = Extractor ()
196197 extractor .load_document_loader (DocumentLoaderDocling ())
197- extractor .load_llm ("gpt-4o" )
198+ extractor .load_llm (get_big_model () )
198199
199200 # Create and run both extractions in parallel
200201 async def run_parallel_extractions ():
@@ -204,8 +205,8 @@ async def run_parallel_extractions():
204205 )
205206 return result_1 , result_2
206207
207- # Run the async code
208- results : tuple [ EUData , EUData ] = asyncio .run (run_parallel_extractions ())
208+ # Run the async extraction and get the results as instances of OptionalEUData
209+ results = asyncio .run (run_parallel_extractions ())
209210 result_1 , result_2 = results
210211
211212 # Compare top-level EU data
@@ -252,6 +253,25 @@ async def run_parallel_extractions():
252253 # assert province1.share_in_eu27_gdp == matching_province.share_in_eu27_gdp
253254 # assert province1.gdp_per_capita == matching_province.gdp_per_capita
254255
256+ def test_pagination_handler_optional ():
257+ test_file_path = os .path .join (os .getcwd (), "tests" , "files" , "Regional_GDP_per_capita_2018_2.pdf" )
258+
259+ extractor = Extractor ()
260+ extractor .load_document_loader (DocumentLoaderDocling ())
261+ extractor .load_llm (get_big_model ())
262+
263+ async def extract_async_optional (extractor , file_path , vision , completion_strategy ):
264+ return extractor .extract (
265+ file_path ,
266+ EUDataOptional ,
267+ vision = vision ,
268+ completion_strategy = completion_strategy
269+ )
270+
271+ result = asyncio .run (extract_async_optional (extractor , test_file_path , vision = True , completion_strategy = CompletionStrategy .PAGINATE ))
272+
273+ assert len (result .countries ) == 6
274+
255275def get_embedding (text , model = "text-embedding-ada-002" ):
256276 text = text .replace ("\n " , " " )
257277 response = embedding (
@@ -284,7 +304,7 @@ def test_concatenation_handler():
284304 tesseract_path = os .getenv ("TESSERACT_PATH" )
285305 extractor = Extractor ()
286306 extractor .load_document_loader (DocumentLoaderTesseract (tesseract_path ))
287- llm_first = LLM ("gpt-4o" , token_limit = 500 )
307+ llm_first = LLM (get_big_model () , token_limit = 500 )
288308 extractor .load_llm (llm_first )
289309
290310 result_1 : ReportContract = extractor .extract (
@@ -296,7 +316,7 @@ def test_concatenation_handler():
296316
297317 second_extractor = Extractor ()
298318 second_extractor .load_document_loader (DocumentLoaderTesseract (tesseract_path ))
299- second_extractor .load_llm ("gpt-4o" )
319+ second_extractor .load_llm (get_big_model () )
300320
301321 result_2 : ReportContract = second_extractor .extract (
302322 test_file_path ,
@@ -324,7 +344,7 @@ def test_llm_timeout():
324344 extractor .load_document_loader (DocumentLoaderPyPdf ())
325345
326346 # Create LLM with very short timeout
327- llm = LLM ("gpt-4o-mini" )
347+ llm = LLM (get_lite_model () )
328348 llm .set_timeout (1 ) # Set timeout to 1ms (extremely short to force timeout)
329349 extractor .load_llm (llm )
330350
@@ -374,7 +394,7 @@ def test_extract_with_default_backend():
374394
375395 extractor = Extractor ()
376396 extractor .load_document_loader (DocumentLoaderPyPdf ())
377- extractor .load_llm (LLM ("gpt-4o-mini" , backend = LLMEngine .DEFAULT ))
397+ extractor .load_llm (LLM (get_lite_model () , backend = LLMEngine .DEFAULT ))
378398
379399 # Act
380400 result = extractor .extract (test_file_path , InvoiceContract )
@@ -395,7 +415,7 @@ def test_extract_with_pydanticai_backend():
395415
396416 extractor = Extractor ()
397417 extractor .load_document_loader (DocumentLoaderPyPdf ())
398- extractor .load_llm (LLM ("openai:gpt-4o-mini" , backend = LLMEngine .PYDANTIC_AI ))
418+ extractor .load_llm (LLM (get_lite_model () , backend = LLMEngine .PYDANTIC_AI ))
399419
400420 # Act
401421 result = extractor .extract (test_file_path , InvoiceContract )
@@ -419,7 +439,7 @@ def test_extract_from_url_docling_and_gpt4o_mini():
419439 # Initialize the extractor, load the Docling loader and the gpt-4o-mini LLM
420440 extractor = Extractor ()
421441 extractor .load_document_loader (DocumentLoaderDocling ())
422- extractor .load_llm ("gpt-4o-mini" )
442+ extractor .load_llm (get_lite_model () )
423443
424444 # Act: Extract the document using the specified URL and the HandbookContract
425445 result = extractor .extract (url , HandbookContract )
0 commit comments