18
18
from tests .models .handbook_contract import HandbookContract
19
19
from extract_thinker .global_models import get_lite_model , get_big_model
20
20
from pydantic import BaseModel , Field
21
+ from extract_thinker .exceptions import ExtractThinkerError
21
22
22
23
23
24
load_dotenv ()
24
25
cwd = os .getcwd ()
25
26
26
- def test_extract_with_pypdf_and_gpt4o_mini ():
27
+ def test_extract_with_pypdf_and_gpt4o_mini_vision ():
27
28
28
29
# Arrange
29
- test_file_path = os .path .join (cwd , "tests" , "files " , "invoice.pdf " )
30
+ test_file_path = os .path .join (cwd , "tests" , "test_images " , "invoice.png " )
30
31
31
32
extractor = Extractor ()
32
33
extractor .load_document_loader (
@@ -35,51 +36,13 @@ def test_extract_with_pypdf_and_gpt4o_mini():
35
36
extractor .load_llm (get_lite_model ())
36
37
37
38
# Act
38
- result = extractor .extract (test_file_path , InvoiceContract )
39
+ result = extractor .extract (test_file_path , InvoiceContract , vision = True )
39
40
40
41
# Assert
41
42
assert result is not None
42
43
assert result .invoice_number == "0000001"
43
44
assert result .invoice_date == "2014-05-07"
44
45
45
- def test_extract_with_azure_di_and_gpt4o_mini ():
46
- subscription_key = os .getenv ("AZURE_SUBSCRIPTION_KEY" )
47
- endpoint = os .getenv ("AZURE_ENDPOINT" )
48
- test_file_path = os .path .join (cwd , "tests" , "test_images" , "invoice.png" )
49
-
50
- extractor = Extractor ()
51
- extractor .load_document_loader (
52
- DocumentLoaderAzureForm (subscription_key , endpoint )
53
- )
54
- extractor .load_llm (get_lite_model ())
55
- # Act
56
- result = extractor .extract (test_file_path , InvoiceContract )
57
-
58
- # Assert
59
- assert result is not None
60
- assert result .lines [0 ].description == "Website Redesign"
61
- assert result .lines [0 ].quantity == 1
62
- assert result .lines [0 ].unit_price == 2500
63
- assert result .lines [0 ].amount == 2500
64
-
65
- def test_extract_with_pypdf_and_gpt4o_mini ():
66
- test_file_path = os .path .join (cwd , "tests" , "files" , "invoice.pdf" )
67
-
68
- extractor = Extractor ()
69
- document_loader = DocumentLoaderPyPdf ()
70
- extractor .load_document_loader (document_loader )
71
- extractor .load_llm ("gpt-4o-mini" )
72
-
73
- # Act
74
- result = extractor .extract (test_file_path , InvoiceContract , vision = True )
75
-
76
- # Assert
77
- assert result is not None
78
- assert result .lines [0 ].description == "Consultation services"
79
- assert result .lines [0 ].quantity == 3
80
- assert result .lines [0 ].unit_price == 375
81
- assert result .lines [0 ].amount == 1125
82
-
83
46
def test_vision_content_pdf ():
84
47
# Arrange
85
48
extractor = Extractor ()
@@ -156,10 +119,10 @@ def test_extract_with_invalid_file_path():
156
119
invalid_file_path = os .path .join (cwd , "tests" , "nonexistent" , "fake_file.png" )
157
120
158
121
# Act & Assert
159
- with pytest .raises (ValueError ) as exc_info :
122
+ with pytest .raises (ExtractThinkerError ) as exc_info :
160
123
extractor .extract (invalid_file_path , InvoiceContract , vision = True )
161
124
162
- assert "Failed to extract from source" in str (exc_info .value . args [ 0 ] )
125
+ assert "Failed to extract from source: Cannot handle source " in str (exc_info .value )
163
126
164
127
def test_forbidden_strategy_with_token_limit ():
165
128
test_file_path = os .path .join (os .getcwd (), "tests" , "test_images" , "eu_tax_chart.png" )
@@ -358,34 +321,6 @@ def test_llm_timeout():
358
321
result = extractor .extract (test_file_path , InvoiceContract )
359
322
assert result is not None
360
323
361
- def test_dynamic_json_parsing ():
362
- """Test dynamic JSON parsing with local Ollama model."""
363
- # Initialize components
364
- llm = LLM (model = "ollama/deepseek-r1:1.5b" )
365
- llm .set_dynamic (True ) # Enable dynamic JSON parsing
366
-
367
- document_loader = DocumentLoaderPyPdf ()
368
- extractor = Extractor (document_loader = document_loader , llm = llm )
369
-
370
- # Test content that should produce JSON response
371
- test_file_path = os .path .join (cwd , "tests" , "files" , "invoice.pdf" )
372
-
373
- # Extract with dynamic parsing
374
- try :
375
- result = extractor .extract (test_file_path , InvoiceContract )
376
-
377
- # Verify the result is an InvoiceContract instance
378
- assert isinstance (result , InvoiceContract )
379
-
380
- # Verify invoice fields
381
- assert result .invoice_number is not None
382
- assert result .invoice_date is not None
383
- assert result .total_amount is not None
384
- assert isinstance (result .lines , list )
385
-
386
- except Exception as e :
387
- pytest .fail (f"Dynamic JSON parsing test failed: { str (e )} " )
388
-
389
324
def test_extract_with_default_backend ():
390
325
"""Test extraction using default LiteLLM backend"""
391
326
# Arrange
@@ -407,8 +342,6 @@ def test_extract_with_default_backend():
407
342
def test_extract_with_pydanticai_backend ():
408
343
"""Test extraction using PydanticAI backend if available"""
409
344
try :
410
- import pydantic_ai
411
-
412
345
# Arrange
413
346
test_file_path = os .path .join (cwd , "tests" , "files" , "invoice.pdf" )
414
347
@@ -439,13 +372,12 @@ def test_extract_from_url_docling_and_gpt4o_mini():
439
372
extractor = Extractor ()
440
373
extractor .load_document_loader (DocumentLoaderDocling ())
441
374
extractor .load_llm (get_lite_model ())
442
-
375
+
443
376
# Act: Extract the document using the specified URL and the HandbookContract
444
- result = extractor .extract (url , HandbookContract )
377
+ result : HandbookContract = extractor .extract (url , HandbookContract )
445
378
446
- # Assert: Verify that the extracted title matches the expected value.
447
- expected_title = "BCOBS 2A.1 Restriction on marketing or providing an optional product for which a fee is payable"
448
- assert result .title == expected_title
379
+ # Check handbook data
380
+ assert "FCA Handbook" in result .title , f"Expected title to contain 'FCA Handbook', but got: { result .title } "
449
381
450
382
def test_extract_from_multiple_sources ():
451
383
"""
@@ -480,4 +412,7 @@ class CombinedData(BaseModel):
480
412
assert result .total_amount == 1125
481
413
482
414
# Check handbook data
483
- assert "FCA Handbook" in result .handbook_title , f"Expected title to contain 'FCA Handbook', but got: { result .handbook_title } "
415
+ assert "FCA Handbook" in result .handbook_title , f"Expected title to contain 'FCA Handbook', but got: { result .handbook_title } "
416
+
417
+ if __name__ == "__main__" :
418
+ test_extract_with_invalid_file_path ()
0 commit comments