11import os
22import os .path
33import tempfile
4- from functools import partial
54from unittest .mock import ANY , mock_open , patch
65
76import numpy as np
87import pytest
98from PIL import Image
109
1110import unstructured_inference .models .base as models
12- from unstructured_inference .constants import Source
1311from unstructured_inference .inference import elements , layout , layoutelement
14- from unstructured_inference .models import detectron2
12+ from unstructured_inference .inference . elements import EmbeddedTextRegion , ImageTextRegion
1513from unstructured_inference .models .unstructuredmodel import (
1614 UnstructuredElementExtractionModel ,
1715 UnstructuredObjectDetectionModel ,
@@ -27,7 +25,7 @@ def mock_image():
2725
2826@pytest .fixture ()
2927def mock_initial_layout ():
30- text_block = layout . EmbeddedTextRegion .from_coords (
28+ text_block = EmbeddedTextRegion .from_coords (
3129 2 ,
3230 4 ,
3331 6 ,
@@ -36,7 +34,7 @@ def mock_initial_layout():
3634 source = "Mock" ,
3735 )
3836
39- title_block = layout . EmbeddedTextRegion .from_coords (
37+ title_block = EmbeddedTextRegion .from_coords (
4038 1 ,
4139 2 ,
4240 3 ,
@@ -81,7 +79,7 @@ def verify_image_array():
8179 assert page .image_array .all () == image_array .all ()
8280
8381 # Scenario 1: where self.image exists
84- page = layout .PageLayout (number = 0 , image = mock_image , layout = [] )
82+ page = layout .PageLayout (number = 0 , image = mock_image )
8583 verify_image_array ()
8684
8785 # Scenario 2: where self.image is None, but self.image_path exists
@@ -111,15 +109,9 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
111109 page = layout .PageLayout (
112110 number = 0 ,
113111 image = image ,
114- layout = mock_final_layout ,
115112 detection_model = MockLayoutModel (mock_final_layout ),
116113 )
117-
118114 elements = page .get_elements_with_detection_model (inplace = False )
119-
120- assert str (elements [0 ]) == "A Catchy Title"
121- assert str (elements [1 ]).startswith ("A very repetitive narrative." )
122-
123115 page .get_elements_with_detection_model (inplace = True )
124116 assert elements == page .elements
125117
@@ -135,35 +127,6 @@ def join(self):
135127 pass
136128
137129
138- def test_read_pdf (monkeypatch , mock_initial_layout , mock_final_layout , mock_image ):
139- with tempfile .TemporaryDirectory () as tmpdir :
140- image_path1 = os .path .join (tmpdir , "mock1.jpg" )
141- image_path2 = os .path .join (tmpdir , "mock2.jpg" )
142- mock_image .save (image_path1 )
143- mock_image .save (image_path2 )
144- image_paths = [image_path1 , image_path2 ]
145-
146- layouts = [mock_initial_layout , mock_initial_layout ]
147-
148- monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
149-
150- with patch .object (layout , "load_pdf" , return_value = (layouts , image_paths )), patch .dict (
151- models .model_class_map ,
152- {"detectron2_lp" : partial (MockLayoutModel , layout = mock_final_layout )},
153- ):
154- model = layout .get_model ("detectron2_lp" )
155- doc = layout .DocumentLayout .from_file ("fake-file.pdf" , detection_model = model )
156-
157- assert str (doc ).startswith ("A Catchy Title" )
158- assert str (doc ).count ("A Catchy Title" ) == 2 # Once for each page
159- assert str (doc ).endswith ("A very repetitive narrative. " )
160-
161- assert doc .pages [0 ].elements [0 ].to_dict ()["text" ] == "A Catchy Title"
162-
163- pages = doc .pages
164- assert str (doc ) == "\n \n " .join ([str (page ) for page in pages ])
165-
166-
167130@pytest .mark .parametrize ("model_name" , [None , "checkbox" , "fake" ])
168131def test_process_data_with_model (monkeypatch , mock_final_layout , model_name ):
169132 monkeypatch .setattr (layout , "get_model" , lambda x : MockLayoutModel (mock_final_layout ))
@@ -236,7 +199,7 @@ def tolist(self):
236199 return [1 , 2 , 3 , 4 ]
237200
238201
239- class MockEmbeddedTextRegion (layout . EmbeddedTextRegion ):
202+ class MockEmbeddedTextRegion (EmbeddedTextRegion ):
240203 def __init__ (self , type = None , text = None ):
241204 self .type = type
242205 self .text = text
@@ -251,15 +214,16 @@ def __init__(
251214 self ,
252215 number = 1 ,
253216 image = None ,
254- layout = None ,
255217 model = None ,
256218 extract_tables = False ,
219+ detection_model = None ,
257220 ):
258221 self .image = image
259222 self .layout = layout
260223 self .model = model
261224 self .extract_tables = extract_tables
262225 self .number = number
226+ self .detection_model = detection_model
263227
264228
265229@pytest .mark .parametrize (
@@ -349,8 +313,8 @@ def mock_get_elements(self, *args, **kwargs):
349313
350314 with patch .object (
351315 layout ,
352- "load_pdf " ,
353- lambda * args , ** kwargs : ([[]], [ image_path ]),
316+ "convert_pdf_to_image " ,
317+ lambda * args , ** kwargs : ([image_path ]),
354318 ):
355319 doc = layout .DocumentLayout .from_file ("fake-file.pdf" )
356320 page = doc .pages [0 ]
@@ -369,16 +333,9 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
369333 layout .DocumentLayout .from_image_file (tempdir )
370334
371335
372- def test_from_file_raises_on_length_mismatch (monkeypatch ):
373- monkeypatch .setattr (layout , "load_pdf" , lambda * args , ** kwargs : ([None , None ], []))
374- with pytest .raises (RuntimeError ) as e :
375- layout .DocumentLayout .from_file ("fake_file" )
376- assert "images" in str (e ).lower ()
377-
378-
379336@pytest .mark .parametrize ("idx" , range (2 ))
380337def test_get_elements_from_layout (mock_initial_layout , idx ):
381- page = MockPageLayout (layout = mock_initial_layout )
338+ page = MockPageLayout ()
382339 block = mock_initial_layout [idx ]
383340 block .bbox .pad (3 )
384341 fixed_layout = [block ]
@@ -429,74 +386,19 @@ def test_remove_control_characters(text, expected):
429386 assert elements .remove_control_characters (text ) == expected
430387
431388
432- no_text_region = layout . EmbeddedTextRegion .from_coords (0 , 0 , 100 , 100 )
433- text_region = layout . EmbeddedTextRegion .from_coords (0 , 0 , 100 , 100 , text = "test" )
434- cid_text_region = layout . EmbeddedTextRegion .from_coords (
389+ no_text_region = EmbeddedTextRegion .from_coords (0 , 0 , 100 , 100 )
390+ text_region = EmbeddedTextRegion .from_coords (0 , 0 , 100 , 100 , text = "test" )
391+ cid_text_region = EmbeddedTextRegion .from_coords (
435392 0 ,
436393 0 ,
437394 100 ,
438395 100 ,
439396 text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)" ,
440397)
441- overlapping_rect = layout .ImageTextRegion .from_coords (50 , 50 , 150 , 150 )
442- nonoverlapping_rect = layout .ImageTextRegion .from_coords (150 , 150 , 200 , 200 )
443- populated_text_region = layout .EmbeddedTextRegion .from_coords (50 , 50 , 60 , 60 , text = "test" )
444- unpopulated_text_region = layout .EmbeddedTextRegion .from_coords (50 , 50 , 60 , 60 , text = None )
445-
446-
447- @pytest .mark .parametrize ("filename" , ["loremipsum.pdf" , "IRS-form-1987.pdf" ])
448- def test_load_pdf (filename ):
449- layouts , images = layout .load_pdf (f"sample-docs/{ filename } " )
450- assert Source .PDFMINER in {e .source for e in layouts [0 ]}
451- assert len (layouts )
452- for lo in layouts :
453- assert len (lo )
454- assert len (images )
455- assert len (layouts ) == len (images )
456-
457-
458- def test_load_pdf_with_images ():
459- layouts , _ = layout .load_pdf ("sample-docs/loremipsum-flat.pdf" )
460- first_page_layout = layouts [0 ]
461- assert any (isinstance (obj , layout .ImageTextRegion ) for obj in first_page_layout )
462-
463-
464- def test_load_pdf_image_placement ():
465- layouts , images = layout .load_pdf ("sample-docs/layout-parser-paper.pdf" )
466- page_layout = layouts [5 ]
467- image_regions = [region for region in page_layout if isinstance (region , layout .ImageTextRegion )]
468- image_region = image_regions [0 ]
469- # Image is in top half of the page, so that should be reflected in the pixel coordinates
470- assert image_region .bbox .y1 < images [5 ].height / 2
471- assert image_region .bbox .y2 < images [5 ].height / 2
472-
473-
474- def test_load_pdf_raises_with_path_only_no_output_folder ():
475- with pytest .raises (ValueError ):
476- layout .load_pdf (
477- "sample-docs/loremipsum-flat.pdf" ,
478- path_only = True ,
479- )
480-
481-
482- @pytest .mark .skip ("Temporarily removed multicolumn to fix ordering" )
483- def test_load_pdf_with_multicolumn_layout (filename = "sample-docs/design-thinking.pdf" ):
484- layouts , images = layout .load_pdf (filename )
485- doc = layout .process_file_with_model (filename = filename , model_name = None )
486- test_snippets = [
487- "Key to design thinking" ,
488- "Design thinking also" ,
489- "But in recent years" ,
490- ]
491-
492- test_elements = []
493- for element in doc .pages [0 ].elements :
494- for snippet in test_snippets :
495- if element .text .startswith (snippet ):
496- test_elements .append (element )
497-
498- for i , element in enumerate (test_elements ):
499- assert element .text .startswith (test_snippets [i ])
398+ overlapping_rect = ImageTextRegion .from_coords (50 , 50 , 150 , 150 )
399+ nonoverlapping_rect = ImageTextRegion .from_coords (150 , 150 , 200 , 200 )
400+ populated_text_region = EmbeddedTextRegion .from_coords (50 , 50 , 60 , 60 , text = "test" )
401+ unpopulated_text_region = EmbeddedTextRegion .from_coords (50 , 50 , 60 , 60 , text = None )
500402
501403
502404@pytest .mark .parametrize (
@@ -521,7 +423,7 @@ def check_annotated_image():
521423
522424 test_image_arr = np .ones ((100 , 100 , 3 ), dtype = "uint8" )
523425 image = Image .fromarray (test_image_arr )
524- page = layout .PageLayout (number = 1 , image = image , layout = None )
426+ page = layout .PageLayout (number = 1 , image = image )
525427 coords1 = (21 , 30 , 37 , 41 )
526428 rect1 = elements .TextRegion .from_coords (* coords1 )
527429 coords2 = (1 , 10 , 7 , 11 )
@@ -571,8 +473,8 @@ def test_layout_order(mock_image):
571473 mock_image .save (mock_image_path )
572474 with patch .object (layout , "get_model" , lambda : MockDetectionModel ()), patch .object (
573475 layout ,
574- "load_pdf " ,
575- lambda * args , ** kwargs : ([[]], [ mock_image_path ]),
476+ "convert_pdf_to_image " ,
477+ lambda * args , ** kwargs : ([mock_image_path ]),
576478 ):
577479 doc = layout .DocumentLayout .from_file ("sample-docs/layout-parser-paper.pdf" )
578480 page = doc .pages [0 ]
0 commit comments