6
6
from unstructured_inference .inference import layout
7
7
8
8
from unstructured .documents .elements import NarrativeText , PageBreak , Text , Title
9
- from unstructured .partition import pdf
9
+ from unstructured .partition import pdf , strategies
10
10
11
11
12
12
class MockResponse :
@@ -161,7 +161,7 @@ def test_partition_pdf_api_raises_with_failed_api_call(
161
161
[("fakeurl" , True , False ), (None , False , True )],
162
162
)
163
163
def test_partition_pdf (url , api_called , local_called , monkeypatch ):
164
- monkeypatch .setattr (pdf , "is_pdf_text_extractable" , lambda * args , ** kwargs : True )
164
+ monkeypatch .setattr (strategies , "is_pdf_text_extractable" , lambda * args , ** kwargs : True )
165
165
with mock .patch .object (
166
166
pdf ,
167
167
attribute = "_partition_via_api" ,
@@ -177,7 +177,7 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
177
177
[("fakeurl" , True , False ), (None , False , True )],
178
178
)
179
179
def test_partition_pdf_with_template (url , api_called , local_called , monkeypatch ):
180
- monkeypatch .setattr (pdf , "is_pdf_text_extractable" , lambda * args , ** kwargs : True )
180
+ monkeypatch .setattr (strategies , "is_pdf_text_extractable" , lambda * args , ** kwargs : True )
181
181
with mock .patch .object (
182
182
pdf ,
183
183
attribute = "_partition_via_api" ,
@@ -253,13 +253,83 @@ def test_partition_pdf_falls_back_to_fast(
253
253
caplog ,
254
254
filename = "example-docs/layout-parser-paper-fast.pdf" ,
255
255
):
256
- monkeypatch .setattr (pdf , "dependency_exists" , lambda dep : dep != "detectron2" )
256
+ def mock_exists (dep ):
257
+ return dep not in ["detectron2" , "pytesseract" ]
258
+
259
+ monkeypatch .setattr (strategies , "dependency_exists" , mock_exists )
260
+
261
+ mock_return = [Text ("Hello there!" )]
262
+ with mock .patch .object (
263
+ pdf ,
264
+ "_partition_pdf_with_pdfminer" ,
265
+ return_value = mock_return ,
266
+ ) as mock_partition :
267
+ pdf .partition_pdf (filename = filename , url = None , strategy = "hi_res" )
268
+
269
+ mock_partition .assert_called_once ()
270
+ assert "detectron2 is not installed" in caplog .text
271
+
272
+
273
+ def test_partition_pdf_falls_back_to_fast_from_ocr_only (
274
+ monkeypatch ,
275
+ caplog ,
276
+ filename = "example-docs/layout-parser-paper-fast.pdf" ,
277
+ ):
278
+ def mock_exists (dep ):
279
+ return dep not in ["pytesseract" ]
280
+
281
+ monkeypatch .setattr (strategies , "dependency_exists" , mock_exists )
257
282
258
283
mock_return = [Text ("Hello there!" )]
259
284
with mock .patch .object (
260
285
pdf ,
261
286
"_partition_pdf_with_pdfminer" ,
262
287
return_value = mock_return ,
288
+ ) as mock_partition :
289
+ pdf .partition_pdf (filename = filename , url = None , strategy = "ocr_only" )
290
+
291
+ mock_partition .assert_called_once ()
292
+ assert "pytesseract is not installed" in caplog .text
293
+
294
+
295
+ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only (
296
+ monkeypatch ,
297
+ caplog ,
298
+ filename = "example-docs/layout-parser-paper-fast.pdf" ,
299
+ ):
300
+ def mock_exists (dep ):
301
+ return dep not in ["pytesseract" ]
302
+
303
+ monkeypatch .setattr (strategies , "dependency_exists" , mock_exists )
304
+ monkeypatch .setattr (strategies , "is_pdf_text_extractable" , lambda * args , ** kwargs : False )
305
+
306
+ mock_return = [Text ("Hello there!" )]
307
+ with mock .patch .object (
308
+ pdf ,
309
+ "_partition_pdf_or_image_local" ,
310
+ return_value = mock_return ,
311
+ ) as mock_partition :
312
+ pdf .partition_pdf (filename = filename , url = None , strategy = "ocr_only" )
313
+
314
+ mock_partition .assert_called_once ()
315
+ assert "pytesseract is not installed" in caplog .text
316
+
317
+
318
+ def test_partition_pdf_falls_back_to_ocr_only (
319
+ monkeypatch ,
320
+ caplog ,
321
+ filename = "example-docs/layout-parser-paper-fast.pdf" ,
322
+ ):
323
+ def mock_exists (dep ):
324
+ return dep not in ["detectron2" ]
325
+
326
+ monkeypatch .setattr (strategies , "dependency_exists" , mock_exists )
327
+
328
+ mock_return = [Text ("Hello there!" )]
329
+ with mock .patch .object (
330
+ pdf ,
331
+ "_partition_pdf_or_image_with_ocr" ,
332
+ return_value = mock_return ,
263
333
) as mock_partition :
264
334
pdf .partition_pdf (filename = filename , url = None , strategy = "hi_res" )
265
335
@@ -276,27 +346,6 @@ def test_partition_pdf_uses_table_extraction():
276
346
assert mock_process_file_with_model .call_args [1 ]["extract_tables" ]
277
347
278
348
279
- @pytest .mark .parametrize (
280
- ("filename" , "from_file" , "expected" ),
281
- [
282
- ("layout-parser-paper-fast.pdf" , True , True ),
283
- ("copy-protected.pdf" , True , False ),
284
- ("layout-parser-paper-fast.pdf" , False , True ),
285
- ("copy-protected.pdf" , False , False ),
286
- ],
287
- )
288
- def test_is_pdf_text_extractable (filename , from_file , expected ):
289
- filename = os .path .join ("example-docs" , filename )
290
-
291
- if from_file :
292
- with open (filename , "rb" ) as f :
293
- extractable = pdf .is_pdf_text_extractable (file = f )
294
- else :
295
- extractable = pdf .is_pdf_text_extractable (filename = filename )
296
-
297
- assert extractable is expected
298
-
299
-
300
349
def test_partition_pdf_with_copy_protection ():
301
350
filename = os .path .join ("example-docs" , "copy-protected.pdf" )
302
351
elements = pdf .partition_pdf (filename = filename , strategy = "hi_res" )
@@ -314,8 +363,11 @@ def test_partition_pdf_fails_if_pdf_not_processable(
314
363
monkeypatch ,
315
364
filename = "example-docs/layout-parser-paper-fast.pdf" ,
316
365
):
317
- monkeypatch .setattr (pdf , "dependency_exists" , lambda dep : dep != "detectron2" )
318
- monkeypatch .setattr (pdf , "is_pdf_text_extractable" , lambda * args , ** kwargs : False )
366
+ def mock_exists (dep ):
367
+ return dep not in ["detectron2" , "pytesseract" ]
368
+
369
+ monkeypatch .setattr (strategies , "dependency_exists" , mock_exists )
370
+ monkeypatch .setattr (strategies , "is_pdf_text_extractable" , lambda * args , ** kwargs : False )
319
371
320
372
with pytest .raises (ValueError ):
321
373
pdf .partition_pdf (filename = filename )
0 commit comments