@@ -144,6 +144,7 @@ def partition_pdf(
144
144
starting_page_number : int = 1 ,
145
145
extract_forms : bool = False ,
146
146
form_extraction_skip_tables : bool = True ,
147
+ password : Optional [str ] = None ,
147
148
** kwargs : Any ,
148
149
) -> list [Element ]:
149
150
"""Parses a pdf document into a list of interpreted elements.
@@ -224,6 +225,7 @@ def partition_pdf(
224
225
starting_page_number = starting_page_number ,
225
226
extract_forms = extract_forms ,
226
227
form_extraction_skip_tables = form_extraction_skip_tables ,
228
+ password = password ,
227
229
** kwargs ,
228
230
)
229
231
@@ -245,6 +247,7 @@ def partition_pdf_or_image(
245
247
starting_page_number : int = 1 ,
246
248
extract_forms : bool = False ,
247
249
form_extraction_skip_tables : bool = True ,
250
+ password : Optional [str ] = None ,
248
251
** kwargs : Any ,
249
252
) -> list [Element ]:
250
253
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -273,6 +276,7 @@ def partition_pdf_or_image(
273
276
languages = languages ,
274
277
metadata_last_modified = metadata_last_modified or last_modified ,
275
278
starting_page_number = starting_page_number ,
279
+ password = password ,
276
280
** kwargs ,
277
281
)
278
282
pdf_text_extractable = any (
@@ -322,6 +326,7 @@ def partition_pdf_or_image(
322
326
starting_page_number = starting_page_number ,
323
327
extract_forms = extract_forms ,
324
328
form_extraction_skip_tables = form_extraction_skip_tables ,
329
+ password = password ,
325
330
** kwargs ,
326
331
)
327
332
out_elements = _process_uncategorized_text_elements (elements )
@@ -347,6 +352,7 @@ def partition_pdf_or_image(
347
352
is_image = is_image ,
348
353
metadata_last_modified = metadata_last_modified or last_modified ,
349
354
starting_page_number = starting_page_number ,
355
+ password = password ,
350
356
** kwargs ,
351
357
)
352
358
out_elements = _process_uncategorized_text_elements (elements )
@@ -360,6 +366,7 @@ def extractable_elements(
360
366
languages : Optional [list [str ]] = None ,
361
367
metadata_last_modified : Optional [str ] = None ,
362
368
starting_page_number : int = 1 ,
369
+ password : Optional [str ] = None ,
363
370
** kwargs : Any ,
364
371
) -> list [list [Element ]]:
365
372
if isinstance (file , bytes ):
@@ -370,6 +377,7 @@ def extractable_elements(
370
377
languages = languages ,
371
378
metadata_last_modified = metadata_last_modified ,
372
379
starting_page_number = starting_page_number ,
380
+ password = password ,
373
381
** kwargs ,
374
382
)
375
383
@@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
380
388
languages : list [str ],
381
389
metadata_last_modified : Optional [str ],
382
390
starting_page_number : int = 1 ,
391
+ password : Optional [str ] = None ,
383
392
** kwargs : Any ,
384
393
) -> list [list [Element ]]:
385
394
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
403
412
languages = languages ,
404
413
metadata_last_modified = metadata_last_modified ,
405
414
starting_page_number = starting_page_number ,
415
+ password = password ,
406
416
** kwargs ,
407
417
)
408
418
@@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
413
423
languages = languages ,
414
424
metadata_last_modified = metadata_last_modified ,
415
425
starting_page_number = starting_page_number ,
426
+ password = password ,
416
427
** kwargs ,
417
428
)
418
429
@@ -427,14 +438,16 @@ def _process_pdfminer_pages(
427
438
metadata_last_modified : Optional [str ],
428
439
annotation_threshold : Optional [float ] = env_config .PDF_ANNOTATION_THRESHOLD ,
429
440
starting_page_number : int = 1 ,
441
+ password : Optional [str ] = None ,
430
442
** kwargs ,
431
443
) -> list [list [Element ]]:
432
444
"""Uses PDFMiner to split a document into pages and process them."""
433
445
434
446
elements = []
435
447
436
448
for page_number , (page , page_layout ) in enumerate (
437
- open_pdfminer_pages_generator (fp ), start = starting_page_number
449
+ open_pdfminer_pages_generator (fp , password = password ),
450
+ start = starting_page_number ,
438
451
):
439
452
width , height = page_layout .width , page_layout .height
440
453
@@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
556
569
extract_forms : bool = False ,
557
570
form_extraction_skip_tables : bool = True ,
558
571
pdf_hi_res_max_pages : Optional [int ] = None ,
572
+ password : Optional [str ] = None ,
559
573
** kwargs : Any ,
560
574
) -> list [Element ]:
561
575
"""Partition using package installed locally"""
@@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
592
606
is_image = is_image ,
593
607
model_name = hi_res_model_name ,
594
608
pdf_image_dpi = pdf_image_dpi ,
609
+ password = password ,
595
610
)
596
611
597
612
extracted_layout , layouts_links = (
598
- process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi )
613
+ process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi , password = password )
599
614
if pdf_text_extractable
600
615
else ([], [])
601
616
)
@@ -635,20 +650,22 @@ def _partition_pdf_or_image_local(
635
650
ocr_mode = ocr_mode ,
636
651
pdf_image_dpi = pdf_image_dpi ,
637
652
ocr_layout_dumper = ocr_layout_dumper ,
653
+ password = password ,
638
654
)
639
655
else :
640
656
inferred_document_layout = process_data_with_model (
641
657
file ,
642
658
is_image = is_image ,
643
659
model_name = hi_res_model_name ,
644
660
pdf_image_dpi = pdf_image_dpi ,
661
+ password = password ,
645
662
)
646
663
647
664
if hasattr (file , "seek" ):
648
665
file .seek (0 )
649
666
650
667
extracted_layout , layouts_links = (
651
- process_data_with_pdfminer (file = file , dpi = pdf_image_dpi )
668
+ process_data_with_pdfminer (file = file , dpi = pdf_image_dpi , password = password )
652
669
if pdf_text_extractable
653
670
else ([], [])
654
671
)
@@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
690
707
ocr_mode = ocr_mode ,
691
708
pdf_image_dpi = pdf_image_dpi ,
692
709
ocr_layout_dumper = ocr_layout_dumper ,
710
+ password = password ,
693
711
)
694
712
695
713
# vectorization of the data structure ends here
@@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
837
855
is_image : bool = False ,
838
856
metadata_last_modified : Optional [str ] = None ,
839
857
starting_page_number : int = 1 ,
858
+ password : Optional [str ] = None ,
840
859
** kwargs : Any ,
841
860
):
842
861
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
861
880
elements .extend (page_elements )
862
881
else :
863
882
for page_number , image in enumerate (
864
- convert_pdf_to_images (filename , file ), start = starting_page_number
883
+ convert_pdf_to_images (filename , file , password = password ), start = starting_page_number
865
884
):
866
885
page_elements = _partition_pdf_or_image_with_ocr_from_image (
867
886
image = image ,
0 commit comments