@@ -144,6 +144,7 @@ def partition_pdf(
144
144
starting_page_number : int = 1 ,
145
145
extract_forms : bool = False ,
146
146
form_extraction_skip_tables : bool = True ,
147
+ password : Optional [str ] = None ,
147
148
** kwargs : Any ,
148
149
) -> list [Element ]:
149
150
"""Parses a pdf document into a list of interpreted elements.
@@ -224,6 +225,7 @@ def partition_pdf(
224
225
starting_page_number = starting_page_number ,
225
226
extract_forms = extract_forms ,
226
227
form_extraction_skip_tables = form_extraction_skip_tables ,
228
+ password = password ,
227
229
** kwargs ,
228
230
)
229
231
@@ -245,6 +247,7 @@ def partition_pdf_or_image(
245
247
starting_page_number : int = 1 ,
246
248
extract_forms : bool = False ,
247
249
form_extraction_skip_tables : bool = True ,
250
+ password : Optional [str ] = None ,
248
251
** kwargs : Any ,
249
252
) -> list [Element ]:
250
253
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -273,6 +276,7 @@ def partition_pdf_or_image(
273
276
languages = languages ,
274
277
metadata_last_modified = metadata_last_modified or last_modified ,
275
278
starting_page_number = starting_page_number ,
279
+ password = password ,
276
280
** kwargs ,
277
281
)
278
282
pdf_text_extractable = any (
@@ -322,6 +326,7 @@ def partition_pdf_or_image(
322
326
starting_page_number = starting_page_number ,
323
327
extract_forms = extract_forms ,
324
328
form_extraction_skip_tables = form_extraction_skip_tables ,
329
+ password = password ,
325
330
** kwargs ,
326
331
)
327
332
out_elements = _process_uncategorized_text_elements (elements )
@@ -347,6 +352,7 @@ def partition_pdf_or_image(
347
352
is_image = is_image ,
348
353
metadata_last_modified = metadata_last_modified or last_modified ,
349
354
starting_page_number = starting_page_number ,
355
+ password = password ,
350
356
** kwargs ,
351
357
)
352
358
out_elements = _process_uncategorized_text_elements (elements )
@@ -360,6 +366,7 @@ def extractable_elements(
360
366
languages : Optional [list [str ]] = None ,
361
367
metadata_last_modified : Optional [str ] = None ,
362
368
starting_page_number : int = 1 ,
369
+ password :Optional [str ] = None ,
363
370
** kwargs : Any ,
364
371
) -> list [list [Element ]]:
365
372
if isinstance (file , bytes ):
@@ -370,6 +377,7 @@ def extractable_elements(
370
377
languages = languages ,
371
378
metadata_last_modified = metadata_last_modified ,
372
379
starting_page_number = starting_page_number ,
380
+ password = password ,
373
381
** kwargs ,
374
382
)
375
383
@@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
380
388
languages : list [str ],
381
389
metadata_last_modified : Optional [str ],
382
390
starting_page_number : int = 1 ,
391
+ password :Optional [str ] = None ,
383
392
** kwargs : Any ,
384
393
) -> list [list [Element ]]:
385
394
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
403
412
languages = languages ,
404
413
metadata_last_modified = metadata_last_modified ,
405
414
starting_page_number = starting_page_number ,
415
+ password = password ,
406
416
** kwargs ,
407
417
)
408
418
@@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
413
423
languages = languages ,
414
424
metadata_last_modified = metadata_last_modified ,
415
425
starting_page_number = starting_page_number ,
426
+ password = password ,
416
427
** kwargs ,
417
428
)
418
429
@@ -427,14 +438,16 @@ def _process_pdfminer_pages(
427
438
metadata_last_modified : Optional [str ],
428
439
annotation_threshold : Optional [float ] = env_config .PDF_ANNOTATION_THRESHOLD ,
429
440
starting_page_number : int = 1 ,
441
+ password : Optional [str ] = None ,
430
442
** kwargs ,
431
443
) -> list [list [Element ]]:
432
444
"""Uses PDFMiner to split a document into pages and process them."""
433
445
434
446
elements = []
435
447
436
448
for page_number , (page , page_layout ) in enumerate (
437
- open_pdfminer_pages_generator (fp ), start = starting_page_number
449
+ open_pdfminer_pages_generator (fp , password = password ),
450
+ start = starting_page_number ,
438
451
):
439
452
width , height = page_layout .width , page_layout .height
440
453
@@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
556
569
extract_forms : bool = False ,
557
570
form_extraction_skip_tables : bool = True ,
558
571
pdf_hi_res_max_pages : Optional [int ] = None ,
572
+ password :Optional [str ] = None ,
559
573
** kwargs : Any ,
560
574
) -> list [Element ]:
561
575
"""Partition using package installed locally"""
@@ -592,10 +606,12 @@ def _partition_pdf_or_image_local(
592
606
is_image = is_image ,
593
607
model_name = hi_res_model_name ,
594
608
pdf_image_dpi = pdf_image_dpi ,
609
+ password = password ,
595
610
)
596
611
597
612
extracted_layout , layouts_links = (
598
- process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi )
613
+ process_file_with_pdfminer (filename = filename , dpi = pdf_image_dpi ,
614
+ password = password )
599
615
if pdf_text_extractable
600
616
else ([], [])
601
617
)
@@ -635,20 +651,22 @@ def _partition_pdf_or_image_local(
635
651
ocr_mode = ocr_mode ,
636
652
pdf_image_dpi = pdf_image_dpi ,
637
653
ocr_layout_dumper = ocr_layout_dumper ,
654
+ password = password ,
638
655
)
639
656
else :
640
657
inferred_document_layout = process_data_with_model (
641
658
file ,
642
659
is_image = is_image ,
643
660
model_name = hi_res_model_name ,
644
661
pdf_image_dpi = pdf_image_dpi ,
662
+ password = password ,
645
663
)
646
664
647
665
if hasattr (file , "seek" ):
648
666
file .seek (0 )
649
667
650
668
extracted_layout , layouts_links = (
651
- process_data_with_pdfminer (file = file , dpi = pdf_image_dpi )
669
+ process_data_with_pdfminer (file = file , dpi = pdf_image_dpi , password = password )
652
670
if pdf_text_extractable
653
671
else ([], [])
654
672
)
@@ -690,6 +708,7 @@ def _partition_pdf_or_image_local(
690
708
ocr_mode = ocr_mode ,
691
709
pdf_image_dpi = pdf_image_dpi ,
692
710
ocr_layout_dumper = ocr_layout_dumper ,
711
+ password = password ,
693
712
)
694
713
695
714
# vectorization of the data structure ends here
@@ -837,6 +856,7 @@ def _partition_pdf_or_image_with_ocr(
837
856
is_image : bool = False ,
838
857
metadata_last_modified : Optional [str ] = None ,
839
858
starting_page_number : int = 1 ,
859
+ password : Optional [str ] = None ,
840
860
** kwargs : Any ,
841
861
):
842
862
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -861,7 +881,8 @@ def _partition_pdf_or_image_with_ocr(
861
881
elements .extend (page_elements )
862
882
else :
863
883
for page_number , image in enumerate (
864
- convert_pdf_to_images (filename , file ), start = starting_page_number
884
+ convert_pdf_to_images (filename , file , password = password ),
885
+ start = starting_page_number
865
886
):
866
887
page_elements = _partition_pdf_or_image_with_ocr_from_image (
867
888
image = image ,
0 commit comments