Open
Description
[<ipython-input-2-808188905a87>](https://localhost:8080/#) in extract_data_from_pdf(pdf_path)
57 # Function to extract text using the unstructured library
58 def extract_data_from_pdf(pdf_path):
---> 59 elements = partition_pdf(filename=pdf_path, strategy='hi_res')
60 text_data = ' '.join([str(el) for el in elements]) # Concatenate all elements
61 return text_data
[/usr/local/lib/python3.10/dist-packages/unstructured/documents/elements.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
576 @functools.wraps(func)
577 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 578 elements = func(*args, **kwargs)
579 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
580
[/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
723 @functools.wraps(func)
724 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 725 elements = func(*args, **kwargs)
726
727 for element in elements:
[/usr/local/lib/python3.10/dist-packages/unstructured/file_utils/filetype.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
681 @functools.wraps(func)
682 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 683 elements = func(*args, **kwargs)
684 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
685
[/usr/local/lib/python3.10/dist-packages/unstructured/chunking/dispatch.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
72
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, **kwargs)
75
76 # -- look for a chunking-strategy argument --
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
198 languages = check_language_args(languages or [], ocr_languages)
199
--> 200 return partition_pdf_or_image(
201 filename=filename,
202 file=file,
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
294 with warnings.catch_warnings():
295 warnings.simplefilter("ignore")
--> 296 elements = _partition_pdf_or_image_local(
297 filename=filename,
298 file=spooled_to_bytes_io_if_needed(file),
[/usr/local/lib/python3.10/dist-packages/unstructured/utils.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
216 run_check()
--> 217 return func(*args, **kwargs)
218
219 @wraps(func)
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf.py](https://localhost:8080/#) in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, **kwargs)
625 )
626
--> 627 final_document_layout = process_file_with_ocr(
628 filename,
629 merged_document_layout,
[/usr/local/lib/python3.10/dist-packages/unstructured/utils.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
216 run_check()
--> 217 return func(*args, **kwargs)
218
219 @wraps(func)
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/ocr.py](https://localhost:8080/#) in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper)
176 except Exception as e:
177 if os.path.isdir(filename) or os.path.isfile(filename):
--> 178 raise e
179 else:
180 raise FileNotFoundError(f'File "{filename}" not found!') from e
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/ocr.py](https://localhost:8080/#) in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper)
163 extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
164 with PILImage.open(image_path) as image:
--> 165 merged_page_layout = supplement_page_layout_with_ocr(
166 page_layout=out_layout.pages[i],
167 image=image,
[/usr/local/lib/python3.10/dist-packages/unstructured/utils.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
216 run_check()
--> 217 return func(*args, **kwargs)
218
219 @wraps(func)
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/ocr.py](https://localhost:8080/#) in supplement_page_layout_with_ocr(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)
204 if ocr_layout_dumper:
205 ocr_layout_dumper.add_ocred_page(ocr_layout)
--> 206 page_layout.elements[:] = merge_out_layout_with_ocr_layout(
207 out_layout=cast(List["LayoutElement"], page_layout.elements),
208 ocr_layout=ocr_layout,
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/ocr.py](https://localhost:8080/#) in merge_out_layout_with_ocr_layout(out_layout, ocr_layout, supplement_with_ocr_elements)
359
360 final_layout = (
--> 361 supplement_layout_with_ocr_elements(out_layout, ocr_layout)
362 if supplement_with_ocr_elements
363 else out_layout
[/usr/local/lib/python3.10/dist-packages/unstructured/utils.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
216 run_check()
--> 217 return func(*args, **kwargs)
218
219 @wraps(func)
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/ocr.py](https://localhost:8080/#) in supplement_layout_with_ocr_elements(layout, ocr_layout, subregion_threshold)
438 ocr_regions_to_add = [region for region in ocr_layout if region not in ocr_regions_to_remove]
439 if ocr_regions_to_add:
--> 440 ocr_elements_to_add = build_layout_elements_from_ocr_regions(ocr_regions_to_add)
441 final_layout = layout + ocr_elements_to_add
442 else:
[/usr/local/lib/python3.10/dist-packages/unstructured/partition/pdf_image/inference_utils.py](https://localhost:8080/#) in build_layout_elements_from_ocr_regions(ocr_regions, ocr_text, group_by_ocr_text)
69 grouped_regions.append(regions)
70 else:
---> 71 grouped_regions = partition_groups_from_regions(ocr_regions)
72
73 merged_regions = [merge_text_regions(group) for group in grouped_regions]
[/usr/local/lib/python3.10/dist-packages/unstructured_inference/inference/layoutelement.py](https://localhost:8080/#) in partition_groups_from_regions(regions)
316 if len(regions) == 0:
317 return []
--> 318 padded_coords = regions.element_coords.copy()
319 v_pad = (regions.y2 - regions.y1) * inference_config.ELEMENTS_V_PADDING_COEF
320 h_pad = (regions.x2 - regions.x1) * inference_config.ELEMENTS_H_PADDING_COEF
AttributeError: 'list' object has no attribute 'element_coords'