Open
Description
Describe the bug
/app $ python3 docker_test/langchain_unstructured_test_pdf_to_text.py
INFO: pikepdf C++ to Python logger bridge initialized
INFO: Reading PDF for file: /app/example-docs/pdf/2409.12431v1.pdf ...
Traceback (most recent call last):
File "/app/docker_test/langchain_unstructured_test_pdf_to_text.py", line 19, in <module>
docs = load_and_process_pdf_structured(pdf_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/docker_test/langchain_unstructured_test_pdf_to_text.py", line 11, in load_and_process_pdf_structured
for doc in loader.lazy_load():
File "/home/notebook-user/.local/lib/python3.11/site-packages/langchain_unstructured/document_loaders.py", line 178, in lazy_load
yield from load_file(f=self.file, f_path=self.file_path)
File "/home/notebook-user/.local/lib/python3.11/site-packages/langchain_unstructured/document_loaders.py", line 212, in lazy_load
else self._elements_json
^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/langchain_unstructured/document_loaders.py", line 231, in _elements_json
return self._convert_elements_to_dicts(self._elements_via_local)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/langchain_unstructured/document_loaders.py", line 249, in _elements_via_local
return partition(
^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/partition/auto.py", line 342, in partition
elements = partition_pdf(
^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/documents/elements.py", line 605, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/file_utils/filetype.py", line 731, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/file_utils/filetype.py", line 687, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/chunking/dispatch.py", line 74, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 205, in partition_pdf
return partition_pdf_or_image(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 307, in partition_pdf_or_image
elements = _partition_pdf_or_image_local(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/utils.py", line 217, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/partition/pdf.py", line 720, in _partition_pdf_or_image_local
elements = document_to_element_list(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured/partition/common.py", line 657, in document_to_element_list
add_element_metadata(
TypeError: unstructured.partition.common.add_element_metadata() got multiple values for keyword argument 'coordinates'
To Reproduce
def load_and_process_pdf_structured(file_path: str):
# Doesn't work
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path=file_path,
strategy="hi_res",
partition_via_api=False,
coordinates=False, # doesn't matter true or false
)
docs = []
for doc in loader.lazy_load():
docs.append(doc)
return docs
if __name__ == "__main__":
pdf_path = "/app/example-docs/pdf/2409.12431v1.pdf"
docs = load_and_process_pdf_structured(pdf_path)
for doc in docs:
print(f"\n{doc.page_content}")
Expected behavior
To run the code without error
Environment Info
I used the official unstructured docker image, in Windows 11, WSL 2