Skip to content

Commit 7720e72

Browse files
Fix: avoid elements sharing the same memory address (#2940)
This PR attempts to fix a memory issue, which resulted in errors like this: #2931 The root cause seems to be in how ListItems are being combined, not in how hashes or parent IDs are updated. When `assign_and_map_hash_ids()` is called and elements (or elements' metadata) do not have unique memory addresses, then updating the parent_id of one element will also overwrite the parent_id of some other element. --------- Co-authored-by: cragwolfe <[email protected]>
1 parent fa767d6 commit 7720e72

File tree

5 files changed

+14
-4
lines changed

5 files changed

+14
-4
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.13.5-dev0
1+
## 0.13.5
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* **KeyError raised when updating parent_id** In the past, combining `ListItem` elements could result in reusing the same memory location which then led to unexpected side effects when updating element IDs.
910
* **Bump unstructured-inference==0.7.29**: table transformer predictions are now removed if confidence is below threshold
1011

1112
## 0.13.4

Diff for: example-docs/a1977-backus-p21.pdf

98 KB
Binary file not shown.

Diff for: test_unstructured/partition/pdf_image/test_pdf.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -887,12 +887,20 @@ def test_partition_pdf_word_bbox_not_char(
887887
filename=example_doc_path("interface-config-guide-p93.pdf"),
888888
):
889889
try:
890-
elements = pdf.partition_pdf(filename=filename)
890+
elements = pdf.partition_pdf(filename=filename, strategy="fast")
891891
except Exception as e:
892892
raise ("Partitioning fail: %s" % e)
893893
assert len(elements) == 17
894894

895895

896+
def test_partition_pdf_fast_no_mapping_errors(
897+
filename=example_doc_path("a1977-backus-p21.pdf"),
898+
):
899+
"""Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940,
900+
failing to map old parent_id's to new"""
901+
pdf.partition_pdf(filename=filename, strategy="fast")
902+
903+
896904
def test_partition_pdf_raises_TypeError_for_invalid_languages():
897905
filename = example_doc_path("chevron-page.pdf")
898906
with pytest.raises(TypeError):

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.5-dev0" # pragma: no cover
1+
__version__ = "0.13.5" # pragma: no cover

Diff for: unstructured/partition/pdf.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import contextlib
4+
import copy
45
import io
56
import os
67
import re
@@ -867,7 +868,7 @@ def _combine_coordinates_into_element1(
867868
points=points,
868869
system=coordinate_system,
869870
)
870-
return element1
871+
return copy.deepcopy(element1)
871872

872873

873874
def convert_pdf_to_images(

0 commit comments

Comments
 (0)