Skip to content

Commit a9ff1e7

Browse files
authored
Fix/fix ocr region to elements bug (#3891)
This PR fixes a bug in `build_layout_elements_from_ocr_regions` where texts are joint in incorrect orders. The bug is due to incorrect masking of the `ocr_regions` after some are already selected as one of the final groups. The fix uses simpler method to mask the indices by simply use the same indices that adds the regions to the final groups to mask them so they are not considered again. ## Testing This PR adds a unit test specifically aimed for this bug. Without the fix the test would fail. Additionally any PDF files with repeated texts has a potential to trigger this bug. e.g., create a simple pdf use the test text ```python "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\nLayoutParser for Deep Learning" ``` and partition with `ocr_only` mode on main branch would hit this bug and output text where position of the second "LayoutParser" is incorrect. ```python [ 'LayoutParser:', 'A Unified Toolkit for Deep Learning Based Document Image', 'for Deep Learning LayoutParser', ] ```
1 parent 0fbdd4e commit a9ff1e7

File tree

4 files changed

+80
-3
lines changed

4 files changed

+80
-3
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.16.17-dev1
1+
## 0.16.17-dev2
22

33
### Enhancements
44
- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.
55

66
### Features
77

88
### Fixes
9+
- **Fix a bug where `build_layout_elements_from_cor_regions` incorrectly joins texts in wrong order**.
910

1011
## 0.16.16
1112

test_unstructured/partition/pdf_image/test_inference_utils.py

+76
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,79 @@ def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedd
9191
),
9292
]
9393
)
94+
95+
96+
def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions):
97+
mock_embedded_text_regions.extend(
98+
[
99+
LayoutElement.from_coords(
100+
x1=453.00277777777774,
101+
y1=417.319341111111,
102+
x2=711.5338541666665,
103+
y2=458.28571222222206,
104+
text="LayoutParser",
105+
type=ElementType.UNCATEGORIZED_TEXT,
106+
),
107+
LayoutElement.from_coords(
108+
x1=453.00277777777774,
109+
y1=468.319341111111,
110+
x2=711.5338541666665,
111+
y2=478.28571222222206,
112+
text="for",
113+
type=ElementType.UNCATEGORIZED_TEXT,
114+
),
115+
LayoutElement.from_coords(
116+
x1=453.00277777777774,
117+
y1=488.319341111111,
118+
x2=711.5338541666665,
119+
y2=500.28571222222206,
120+
text="Deep",
121+
type=ElementType.UNCATEGORIZED_TEXT,
122+
),
123+
LayoutElement.from_coords(
124+
x1=453.00277777777774,
125+
y1=510.319341111111,
126+
x2=711.5338541666665,
127+
y2=550.28571222222206,
128+
text="Learning",
129+
type=ElementType.UNCATEGORIZED_TEXT,
130+
),
131+
]
132+
)
133+
text = (
134+
"LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n"
135+
"LayoutParser for Deep Learning"
136+
)
137+
elements = build_layout_elements_from_ocr_regions(
138+
TextRegions.from_list(mock_embedded_text_regions),
139+
text,
140+
group_by_ocr_text=True,
141+
)
142+
assert elements == LayoutElements.from_list(
143+
[
144+
LayoutElement.from_coords(
145+
x1=453.00277777777774,
146+
y1=317.319341111111,
147+
x2=711.5338541666665,
148+
y2=358.28571222222206,
149+
text="LayoutParser:",
150+
type=ElementType.UNCATEGORIZED_TEXT,
151+
),
152+
LayoutElement.from_coords(
153+
x1=437.83888888888885,
154+
y1=317.319341111111,
155+
x2=1256.334784222222,
156+
y2=406.9837855555556,
157+
text="A Unified Toolkit for Deep Learning Based Document Image",
158+
type=ElementType.UNCATEGORIZED_TEXT,
159+
),
160+
LayoutElement.from_coords(
161+
x1=453.00277777777774,
162+
y1=417.319341111111,
163+
x2=711.5338541666665,
164+
y2=550.28571222222206,
165+
text="LayoutParser for Deep Learning",
166+
type=ElementType.UNCATEGORIZED_TEXT,
167+
),
168+
]
169+
)

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.17-dev1" # pragma: no cover
1+
__version__ = "0.16.17-dev2" # pragma: no cover

unstructured/partition/pdf_image/inference_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,12 @@ def build_layout_elements_from_ocr_regions(
6262
break
6363
if text in words:
6464
regions.append(indices[mask][i])
65-
mask[mask][i] = False
6665
words.remove(text)
6766

6867
if not regions:
6968
continue
7069

70+
mask[regions] = False
7171
grouped_regions.append(ocr_regions.slice(regions))
7272
else:
7373
grouped_regions = partition_groups_from_regions(ocr_regions)

0 commit comments

Comments
 (0)