Skip to content

Commit d332b65

Browse files
authored
fix: removes multithreading for OCR (#65)
* fix: removes multithreading for OCR * fix: added missing execution provider for ONNX * Removed unused dependencies * fix: removed MockPool as Pool object isn't used anymore * Version updated
1 parent bae0948 commit d332b65

File tree

5 files changed

+10
-17
lines changed

5 files changed

+10
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.2.9
2+
3+
* Removed multithreading from OCR (DocumentLayout.get_elements_from_layout)
4+
15
## 0.2.8
26

37
* Refactored YoloX inference code to integrate better with framework

test_unstructured_inference/inference/test_layout.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ def test_get_page_elements_with_ocr(monkeypatch):
101101

102102
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
103103
monkeypatch.setattr(layout, "ocr", lambda *args: "An Even Catchier Title")
104-
monkeypatch.setattr(layout, "Pool", MockPool)
105104

106105
image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
107106
print(layout.ocr(text_block, image))
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.8" # pragma: no cover
1+
__version__ = "0.2.9" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
from __future__ import annotations
22
from dataclasses import dataclass
3-
from functools import partial
4-
from multiprocessing import Pool
53
import os
64
import re
75
import tempfile
6+
from tqdm import tqdm
87
from typing import List, Optional, Tuple, Union, BinaryIO
98

109
from layoutparser.io.pdf import load_pdf
@@ -178,18 +177,9 @@ def get_elements_from_layout(self, layout: Layout) -> List[LayoutElement]:
178177
# sophisticated ordering logic for more complicated layouts.
179178
layout.sort(key=lambda element: element.coordinates[1], inplace=True)
180179
# NOTE(benjamin): Creates a Pool for concurrent processing of image elements by OCR
181-
pool = Pool()
182-
try:
183-
get_element_partial = partial(
184-
get_element_from_block,
185-
image=self.image,
186-
layout=self.layout,
187-
ocr_strategy=self.ocr_strategy,
188-
)
189-
elements = pool.map(get_element_partial, layout)
190-
finally:
191-
pool.close()
192-
pool.join()
180+
elements = []
181+
for e in tqdm(layout):
182+
elements.append(get_element_from_block(e, self.image, self.layout, self.ocr_strategy))
193183
return elements
194184

195185
def _get_image_array(self) -> Union[np.ndarray, None]:

unstructured_inference/models/yolox.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def predict(self, x: Image):
5151
return self.image_processing(x)
5252

5353
def initialize(self, model_path: str, label_map: dict):
54-
self.model = onnxruntime.InferenceSession(model_path)
54+
self.model = onnxruntime.InferenceSession(model_path, providers=["CPUExecutionProvider"])
5555
self.layout_classes = label_map
5656

5757
def image_processing(

0 commit comments

Comments
 (0)