Skip to content

Commit b8238fe

Browse files
authored
feat: process images (#11)
Adds ability to process images as single page documents. Uses mostly the same path as processing a pdf, but uses a null layout (since an image won't provide a pdf layout) and does not filter layout text blocks using a bounding box to discover text (because there are no layout text blocks).
1 parent 1b6aadd commit b8238fe

File tree

6 files changed

+84
-17
lines changed

6 files changed

+84
-17
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.2.2-dev0
1+
## 0.2.2-dev1
22

3+
* Add capability to process image files
34
* Add logic to use OCR when layout text is full of unknown characters
45

56
## 0.2.1

sample-docs/loremipsum.jpg

492 KB
Loading

sample-docs/loremipsum.png

316 KB
Loading

test_unstructured_inference/inference/test_layout.py

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,24 +178,37 @@ def test_process_file_with_model_raises_on_invalid_model_name():
178178
layout.process_file_with_model("", model_name="fake")
179179

180180

181-
class MockPageLayout(layout.PageLayout):
182-
def __init__(self, ocr_text):
183-
self.ocr_text = ocr_text
184-
185-
def ocr(self, text_block):
186-
return self.ocr_text
181+
class MockPoints:
182+
def tolist(self):
183+
return [1, 2, 3, 4]
187184

188185

189186
class MockTextBlock(lp.TextBlock):
190-
def __init__(self, text):
187+
def __init__(self, type=None, text=None, ocr_text=None):
188+
self.type = type
191189
self.text = text
190+
self.ocr_text = ocr_text
191+
192+
@property
193+
def points(self):
194+
return MockPoints()
195+
196+
197+
class MockPageLayout(layout.PageLayout):
198+
def __init__(self, layout=None, model=None):
199+
self.image = None
200+
self.layout = layout
201+
self.model = model
202+
203+
def ocr(self, text_block: MockTextBlock):
204+
return text_block.ocr_text
192205

193206

194207
def test_interpret_text_block_use_ocr_when_text_symbols_cid():
195208
fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)"
196209
fake_ocr = "ocrme"
197-
fake_text_block = MockTextBlock(fake_text)
198-
assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr
210+
fake_text_block = MockTextBlock(text=fake_text, ocr_text=fake_ocr)
211+
assert MockPageLayout().interpret_text_block(fake_text_block) == fake_ocr
199212

200213

201214
@pytest.mark.parametrize(
@@ -212,3 +225,45 @@ def test_cid_ratio(text, expected):
212225
)
213226
def test_is_cid_present(text, expected):
214227
assert layout.is_cid_present(text) == expected
228+
229+
230+
class MockLayout:
231+
def __init__(self, *elements):
232+
self.elements = elements
233+
234+
def sort(self, key, inplace):
235+
return self.elements
236+
237+
def __iter__(self):
238+
return iter(self.elements)
239+
240+
def get_texts(self):
241+
return [el.text for el in self.elements]
242+
243+
244+
def test_pagelayout_without_layout():
245+
mock_layout = MockLayout(
246+
MockTextBlock(text=None, ocr_text="textblock1"),
247+
MockTextBlock(text=None, ocr_text="textblock2"),
248+
)
249+
250+
model = MockLayoutModel(mock_layout)
251+
pl = MockPageLayout(model=model, layout=None)
252+
253+
assert [el.text for el in pl.get_elements(inplace=False)] == [
254+
el.ocr_text for el in model.detect(None)
255+
]
256+
257+
258+
@pytest.mark.parametrize("filetype", ("png", "jpg"))
259+
def test_from_image_file(monkeypatch, mock_page_layout, filetype):
260+
def mock_get_elements(self, *args, **kwargs):
261+
self.elements = [mock_page_layout]
262+
263+
monkeypatch.setattr(layout.PageLayout, "get_elements", mock_get_elements)
264+
elements = (
265+
layout.DocumentLayout.from_image_file(f"sample-docs/loremipsum.{filetype}")
266+
.pages[0]
267+
.elements
268+
)
269+
assert elements[0] == mock_page_layout
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.2-dev0" # pragma: no cover
1+
__version__ = "0.2.2-dev1" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,15 @@ def from_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None)
7272
pages.append(page)
7373
return cls.from_pages(pages)
7474

75+
@classmethod
76+
def from_image_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None):
77+
"""Creates a DocumentLayout from an image file."""
78+
logger.info(f"Reading image file: {filename} ...")
79+
image = Image.open(filename)
80+
page = PageLayout(number=0, image=image, layout=None, model=model)
81+
page.get_elements()
82+
return cls.from_pages([page])
83+
7584

7685
class PageLayout:
7786
"""Class for an individual PDF page."""
@@ -80,7 +89,7 @@ def __init__(
8089
self,
8190
number: int,
8291
image: Image,
83-
layout: lp.Layout,
92+
layout: Optional[lp.Layout],
8493
model: Optional[Detectron2LayoutModel] = None,
8594
):
8695
self.image = image
@@ -107,12 +116,14 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
107116
# sophisticated ordering logic for more complicated layouts.
108117
image_layout.sort(key=lambda element: element.coordinates[1], inplace=True)
109118
for item in image_layout:
110-
text_blocks = self.layout.filter_by(item, center=True)
111119
text = str()
112-
for text_block in text_blocks:
113-
text_block.text = self.interpret_text_block(text_block)
114-
text = " ".join([x for x in text_blocks.get_texts() if x])
115-
120+
if self.layout is None:
121+
text = self.interpret_text_block(item)
122+
else:
123+
text_blocks = self.layout.filter_by(item, center=True)
124+
for text_block in text_blocks:
125+
text_block.text = self.interpret_text_block(text_block)
126+
text = " ".join([x for x in text_blocks.get_texts() if x])
116127
elements.append(
117128
LayoutElement(type=item.type, text=text, coordinates=item.points.tolist())
118129
)

0 commit comments

Comments
 (0)