Skip to content

Commit 9310542

Browse files
committed
Add function to group contiguous objects
1 parent 369ecd3 commit 9310542

File tree

1 file changed

+59
-17
lines changed

1 file changed

+59
-17
lines changed

rows/plugins/plugin_ocr.py

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,56 @@
2121
from pytesseract import image_to_boxes
2222
from PIL import Image
2323

24-
from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
24+
from rows.plugins.plugin_pdf import group_objects, PDFBackend, TextObject, pdf_table_lines
2525
from rows.plugins.utils import create_table
2626

2727

28+
def join_text_group(group):
29+
"""Join a list of `TextObject`s into one"""
30+
31+
obj = group[0]
32+
max_between = (obj.x1 - obj.x0) / len(obj.text) # Average letter size
33+
text, last_x1 = [], obj.x0
34+
for obj in group:
35+
if last_x1 + max_between <= obj.x0:
36+
text.append(" ")
37+
text.append(obj.text)
38+
last_x1 = obj.x1
39+
text = "".join(text)
40+
41+
return TextObject(
42+
x0=min(obj.x0 for obj in group),
43+
y0=min(obj.y0 for obj in group),
44+
x1=max(obj.x1 for obj in group),
45+
y1=max(obj.y1 for obj in group),
46+
text=text
47+
)
48+
49+
50+
def group_contiguous_objects(objs, x_threshold, y_threshold):
51+
"""Merge contiguous objects if they're closer enough"""
52+
53+
objs.sort(key=lambda obj: obj.y0)
54+
y_groups = group_objects(objs, y_threshold, "y")
55+
for y_group, y_items in y_groups.items():
56+
y_items.sort(key=lambda obj: obj.x0)
57+
58+
x_groups, current_group, last_x1 = [], [], None
59+
for obj in y_items:
60+
if not current_group or last_x1 + x_threshold >= obj.x0:
61+
current_group.append(obj)
62+
elif current_group:
63+
x_groups.append(current_group)
64+
current_group = [obj]
65+
last_x1 = obj.x1
66+
if current_group:
67+
x_groups.append(current_group)
68+
69+
for group in x_groups:
70+
if group:
71+
yield join_text_group(group)
72+
73+
2874
class TesseractBackend(PDFBackend):
2975

3076
name = "tesseract"
@@ -36,12 +82,7 @@ def __init__(self, filename_or_fobj, language):
3682

3783
@cached_property
3884
def document(self):
39-
if hasattr(self.filename_or_fobj, "read"):
40-
image = Image.open(self.filename_or_fobj)
41-
else:
42-
image = self.filename_or_fobj
43-
44-
return image
85+
return Image.open(self.filename_or_fobj)
4586

4687
@cached_property
4788
def number_of_pages(self):
@@ -51,6 +92,7 @@ def extract_text(self, page_numbers=None):
5192
return "" # TODO: image_to_string
5293

5394
def objects(self, page_numbers=None, starts_after=None, ends_before=None):
95+
_, total_y = self.document.size
5496
header = "char left bottom right top page".split()
5597
boxes = image_to_boxes(self.document, lang=self.language).splitlines()
5698
text_objs = []
@@ -60,18 +102,18 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
60102
if key != "char":
61103
value = int(value)
62104
row[key] = value
63-
obj = TextObject(
64-
x0=row["left"],
65-
y0=row["bottom"],
66-
x1=row["right"],
67-
y1=row["top"],
68-
text=row["char"],
105+
text_objs.append(
106+
TextObject(
107+
x0=row["left"],
108+
y0=total_y - row["bottom"],
109+
x1=row["right"],
110+
y1=total_y - row["top"],
111+
text=row["char"],
112+
)
69113
)
70-
text_objs.append(obj)
71114

72-
text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
73-
# TODO: group contiguous objects before yielding
74-
yield text_objs
115+
# TODO: custom thresholds
116+
yield list(group_contiguous_objects(text_objs, 30, 12))
75117

76118
text_objects = objects
77119

0 commit comments

Comments
 (0)