21
21
from pytesseract import image_to_boxes
22
22
from PIL import Image
23
23
24
- from rows .plugins .plugin_pdf import PDFBackend , TextObject , pdf_table_lines
24
+ from rows .plugins .plugin_pdf import group_objects , PDFBackend , TextObject , pdf_table_lines
25
25
from rows .plugins .utils import create_table
26
26
27
27
28
+ def join_text_group (group ):
29
+ """Join a list of `TextObject`s into one"""
30
+
31
+ obj = group [0 ]
32
+ max_between = (obj .x1 - obj .x0 ) / len (obj .text ) # Average letter size
33
+ text , last_x1 = [], obj .x0
34
+ for obj in group :
35
+ if last_x1 + max_between <= obj .x0 :
36
+ text .append (" " )
37
+ text .append (obj .text )
38
+ last_x1 = obj .x1
39
+ text = "" .join (text )
40
+
41
+ return TextObject (
42
+ x0 = min (obj .x0 for obj in group ),
43
+ y0 = min (obj .y0 for obj in group ),
44
+ x1 = max (obj .x1 for obj in group ),
45
+ y1 = max (obj .y1 for obj in group ),
46
+ text = text
47
+ )
48
+
49
+
50
+ def group_contiguous_objects (objs , x_threshold , y_threshold ):
51
+ """Merge contiguous objects if they're closer enough"""
52
+
53
+ objs .sort (key = lambda obj : obj .y0 )
54
+ y_groups = group_objects (objs , y_threshold , "y" )
55
+ for y_group , y_items in y_groups .items ():
56
+ y_items .sort (key = lambda obj : obj .x0 )
57
+
58
+ x_groups , current_group , last_x1 = [], [], None
59
+ for obj in y_items :
60
+ if not current_group or last_x1 + x_threshold >= obj .x0 :
61
+ current_group .append (obj )
62
+ elif current_group :
63
+ x_groups .append (current_group )
64
+ current_group = [obj ]
65
+ last_x1 = obj .x1
66
+ if current_group :
67
+ x_groups .append (current_group )
68
+
69
+ for group in x_groups :
70
+ if group :
71
+ yield join_text_group (group )
72
+
73
+
28
74
class TesseractBackend (PDFBackend ):
29
75
30
76
name = "tesseract"
@@ -36,12 +82,7 @@ def __init__(self, filename_or_fobj, language):
36
82
37
83
@cached_property
38
84
def document (self ):
39
- if hasattr (self .filename_or_fobj , "read" ):
40
- image = Image .open (self .filename_or_fobj )
41
- else :
42
- image = self .filename_or_fobj
43
-
44
- return image
85
+ return Image .open (self .filename_or_fobj )
45
86
46
87
@cached_property
47
88
def number_of_pages (self ):
@@ -51,6 +92,7 @@ def extract_text(self, page_numbers=None):
51
92
return "" # TODO: image_to_string
52
93
53
94
def objects (self , page_numbers = None , starts_after = None , ends_before = None ):
95
+ _ , total_y = self .document .size
54
96
header = "char left bottom right top page" .split ()
55
97
boxes = image_to_boxes (self .document , lang = self .language ).splitlines ()
56
98
text_objs = []
@@ -60,18 +102,18 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
60
102
if key != "char" :
61
103
value = int (value )
62
104
row [key ] = value
63
- obj = TextObject (
64
- x0 = row ["left" ],
65
- y0 = row ["bottom" ],
66
- x1 = row ["right" ],
67
- y1 = row ["top" ],
68
- text = row ["char" ],
105
+ text_objs .append (
106
+ TextObject (
107
+ x0 = row ["left" ],
108
+ y0 = total_y - row ["bottom" ],
109
+ x1 = row ["right" ],
110
+ y1 = total_y - row ["top" ],
111
+ text = row ["char" ],
112
+ )
69
113
)
70
- text_objs .append (obj )
71
114
72
- text_objs .sort (key = lambda obj : (obj .y0 , obj .x0 ))
73
- # TODO: group contiguous objects before yielding
74
- yield text_objs
115
+ # TODO: custom thresholds
116
+ yield list (group_contiguous_objects (text_objs , 30 , 12 ))
75
117
76
118
text_objects = objects
77
119
0 commit comments