Skip to content

Commit d5dbc73

Browse files
[Reconstitution] Improve reconstitution (#1750)
1 parent 2ca3928 commit d5dbc73

File tree

3 files changed

+198
-72
lines changed

3 files changed

+198
-72
lines changed

doctr/io/elements.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **
310310
def synthesize(self, **kwargs) -> np.ndarray:
311311
"""Synthesize the page from the predictions
312312
313+
Args:
314+
----
315+
**kwargs: keyword arguments passed to the `synthesize_page` method
316+
313317
Returns
314318
-------
315319
synthesized page
@@ -493,7 +497,7 @@ def synthesize(self, **kwargs) -> np.ndarray:
493497
494498
Args:
495499
----
496-
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
500+
**kwargs: keyword arguments passed to the `synthesize_kie_page` method
497501
498502
Returns:
499503
-------
@@ -603,11 +607,15 @@ def show(self, **kwargs) -> None:
603607
def synthesize(self, **kwargs) -> List[np.ndarray]:
604608
"""Synthesize all pages from their predictions
605609
610+
Args:
611+
----
612+
**kwargs: keyword arguments passed to the `Page.synthesize` method
613+
606614
Returns
607615
-------
608616
list of synthesized pages
609617
"""
610-
return [page.synthesize() for page in self.pages]
618+
return [page.synthesize(**kwargs) for page in self.pages]
611619

612620
def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
613621
"""Export the document as XML (hOCR-format)

doctr/utils/reconstitution.py

Lines changed: 151 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
# This program is licensed under the Apache License 2.0.
44
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5+
import logging
56
from typing import Any, Dict, Optional
67

78
import numpy as np
@@ -13,61 +14,163 @@
1314
__all__ = ["synthesize_page", "synthesize_kie_page"]
1415

1516

17+
# Global variable to avoid multiple warnings
18+
ROTATION_WARNING = False
19+
20+
21+
def _warn_rotation(entry: Dict[str, Any]) -> None: # pragma: no cover
22+
global ROTATION_WARNING
23+
if not ROTATION_WARNING and len(entry["geometry"]) == 4:
24+
logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
25+
ROTATION_WARNING = True
26+
27+
28+
def _synthesize(
29+
response: Image.Image,
30+
entry: Dict[str, Any],
31+
w: int,
32+
h: int,
33+
draw_proba: bool = False,
34+
font_family: Optional[str] = None,
35+
smoothing_factor: float = 0.75,
36+
min_font_size: int = 6,
37+
max_font_size: int = 50,
38+
) -> Image.Image:
39+
if len(entry["geometry"]) == 2:
40+
(xmin, ymin), (xmax, ymax) = entry["geometry"]
41+
polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
42+
else:
43+
polygon = entry["geometry"]
44+
45+
# Calculate the bounding box of the word
46+
x_coords, y_coords = zip(*polygon)
47+
xmin, ymin, xmax, ymax = (
48+
int(round(w * min(x_coords))),
49+
int(round(h * min(y_coords))),
50+
int(round(w * max(x_coords))),
51+
int(round(h * max(y_coords))),
52+
)
53+
word_width = xmax - xmin
54+
word_height = ymax - ymin
55+
56+
# If lines are provided instead of words, concatenate the word entries
57+
if "words" in entry:
58+
word_text = " ".join(word["value"] for word in entry["words"])
59+
else:
60+
word_text = entry["value"]
61+
# Find the optimal font size
62+
try:
63+
font_size = min(word_height, max_font_size)
64+
font = get_font(font_family, font_size)
65+
text_width, text_height = font.getbbox(word_text)[2:4]
66+
67+
while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
68+
font_size = max(int(font_size * smoothing_factor), min_font_size)
69+
font = get_font(font_family, font_size)
70+
text_width, text_height = font.getbbox(word_text)[2:4]
71+
except ValueError:
72+
font = get_font(font_family, min_font_size)
73+
74+
# Create a mask for the word
75+
mask = Image.new("L", (w, h), 0)
76+
ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)
77+
78+
# Draw the word text
79+
d = ImageDraw.Draw(response)
80+
try:
81+
try:
82+
d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
83+
except UnicodeEncodeError:
84+
d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")
85+
# Catch generic exceptions to avoid crashing the whole rendering
86+
except Exception: # pragma: no cover
87+
logging.warning(f"Could not render word: {word_text}")
88+
89+
if draw_proba:
90+
confidence = (
91+
entry["confidence"]
92+
if "confidence" in entry
93+
else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
94+
)
95+
p = int(255 * confidence)
96+
color = (255 - p, 0, p) # Red to blue gradient based on probability
97+
d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)
98+
99+
prob_font = get_font(font_family, 20)
100+
prob_text = f"{confidence:.2f}"
101+
prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]
102+
103+
# Position the probability slightly above the bounding box
104+
prob_x_offset = (word_width - prob_text_width) // 2
105+
prob_y_offset = ymin - prob_text_height - 2
106+
prob_y_offset = max(0, prob_y_offset)
107+
108+
d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")
109+
110+
return response
111+
112+
16113
def synthesize_page(
17114
page: Dict[str, Any],
18115
draw_proba: bool = False,
19116
font_family: Optional[str] = None,
117+
smoothing_factor: float = 0.95,
118+
min_font_size: int = 8,
119+
max_font_size: int = 50,
20120
) -> np.ndarray:
21121
"""Draw a the content of the element page (OCR response) on a blank page.
22122
23123
Args:
24124
----
25125
page: exported Page object to represent
26126
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
27-
font_size: size of the font, default font = 13
28127
font_family: family of the font
128+
smoothing_factor: factor to smooth the font size
129+
min_font_size: minimum font size
130+
max_font_size: maximum font size
29131
30132
Returns:
31133
-------
32134
the synthesized page
33135
"""
34136
# Draw template
35137
h, w = page["dimensions"]
36-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
138+
response = Image.new("RGB", (w, h), color=(255, 255, 255))
37139

38-
# Draw each word
39140
for block in page["blocks"]:
40-
for line in block["lines"]:
41-
for word in line["words"]:
42-
# Get absolute word geometry
43-
(xmin, ymin), (xmax, ymax) = word["geometry"]
44-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
45-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
46-
47-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
48-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
49-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
50-
d = ImageDraw.Draw(img)
51-
# Draw in black the value of the word
52-
try:
53-
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
54-
except UnicodeEncodeError:
55-
# When character cannot be encoded, use its anyascii version
56-
d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0))
57-
58-
# Colorize if draw_proba
59-
if draw_proba:
60-
p = int(255 * word["confidence"])
61-
mask = np.where(np.array(img) == 0, 1, 0)
62-
proba: np.ndarray = np.array([255 - p, 0, p])
63-
color = mask * proba[np.newaxis, np.newaxis, :]
64-
white_mask = 255 * (1 - mask)
65-
img = color + white_mask
66-
67-
# Write to response page
68-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
69-
70-
return response
141+
# If lines are provided use these to get better rendering results
142+
if len(block["lines"]) > 1:
143+
for line in block["lines"]:
144+
_warn_rotation(block) # pragma: no cover
145+
response = _synthesize(
146+
response=response,
147+
entry=line,
148+
w=w,
149+
h=h,
150+
draw_proba=draw_proba,
151+
font_family=font_family,
152+
smoothing_factor=smoothing_factor,
153+
min_font_size=min_font_size,
154+
max_font_size=max_font_size,
155+
)
156+
# Otherwise, draw each word
157+
else:
158+
for line in block["lines"]:
159+
_warn_rotation(block) # pragma: no cover
160+
for word in line["words"]:
161+
response = _synthesize(
162+
response=response,
163+
entry=word,
164+
w=w,
165+
h=h,
166+
draw_proba=draw_proba,
167+
font_family=font_family,
168+
smoothing_factor=smoothing_factor,
169+
min_font_size=min_font_size,
170+
max_font_size=max_font_size,
171+
)
172+
173+
return np.array(response, dtype=np.uint8)
71174

72175

73176
def synthesize_kie_page(
@@ -81,46 +184,29 @@ def synthesize_kie_page(
81184
----
82185
page: exported Page object to represent
83186
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
84-
font_size: size of the font, default font = 13
85187
font_family: family of the font
188+
smoothing_factor: factor to smooth the font size
189+
min_font_size: minimum font size
190+
max_font_size: maximum font size
86191
87192
Returns:
88193
-------
89194
the synthesized page
90195
"""
91196
# Draw template
92197
h, w = page["dimensions"]
93-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
198+
response = Image.new("RGB", (w, h), color=(255, 255, 255))
94199

95200
# Draw each word
96201
for predictions in page["predictions"].values():
97202
for prediction in predictions:
98-
# Get aboslute word geometry
99-
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
100-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
101-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
102-
103-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
104-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
105-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
106-
d = ImageDraw.Draw(img)
107-
# Draw in black the value of the word
108-
try:
109-
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
110-
except UnicodeEncodeError:
111-
# When character cannot be encoded, use its anyascii version
112-
d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0))
113-
114-
# Colorize if draw_proba
115-
if draw_proba:
116-
p = int(255 * prediction["confidence"])
117-
mask = np.where(np.array(img) == 0, 1, 0)
118-
proba: np.ndarray = np.array([255 - p, 0, p])
119-
color = mask * proba[np.newaxis, np.newaxis, :]
120-
white_mask = 255 * (1 - mask)
121-
img = color + white_mask
122-
123-
# Write to response page
124-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
125-
126-
return response
203+
_warn_rotation(prediction) # pragma: no cover
204+
response = _synthesize(
205+
response=response,
206+
entry=prediction,
207+
w=w,
208+
h=h,
209+
draw_proba=draw_proba,
210+
font_family=font_family,
211+
)
212+
return np.array(response, dtype=np.uint8)
Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,44 @@
11
import numpy as np
2-
from test_io_elements import _mock_pages
2+
from test_io_elements import _mock_kie_pages, _mock_pages
33

44
from doctr.utils import reconstitution
55

66

77
def test_synthesize_page():
88
pages = _mock_pages()
9-
reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
10-
render = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
11-
assert isinstance(render, np.ndarray)
12-
assert render.shape == (*pages[0].dimensions, 3)
9+
# Test without probability rendering
10+
render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
11+
assert isinstance(render_no_proba, np.ndarray)
12+
assert render_no_proba.shape == (*pages[0].dimensions, 3)
13+
14+
# Test with probability rendering
15+
render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
16+
assert isinstance(render_with_proba, np.ndarray)
17+
assert render_with_proba.shape == (*pages[0].dimensions, 3)
18+
19+
# Test with only one line
20+
pages_one_line = pages[0].export()
21+
pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]]
22+
render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True)
23+
assert isinstance(render_one_line, np.ndarray)
24+
assert render_one_line.shape == (*pages[0].dimensions, 3)
25+
26+
# Test with polygons
27+
pages_poly = pages[0].export()
28+
pages_poly["blocks"][0]["lines"][0]["geometry"] = [(0, 0), (0, 1), (1, 1), (1, 0)]
29+
render_poly = reconstitution.synthesize_page(pages_poly, draw_proba=True)
30+
assert isinstance(render_poly, np.ndarray)
31+
assert render_poly.shape == (*pages[0].dimensions, 3)
32+
33+
34+
def test_synthesize_kie_page():
35+
pages = _mock_kie_pages()
36+
# Test without probability rendering
37+
render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False)
38+
assert isinstance(render_no_proba, np.ndarray)
39+
assert render_no_proba.shape == (*pages[0].dimensions, 3)
40+
41+
# Test with probability rendering
42+
render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True)
43+
assert isinstance(render_with_proba, np.ndarray)
44+
assert render_with_proba.shape == (*pages[0].dimensions, 3)

0 commit comments

Comments
 (0)