Skip to content

Commit b3ceb2a

Browse files
committed
boosting
1 parent b639327 commit b3ceb2a

4 files changed

Lines changed: 202 additions & 10 deletions

File tree

examples/frogs.png

7.13 MB
Loading

examples/frogs.pptx

5.49 MB
Binary file not shown.

png2pptx/layout.py

Lines changed: 173 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,27 @@ def group_into_blocks(
6161

6262
_NOISE_RE = re.compile(r'^[\s_\-—–;:.,!?\'"()|\[\]{}]+$')
6363
_SYMBOLIC_CHARS = set("|[]{}\\/@~")
64+
_INLINE_PUNCTUATION_KEEP = {"&", "+", "%"}
6465
_COMMON_SHORT_WORDS = {
6566
"a", "ai", "al", "an", "as", "at", "be", "by", "do", "go", "he",
6667
"i", "if", "in", "is", "it", "ml", "no", "of", "ok", "on", "or",
6768
"so", "to", "ui", "up", "us", "ux", "we",
6869
}
6970
_COMMON_SHORT_UNITS = {"cm", "db", "ft", "gb", "hz", "kb", "kg", "km", "mb", "mhz", "mm", "ms", "tb"}
70-
_ALLOWED_ALL_CAPS_SHORT_WORDS = {"ai", "al", "km", "ml", "no", "of", "ok", "ui", "us", "ux"}
71+
# When a 2-letter word is in ALL CAPS, it's much more likely to be
72+
# legitimate (a heading word) than noise. The allowlist mirrors common
73+
# real English/abbreviation tokens that show up in headings.
74+
_ALLOWED_ALL_CAPS_SHORT_WORDS = {
75+
"ai", "al", "an", "as", "at", "be", "by", "do", "go", "he", "if",
76+
"in", "is", "it", "km", "ml", "no", "of", "ok", "on", "or", "so",
77+
"to", "ui", "us", "ux", "we",
78+
}
7179
_SHORT_STANDALONE_ALLOWLIST = {"avg", "data", "high", "low", "time"}
80+
_ROMAN_NUMERAL_MARKER_RE = re.compile(r"^(?:[IVXLCM]{1,5}|[ivxlcm]{1,5})[.)]$")
81+
# Tesseract regularly mis-reads `II.`/`III.`/`IIV.` etc. as `Il.`/`Ill.`/`IlV.`
82+
# (capital I followed by lowercase L). Recognise the body if every character
83+
# is a Roman numeral OR a lowercase `l`, and the trailing punctuation is `.`/`)`.
84+
_ROMAN_NUMERAL_OCR_CONFUSION_RE = re.compile(r"^(?=.*[Ii])[IiVvXxLlCcMm]{1,5}[.)]$")
7285

7386

7487
def _normalized_text(text: str) -> str:
@@ -80,6 +93,43 @@ def _is_number_marker(text: str, normalized: str) -> bool:
8093
return bool(normalized) and normalized.isdigit() and bool(re.fullmatch(r"\d+[.)]", text))
8194

8295

96+
def _is_roman_marker(text: str) -> bool:
97+
"""A short Roman numeral followed by `.` or `)` (e.g. `I.`, `IV.`, `iii)`).
98+
99+
Also accepts common Tesseract OCR confusions where lowercase `l` was read
100+
instead of `I` (e.g. `Il.` -> `II.`, `Ill.` -> `III.`).
101+
"""
102+
if _ROMAN_NUMERAL_MARKER_RE.fullmatch(text):
103+
return True
104+
if not _ROMAN_NUMERAL_OCR_CONFUSION_RE.fullmatch(text):
105+
return False
106+
# Body must contain at least one I/i so we don't false-positive on plain
107+
# `L.` / `LL.` (which the strict regex above already accepts as Roman
108+
# numerals if all uppercase). Substitute lowercase l -> I and re-validate.
109+
body = text[:-1]
110+
promoted = body.replace("l", "I").replace("L", "I") if any(c in "Ii" for c in body) else body
111+
return bool(_ROMAN_NUMERAL_MARKER_RE.fullmatch(promoted + text[-1]))
112+
113+
114+
def normalize_roman_marker_text(text: str) -> str:
115+
"""If *text* is a Roman-numeral list marker (possibly with `l`->`I` OCR
116+
confusion), return the normalised uppercase form. Otherwise return *text*
117+
unchanged.
118+
"""
119+
if not text:
120+
return text
121+
if _ROMAN_NUMERAL_MARKER_RE.fullmatch(text):
122+
return text.upper()
123+
if _ROMAN_NUMERAL_OCR_CONFUSION_RE.fullmatch(text):
124+
body = text[:-1].replace("l", "I").replace("L", "I")
125+
return body.upper() + text[-1]
126+
return text
127+
128+
129+
def _is_list_marker(text: str, normalized: str) -> bool:
130+
return _is_number_marker(text, normalized) or _is_roman_marker(text)
131+
132+
83133
def _is_numeric_fragment(text: str, normalized: str) -> bool:
84134
if not normalized or not normalized.isdigit():
85135
return False
@@ -109,7 +159,12 @@ def _is_noise_word(word: WordBox) -> bool:
109159
if not text:
110160
return True
111161

162+
if _is_list_marker(text, normalized):
163+
return False
164+
112165
if not normalized:
166+
if text in _INLINE_PUNCTUATION_KEEP:
167+
return False
113168
return _contains_symbolic_noise(text) or word.width <= 3 or word.height <= 3
114169

115170
if _contains_symbolic_noise(text):
@@ -122,7 +177,7 @@ def _is_noise_word(word: WordBox) -> bool:
122177
return word.confidence < 80.0 or word.width <= max(4, int(word.height * 0.2))
123178
return True
124179

125-
if len(normalized) == 2 and text.isupper() and normalized not in {"ai", "al", "ok"}:
180+
if len(normalized) == 2 and text.isupper():
126181
return normalized not in _ALLOWED_ALL_CAPS_SHORT_WORDS
127182

128183
if len(normalized) == 2 and text != text.lower() and not _is_allowed_short_token(normalized) and normalized not in {"ai", "al", "ok"}:
@@ -152,25 +207,38 @@ def _drop_inline_separator(word: WordBox) -> bool:
152207
text = word.text.strip()
153208
normalized = _normalized_text(text)
154209
if not normalized:
210+
if text in _INLINE_PUNCTUATION_KEEP:
211+
return False
155212
return text not in {"-", "—", "–"}
156213
return "|" in text or "~" in text
157214

158215

159216
def _is_edge_noise_word(word: WordBox) -> bool:
160-
normalized = _normalized_text(word.text.strip())
217+
text = word.text.strip()
218+
normalized = _normalized_text(text)
219+
220+
if _is_list_marker(text, normalized):
221+
return False
222+
161223
if not normalized:
162224
return _drop_inline_separator(word)
163225

164226
if (
165227
len(normalized) <= 3
166-
and word.text.strip() != normalized
167-
and not word.text.strip().isalpha()
168-
and not _is_number_marker(word.text.strip(), normalized)
169-
and not _is_numeric_fragment(word.text.strip(), normalized)
228+
and text != normalized
229+
and not text.isalpha()
230+
and not _is_number_marker(text, normalized)
231+
and not _is_numeric_fragment(text, normalized)
232+
and not _is_roman_marker(text)
170233
):
171234
return True
172235

173-
if len(normalized) <= 2 and not _is_allowed_short_token(normalized) and not _is_number_marker(word.text.strip(), normalized) and not _is_numeric_fragment(word.text.strip(), normalized):
236+
if (
237+
len(normalized) <= 2
238+
and not _is_allowed_short_token(normalized)
239+
and not _is_number_marker(text, normalized)
240+
and not _is_numeric_fragment(text, normalized)
241+
):
174242
return True
175243

176244
if word.width <= max(4, int(word.height * 0.2)):
@@ -183,8 +251,13 @@ def _is_weak_edge_word(word: WordBox) -> bool:
183251
text = word.text.strip()
184252
normalized = _normalized_text(text)
185253
if not normalized:
254+
if text in _INLINE_PUNCTUATION_KEEP:
255+
return False
186256
return True
187257

258+
if _is_list_marker(text, normalized):
259+
return False
260+
188261
if word.confidence < 55.0:
189262
return True
190263

@@ -210,13 +283,94 @@ def _clean_block_words(words: list[WordBox]) -> TextBlock | None:
210283
kept = kept[1:]
211284
while len(kept) > 1 and _is_weak_edge_word(kept[-1]):
212285
kept = kept[:-1]
286+
kept = _drop_height_outlier_edges(kept)
213287
if not kept:
214288
return None
215289
if len(kept) == 1 and _is_edge_noise_word(kept[0]):
216290
return None
291+
for word in kept:
292+
normalised = normalize_roman_marker_text(word.text.strip())
293+
if normalised != word.text:
294+
word.text = normalised
217295
return TextBlock(words=kept)
218296

219297

298+
def _drop_height_outlier_edges(words: list[WordBox]) -> list[WordBox]:
299+
"""Trim leading/trailing words whose height is dramatically smaller than the
300+
rest of the line. This catches stray OCR prefixes/suffixes (e.g. a small
301+
`we` glued to the start of an otherwise large title block).
302+
303+
Only triggers when:
304+
* the line has at least 3 words (so single-word lines never lose their
305+
only word),
306+
* the candidate is a short common-word token (`we`, `of`, ...) or a
307+
single character — never a long word,
308+
* the candidate's height is < 0.6x the median height of the remaining
309+
words, and
310+
* the candidate's confidence is < 90.
311+
"""
312+
if len(words) < 3:
313+
return words
314+
315+
def median_height(seq: list[WordBox]) -> float:
316+
heights = sorted(w.height for w in seq)
317+
return float(heights[len(heights) // 2])
318+
319+
def is_height_outlier(word: WordBox, others: list[WordBox]) -> bool:
320+
normalized = _normalized_text(word.text.strip())
321+
if not normalized:
322+
return False
323+
if len(normalized) > 3 and normalized not in _COMMON_SHORT_WORDS:
324+
return False
325+
if word.confidence >= 90.0:
326+
return False
327+
median = median_height(others)
328+
if median <= 0:
329+
return False
330+
return word.height < median * 0.6
331+
332+
def is_case_mismatch_lead(word: WordBox, others: list[WordBox]) -> bool:
333+
"""Drop a leading/trailing short lowercase word when the rest of the
334+
block is otherwise all-uppercase. This catches stray prefixes like
335+
`we UNDERSTANDING FROG MATING HABITS:` where Tesseract glued a tiny
336+
decorative scrap onto the front of a heading.
337+
"""
338+
text = word.text.strip()
339+
normalized = _normalized_text(text)
340+
if not normalized:
341+
return False
342+
# Only consider short common-word tokens — never a long word, never
343+
# something with digits.
344+
if len(normalized) > 3:
345+
return False
346+
if normalized not in _COMMON_SHORT_WORDS:
347+
return False
348+
if not text.isalpha() or text != text.lower():
349+
return False
350+
if word.confidence >= 90.0:
351+
return False
352+
# The remaining words must be predominantly uppercase, so a lowercase
353+
# leader genuinely doesn't fit.
354+
upper_words = [w for w in others if any(c.isupper() for c in w.text) and w.text == w.text.upper()]
355+
if len(upper_words) < max(2, int(len(others) * 0.6)):
356+
return False
357+
return True
358+
359+
# Trim from front
360+
while len(words) >= 3 and (
361+
is_height_outlier(words[0], words[1:])
362+
or is_case_mismatch_lead(words[0], words[1:])
363+
):
364+
words = words[1:]
365+
# Trim from back
366+
while len(words) >= 3 and (
367+
is_height_outlier(words[-1], words[:-1])
368+
or is_case_mismatch_lead(words[-1], words[:-1])
369+
):
370+
words = words[:-1]
371+
return words
372+
373+
220374
def _drop_overlapping_noise_words(words: list[WordBox]) -> list[WordBox]:
221375
kept: list[WordBox] = []
222376

@@ -270,14 +424,23 @@ def _filter_noise(blocks: list[TextBlock]) -> list[TextBlock]:
270424
# Skip single-character blocks that aren't alphanumeric
271425
if len(text) <= 1 and not text.isalnum():
272426
continue
273-
# Skip blocks that are only punctuation/whitespace/underscores
274-
if _NOISE_RE.match(text):
427+
# Single-token blocks that hold only inline punctuation we want to
428+
# keep (e.g. a standalone `&` or `+`) survive even though _NOISE_RE
429+
# would otherwise drop them.
430+
if not _NOISE_RE.match(text):
431+
pass
432+
elif text in _INLINE_PUNCTUATION_KEEP:
433+
pass
434+
else:
275435
continue
276436
# Skip blocks where estimated font is unreasonably large (graphic artifacts)
277437
if b.estimated_font_size_px > 100:
278438
continue
279439
if len(b.words) == 1:
280440
word = b.words[0]
441+
if _is_list_marker(text, normalized):
442+
filtered.append(b)
443+
continue
281444
if avg_conf < 60.0 and (len(normalized) <= 6 or text != normalized):
282445
continue
283446
if (

png2pptx/ocr.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,12 +985,41 @@ def _select_cluster_candidate(cluster: list[tuple[int, WordBox]]) -> WordBox | N
985985
),
986986
)
987987

988+
# When multiple same-text candidates share a cluster but one's bounding
989+
# box is dramatically taller than the others, prefer the smaller one.
990+
# Tesseract sometimes returns oversized bboxes that bleed into adjacent
991+
# graphics; the smaller candidates are usually the correct visual region.
992+
chosen = _prefer_smaller_same_text_candidate(chosen, best_group)
993+
988994
if len({pass_id for pass_id, _word in best_group}) == 1 and _is_suspicious_candidate(chosen):
989995
return None
990996

991997
return chosen
992998

993999

1000+
def _prefer_smaller_same_text_candidate(
1001+
chosen: WordBox,
1002+
group: list[tuple[int, WordBox]],
1003+
) -> WordBox:
1004+
if len(group) < 2:
1005+
return chosen
1006+
1007+
heights = sorted(word.height for _pid, word in group)
1008+
median_h = heights[len(heights) // 2]
1009+
if median_h <= 0 or chosen.height <= median_h * 1.6:
1010+
return chosen
1011+
1012+
candidates = [
1013+
word for _pid, word in group
1014+
if word.height <= median_h * 1.4
1015+
and word.confidence >= chosen.confidence - 12.0
1016+
]
1017+
if not candidates:
1018+
return chosen
1019+
1020+
return max(candidates, key=lambda word: (word.confidence, word.width * word.height))
1021+
1022+
9941023
def _candidate_group_score(group: list[tuple[int, WordBox]]) -> float:
9951024
pass_count = len({pass_id for pass_id, _word in group})
9961025
confidences = [word.confidence for _pass_id, word in group]

0 commit comments

Comments
 (0)