chrispydizzle
diff --git a/‎examples/frogs.png‎
7.13 MB b/‎examples/frogs.png‎
7.13 MB
diff --git a/‎examples/frogs.pptx‎
5.49 MB b/‎examples/frogs.pptx‎
5.49 MB
diff --git a/‎png2pptx/layout.py‎
Lines changed: 173 additions & 10 deletions b/‎png2pptx/layout.py‎
Lines changed: 173 additions & 10 deletions
diff --git a/‎png2pptx/ocr.py‎
Lines changed: 29 additions & 0 deletions b/‎png2pptx/ocr.py‎
Lines changed: 29 additions & 0 deletions
@@ -61,14 +61,27 @@ def group_into_blocks(
 
 _NOISE_RE = re.compile(r'^[\s_\-—–;:.,!?\'"()|\[\]{}]+$')
 _SYMBOLIC_CHARS = set("|[]{}\\/@~")
+_INLINE_PUNCTUATION_KEEP = {"&", "+", "%"}
 _COMMON_SHORT_WORDS = {
     "a", "ai", "al", "an", "as", "at", "be", "by", "do", "go", "he",
     "i", "if", "in", "is", "it", "ml", "no", "of", "ok", "on", "or",
     "so", "to", "ui", "up", "us", "ux", "we",
 }
 _COMMON_SHORT_UNITS = {"cm", "db", "ft", "gb", "hz", "kb", "kg", "km", "mb", "mhz", "mm", "ms", "tb"}
-_ALLOWED_ALL_CAPS_SHORT_WORDS = {"ai", "al", "km", "ml", "no", "of", "ok", "ui", "us", "ux"}
+# When a 2-letter word is in ALL CAPS, it's much more likely to be
+# legitimate (a heading word) than noise. The allowlist mirrors common
+# real English/abbreviation tokens that show up in headings.
+_ALLOWED_ALL_CAPS_SHORT_WORDS = {
+    "ai", "al", "an", "as", "at", "be", "by", "do", "go", "he", "if",
+    "in", "is", "it", "km", "ml", "no", "of", "ok", "on", "or", "so",
+    "to", "ui", "us", "ux", "we",
+}
 _SHORT_STANDALONE_ALLOWLIST = {"avg", "data", "high", "low", "time"}
+_ROMAN_NUMERAL_MARKER_RE = re.compile(r"^(?:[IVXLCM]{1,5}|[ivxlcm]{1,5})[.)]$")
+# Tesseract regularly mis-reads `II.`/`III.`/`IIV.` etc. as `Il.`/`Ill.`/`IlV.`
+# (capital I followed by lowercase L). Recognise the body if every character
+# is a Roman numeral OR a lowercase `l`, and the trailing punctuation is `.`/`)`.
+_ROMAN_NUMERAL_OCR_CONFUSION_RE = re.compile(r"^(?=.*[Ii])[IiVvXxLlCcMm]{1,5}[.)]$")
 
 
 def _normalized_text(text: str) -> str:
@@ -80,6 +93,43 @@ def _is_number_marker(text: str, normalized: str) -> bool:
     return bool(normalized) and normalized.isdigit() and bool(re.fullmatch(r"\d+[.)]", text))
 
 
+def _is_roman_marker(text: str) -> bool:
+    """A short Roman numeral followed by `.` or `)` (e.g. `I.`, `IV.`, `iii)`).
+
+    Also accepts common Tesseract OCR confusions where lowercase `l` was read
+    instead of `I` (e.g. `Il.` -> `II.`, `Ill.` -> `III.`).
+    """
+    if _ROMAN_NUMERAL_MARKER_RE.fullmatch(text):
+        return True
+    if not _ROMAN_NUMERAL_OCR_CONFUSION_RE.fullmatch(text):
+        return False
+    # Body must contain at least one I/i so we don't false-positive on plain
+    # `L.` / `LL.` (which the strict regex above already accepts as Roman
+    # numerals if all uppercase). Substitute lowercase l -> I and re-validate.
+    body = text[:-1]
+    promoted = body.replace("l", "I").replace("L", "I") if any(c in "Ii" for c in body) else body
+    return bool(_ROMAN_NUMERAL_MARKER_RE.fullmatch(promoted + text[-1]))
+
+
+def normalize_roman_marker_text(text: str) -> str:
+    """If *text* is a Roman-numeral list marker (possibly with `l`->`I` OCR
+    confusion), return the normalised uppercase form. Otherwise return *text*
+    unchanged.
+    """
+    if not text:
+        return text
+    if _ROMAN_NUMERAL_MARKER_RE.fullmatch(text):
+        return text.upper()
+    if _ROMAN_NUMERAL_OCR_CONFUSION_RE.fullmatch(text):
+        body = text[:-1].replace("l", "I").replace("L", "I")
+        return body.upper() + text[-1]
+    return text
+
+
+def _is_list_marker(text: str, normalized: str) -> bool:
+    return _is_number_marker(text, normalized) or _is_roman_marker(text)
+
+
 def _is_numeric_fragment(text: str, normalized: str) -> bool:
     if not normalized or not normalized.isdigit():
         return False
@@ -109,7 +159,12 @@ def _is_noise_word(word: WordBox) -> bool:
     if not text:
         return True
 
+    if _is_list_marker(text, normalized):
+        return False
+
     if not normalized:
+        if text in _INLINE_PUNCTUATION_KEEP:
+            return False
         return _contains_symbolic_noise(text) or word.width <= 3 or word.height <= 3
 
     if _contains_symbolic_noise(text):
@@ -122,7 +177,7 @@ def _is_noise_word(word: WordBox) -> bool:
             return word.confidence < 80.0 or word.width <= max(4, int(word.height * 0.2))
         return True
 
-    if len(normalized) == 2 and text.isupper() and normalized not in {"ai", "al", "ok"}:
+    if len(normalized) == 2 and text.isupper():
         return normalized not in _ALLOWED_ALL_CAPS_SHORT_WORDS
 
     if len(normalized) == 2 and text != text.lower() and not _is_allowed_short_token(normalized) and normalized not in {"ai", "al", "ok"}:
@@ -152,25 +207,38 @@ def _drop_inline_separator(word: WordBox) -> bool:
     text = word.text.strip()
     normalized = _normalized_text(text)
     if not normalized:
+        if text in _INLINE_PUNCTUATION_KEEP:
+            return False
         return text not in {"-", "—", "–"}
     return "|" in text or "~" in text
 
 
 def _is_edge_noise_word(word: WordBox) -> bool:
-    normalized = _normalized_text(word.text.strip())
+    text = word.text.strip()
+    normalized = _normalized_text(text)
+
+    if _is_list_marker(text, normalized):
+        return False
+
     if not normalized:
         return _drop_inline_separator(word)
 
     if (
         len(normalized) <= 3
-        and word.text.strip() != normalized
-        and not word.text.strip().isalpha()
-        and not _is_number_marker(word.text.strip(), normalized)
-        and not _is_numeric_fragment(word.text.strip(), normalized)
+        and text != normalized
+        and not text.isalpha()
+        and not _is_number_marker(text, normalized)
+        and not _is_numeric_fragment(text, normalized)
+        and not _is_roman_marker(text)
     ):
         return True
 
-    if len(normalized) <= 2 and not _is_allowed_short_token(normalized) and not _is_number_marker(word.text.strip(), normalized) and not _is_numeric_fragment(word.text.strip(), normalized):
+    if (
+        len(normalized) <= 2
+        and not _is_allowed_short_token(normalized)
+        and not _is_number_marker(text, normalized)
+        and not _is_numeric_fragment(text, normalized)
+    ):
         return True
 
     if word.width <= max(4, int(word.height * 0.2)):
@@ -183,8 +251,13 @@ def _is_weak_edge_word(word: WordBox) -> bool:
     text = word.text.strip()
     normalized = _normalized_text(text)
     if not normalized:
+        if text in _INLINE_PUNCTUATION_KEEP:
+            return False
         return True
 
+    if _is_list_marker(text, normalized):
+        return False
+
     if word.confidence < 55.0:
         return True
 
@@ -210,13 +283,94 @@ def _clean_block_words(words: list[WordBox]) -> TextBlock | None:
         kept = kept[1:]
     while len(kept) > 1 and _is_weak_edge_word(kept[-1]):
         kept = kept[:-1]
+    kept = _drop_height_outlier_edges(kept)
     if not kept:
         return None
     if len(kept) == 1 and _is_edge_noise_word(kept[0]):
         return None
+    for word in kept:
+        normalised = normalize_roman_marker_text(word.text.strip())
+        if normalised != word.text:
+            word.text = normalised
     return TextBlock(words=kept)
 
 
+def _drop_height_outlier_edges(words: list[WordBox]) -> list[WordBox]:
+    """Trim leading/trailing words whose height is dramatically smaller than the
+    rest of the line.  This catches stray OCR prefixes/suffixes (e.g. a small
+    `we` glued to the start of an otherwise large title block).
+
+    Only triggers when:
+      * the line has at least 3 words (so single-word lines never lose their
+        only word),
+      * the candidate is a short common-word token (`we`, `of`, ...) or a
+        single character — never a long word,
+      * the candidate's height is < 0.6x the median height of the remaining
+        words, and
+      * the candidate's confidence is < 90.
+    """
+    if len(words) < 3:
+        return words
+
+    def median_height(seq: list[WordBox]) -> float:
+        heights = sorted(w.height for w in seq)
+        return float(heights[len(heights) // 2])
+
+    def is_height_outlier(word: WordBox, others: list[WordBox]) -> bool:
+        normalized = _normalized_text(word.text.strip())
+        if not normalized:
+            return False
+        if len(normalized) > 3 and normalized not in _COMMON_SHORT_WORDS:
+            return False
+        if word.confidence >= 90.0:
+            return False
+        median = median_height(others)
+        if median <= 0:
+            return False
+        return word.height < median * 0.6
+
+    def is_case_mismatch_lead(word: WordBox, others: list[WordBox]) -> bool:
+        """Drop a leading/trailing short lowercase word when the rest of the
+        block is otherwise all-uppercase. This catches stray prefixes like
+        `we UNDERSTANDING FROG MATING HABITS:` where Tesseract glued a tiny
+        decorative scrap onto the front of a heading.
+        """
+        text = word.text.strip()
+        normalized = _normalized_text(text)
+        if not normalized:
+            return False
+        # Only consider short common-word tokens — never a long word, never
+        # something with digits.
+        if len(normalized) > 3:
+            return False
+        if normalized not in _COMMON_SHORT_WORDS:
+            return False
+        if not text.isalpha() or text != text.lower():
+            return False
+        if word.confidence >= 90.0:
+            return False
+        # The remaining words must be predominantly uppercase, so a lowercase
+        # leader genuinely doesn't fit.
+        upper_words = [w for w in others if any(c.isupper() for c in w.text) and w.text == w.text.upper()]
+        if len(upper_words) < max(2, int(len(others) * 0.6)):
+            return False
+        return True
+
+    # Trim from front
+    while len(words) >= 3 and (
+        is_height_outlier(words[0], words[1:])
+        or is_case_mismatch_lead(words[0], words[1:])
+    ):
+        words = words[1:]
+    # Trim from back
+    while len(words) >= 3 and (
+        is_height_outlier(words[-1], words[:-1])
+        or is_case_mismatch_lead(words[-1], words[:-1])
+    ):
+        words = words[:-1]
+    return words
+
+
 def _drop_overlapping_noise_words(words: list[WordBox]) -> list[WordBox]:
     kept: list[WordBox] = []
 
@@ -270,14 +424,23 @@ def _filter_noise(blocks: list[TextBlock]) -> list[TextBlock]:
         # Skip single-character blocks that aren't alphanumeric
         if len(text) <= 1 and not text.isalnum():
             continue
-        # Skip blocks that are only punctuation/whitespace/underscores
-        if _NOISE_RE.match(text):
+        # Single-token blocks that hold only inline punctuation we want to
+        # keep (e.g. a standalone `&` or `+`) survive even though _NOISE_RE
+        # would otherwise drop them.
+        if not _NOISE_RE.match(text):
+            pass
+        elif text in _INLINE_PUNCTUATION_KEEP:
+            pass
+        else:
             continue
         # Skip blocks where estimated font is unreasonably large (graphic artifacts)
         if b.estimated_font_size_px > 100:
             continue
         if len(b.words) == 1:
             word = b.words[0]
+            if _is_list_marker(text, normalized):
+                filtered.append(b)
+                continue
             if avg_conf < 60.0 and (len(normalized) <= 6 or text != normalized):
                 continue
             if (
 
@@ -985,12 +985,41 @@ def _select_cluster_candidate(cluster: list[tuple[int, WordBox]]) -> WordBox | N
         ),
     )
 
+    # When multiple same-text candidates share a cluster but one's bounding
+    # box is dramatically taller than the others, prefer the smaller one.
+    # Tesseract sometimes returns oversized bboxes that bleed into adjacent
+    # graphics; the smaller candidates are usually the correct visual region.
+    chosen = _prefer_smaller_same_text_candidate(chosen, best_group)
+
     if len({pass_id for pass_id, _word in best_group}) == 1 and _is_suspicious_candidate(chosen):
         return None
 
     return chosen
 
 
+def _prefer_smaller_same_text_candidate(
+    chosen: WordBox,
+    group: list[tuple[int, WordBox]],
+) -> WordBox:
+    if len(group) < 2:
+        return chosen
+
+    heights = sorted(word.height for _pid, word in group)
+    median_h = heights[len(heights) // 2]
+    if median_h <= 0 or chosen.height <= median_h * 1.6:
+        return chosen
+
+    candidates = [
+        word for _pid, word in group
+        if word.height <= median_h * 1.4
+        and word.confidence >= chosen.confidence - 12.0
+    ]
+    if not candidates:
+        return chosen
+
+    return max(candidates, key=lambda word: (word.confidence, word.width * word.height))
+
+
 def _candidate_group_score(group: list[tuple[int, WordBox]]) -> float:
     pass_count = len({pass_id for pass_id, _word in group})
     confidences = [word.confidence for _pass_id, word in group]