@@ -61,14 +61,27 @@ def group_into_blocks(
6161
6262_NOISE_RE = re .compile (r'^[\s_\-—–;:.,!?\'"()|\[\]{}]+$' )
6363_SYMBOLIC_CHARS = set ("|[]{}\\ /@~" )
64+ _INLINE_PUNCTUATION_KEEP = {"&" , "+" , "%" }
6465_COMMON_SHORT_WORDS = {
6566 "a" , "ai" , "al" , "an" , "as" , "at" , "be" , "by" , "do" , "go" , "he" ,
6667 "i" , "if" , "in" , "is" , "it" , "ml" , "no" , "of" , "ok" , "on" , "or" ,
6768 "so" , "to" , "ui" , "up" , "us" , "ux" , "we" ,
6869}
6970_COMMON_SHORT_UNITS = {"cm" , "db" , "ft" , "gb" , "hz" , "kb" , "kg" , "km" , "mb" , "mhz" , "mm" , "ms" , "tb" }
70- _ALLOWED_ALL_CAPS_SHORT_WORDS = {"ai" , "al" , "km" , "ml" , "no" , "of" , "ok" , "ui" , "us" , "ux" }
71+ # When a 2-letter word is in ALL CAPS, it's much more likely to be
72+ # legitimate (a heading word) than noise. The allowlist mirrors common
73+ # real English/abbreviation tokens that show up in headings.
74+ _ALLOWED_ALL_CAPS_SHORT_WORDS = {
75+ "ai" , "al" , "an" , "as" , "at" , "be" , "by" , "do" , "go" , "he" , "if" ,
76+ "in" , "is" , "it" , "km" , "ml" , "no" , "of" , "ok" , "on" , "or" , "so" ,
77+ "to" , "ui" , "us" , "ux" , "we" ,
78+ }
7179_SHORT_STANDALONE_ALLOWLIST = {"avg" , "data" , "high" , "low" , "time" }
80+ _ROMAN_NUMERAL_MARKER_RE = re .compile (r"^(?:[IVXLCM]{1,5}|[ivxlcm]{1,5})[.)]$" )
81+ # Tesseract regularly mis-reads `II.`/`III.`/`IIV.` etc. as `Il.`/`Ill.`/`IlV.`
82+ # (capital I followed by lowercase L). Recognise the body if every character
83+ # is a Roman numeral OR a lowercase `l`, and the trailing punctuation is `.`/`)`.
84+ _ROMAN_NUMERAL_OCR_CONFUSION_RE = re .compile (r"^(?=.*[Ii])[IiVvXxLlCcMm]{1,5}[.)]$" )
7285
7386
7487def _normalized_text (text : str ) -> str :
@@ -80,6 +93,43 @@ def _is_number_marker(text: str, normalized: str) -> bool:
8093 return bool (normalized ) and normalized .isdigit () and bool (re .fullmatch (r"\d+[.)]" , text ))
8194
8295
96+ def _is_roman_marker (text : str ) -> bool :
97+ """A short Roman numeral followed by `.` or `)` (e.g. `I.`, `IV.`, `iii)`).
98+
99+ Also accepts common Tesseract OCR confusions where lowercase `l` was read
100+ instead of `I` (e.g. `Il.` -> `II.`, `Ill.` -> `III.`).
101+ """
102+ if _ROMAN_NUMERAL_MARKER_RE .fullmatch (text ):
103+ return True
104+ if not _ROMAN_NUMERAL_OCR_CONFUSION_RE .fullmatch (text ):
105+ return False
106+ # Body must contain at least one I/i so we don't false-positive on plain
107+ # `L.` / `LL.` (which the strict regex above already accepts as Roman
108+ # numerals if all uppercase). Substitute lowercase l -> I and re-validate.
109+ body = text [:- 1 ]
110+ promoted = body .replace ("l" , "I" ).replace ("L" , "I" ) if any (c in "Ii" for c in body ) else body
111+ return bool (_ROMAN_NUMERAL_MARKER_RE .fullmatch (promoted + text [- 1 ]))
112+
113+
114+ def normalize_roman_marker_text (text : str ) -> str :
115+ """If *text* is a Roman-numeral list marker (possibly with `l`->`I` OCR
116+ confusion), return the normalised uppercase form. Otherwise return *text*
117+ unchanged.
118+ """
119+ if not text :
120+ return text
121+ if _ROMAN_NUMERAL_MARKER_RE .fullmatch (text ):
122+ return text .upper ()
123+ if _ROMAN_NUMERAL_OCR_CONFUSION_RE .fullmatch (text ):
124+ body = text [:- 1 ].replace ("l" , "I" ).replace ("L" , "I" )
125+ return body .upper () + text [- 1 ]
126+ return text
127+
128+
129+ def _is_list_marker (text : str , normalized : str ) -> bool :
130+ return _is_number_marker (text , normalized ) or _is_roman_marker (text )
131+
132+
83133def _is_numeric_fragment (text : str , normalized : str ) -> bool :
84134 if not normalized or not normalized .isdigit ():
85135 return False
@@ -109,7 +159,12 @@ def _is_noise_word(word: WordBox) -> bool:
109159 if not text :
110160 return True
111161
162+ if _is_list_marker (text , normalized ):
163+ return False
164+
112165 if not normalized :
166+ if text in _INLINE_PUNCTUATION_KEEP :
167+ return False
113168 return _contains_symbolic_noise (text ) or word .width <= 3 or word .height <= 3
114169
115170 if _contains_symbolic_noise (text ):
@@ -122,7 +177,7 @@ def _is_noise_word(word: WordBox) -> bool:
122177 return word .confidence < 80.0 or word .width <= max (4 , int (word .height * 0.2 ))
123178 return True
124179
125- if len (normalized ) == 2 and text .isupper () and normalized not in { "ai" , "al" , "ok" } :
180+ if len (normalized ) == 2 and text .isupper ():
126181 return normalized not in _ALLOWED_ALL_CAPS_SHORT_WORDS
127182
128183 if len (normalized ) == 2 and text != text .lower () and not _is_allowed_short_token (normalized ) and normalized not in {"ai" , "al" , "ok" }:
@@ -152,25 +207,38 @@ def _drop_inline_separator(word: WordBox) -> bool:
152207 text = word .text .strip ()
153208 normalized = _normalized_text (text )
154209 if not normalized :
210+ if text in _INLINE_PUNCTUATION_KEEP :
211+ return False
155212 return text not in {"-" , "—" , "–" }
156213 return "|" in text or "~" in text
157214
158215
159216def _is_edge_noise_word (word : WordBox ) -> bool :
160- normalized = _normalized_text (word .text .strip ())
217+ text = word .text .strip ()
218+ normalized = _normalized_text (text )
219+
220+ if _is_list_marker (text , normalized ):
221+ return False
222+
161223 if not normalized :
162224 return _drop_inline_separator (word )
163225
164226 if (
165227 len (normalized ) <= 3
166- and word .text .strip () != normalized
167- and not word .text .strip ().isalpha ()
168- and not _is_number_marker (word .text .strip (), normalized )
169- and not _is_numeric_fragment (word .text .strip (), normalized )
228+ and text != normalized
229+ and not text .isalpha ()
230+ and not _is_number_marker (text , normalized )
231+ and not _is_numeric_fragment (text , normalized )
232+ and not _is_roman_marker (text )
170233 ):
171234 return True
172235
173- if len (normalized ) <= 2 and not _is_allowed_short_token (normalized ) and not _is_number_marker (word .text .strip (), normalized ) and not _is_numeric_fragment (word .text .strip (), normalized ):
236+ if (
237+ len (normalized ) <= 2
238+ and not _is_allowed_short_token (normalized )
239+ and not _is_number_marker (text , normalized )
240+ and not _is_numeric_fragment (text , normalized )
241+ ):
174242 return True
175243
176244 if word .width <= max (4 , int (word .height * 0.2 )):
@@ -183,8 +251,13 @@ def _is_weak_edge_word(word: WordBox) -> bool:
183251 text = word .text .strip ()
184252 normalized = _normalized_text (text )
185253 if not normalized :
254+ if text in _INLINE_PUNCTUATION_KEEP :
255+ return False
186256 return True
187257
258+ if _is_list_marker (text , normalized ):
259+ return False
260+
188261 if word .confidence < 55.0 :
189262 return True
190263
@@ -210,13 +283,94 @@ def _clean_block_words(words: list[WordBox]) -> TextBlock | None:
210283 kept = kept [1 :]
211284 while len (kept ) > 1 and _is_weak_edge_word (kept [- 1 ]):
212285 kept = kept [:- 1 ]
286+ kept = _drop_height_outlier_edges (kept )
213287 if not kept :
214288 return None
215289 if len (kept ) == 1 and _is_edge_noise_word (kept [0 ]):
216290 return None
291+ for word in kept :
292+ normalised = normalize_roman_marker_text (word .text .strip ())
293+ if normalised != word .text :
294+ word .text = normalised
217295 return TextBlock (words = kept )
218296
219297
298+ def _drop_height_outlier_edges (words : list [WordBox ]) -> list [WordBox ]:
299+ """Trim leading/trailing words whose height is dramatically smaller than the
300+ rest of the line. This catches stray OCR prefixes/suffixes (e.g. a small
301+ `we` glued to the start of an otherwise large title block).
302+
303+ Only triggers when:
304+ * the line has at least 3 words (so single-word lines never lose their
305+ only word),
306+ * the candidate is a short common-word token (`we`, `of`, ...) or a
307+ single character — never a long word,
308+ * the candidate's height is < 0.6x the median height of the remaining
309+ words, and
310+ * the candidate's confidence is < 90.
311+ """
312+ if len (words ) < 3 :
313+ return words
314+
315+ def median_height (seq : list [WordBox ]) -> float :
316+ heights = sorted (w .height for w in seq )
317+ return float (heights [len (heights ) // 2 ])
318+
319+ def is_height_outlier (word : WordBox , others : list [WordBox ]) -> bool :
320+ normalized = _normalized_text (word .text .strip ())
321+ if not normalized :
322+ return False
323+ if len (normalized ) > 3 and normalized not in _COMMON_SHORT_WORDS :
324+ return False
325+ if word .confidence >= 90.0 :
326+ return False
327+ median = median_height (others )
328+ if median <= 0 :
329+ return False
330+ return word .height < median * 0.6
331+
332+ def is_case_mismatch_lead (word : WordBox , others : list [WordBox ]) -> bool :
333+ """Drop a leading/trailing short lowercase word when the rest of the
334+ block is otherwise all-uppercase. This catches stray prefixes like
335+ `we UNDERSTANDING FROG MATING HABITS:` where Tesseract glued a tiny
336+ decorative scrap onto the front of a heading.
337+ """
338+ text = word .text .strip ()
339+ normalized = _normalized_text (text )
340+ if not normalized :
341+ return False
342+ # Only consider short common-word tokens — never a long word, never
343+ # something with digits.
344+ if len (normalized ) > 3 :
345+ return False
346+ if normalized not in _COMMON_SHORT_WORDS :
347+ return False
348+ if not text .isalpha () or text != text .lower ():
349+ return False
350+ if word .confidence >= 90.0 :
351+ return False
352+ # The remaining words must be predominantly uppercase, so a lowercase
353+ # leader genuinely doesn't fit.
354+ upper_words = [w for w in others if any (c .isupper () for c in w .text ) and w .text == w .text .upper ()]
355+ if len (upper_words ) < max (2 , int (len (others ) * 0.6 )):
356+ return False
357+ return True
358+
359+ # Trim from front
360+ while len (words ) >= 3 and (
361+ is_height_outlier (words [0 ], words [1 :])
362+ or is_case_mismatch_lead (words [0 ], words [1 :])
363+ ):
364+ words = words [1 :]
365+ # Trim from back
366+ while len (words ) >= 3 and (
367+ is_height_outlier (words [- 1 ], words [:- 1 ])
368+ or is_case_mismatch_lead (words [- 1 ], words [:- 1 ])
369+ ):
370+ words = words [:- 1 ]
371+ return words
372+
373+
220374def _drop_overlapping_noise_words (words : list [WordBox ]) -> list [WordBox ]:
221375 kept : list [WordBox ] = []
222376
@@ -270,14 +424,23 @@ def _filter_noise(blocks: list[TextBlock]) -> list[TextBlock]:
270424 # Skip single-character blocks that aren't alphanumeric
271425 if len (text ) <= 1 and not text .isalnum ():
272426 continue
273- # Skip blocks that are only punctuation/whitespace/underscores
274- if _NOISE_RE .match (text ):
427+ # Single-token blocks that hold only inline punctuation we want to
428+ # keep (e.g. a standalone `&` or `+`) survive even though _NOISE_RE
429+ # would otherwise drop them.
430+ if not _NOISE_RE .match (text ):
431+ pass
432+ elif text in _INLINE_PUNCTUATION_KEEP :
433+ pass
434+ else :
275435 continue
276436 # Skip blocks where estimated font is unreasonably large (graphic artifacts)
277437 if b .estimated_font_size_px > 100 :
278438 continue
279439 if len (b .words ) == 1 :
280440 word = b .words [0 ]
441+ if _is_list_marker (text , normalized ):
442+ filtered .append (b )
443+ continue
281444 if avg_conf < 60.0 and (len (normalized ) <= 6 or text != normalized ):
282445 continue
283446 if (
0 commit comments