Skip to content

Commit d92bc11

Browse files
author
carlosacchi
committed
Merge feature/v0.16-fidelity-fixes: Fidelity-first capture and post-processing
1 parent 81810bf commit d92bc11

4 files changed

Lines changed: 22 additions & 28 deletions

File tree

captiocr/config/constants.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525

2626
# Capture Configuration
2727
DEFAULT_MIN_CAPTURE_INTERVAL = 3.0 # seconds
28-
DEFAULT_MAX_CAPTURE_INTERVAL = 4.0 # seconds (recall-first: tighter sampling)
28+
DEFAULT_MAX_CAPTURE_INTERVAL = 6.0 # seconds
2929
MAX_SIMILAR_CAPTURES = 1
30-
TEXT_SIMILARITY_THRESHOLD = 0.85 # Raw capture: recall-first (higher = less aggressive dedup)
30+
TEXT_SIMILARITY_THRESHOLD = 0.80 # Fidelity-first: match v0.12.3 default
3131
MIN_TEXT_LENGTH = 10 # Used only in post-processing delta extraction
3232
MIN_CAPTURE_AREA_SIZE = 70 # pixels
3333

@@ -38,8 +38,8 @@
3838
DEFAULT_INCREMENTAL_THRESHOLD = 0.7 # Percentage overlap for incremental detection (70%)
3939

4040
# Post-Processing Configuration (recall-first pipeline for processed files)
41-
POST_PROCESS_DEDUP_ENTER_THRESHOLD = 0.75 # Enter dedup mode when similarity >= this
42-
POST_PROCESS_DEDUP_EXIT_THRESHOLD = 0.60 # Exit dedup mode when similarity <= this
41+
POST_PROCESS_DEDUP_ENTER_THRESHOLD = 0.82 # Enter dedup mode when similarity >= this
42+
POST_PROCESS_DEDUP_EXIT_THRESHOLD = 0.55 # Exit dedup mode when similarity <= this
4343
POST_PROCESS_MIN_LENGTH_RATIO = 0.60 # No-downgrade rule: skip if new < this * previous length
4444
POST_PROCESS_MIN_NEW_WORDS = 3 # No-downgrade rule: minimum new words to accept a shorter frame
4545
POST_PROCESS_FRAME_CONSENSUS_WINDOW = 3 # Emit only when content appears in N-1 of N frames

captiocr/core/capture.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -252,35 +252,36 @@ def _capture_loop(self, language: str, caption_mode: bool) -> None:
252252
self.logger.debug(f"Raw OCR result (length: {len(raw_text)}): '{raw_text[:100]}...'")
253253
metrics['total_ocr_frames'] += 1
254254

255-
# Clean text (control chars/whitespace only — no semantic filtering)
256-
cleaned_text = self.text_processor.clean_text_raw(raw_text)
255+
# Dual-path: raw for file output, normalized for comparison
256+
raw_cleaned = self.text_processor.clean_text_raw(raw_text)
257+
normalized = self.text_processor.clean_text(raw_text)
257258

258-
if not cleaned_text or not cleaned_text.strip():
259+
if not raw_cleaned or not raw_cleaned.strip():
259260
metrics['dropped_empty'] += 1
260261
continue
261262

262263
# Skip UI overlay artifacts (Press ESC, Click and drag, etc.)
263-
if self.text_processor._is_ui_artifact(cleaned_text):
264+
if self.text_processor._is_ui_artifact(raw_cleaned):
264265
self.logger.debug("Skipped UI artifact frame")
265266
metrics['dropped_ui_artifact'] += 1
266267
continue
267268

268-
# Write full text if meaningfully different from last capture
269-
if self.text_processor.has_significant_new_content(cleaned_text, last_text):
269+
# Write full raw text if normalized version is meaningfully different
270+
if self.text_processor.has_significant_new_content(normalized, last_text):
270271
with open(self.output_file_path, 'a', encoding='utf-8') as f:
271272
timestamp = datetime.now().strftime('%H:%M:%S')
272-
f.write(f"[{timestamp}] {cleaned_text}\n")
273+
f.write(f"[{timestamp}] {raw_cleaned}\n")
273274

274-
last_text = cleaned_text
275-
self.text_history.append(cleaned_text)
275+
last_text = normalized
276+
self.text_history.append(raw_cleaned)
276277
similar_captures_count = 0
277278
capture_interval = self.capture_config.reset_interval()
278279
metrics['written_frames'] += 1
279280

280281
if self.on_text_captured:
281-
self.on_text_captured(cleaned_text)
282+
self.on_text_captured(raw_cleaned)
282283

283-
self.logger.debug(f"Captured: {cleaned_text[:80]}...")
284+
self.logger.debug(f"Captured: {raw_cleaned[:80]}...")
284285
else:
285286
# No significant change — slow down polling if screen is static
286287
similar_captures_count += 1

captiocr/core/text_processor.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,10 @@ def _is_gibberish_token(word: str) -> bool:
435435
if len(word) < 4:
436436
return False
437437

438+
# All-uppercase short words are almost always acronyms, not gibberish
439+
if word.isupper() and len(word) <= 6:
440+
return False
441+
438442
letters_only = ''.join(c for c in word if c.isalpha())
439443
if not letters_only:
440444
return False
@@ -504,14 +508,6 @@ def _clean_ocr_artifacts(self, text: str) -> str:
504508
for word in words:
505509
# Strip punctuation for checking but keep original if valid
506510
stripped = word.strip('.,!?;:()[]{}')
507-
# Remove isolated uppercase 2-3 letter OCR noise using repetition-based detection
508-
# Protect common valid words: I, OK, IT, IS, OR, AN, AM, AT, IF, OF, ON, US, NO
509-
if stripped and 2 <= len(stripped) <= 3 and stripped.isupper():
510-
letters = stripped.lower()
511-
has_vowel = any(c in 'aeiou' for c in letters)
512-
is_repeated = len(set(letters)) == 1
513-
if is_repeated or not has_vowel:
514-
continue
515511
if self._is_gibberish_token(stripped):
516512
continue
517513
cleaned_words.append(word)
@@ -520,9 +516,6 @@ def _clean_ocr_artifacts(self, text: str) -> str:
520516
# Remove stray percent signs not next to numbers
521517
text = re.sub(r'(?<!\d)\s*%\s*(?!\d)', ' ', text)
522518

523-
# Remove isolated single characters that aren't common words (I, a, A)
524-
text = re.sub(r'(?<!\w)(?![IaA])[a-zA-Z](?!\w)', '', text)
525-
526519
# Normalize whitespace
527520
text = re.sub(r'\s+', ' ', text).strip()
528521

version.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
0.15.4
2-
24/02/2026
1+
0.16.0
2+
25/02/2026
33
CaptiOCR
44
Author: Carlo Sacchi
55
Website: https://www.captiocr.com

0 commit comments

Comments
 (0)