Merge feature/v0.16-fidelity-fixes: Fidelity-first capture and post-processing

carlosacchi · carlosacchi · commit d92bc11918a6 · 2026-02-25T15:07:36.000Z
diff --git a/captiocr/config/constants.py b/captiocr/config/constants.py
@@ -25,9 +25,9 @@
 
 # Capture Configuration
 DEFAULT_MIN_CAPTURE_INTERVAL = 3.0  # seconds
-DEFAULT_MAX_CAPTURE_INTERVAL = 4.0  # seconds (recall-first: tighter sampling)
+DEFAULT_MAX_CAPTURE_INTERVAL = 6.0  # seconds
 MAX_SIMILAR_CAPTURES = 1
-TEXT_SIMILARITY_THRESHOLD = 0.85  # Raw capture: recall-first (higher = less aggressive dedup)
+TEXT_SIMILARITY_THRESHOLD = 0.80  # Fidelity-first: match v0.12.3 default
 MIN_TEXT_LENGTH = 10  # Used only in post-processing delta extraction
 MIN_CAPTURE_AREA_SIZE = 70  # pixels
 
@@ -38,8 +38,8 @@
 DEFAULT_INCREMENTAL_THRESHOLD = 0.7  # Percentage overlap for incremental detection (70%)
 
 # Post-Processing Configuration (recall-first pipeline for processed files)
-POST_PROCESS_DEDUP_ENTER_THRESHOLD = 0.75  # Enter dedup mode when similarity >= this
-POST_PROCESS_DEDUP_EXIT_THRESHOLD = 0.60  # Exit dedup mode when similarity <= this
+POST_PROCESS_DEDUP_ENTER_THRESHOLD = 0.82  # Enter dedup mode when similarity >= this
+POST_PROCESS_DEDUP_EXIT_THRESHOLD = 0.55  # Exit dedup mode when similarity <= this
 POST_PROCESS_MIN_LENGTH_RATIO = 0.60  # No-downgrade rule: skip if new < this * previous length
 POST_PROCESS_MIN_NEW_WORDS = 3  # No-downgrade rule: minimum new words to accept a shorter frame
 POST_PROCESS_FRAME_CONSENSUS_WINDOW = 3  # Emit only when content appears in N-1 of N frames
diff --git a/captiocr/core/capture.py b/captiocr/core/capture.py
@@ -252,35 +252,36 @@ def _capture_loop(self, language: str, caption_mode: bool) -> None:
                     self.logger.debug(f"Raw OCR result (length: {len(raw_text)}): '{raw_text[:100]}...'")
                     metrics['total_ocr_frames'] += 1
 
-                    # Clean text (control chars/whitespace only — no semantic filtering)
-                    cleaned_text = self.text_processor.clean_text_raw(raw_text)
+                    # Dual-path: raw for file output, normalized for comparison
+                    raw_cleaned = self.text_processor.clean_text_raw(raw_text)
+                    normalized = self.text_processor.clean_text(raw_text)
 
-                    if not cleaned_text or not cleaned_text.strip():
+                    if not raw_cleaned or not raw_cleaned.strip():
                         metrics['dropped_empty'] += 1
                         continue
 
                     # Skip UI overlay artifacts (Press ESC, Click and drag, etc.)
-                    if self.text_processor._is_ui_artifact(cleaned_text):
+                    if self.text_processor._is_ui_artifact(raw_cleaned):
                         self.logger.debug("Skipped UI artifact frame")
                         metrics['dropped_ui_artifact'] += 1
                         continue
 
-                    # Write full text if meaningfully different from last capture
-                    if self.text_processor.has_significant_new_content(cleaned_text, last_text):
+                    # Write full raw text if normalized version is meaningfully different
+                    if self.text_processor.has_significant_new_content(normalized, last_text):
                         with open(self.output_file_path, 'a', encoding='utf-8') as f:
                             timestamp = datetime.now().strftime('%H:%M:%S')
-                            f.write(f"[{timestamp}] {cleaned_text}\n")
+                            f.write(f"[{timestamp}] {raw_cleaned}\n")
 
-                        last_text = cleaned_text
-                        self.text_history.append(cleaned_text)
+                        last_text = normalized
+                        self.text_history.append(raw_cleaned)
                         similar_captures_count = 0
                         capture_interval = self.capture_config.reset_interval()
                         metrics['written_frames'] += 1
 
                         if self.on_text_captured:
-                            self.on_text_captured(cleaned_text)
+                            self.on_text_captured(raw_cleaned)
 
-                        self.logger.debug(f"Captured: {cleaned_text[:80]}...")
+                        self.logger.debug(f"Captured: {raw_cleaned[:80]}...")
                     else:
                         # No significant change — slow down polling if screen is static
                         similar_captures_count += 1
diff --git a/captiocr/core/text_processor.py b/captiocr/core/text_processor.py
@@ -435,6 +435,10 @@ def _is_gibberish_token(word: str) -> bool:
         if len(word) < 4:
             return False
 
+        # All-uppercase short words are almost always acronyms, not gibberish
+        if word.isupper() and len(word) <= 6:
+            return False
+
         letters_only = ''.join(c for c in word if c.isalpha())
         if not letters_only:
             return False
@@ -504,14 +508,6 @@ def _clean_ocr_artifacts(self, text: str) -> str:
         for word in words:
             # Strip punctuation for checking but keep original if valid
             stripped = word.strip('.,!?;:()[]{}')
-            # Remove isolated uppercase 2-3 letter OCR noise using repetition-based detection
-            # Protect common valid words: I, OK, IT, IS, OR, AN, AM, AT, IF, OF, ON, US, NO
-            if stripped and 2 <= len(stripped) <= 3 and stripped.isupper():
-                letters = stripped.lower()
-                has_vowel = any(c in 'aeiou' for c in letters)
-                is_repeated = len(set(letters)) == 1
-                if is_repeated or not has_vowel:
-                    continue
             if self._is_gibberish_token(stripped):
                 continue
             cleaned_words.append(word)
@@ -520,9 +516,6 @@ def _clean_ocr_artifacts(self, text: str) -> str:
         # Remove stray percent signs not next to numbers
         text = re.sub(r'(?<!\d)\s*%\s*(?!\d)', ' ', text)
 
-        # Remove isolated single characters that aren't common words (I, a, A)
-        text = re.sub(r'(?<!\w)(?![IaA])[a-zA-Z](?!\w)', '', text)
-
         # Normalize whitespace
         text = re.sub(r'\s+', ' ', text).strip()
 
diff --git a/version.txt b/version.txt
@@ -1,5 +1,5 @@
-0.15.4
-24/02/2026
+0.16.0
+25/02/2026
 CaptiOCR
 Author: Carlo Sacchi
 Website: https://www.captiocr.com