Skip to content

Commit feb800d

Browse files
author
carlosacchi
committed
v0.16.1: End-of-stream flush for trailing captured frames
Add end-of-stream flush logic to the fidelity pipeline so that trailing frames still in the buffer at the end of capture are emitted rather than silently dropped. The flush candidate goes through the same no-downgrade, dedup, overlap, and sentence-split steps (4-7) as regular consensus blocks, ensuring consistent output quality.
1 parent d92bc11 commit feb800d

2 files changed

Lines changed: 56 additions & 2 deletions

File tree

captiocr/core/text_processor.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,60 @@ def filter_duplicate_blocks_aggressive(self, text_blocks: List[Tuple[str, str]],
913913
prev_emitted_text = consensus
914914
stats['chunks_emitted'] += 1
915915

916+
# End-of-stream flush: emit trailing frames that never reached consensus
917+
if len(frame_buffer) >= 2:
918+
flush_text = max(frame_buffer, key=len)
919+
flush_ts = ts_buffer[-1]
920+
emit_flush = True
921+
922+
# Apply steps 4-7 to the flush candidate
923+
if prev_emitted_text:
924+
# Step 4: No-downgrade rule
925+
length_ratio = len(flush_text) / max(len(prev_emitted_text), 1)
926+
if length_ratio < min_length_ratio:
927+
prev_words = set(prev_emitted_text.lower().split())
928+
new_words_set = set(flush_text.lower().split()) - prev_words
929+
if len(new_words_set) < min_new_words:
930+
emit_flush = False
931+
932+
# Step 5: Hysteresis dedup
933+
if emit_flush:
934+
similarity = self.calculate_similarity(flush_text, prev_emitted_text)
935+
if similarity >= dedup_enter or (in_dedup_mode and similarity > dedup_exit):
936+
emit_flush = False
937+
938+
# Step 6: Prefix/suffix overlap dedup
939+
if emit_flush:
940+
prefix_end, suffix_start = self._find_overlap_boundary(prev_emitted_text, flush_text)
941+
original_words = flush_text.split()
942+
novel_words = original_words[prefix_end:suffix_start]
943+
if novel_words:
944+
flush_text = ' '.join(novel_words)
945+
if prefix_end > 0 or suffix_start < len(original_words):
946+
stats['merges_performed'] += 1
947+
else:
948+
emit_flush = False
949+
950+
# Step 7: Sentence splitting
951+
if emit_flush:
952+
sentences = self._split_into_sentences(flush_text, min_sentence_words)
953+
if not sentences:
954+
if len(flush_text.split()) >= min_sentence_words:
955+
sentences = [flush_text]
956+
else:
957+
emit_flush = False
958+
959+
if emit_flush:
960+
combined = '. '.join(sentences)
961+
if combined and not combined.endswith(('.', '!', '?')):
962+
combined += '.'
963+
result_blocks.append((flush_ts, combined))
964+
stats['chunks_emitted'] += 1
965+
self.logger.debug(
966+
f"End-of-stream flush: emitted trailing frame at {flush_ts} "
967+
f"({len(frame_buffer)} frames in buffer)"
968+
)
969+
916970
# Detect possible drops: gaps > 30s with dissimilar content
917971
for i in range(1, len(result_blocks)):
918972
prev_ts = result_blocks[i - 1][0]

version.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
0.16.0
2-
25/02/2026
1+
0.16.1
2+
03/03/2026
33
CaptiOCR
44
Author: Carlo Sacchi
55
Website: https://www.captiocr.com

0 commit comments

Comments
 (0)