1+ import hashlib
12import json
23import re
34import shutil
@@ -60,6 +61,9 @@ def is_zip_payload(chunk: bytes) -> bool:
6061
6162_HASH_MATCH_PREFIX = "hashmatch__"
6263
64+ # marks a subtitle that was already placed next to the video and later displaced, so it is not re-selected
65+ _TESTED_MARKER = "_tested"
66+
6367
6468def _cloudflare_block_reason (response : "CurlResponse" ) -> str :
6569 headers = response .headers
@@ -158,6 +162,24 @@ def _archive_listing(archive: zipfile.ZipFile) -> str:
158162 return "\n " .join (f" { info .filename } ({ info .file_size } bytes)" for info in archive .infolist ())
159163
160164
165+ def content_hash (data : bytes ) -> str :
166+ return hashlib .sha256 (data ).hexdigest ()
167+
168+
169+ def file_content_hash (path : Path ) -> str :
170+ return content_hash (path .read_bytes ())
171+
172+
173+ def _resolve_extraction_target (destination : Path , filename : str , payload : bytes ) -> Path | None :
174+ target = destination / Path (filename ).name
175+ if not target .exists ():
176+ return target
177+ if file_content_hash (target ) == content_hash (payload ):
178+ capture (f"Skipping { target .name } (identical copy already extracted)" , level = LogLevel .DEBUG )
179+ return None
180+ return _next_available_path (destination , target .stem , target .suffix )
181+
182+
161183def _safe_extract_archive (archive : zipfile .ZipFile , dst : Path , hash_match : bool = False ) -> int :
162184 members = archive .infolist ()
163185 total_uncompressed = sum (info .file_size for info in members )
@@ -180,12 +202,15 @@ def _safe_extract_archive(archive: zipfile.ZipFile, dst: Path, hash_match: bool
180202 if member_path .suffix .lower () not in _SUBTITLE_EXTENSIONS :
181203 capture (f"Ignoring { member .filename } (not a subtitle file)" , level = LogLevel .DEBUG )
182204 continue
183- extracted_path = Path (archive .extract (member , dst ))
184- if hash_match and not extracted_path .name .startswith (_HASH_MATCH_PREFIX ):
185- renamed_path = extracted_path .with_name (f"{ _HASH_MATCH_PREFIX } { extracted_path .name } " )
186- extracted_path .rename (renamed_path )
187- extracted_path = renamed_path
188- _track (extracted_path )
205+ payload = archive .read (member )
206+ member_name = Path (member .filename ).name
207+ if hash_match and not member_name .startswith (_HASH_MATCH_PREFIX ):
208+ member_name = f"{ _HASH_MATCH_PREFIX } { member_name } "
209+ target = _resolve_extraction_target (dst , member_name , payload )
210+ if target is None :
211+ continue
212+ target .write_bytes (payload )
213+ _track (target )
189214 extracted_count += 1
190215 return extracted_count
191216
@@ -215,11 +240,17 @@ def extract_subtitle_by_id(subtitle_id: str, src: Path, dst: Path, extension: st
215240 return extracted_count
216241
217242
243+ def _is_tested_subtitle (stem : str ) -> bool :
244+ return stem .endswith (_TESTED_MARKER ) or f"{ _TESTED_MARKER } _v" in stem
245+
246+
218247def subtitle_files_in (directory : Path ) -> list [Path ]:
219248 if not directory .is_dir ():
220249 return []
221250 return sorted (
222- file for file in directory .iterdir () if file .is_file () and file .suffix .lower () in _SUBTITLE_EXTENSIONS
251+ file
252+ for file in directory .iterdir ()
253+ if file .is_file () and file .suffix .lower () in _SUBTITLE_EXTENSIONS and not _is_tested_subtitle (file .stem )
223254 )
224255
225256
@@ -265,9 +296,21 @@ def move_all(src: Path, dst: Path) -> int:
265296 return moved_count
266297
267298
268- def move_and_replace (source_file : Path , destination_directory : Path ) -> None :
269- source_file .replace (destination_directory / source_file .name )
270- capture (f"Moving file: { source_file } -> { destination_directory } " )
299+ def move_best_next_to_video (
300+ source_file : Path , destination_directory : Path , video_stem : str , extraction_directory : Path
301+ ) -> None :
302+ if _is_tested_subtitle (source_file .stem ):
303+ capture (f"Refusing to move already-tested subtitle: { source_file } " , level = LogLevel .DEBUG )
304+ return
305+ target = destination_directory / f"{ video_stem } { source_file .suffix } "
306+ if target .exists ():
307+ preserved = _next_available_path (extraction_directory , f"{ video_stem } { _TESTED_MARKER } " , target .suffix )
308+ capture (f"Preserving existing subtitle: { target } -> { preserved } " )
309+ target .replace (preserved )
310+ _track (preserved )
311+ capture (f"Moving file: { source_file } -> { target } " )
312+ source_file .replace (target )
313+ _track (target )
271314
272315
273316def del_file_type (cwd : Path , extension : str ) -> None :
0 commit comments