1+ import json
12import logging
23import re
34import time
1011
1112logger = logging .getLogger (__name__ )
1213
14+
15+ def _normalize_group_name (group : str | None ) -> str :
16+ """Normalize group name for comparison by removing common separators."""
17+ if not group :
18+ return ""
19+ # Remove common separators (&, ×, _, -) and normalize to lowercase
20+ return re .sub (r"[&×_\-]" , "" , group ).lower ().strip ()
21+
22+
23+ def _groups_are_similar (group1 : str | None , group2 : str | None ) -> bool :
24+ """
25+ Check if two group names are similar enough to be considered the same group.
26+
27+ Handles cases like:
28+ - "LoliHouse" vs "LoliHouse&动漫国字幕组"
29+ - "字幕组A" vs "字幕组A×字幕组B"
30+ """
31+ if not group1 or not group2 :
32+ return False
33+
34+ # Exact match or substring match (one contains the other)
35+ if group1 == group2 or group1 in group2 or group2 in group1 :
36+ return True
37+
38+ # Normalized comparison - check if core group names overlap
39+ norm1 = _normalize_group_name (group1 )
40+ norm2 = _normalize_group_name (group2 )
41+ return norm1 in norm2 or norm2 in norm1
42+
43+
44+ def _get_aliases_list (bangumi : Bangumi ) -> list [str ]:
45+ """Get the list of title aliases from a bangumi's title_aliases JSON field."""
46+ if not bangumi .title_aliases :
47+ return []
48+ try :
49+ aliases = json .loads (bangumi .title_aliases )
50+ return aliases if isinstance (aliases , list ) else []
51+ except (json .JSONDecodeError , TypeError ):
52+ return []
53+
54+
55+ def _set_aliases_list (bangumi : Bangumi , aliases : list [str ]) -> None :
56+ """Set the title aliases JSON field from a list."""
57+ if not aliases :
58+ bangumi .title_aliases = None
59+ else :
60+ # Remove duplicates while preserving order
61+ unique_aliases = list (dict .fromkeys (aliases ))
62+ bangumi .title_aliases = json .dumps (unique_aliases , ensure_ascii = False )
63+
64+
1365# Module-level TTL cache for search_all results
1466_bangumi_cache : list [Bangumi ] | None = None
1567_bangumi_cache_time : float = 0
@@ -26,6 +78,91 @@ class BangumiDatabase:
2678 def __init__ (self , session : Session ):
2779 self .session = session
2880
81+ def find_semantic_duplicate (self , data : Bangumi ) -> Optional [Bangumi ]:
82+ """
83+ Find existing bangumi that semantically matches the new one.
84+
85+ This handles cases where subtitle groups change naming mid-season.
86+ A semantic match requires:
87+ - Same official_title
88+ - Same dpi (resolution)
89+ - Same subtitle type
90+ - Same source
91+ - Similar group_name (one contains the other)
92+
93+ Returns the matching Bangumi if found, None otherwise.
94+ """
95+ statement = select (Bangumi ).where (
96+ and_ (
97+ Bangumi .official_title == data .official_title ,
98+ Bangumi .deleted == false (),
99+ )
100+ )
101+ candidates = self .session .execute (statement ).scalars ().all ()
102+
103+ for candidate in candidates :
104+ is_exact_duplicate = (
105+ candidate .title_raw == data .title_raw
106+ and candidate .group_name == data .group_name
107+ )
108+ if is_exact_duplicate :
109+ continue
110+
111+ is_semantic_match = (
112+ candidate .dpi == data .dpi
113+ and candidate .subtitle == data .subtitle
114+ and candidate .source == data .source
115+ and _groups_are_similar (candidate .group_name , data .group_name )
116+ )
117+ if is_semantic_match :
118+ logger .debug (
119+ f"[Database] Found semantic duplicate: '{ data .title_raw } ' matches "
120+ f"existing '{ candidate .title_raw } ' (official: { data .official_title } )"
121+ )
122+ return candidate
123+
124+ return None
125+
126+ def add_title_alias (self , bangumi_id : int , new_title_raw : str ) -> bool :
127+ """
128+ Add a new title_raw alias to an existing bangumi.
129+
130+ This allows a single bangumi entry to match multiple naming patterns.
131+ """
132+ bangumi = self .session .get (Bangumi , bangumi_id )
133+ if not bangumi :
134+ logger .warning (
135+ f"[Database] Cannot add alias: bangumi id { bangumi_id } not found"
136+ )
137+ return False
138+
139+ # Don't add if it's the same as the main title_raw
140+ if bangumi .title_raw == new_title_raw :
141+ return False
142+
143+ # Get existing aliases and add the new one
144+ aliases = _get_aliases_list (bangumi )
145+ if new_title_raw in aliases :
146+ return False # Already exists
147+
148+ aliases .append (new_title_raw )
149+ _set_aliases_list (bangumi , aliases )
150+
151+ self .session .add (bangumi )
152+ self .session .commit ()
153+ _invalidate_bangumi_cache ()
154+ logger .info (
155+ f"[Database] Added alias '{ new_title_raw } ' to bangumi '{ bangumi .official_title } ' "
156+ f"(id: { bangumi_id } )"
157+ )
158+ return True
159+
160+ def get_all_title_patterns (self , bangumi : Bangumi ) -> list [str ]:
161+ """Get all title patterns for matching (title_raw + all aliases)."""
162+ patterns = [bangumi .title_raw ]
163+ patterns .extend (_get_aliases_list (bangumi ))
164+ return patterns
165+
29166 def _is_duplicate (self , data : Bangumi ) -> bool :
30167 """Check if a bangumi rule already exists based on title_raw and group_name."""
31168 statement = select (Bangumi ).where (
@@ -43,6 +180,18 @@ def add(self, data: Bangumi) -> bool:
43180 f"[Database] Skipping duplicate: { data .official_title } ({ data .group_name } )"
44181 )
45182 return False
183+
184+ # Check for semantic duplicate (same anime, different naming pattern)
185+ semantic_match = self .find_semantic_duplicate (data )
186+ if semantic_match :
187+ # Add as alias instead of creating new entry
188+ self .add_title_alias (semantic_match .id , data .title_raw )
189+ logger .info (
190+ f"[Database] Merged '{ data .title_raw } ' as alias to existing "
191+ f"'{ semantic_match .title_raw } ' (official: { data .official_title } )"
192+ )
193+ return False # Return False since we didn't add a new entry
194+
46195 self .session .add (data )
47196 self .session .commit ()
48197 _invalidate_bangumi_cache ()
@@ -70,31 +219,54 @@ def add_all(self, datas: list[Bangumi]) -> int:
70219 else :
71220 existing = set ()
72221
73- # Filter out duplicates
222+ # Filter out exact duplicates
74223 to_add = [d for d in datas if (d .title_raw , d .group_name ) not in existing ]
75224
225+ # Check for semantic duplicates and add as aliases
226+ semantic_merged = 0
227+ really_to_add = []
228+ for d in to_add :
229+ semantic_match = self .find_semantic_duplicate (d )
230+ if semantic_match :
231+ # Add as alias instead of creating new entry
232+ self .add_title_alias (semantic_match .id , d .title_raw )
233+ semantic_merged += 1
234+ logger .info (
235+ f"[Database] Merged '{ d .title_raw } ' as alias to existing "
236+ f"'{ semantic_match .title_raw } ' (official: { d .official_title } )"
237+ )
238+ else :
239+ really_to_add .append (d )
240+
76241 # Also deduplicate within the batch itself
77242 seen = set ()
78243 unique_to_add = []
79- for d in to_add :
244+ for d in really_to_add :
80245 key = (d .title_raw , d .group_name )
81246 if key not in seen :
82247 seen .add (key )
83248 unique_to_add .append (d )
84249
85250 if not unique_to_add :
86- logger .debug (
87- f"[Database] All { len (datas )} bangumi already exist, skipping."
88- )
251+ if semantic_merged > 0 :
252+ logger .debug (
253+ f"[Database] { semantic_merged } bangumi merged as aliases, "
254+ f"rest were duplicates."
255+ )
256+ else :
257+ logger .debug (
258+ f"[Database] All { len (datas )} bangumi already exist, skipping."
259+ )
89260 return 0
90261
91262 self .session .add_all (unique_to_add )
92263 self .session .commit ()
93264 _invalidate_bangumi_cache ()
94- skipped = len (datas ) - len (unique_to_add )
95- if skipped > 0 :
265+ skipped = len (datas ) - len (unique_to_add ) - semantic_merged
266+ if skipped > 0 or semantic_merged > 0 :
96267 logger .debug (
97- f"[Database] Insert { len (unique_to_add )} bangumi, skipped { skipped } duplicates."
268+ f"[Database] Insert { len (unique_to_add )} bangumi, "
269+ f"skipped { skipped } duplicates, merged { semantic_merged } as aliases."
98270 )
99271 else :
100272 logger .debug (
@@ -186,33 +358,34 @@ def search_all(self) -> list[Bangumi]:
186358
187359 def search_id (self , _id : int ) -> Optional [Bangumi ]:
188360 statement = select (Bangumi ).where (Bangumi .id == _id )
189- result = self .session .execute (statement )
190- bangumi = result .scalar_one_or_none ()
361+ bangumi = self .session .execute (statement ).scalar_one_or_none ()
191362 if bangumi is None :
192363 logger .warning (f"[Database] Cannot find bangumi id: { _id } ." )
193364 return None
194- else :
195- logger .debug (f"[Database] Find bangumi id: { _id } ." )
196- return bangumi
365+ logger .debug (f"[Database] Find bangumi id: { _id } ." )
366+ return bangumi
197367
198368 def match_poster (self , bangumi_name : str ) -> str :
199369 statement = select (Bangumi ).where (
200370 func .instr (bangumi_name , Bangumi .official_title ) > 0
201371 )
202- result = self .session .execute (statement )
203- data = result .scalar_one_or_none ()
204- if data :
205- return data .poster_link
206- else :
207- return ""
372+ data = self .session .execute (statement ).scalar_one_or_none ()
373+ return data .poster_link if data else ""
208374
209375 def match_list (self , torrent_list : list , rss_link : str ) -> list :
210376 match_datas = self .search_all ()
211377 if not match_datas :
212378 return torrent_list
213379
214380 # Build index for O(1) lookup after regex match
215- title_index = {m .title_raw : m for m in match_datas }
381+ # Include both title_raw and all aliases
382+ title_index : dict [str , Bangumi ] = {}
383+ for m in match_datas :
384+ # Add main title_raw
385+ title_index [m .title_raw ] = m
386+ # Add all aliases
387+ for alias in _get_aliases_list (m ):
388+ title_index [alias ] = m
216389
217390 # Build compiled regex pattern for fast substring matching
218391 # Sort by length descending so longer (more specific) matches are found first
@@ -226,12 +399,16 @@ def match_list(self, torrent_list: list, rss_link: str) -> list:
226399 for torrent in torrent_list :
227400 match = title_regex .search (torrent .name )
228401 if match :
229- title_raw = match .group (0 )
230- match_data = title_index [title_raw ]
231- if rss_link not in match_data .rss_link and title_raw not in rss_updated :
402+ matched_title = match .group (0 )
403+ match_data = title_index [matched_title ]
404+ # Use the bangumi's main title_raw for rss_updated tracking
405+ if (
406+ rss_link not in match_data .rss_link
407+ and match_data .title_raw not in rss_updated
408+ ):
232409 match_data .rss_link += f",{ rss_link } "
233410 match_data .added = False
234- rss_updated .add (title_raw )
411+ rss_updated .add (match_data . title_raw )
235412 else :
236413 unmatched .append (torrent )
237414 # Batch commit all rss_link updates
@@ -244,20 +421,32 @@ def match_list(self, torrent_list: list, rss_link: str) -> list:
244421 return unmatched
245422
246423 def match_torrent (self , torrent_name : str ) -> Optional [Bangumi ]:
247- statement = (
248- select (Bangumi )
249- .where (
250- and_ (
251- func .instr (torrent_name , Bangumi .title_raw ) > 0 ,
252- Bangumi .deleted == false (),
253- )
254- )
255- # Prefer longer title_raw matches (more specific)
256- .order_by (func .length (Bangumi .title_raw ).desc ())
257- .limit (1 )
258- )
259- result = self .session .execute (statement )
260- return result .scalar_one_or_none ()
424+ """
425+ Match torrent name to a bangumi, checking both title_raw and title_aliases.
426+
427+ Returns the bangumi with the longest matching pattern for specificity.
428+ """
429+ match_datas = self .search_all ()
430+ if not match_datas :
431+ return None
432+
433+ best_match : Optional [Bangumi ] = None
434+ best_match_len = 0
435+
436+ for bangumi in match_datas :
437+ if bangumi .deleted :
438+ continue
439+
440+ # Check all patterns (title_raw + aliases)
441+ patterns = self .get_all_title_patterns (bangumi )
442+ for pattern in patterns :
443+ if pattern in torrent_name :
444+ # Prefer longer matches (more specific)
445+ if len (pattern ) > best_match_len :
446+ best_match = bangumi
447+ best_match_len = len (pattern )
448+
449+ return best_match
261450
262451 def not_complete (self ) -> list [Bangumi ]:
263452 condition = select (Bangumi ).where (
@@ -285,6 +474,7 @@ def disable_rule(self, _id: int):
285474 bangumi .deleted = True
286475 self .session .add (bangumi )
287476 self .session .commit ()
477+ _invalidate_bangumi_cache ()
288478 logger .debug (f"[Database] Disable rule { bangumi .title_raw } ." )
289479
290480 def search_rss (self , rss_link : str ) -> list [Bangumi ]:
0 commit comments