Skip to content

Commit 3f4f3a1

Browse files
EstrellaXDclaude
andcommitted
feat(database): add title alias system for mid-season naming changes
When subtitle groups change their naming convention mid-season (e.g., "LoliHouse" → "LoliHouse&动漫国"), AutoBangumi was creating duplicate entries. This adds a title alias system that: - Detects semantic duplicates (same official_title, dpi, subtitle, source, and similar group name) - Merges them as aliases instead of creating new entries - Updates match_torrent() and match_list() to check aliases - Adds title_aliases field to Bangumi model (JSON list) - Includes migration v8 for the new column - Adds 10 new tests for the feature - Fixes cache invalidation bug in disable_rule() Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 0ba508c commit 3f4f3a1

4 files changed

Lines changed: 541 additions & 40 deletions

File tree

backend/src/module/database/bangumi.py

Lines changed: 228 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import re
34
import time
@@ -10,6 +11,57 @@
1011

1112
logger = logging.getLogger(__name__)
1213

14+
15+
def _normalize_group_name(group: str | None) -> str:
16+
"""Normalize group name for comparison by removing common separators."""
17+
if not group:
18+
return ""
19+
# Remove common separators (&, ×, _, -) and normalize to lowercase
20+
return re.sub(r"[&×_\-]", "", group).lower().strip()
21+
22+
23+
def _groups_are_similar(group1: str | None, group2: str | None) -> bool:
24+
"""
25+
Check if two group names are similar enough to be considered the same group.
26+
27+
Handles cases like:
28+
- "LoliHouse" vs "LoliHouse&动漫国字幕组"
29+
- "字幕组A" vs "字幕组A×字幕组B"
30+
"""
31+
if not group1 or not group2:
32+
return False
33+
34+
# Exact match or substring match (one contains the other)
35+
if group1 == group2 or group1 in group2 or group2 in group1:
36+
return True
37+
38+
# Normalized comparison - check if core group names overlap
39+
norm1 = _normalize_group_name(group1)
40+
norm2 = _normalize_group_name(group2)
41+
return norm1 in norm2 or norm2 in norm1
42+
43+
44+
def _get_aliases_list(bangumi: Bangumi) -> list[str]:
45+
"""Get the list of title aliases from a bangumi's title_aliases JSON field."""
46+
if not bangumi.title_aliases:
47+
return []
48+
try:
49+
aliases = json.loads(bangumi.title_aliases)
50+
return aliases if isinstance(aliases, list) else []
51+
except (json.JSONDecodeError, TypeError):
52+
return []
53+
54+
55+
def _set_aliases_list(bangumi: Bangumi, aliases: list[str]) -> None:
56+
"""Set the title aliases JSON field from a list."""
57+
if not aliases:
58+
bangumi.title_aliases = None
59+
else:
60+
# Remove duplicates while preserving order
61+
unique_aliases = list(dict.fromkeys(aliases))
62+
bangumi.title_aliases = json.dumps(unique_aliases, ensure_ascii=False)
63+
64+
1365
# Module-level TTL cache for search_all results
1466
_bangumi_cache: list[Bangumi] | None = None
1567
_bangumi_cache_time: float = 0
@@ -26,6 +78,91 @@ class BangumiDatabase:
2678
def __init__(self, session: Session):
2779
self.session = session
2880

81+
def find_semantic_duplicate(self, data: Bangumi) -> Optional[Bangumi]:
82+
"""
83+
Find existing bangumi that semantically matches the new one.
84+
85+
This handles cases where subtitle groups change naming mid-season.
86+
A semantic match requires:
87+
- Same official_title
88+
- Same dpi (resolution)
89+
- Same subtitle type
90+
- Same source
91+
- Similar group_name (one contains the other)
92+
93+
Returns the matching Bangumi if found, None otherwise.
94+
"""
95+
statement = select(Bangumi).where(
96+
and_(
97+
Bangumi.official_title == data.official_title,
98+
Bangumi.deleted == false(),
99+
)
100+
)
101+
candidates = self.session.execute(statement).scalars().all()
102+
103+
for candidate in candidates:
104+
is_exact_duplicate = (
105+
candidate.title_raw == data.title_raw
106+
and candidate.group_name == data.group_name
107+
)
108+
if is_exact_duplicate:
109+
continue
110+
111+
is_semantic_match = (
112+
candidate.dpi == data.dpi
113+
and candidate.subtitle == data.subtitle
114+
and candidate.source == data.source
115+
and _groups_are_similar(candidate.group_name, data.group_name)
116+
)
117+
if is_semantic_match:
118+
logger.debug(
119+
f"[Database] Found semantic duplicate: '{data.title_raw}' matches "
120+
f"existing '{candidate.title_raw}' (official: {data.official_title})"
121+
)
122+
return candidate
123+
124+
return None
125+
126+
def add_title_alias(self, bangumi_id: int, new_title_raw: str) -> bool:
127+
"""
128+
Add a new title_raw alias to an existing bangumi.
129+
130+
This allows a single bangumi entry to match multiple naming patterns.
131+
"""
132+
bangumi = self.session.get(Bangumi, bangumi_id)
133+
if not bangumi:
134+
logger.warning(
135+
f"[Database] Cannot add alias: bangumi id {bangumi_id} not found"
136+
)
137+
return False
138+
139+
# Don't add if it's the same as the main title_raw
140+
if bangumi.title_raw == new_title_raw:
141+
return False
142+
143+
# Get existing aliases and add the new one
144+
aliases = _get_aliases_list(bangumi)
145+
if new_title_raw in aliases:
146+
return False # Already exists
147+
148+
aliases.append(new_title_raw)
149+
_set_aliases_list(bangumi, aliases)
150+
151+
self.session.add(bangumi)
152+
self.session.commit()
153+
_invalidate_bangumi_cache()
154+
logger.info(
155+
f"[Database] Added alias '{new_title_raw}' to bangumi '{bangumi.official_title}' "
156+
f"(id: {bangumi_id})"
157+
)
158+
return True
159+
160+
def get_all_title_patterns(self, bangumi: Bangumi) -> list[str]:
161+
"""Get all title patterns for matching (title_raw + all aliases)."""
162+
patterns = [bangumi.title_raw]
163+
patterns.extend(_get_aliases_list(bangumi))
164+
return patterns
165+
29166
def _is_duplicate(self, data: Bangumi) -> bool:
30167
"""Check if a bangumi rule already exists based on title_raw and group_name."""
31168
statement = select(Bangumi).where(
@@ -43,6 +180,18 @@ def add(self, data: Bangumi) -> bool:
43180
f"[Database] Skipping duplicate: {data.official_title} ({data.group_name})"
44181
)
45182
return False
183+
184+
# Check for semantic duplicate (same anime, different naming pattern)
185+
semantic_match = self.find_semantic_duplicate(data)
186+
if semantic_match:
187+
# Add as alias instead of creating new entry
188+
self.add_title_alias(semantic_match.id, data.title_raw)
189+
logger.info(
190+
f"[Database] Merged '{data.title_raw}' as alias to existing "
191+
f"'{semantic_match.title_raw}' (official: {data.official_title})"
192+
)
193+
return False # Return False since we didn't add a new entry
194+
46195
self.session.add(data)
47196
self.session.commit()
48197
_invalidate_bangumi_cache()
@@ -70,31 +219,54 @@ def add_all(self, datas: list[Bangumi]) -> int:
70219
else:
71220
existing = set()
72221

73-
# Filter out duplicates
222+
# Filter out exact duplicates
74223
to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing]
75224

225+
# Check for semantic duplicates and add as aliases
226+
semantic_merged = 0
227+
really_to_add = []
228+
for d in to_add:
229+
semantic_match = self.find_semantic_duplicate(d)
230+
if semantic_match:
231+
# Add as alias instead of creating new entry
232+
self.add_title_alias(semantic_match.id, d.title_raw)
233+
semantic_merged += 1
234+
logger.info(
235+
f"[Database] Merged '{d.title_raw}' as alias to existing "
236+
f"'{semantic_match.title_raw}' (official: {d.official_title})"
237+
)
238+
else:
239+
really_to_add.append(d)
240+
76241
# Also deduplicate within the batch itself
77242
seen = set()
78243
unique_to_add = []
79-
for d in to_add:
244+
for d in really_to_add:
80245
key = (d.title_raw, d.group_name)
81246
if key not in seen:
82247
seen.add(key)
83248
unique_to_add.append(d)
84249

85250
if not unique_to_add:
86-
logger.debug(
87-
f"[Database] All {len(datas)} bangumi already exist, skipping."
88-
)
251+
if semantic_merged > 0:
252+
logger.debug(
253+
f"[Database] {semantic_merged} bangumi merged as aliases, "
254+
f"rest were duplicates."
255+
)
256+
else:
257+
logger.debug(
258+
f"[Database] All {len(datas)} bangumi already exist, skipping."
259+
)
89260
return 0
90261

91262
self.session.add_all(unique_to_add)
92263
self.session.commit()
93264
_invalidate_bangumi_cache()
94-
skipped = len(datas) - len(unique_to_add)
95-
if skipped > 0:
265+
skipped = len(datas) - len(unique_to_add) - semantic_merged
266+
if skipped > 0 or semantic_merged > 0:
96267
logger.debug(
97-
f"[Database] Insert {len(unique_to_add)} bangumi, skipped {skipped} duplicates."
268+
f"[Database] Insert {len(unique_to_add)} bangumi, "
269+
f"skipped {skipped} duplicates, merged {semantic_merged} as aliases."
98270
)
99271
else:
100272
logger.debug(
@@ -186,33 +358,34 @@ def search_all(self) -> list[Bangumi]:
186358

187359
def search_id(self, _id: int) -> Optional[Bangumi]:
188360
statement = select(Bangumi).where(Bangumi.id == _id)
189-
result = self.session.execute(statement)
190-
bangumi = result.scalar_one_or_none()
361+
bangumi = self.session.execute(statement).scalar_one_or_none()
191362
if bangumi is None:
192363
logger.warning(f"[Database] Cannot find bangumi id: {_id}.")
193364
return None
194-
else:
195-
logger.debug(f"[Database] Find bangumi id: {_id}.")
196-
return bangumi
365+
logger.debug(f"[Database] Find bangumi id: {_id}.")
366+
return bangumi
197367

198368
def match_poster(self, bangumi_name: str) -> str:
199369
statement = select(Bangumi).where(
200370
func.instr(bangumi_name, Bangumi.official_title) > 0
201371
)
202-
result = self.session.execute(statement)
203-
data = result.scalar_one_or_none()
204-
if data:
205-
return data.poster_link
206-
else:
207-
return ""
372+
data = self.session.execute(statement).scalar_one_or_none()
373+
return data.poster_link if data else ""
208374

209375
def match_list(self, torrent_list: list, rss_link: str) -> list:
210376
match_datas = self.search_all()
211377
if not match_datas:
212378
return torrent_list
213379

214380
# Build index for O(1) lookup after regex match
215-
title_index = {m.title_raw: m for m in match_datas}
381+
# Include both title_raw and all aliases
382+
title_index: dict[str, Bangumi] = {}
383+
for m in match_datas:
384+
# Add main title_raw
385+
title_index[m.title_raw] = m
386+
# Add all aliases
387+
for alias in _get_aliases_list(m):
388+
title_index[alias] = m
216389

217390
# Build compiled regex pattern for fast substring matching
218391
# Sort by length descending so longer (more specific) matches are found first
@@ -226,12 +399,16 @@ def match_list(self, torrent_list: list, rss_link: str) -> list:
226399
for torrent in torrent_list:
227400
match = title_regex.search(torrent.name)
228401
if match:
229-
title_raw = match.group(0)
230-
match_data = title_index[title_raw]
231-
if rss_link not in match_data.rss_link and title_raw not in rss_updated:
402+
matched_title = match.group(0)
403+
match_data = title_index[matched_title]
404+
# Use the bangumi's main title_raw for rss_updated tracking
405+
if (
406+
rss_link not in match_data.rss_link
407+
and match_data.title_raw not in rss_updated
408+
):
232409
match_data.rss_link += f",{rss_link}"
233410
match_data.added = False
234-
rss_updated.add(title_raw)
411+
rss_updated.add(match_data.title_raw)
235412
else:
236413
unmatched.append(torrent)
237414
# Batch commit all rss_link updates
@@ -244,20 +421,32 @@ def match_list(self, torrent_list: list, rss_link: str) -> list:
244421
return unmatched
245422

246423
def match_torrent(self, torrent_name: str) -> Optional[Bangumi]:
247-
statement = (
248-
select(Bangumi)
249-
.where(
250-
and_(
251-
func.instr(torrent_name, Bangumi.title_raw) > 0,
252-
Bangumi.deleted == false(),
253-
)
254-
)
255-
# Prefer longer title_raw matches (more specific)
256-
.order_by(func.length(Bangumi.title_raw).desc())
257-
.limit(1)
258-
)
259-
result = self.session.execute(statement)
260-
return result.scalar_one_or_none()
424+
"""
425+
Match torrent name to a bangumi, checking both title_raw and title_aliases.
426+
427+
Returns the bangumi with the longest matching pattern for specificity.
428+
"""
429+
match_datas = self.search_all()
430+
if not match_datas:
431+
return None
432+
433+
best_match: Optional[Bangumi] = None
434+
best_match_len = 0
435+
436+
for bangumi in match_datas:
437+
if bangumi.deleted:
438+
continue
439+
440+
# Check all patterns (title_raw + aliases)
441+
patterns = self.get_all_title_patterns(bangumi)
442+
for pattern in patterns:
443+
if pattern in torrent_name:
444+
# Prefer longer matches (more specific)
445+
if len(pattern) > best_match_len:
446+
best_match = bangumi
447+
best_match_len = len(pattern)
448+
449+
return best_match
261450

262451
def not_complete(self) -> list[Bangumi]:
263452
condition = select(Bangumi).where(
@@ -285,6 +474,7 @@ def disable_rule(self, _id: int):
285474
bangumi.deleted = True
286475
self.session.add(bangumi)
287476
self.session.commit()
477+
_invalidate_bangumi_cache()
288478
logger.debug(f"[Database] Disable rule {bangumi.title_raw}.")
289479

290480
def search_rss(self, rss_link: str) -> list[Bangumi]:

backend/src/module/database/combine.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
TABLE_MODELS: list[type[SQLModel]] = [Bangumi, RSSItem, Torrent, User, Passkey]
2424

2525
# Increment this when adding new migrations to MIGRATIONS list.
26-
CURRENT_SCHEMA_VERSION = 7
26+
CURRENT_SCHEMA_VERSION = 8
2727

2828
# Each migration is a tuple of (version, description, list of SQL statements).
2929
# Migrations are applied in order. A migration at index i brings the schema
@@ -96,6 +96,13 @@
9696
"ALTER TABLE bangumi ADD COLUMN suggested_episode_offset INTEGER DEFAULT NULL",
9797
],
9898
),
99+
(
100+
8,
101+
"add title_aliases for mid-season naming changes",
102+
[
103+
"ALTER TABLE bangumi ADD COLUMN title_aliases TEXT DEFAULT NULL",
104+
],
105+
),
99106
]
100107

101108

@@ -187,6 +194,10 @@ def run_migrations(self):
187194
columns = [col["name"] for col in inspector.get_columns("bangumi")]
188195
if "suggested_season_offset" in columns:
189196
needs_run = False
197+
if "bangumi" in tables and version == 8:
198+
columns = [col["name"] for col in inspector.get_columns("bangumi")]
199+
if "title_aliases" in columns:
200+
needs_run = False
190201
if needs_run:
191202
with self.engine.connect() as conn:
192203
for stmt in statements:

0 commit comments

Comments
 (0)