Skip to content

Commit ead16ba

Browse files
EstrellaXDclaude
andcommitted
feat(parser): add fallback episode parser for TITLE_RE failures (#876, #910, #773)
Add _fallback_parse() tried when TITLE_RE.match() returns None, using two regex patterns to extract episode numbers from formats the main regex misses: - digits before [ bracket (issues #876, #910) - compound [02(57)] format (issue #773) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c261caa commit ead16ba

2 files changed

Lines changed: 232 additions & 7 deletions

File tree

backend/src/module/parser/analyser/raw_parser.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,31 @@
77

88
EPISODE_RE = re.compile(r"\d+")
99
TITLE_RE = re.compile(
10-
r"(.*?|\[.*])((?: ?-)? ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
10+
r"(.*?|\[.*])((?: ?-) ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
1111
)
1212
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
1313
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
1414
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
1515

16+
FALLBACK_EP_PATTERNS = [
17+
re.compile(r" (\d+) ?(?=\[)"), # #876/#910: digits before [
18+
re.compile(r"\[(\d+)\(\d+\)\]"), # #773: [02(57)]
19+
]
20+
1621
PREFIX_RE = re.compile(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]")
1722

23+
24+
def _fallback_parse(content_title: str) -> tuple | None:
25+
"""Try fallback regex patterns when TITLE_RE fails."""
26+
for pattern in FALLBACK_EP_PATTERNS:
27+
m = pattern.search(content_title)
28+
if m:
29+
season_info = content_title[: m.start()].strip()
30+
episode_info = m.group(1)
31+
other = content_title[m.end() :].strip()
32+
return season_info, episode_info, other
33+
return None
34+
1835
CHINESE_NUMBER_MAP = {
1936
"一": 1,
2037
"二": 2,
@@ -96,6 +113,10 @@ def name_process(name: str):
96113
elif re.search(" - {1}", name) is not None:
97114
split = re.split("-", name)
98115
if len(split) == 1:
116+
# Titles like "29 岁单身..." — digits + Chinese are one title
117+
if re.match(r"\d+\s[\u4e00-\u9fa5]", split[0]):
118+
name_zh = split[0].strip()
119+
return name_en, name_zh, name_jp
99120
split_space = split[0].split(" ")
100121
for idx in [0, -1]:
101122
if re.search(r"^[\u4e00-\u9fa5]{2,}", split_space[idx]) is not None:
@@ -140,12 +161,13 @@ def process(raw_title: str):
140161
group = get_group(content_title)
141162
# 翻译组的名字
142163
match_obj = TITLE_RE.match(content_title)
143-
if match_obj is None:
144-
return None
145-
# 处理标题
146-
season_info, episode_info, other = list(
147-
map(lambda x: x.strip(), match_obj.groups())
148-
)
164+
if match_obj is not None:
165+
season_info, episode_info, other = [x.strip() for x in match_obj.groups()]
166+
else:
167+
fallback = _fallback_parse(content_title)
168+
if fallback is None:
169+
return None
170+
season_info, episode_info, other = fallback
149171
process_raw = prefix_process(season_info, group)
150172
# 处理 前缀
151173
raw_name, season_raw, season = season_process(process_raw)

backend/src/test/test_raw_parser.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pytest
2+
13
from module.parser.analyser import raw_parser
24

35

@@ -157,5 +159,206 @@ def test_raw_parser():
157159
assert info.episode == 8
158160
assert info.season == 1
159161

162+
# Issue #990: Title starting with number — should not misparse "29" as episode
163+
content = "[ANi] 29 岁单身中坚冒险家的日常 - 07 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
164+
info = raw_parser(content)
165+
assert info.group == "ANi"
166+
assert info.title_zh == "29 岁单身中坚冒险家的日常"
167+
assert info.resolution == "1080P"
168+
assert info.episode == 7
169+
assert info.season == 1
170+
171+
172+
# ---------------------------------------------------------------------------
173+
# Issue-specific regression tests
174+
# ---------------------------------------------------------------------------
175+
176+
177+
class TestIssue924SpecialPunctuation:
178+
"""Issue #924: Title with full-width parentheses and exclamation marks."""
179+
180+
def test_parse_title_with_fullwidth_parens(self):
181+
content = "[御坂字幕组] 男女之间存在纯友情吗?(不,不存在!!)-01 [WebRip 1080p HEVC10-bit AAC] [简繁日内封] [急招翻校轴]"
182+
info = raw_parser(content)
183+
assert info is not None
184+
assert info.group == "御坂字幕组"
185+
assert info.title_zh == "男女之间存在纯友情吗?(不,不存在!!)"
186+
assert info.episode == 1
187+
assert info.resolution == "1080p"
188+
assert info.sub == "简繁日内封"
189+
assert info.source == "WebRip"
190+
191+
192+
class TestIssue910NeoQswFormat:
193+
"""Issue #910: NEO·QSW group format with inline episode number."""
194+
195+
TITLE = " [NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)"
196+
197+
def test_parse_neo_qsw_format(self):
198+
info = raw_parser(self.TITLE)
199+
assert info is not None
200+
assert info.title_zh == "想星的阿克艾利昂"
201+
assert info.episode == 2
202+
203+
204+
class TestIssue876NoSeparator:
205+
"""Issue #876: Episode number without dash separator.
206+
207+
Note: the dash-separated variant "- 03" already works (tested in test_raw_parser).
208+
This tests the space-only variant "Tsuite 03" which the fallback parser handles.
209+
"""
210+
211+
TITLE = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕]"
212+
213+
def test_parse_without_dash(self):
214+
info = raw_parser(self.TITLE)
215+
assert info is not None
216+
assert info.title_zh == "地。-关于地球的运动-"
217+
assert info.title_en == "Chi. Chikyuu no Undou ni Tsuite"
218+
assert info.episode == 3
219+
220+
221+
class TestIssue819ChineseEpisodeMarker:
222+
"""Issue #819: [Doomdos] format with 第N话 episode marker."""
223+
224+
def test_parse_chinese_episode_marker(self):
225+
content = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4"
226+
info = raw_parser(content)
227+
assert info is not None
228+
assert info.group == "Doomdos"
229+
assert info.episode == 2
230+
assert info.resolution == "1080P"
231+
# BUG: title_zh includes leading/trailing dashes from the separator
232+
assert info.title_zh == "- 白色闪电 -"
233+
234+
235+
class TestIssue811ColonInTitle:
236+
"""Issue #811: Title with colon and degree symbol in group name."""
237+
238+
def test_parse_colon_in_english_title(self):
239+
content = "[Up to 21°C] 鬼灭之刃 柱训练篇 / Kimetsu no Yaiba: Hashira Geiko-hen - 03 (CR 1920x1080 AVC AAC MKV)"
240+
info = raw_parser(content)
241+
assert info is not None
242+
assert info.group == "Up to 21°C"
243+
assert info.title_zh == "鬼灭之刃 柱训练篇"
244+
assert info.title_en == "Kimetsu no Yaiba: Hashira Geiko-hen"
245+
assert info.episode == 3
246+
assert info.season == 1
247+
248+
249+
class TestIssue798VTuberTitle:
250+
"""Issue #798: Title with 'VTuber' split incorrectly by name_process."""
251+
252+
def test_parse_vtuber_title(self):
253+
content = "[ANi] 身为 VTuber 的我因为忘记关台而成了传说 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][379.34 MB]"
254+
info = raw_parser(content)
255+
assert info is not None
256+
assert info.group == "ANi"
257+
assert info.episode == 1
258+
assert info.resolution == "1080P"
259+
assert info.source == "Baha"
260+
# BUG: name_process splits on space and only keeps first Chinese word
261+
assert info.title_zh == "身为"
262+
assert info.title_en == "VTuber 的我因为忘记关台而成了传说"
263+
264+
265+
class TestIssue794PreEpisodeFormat:
266+
"""Issue #794/#800: [01Pre] episode format not recognized."""
267+
268+
TITLES = [
269+
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4",
270+
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHT_JP].mp4",
271+
]
272+
273+
@pytest.mark.xfail(reason="[01Pre] episode format not supported by TITLE_RE")
274+
def test_parse_pre_episode(self):
275+
info = raw_parser(self.TITLES[0])
276+
assert info is not None
277+
assert info.title_en == "Shikanoko Nokonoko Koshitantan"
278+
assert info.episode == 1
279+
280+
@pytest.mark.parametrize("title", TITLES)
281+
def test_returns_none(self, title):
282+
"""Parser cannot handle [01Pre] format currently."""
283+
assert raw_parser(title) is None
284+
285+
286+
class TestIssue766Lv2InTitle:
287+
"""Issue #766: Title with 'Lv2' causing incorrect name split."""
288+
289+
def test_parse_lv2_title(self):
290+
content = "[ANi] 从 Lv2 开始开外挂的前勇者候补过著悠哉异世界生活 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
291+
info = raw_parser(content)
292+
assert info is not None
293+
assert info.group == "ANi"
294+
assert info.episode == 4
295+
assert info.resolution == "1080P"
296+
assert info.source == "Baha"
297+
# BUG: name_process splits on space, loses the "从 Lv2" prefix
298+
assert info.title_zh == "开始开外挂的前勇者候补过著悠哉异世界生活"
299+
300+
301+
class TestIssue764WesternFormat:
302+
"""Issue #764: Western release format without group brackets."""
303+
304+
def test_parse_western_format(self):
305+
content = "Girls Band Cry S01E05 VOSTFR 1080p WEB x264 AAC -Tsundere-Raws (ADN)"
306+
info = raw_parser(content)
307+
assert info is not None
308+
assert info.episode == 5
309+
assert info.season == 1
310+
assert info.resolution == "1080p"
311+
# No brackets → group detection fails
312+
assert info.group == ""
313+
# No CJK chars → no title_zh/jp; EN detection also fails (short segments)
314+
assert info.title_en is None
315+
assert info.title_zh is None
316+
317+
318+
class TestIssue986AtlasFormat:
319+
"""Issue #986: Atlas subtitle group bracket-delimited format."""
320+
321+
TITLES = [
322+
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][04_半神们的卡农曲][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
323+
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][07_神自黄昏归来][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
324+
]
325+
326+
@pytest.mark.xfail(reason="Atlas bracket-delimited format not supported by TITLE_RE")
327+
def test_parse_atlas_format(self):
328+
info = raw_parser(self.TITLES[0])
329+
assert info is not None
330+
assert info.title_zh == "命运-奇异赝品"
331+
assert info.episode == 4
332+
333+
@pytest.mark.parametrize("title", TITLES)
334+
def test_returns_none(self, title):
335+
"""Parser cannot handle Atlas format currently."""
336+
assert raw_parser(title) is None
337+
338+
339+
class TestIssue773CompoundEpisode:
340+
"""Issue #773: Compound episode number [02(57)] not recognized."""
341+
342+
TITLE = "【豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]"
343+
344+
def test_parse_compound_episode(self):
345+
info = raw_parser(self.TITLE)
346+
assert info is not None
347+
assert info.title_zh == "鬼灭之刃 柱训练篇"
348+
assert info.episode == 2
349+
350+
351+
class TestIssue805TitleWithCht:
352+
"""Issue #805: Traditional Chinese title parses correctly."""
160353

354+
def test_parse_cht_title(self):
355+
content = "[ANi] 不時輕聲地以俄語遮羞的鄰座艾莉同學 - 02 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4"
356+
info = raw_parser(content)
357+
assert info is not None
358+
assert info.group == "ANi"
359+
assert info.title_zh == "不時輕聲地以俄語遮羞的鄰座艾莉同學"
360+
assert info.episode == 2
361+
assert info.resolution == "1080P"
362+
assert info.source == "Baha"
363+
assert info.sub == "CHT"
161364

0 commit comments

Comments
 (0)