Skip to content

Commit 7138b86

Browse files
committed
fix: fix some title parse error
1 parent f41d494 commit 7138b86

4 files changed

Lines changed: 139 additions & 57 deletions

File tree

backend/src/module/models/bangumi.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ class BangumiBase(SQLModel):
3030
default=None, alias="bangumi_id", title="番剧Bangumi ID"
3131
)
3232
mikan_id: str | None = Field(default=None, alias="mikan_id", title="番剧Mikan ID")
33-
# filter: str = Field(default="", alias="filter", title="番剧过滤器")
34-
# 感觉是不是与 rss 表用外键关联更好
3533
rss_link: str = Field(default="", alias="rss_link", title="番剧RSS链接")
3634
poster_link: str = Field(default="", alias="poster_link", title="番剧海报链接")
3735
rule_name: str | None = Field(default=None, alias="rule_name", title="番剧规则名")
@@ -61,14 +59,15 @@ class Episode(BaseModel):
6159
title_en: str = Field(default="", alias="title_en", title="英文标题")
6260
title_zh: str = Field(default="", alias="title_zh", title="中文标题")
6361
title_jp: str = Field(default="", alias="title_jp", title="日文标题")
62+
title_romaji: str = Field(default="", alias="title_romaji", title="罗马音标题")
6463
season: int = Field(default=1, ge=0, alias="season", title="番剧季度")
6564
season_raw: str = Field(default="", alias="season_raw", title="番剧季度原名")
6665
episode: int = Field(default=0, ge=0, alias="episode", title="番剧集数")
6766
sub: str = Field(default="", alias="sub", title="字幕语言")
6867
sub_type: str = Field(default="", alias="sub_type", title="字幕类型")
6968
group: str = Field(default="", alias="group", title="字幕组")
7069
resolution: str = Field(default="", alias="resolution", title="分辨率")
71-
source: str = Field(default="", alias="source", title="来源")
70+
source: str = Field(default="", alias="source", title="视频来源")
7271
audio_info: list[str] = Field(
7372
default_factory=list, alias="audio_info", title="音频信息"
7473
)

backend/src/module/parser/analyser/meta_parser.py

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from module.models import Episode
66

7-
from . import patterns as p
7+
from module.parser.analyser import patterns as p
88

99
logger = logging.getLogger(__name__)
1010

@@ -38,25 +38,27 @@ class TitleMetaParser:
3838
"""
3939

4040
def __init__(self) -> None:
41-
self.raw_title = ""
42-
self.title = ""
43-
self.token = []
41+
self.raw_title: str = ""
42+
self.title: str = ""
43+
self.token: list[str] = []
4444

4545
def process_title(self) -> None:
4646
"""预处理标题,统一格式"""
47+
# title 里面可能有"\n"
4748
self.title = self.title.replace("\n", " ")
48-
4949
# 如果以【开头
50+
#
5051
if self.title.startswith("【"):
5152
translation_table = str.maketrans("【】", "[]")
5253
self.title = self.title.translate(translation_table)
5354
self.title = self.title.strip()
54-
self.title += "_"
55+
self.title += "/"
5556

5657
def parser(self, title: str) -> Episode:
5758
self.raw_title = title
5859
self.title = title
5960
self.process_title()
61+
group = self.get_group_info()
6062
source_info = self.get_source_info()
6163
resolution_info = self.get_resolution_info()
6264
audio_info = self.get_audio_info()
@@ -76,6 +78,8 @@ def parser(self, title: str) -> Episode:
7678
temp_title = self.title
7779
if "/[]" in temp_title:
7880
parts = temp_title.split("/[]")
81+
# /[] 代表可信的集数或季度, 所以可以相信后面是与集数无用的信息
82+
# 暂时没有哪个组把集数放前面
7983
if len(parts) > 1:
8084
temp_title = "[]".join(parts[:-1])
8185
self.token = re.split(r"[\[\]]", temp_title)
@@ -85,7 +89,8 @@ def parser(self, title: str) -> Episode:
8589
name_jp,
8690
) = self.name_process()
8791

88-
group = self.get_group()
92+
if not group:
93+
group = self.get_group()
8994
source = source_info[0] if source_info else ""
9095
sub = sub_language
9196
resolution = resolution_info[0] if resolution_info else ""
@@ -106,7 +111,7 @@ def parser(self, title: str) -> Episode:
106111
video_info=video_info,
107112
)
108113

109-
def findall_sub_title(self, pattern: re.Pattern, sym: str = "[]") -> Any:
114+
def findall_sub_title(self, pattern: re.Pattern[str], sym: str = "[]") -> list[str]:
110115
"""查找并替换标题中的模式"""
111116
ans = re.findall(pattern, self.title)
112117
if ans:
@@ -115,6 +120,13 @@ def findall_sub_title(self, pattern: re.Pattern, sym: str = "[]") -> Any:
115120
ans = re.findall(pattern, self.raw_title)
116121
return ans
117122

123+
def get_group_info(self) -> str:
124+
"""获取字幕组信息"""
125+
group_info = self.findall_sub_title(p.GROUP_RE)
126+
# 用& 合并多个字幕组信息
127+
group_info = "&".join(group_info).strip()
128+
return group_info
129+
118130
def get_episode_info(self) -> tuple[Any, bool, Any, bool]:
119131
"""获取剧集和季度信息"""
120132
episode_info = self.findall_sub_title(p.EPISODE_PATTERN, sym="/[]")
@@ -140,7 +152,6 @@ def parser_episode(self, episode_info: Any, episode_is_trusted: bool) -> int:
140152
un_trusted_episode_list.append(
141153
self.episode_info_to_episode(un_trusted_episode)
142154
)
143-
144155
# 所有的集数一致
145156
if all(x == un_trusted_episode_list[0] for x in un_trusted_episode_list):
146157
return un_trusted_episode_list[0]
@@ -208,7 +219,9 @@ def name_process(self) -> tuple[str, str, str]:
208219

209220
# 简化 token 过滤逻辑
210221
max_len = min(10, len(self.token))
211-
self.token = [token for token in self.token[:max_len] if len(token.strip()) > 1]
222+
self.token = [
223+
token.strip() for token in self.token[:max_len] if len(token.strip()) > 1
224+
]
212225

213226
self.token = self.token[:5]
214227
token_priority = [len(s) for s in self.token]
@@ -278,19 +291,19 @@ def get_group(self) -> str:
278291
return group
279292
return ""
280293

281-
def get_video_info(self) -> Any:
294+
def get_video_info(self) -> list[str]:
282295
"""获取视频格式信息"""
283296
return self.findall_sub_title(p.VIDEO_TYPE_PATTERN)
284297

285-
def get_resolution_info(self) -> Any:
298+
def get_resolution_info(self) -> list[str]:
286299
"""获取分辨率信息"""
287300
return self.findall_sub_title(p.RESOLUTION_RE)
288301

289-
def get_source_info(self) -> Any:
302+
def get_source_info(self) -> list[str]:
290303
"""获取视频来源信息"""
291304
return self.findall_sub_title(p.SOURCE_RE)
292305

293-
def get_unuseful_info(self) -> Any:
306+
def get_unuseful_info(self) -> list[str]:
294307
"""获取无用信息"""
295308
return self.findall_sub_title(p.UNUSEFUL_RE)
296309

@@ -365,23 +378,23 @@ def is_point_5(title: str) -> bool:
365378
# title = "前辈是男孩子 (2024) S01E02.mp4"
366379
# title = "[SBSUB][CONAN][1082][V2][1080P][AVC_AAC][CHS_JP](C1E4E331).mp4"
367380
# title = "海盗战记 S01E01.zh-tw.ass"
368-
title = "[百冬练习组&LoliHouse] BanG Dream! 少女乐团派对!☆PICO FEVER! / Garupa Pico: Fever! - 26 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][END] [101.69 MB]"
381+
# title = "[百冬练习组&LoliHouse] BanG Dream! 少女乐团派对!☆PICO FEVER! / Garupa Pico: Fever! - 26 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][END] [101.69 MB]"
369382
# title ="【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
370383
# title = "【失眠搬运组】放学后失眠的你-Kimi wa Houkago Insomnia - 06 [bilibili - 1080p AVC1 CHS-JP].mp4"
371384
# title = "[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4"
372385
# title = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4"
373-
# title = "Doomdos] -凡人修仙传-第107话-[1080P].mp"
386+
# title = "[Doomdos] -凡人修仙传-第107话-[1080P].mp"
374387
# title = "[豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]"
375388
# title = "迷宮飯 08/[TOC] Delicious in Dungeon [08][1080P][AVC AAC][CHT][MP4].mp4"
376389
# title = "[喵萌奶茶屋&LoliHouse] 葬送的芙莉莲 / Sousou no Frieren - 06 [WebRip 1080p HEVC-10bit AAC][简繁日内封字幕]"
377390
# title = "[LoliHouse] Ore wa Subete wo Parry suru - 05 [WebRip 1080p HEVC-10bit AAC SRTx2]"
378-
# title = " [LoliHouse] 我要【招架】一切 ~反误解的世界最强想成为冒险者~ / Ore wa Subete wo Parry suru - 05 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕] [复制磁连]"
391+
title = " [LoliHouse] 我要【招架】一切 ~反误解的世界最强想成为冒险者~ / Ore wa Subete wo Parry suru - 05 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕] [复制磁连]"
379392
# title = "北宇治字幕组] 夜晚的水母不会游泳 / Yoru no Kurage wa Oyogenai [01-12 修正合集][WebRip][HEVC_AAC][简繁日内封] [复制磁连]"
380393
# title = "[北宇治字组&霜庭云花Sub&氢气烤肉架]【我推的孩子】/【Oshi no Ko】[18][WebRip][HEVC_AAC][繁日内嵌]"
381394
# # print(re.findall(RESOLUTION_RE,title))
382-
title = (
383-
"[织梦字幕组][尼尔:机械纪元 NieR Automata Ver1.1a][02集][1080P][AVC][简日双语]"
384-
)
395+
# title = (
396+
# "[织梦字幕组][尼尔:机械纪元 NieR Automata Ver1.1a][02集][1080P][AVC][简日双语]"
397+
# )
385398
# title = "[ANi] Bakemonogatari / 物语系列 第外季&第怪季 - 06.5 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][ANi] Bakemonogatari / 物语系列 第外季&第怪季 - 06.5 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][217.2 MB]"
386399
# title = "ANi] 我獨自升級 - 07.5 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4"
387400
# title = "[NEO·QSW]古莲泰沙U グレンダイザーU Grendizer U 02[WEBRIP AVC 1080P](搜索用:巨灵神/克雷飞天神)"
@@ -391,28 +404,29 @@ def is_point_5(title: str) -> bool:
391404
# )
392405
# title = "物语系列 S05E06.5.mp4 "
393406
# title = " 【幻月字幕组】【24年日剧】【直到破坏了丈夫的家庭】【第7话】【1080P】【中日双语】.mp4"
394-
title = "[LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][609.59 MB]"
407+
# title = "[LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][609.59 MB]"
395408
# title = "[ANi] Re:从零开始的异世界生活 第三季 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4] [复制磁连]"
396409
# title = "[AnimeRep] 蓝箱 / 青之箱 / Blue Box / Ao no Hako- 02 [1080p][简中内嵌]"
397410
# title = "[ANi] Kekkon Surutte Hontou desu ka / 听说你们要结婚!? - 03 [1080P][baha][WEB-DL][AAC AVC][CHT][MP4]"
398-
# title = "[DBD-Raws][败犬女主太多了!/Make Heroine ga Oosugiru!/负けヒロインが多すぎる!][07-08TV+特典映像][BOX4][1080P][BDRip][HEVC-10bit][FLACx2][MKV] [复制磁连]"
411+
title = "[DBD-Raws][败犬女主太多了!/Make Heroine ga Oosugiru!/负けヒロインが多すぎる!][07-08TV+特典映像][BOX4][1080P][BDRip][HEVC-10bit][FLACx2][MKV] [复制磁连]"
399412
# title = "[漫猫字幕组&猫恋汉化组] 败犬女主太多了/Make Heroine ga Oosugiru (01-12Fin WEBRIP 1080p AVC AAC MP4 2024年7月 简中) [复制磁连]"
400-
title = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕] [复制磁连]"
413+
# title = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕] [复制磁连]"
401414
# title = "[Lilith-Raws] Boku no Kokoro no Yabai Yatsu - 01 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4].mp4"
402415
# title = "[LoliHouse] 关于我转生变成史莱姆这档事 第三季 / Tensei Shitara Slime Datta Ken 3rd Season - 17.5(65.5) [WebRip 1080p HEVC-10bit AAC][简繁内封字幕] [复制磁连]"
403416
# title = "水星的魔女(2022) S00E19.mp4"
404417
# title = "[Billion Meta Lab] 终末列车寻往何方 Shuumatsu Torein Dokoe Iku [12][1080][HEVC 10bit][简繁日内封][END]"
405-
# title = " 幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
418+
# title = "幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
406419
# title = "【1月】超超超超超喜欢你的100个女朋友 第二季 07.mp4"
407420
# print(is_vd(title))
408421
# print(is_point_5(title))
409422
# title = "[云歌字幕组][Re:从零开始的异世界生活 第三季 袭击篇][01][HEVC][x265 10bit][1080p][简日双语][招募校对] [复制磁连]"
410-
# title = "NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)"
423+
# title = "[NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)"
411424
# title = "[SBSUBJ[CONAN][1155][WEBRIP][1080P1[AVC_AAC][CHT_JP](8D4F664C).mp4"
412425
# title = "[TOC] 最弱技能《果实大师》 ~关于能无限食用技能果实(吃了就会死)这件事~ 09 [1080P][AVC AAC][CHT][MP4] [复制磁连]"
413426
# title = "[ANi] 离开 A 级队伍的我,和从前的弟子往迷宫深处迈进 - 08 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4] [复制磁连]"
414427
# title = "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
415428
# title = "海盗战记 (2019) S01E01.mp4"
429+
# title = "somethime error"
416430
# print(title)
417431
# print(re.findall(EPISODE_PATTERN, title))
418432
# print(re.findall(SEASON_RE, title))
@@ -433,9 +447,13 @@ def is_point_5(title: str) -> bool:
433447
# title = (
434448
# "[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"
435449
# )
436-
title = "负けヒロインが多すぎる! (JPBD Vol.1-6 Remux) 败犬女主太多了! 败北女角太多了! Make Heroine ga Oosugiru! Toooooo Many Losing Heroines! [复制磁连]"
437-
title = "海盗战记 S01E01.SC.ass"
438-
title = "[LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕].mkv"
450+
# title = "负けヒロインが多すぎる! (JPBD Vol.1-6 Remux) 败犬女主太多了! 败北女角太多了! Make Heroine ga Oosugiru! Toooooo Many Losing Heroines! [复制磁连]"
451+
# title = "海盗战记 S01E01.SC.ass"
452+
# title = "[LoliHouse] 2.5次元的诱惑 / 2.5-jigen no Ririsa - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕].mkv"
453+
# title = "[桜都字幕组&7³ACG] 摇曳露营 第3季/ゆるキャン△ SEASON3/Yuru Camp S03 | 01-12+New Anime 01-03 [简繁字幕] BDrip 1080p AV1 OPUS 2.0 [复制磁连]"
454+
#
455+
#
456+
# print(re.findall(p.GROUP_RE, title))
439457
res = raw_parser(title)
440458
for k, v in res.__dict__.items():
441459
print(f"{k}: {v}")

0 commit comments

Comments
 (0)