Skip to content

Commit 1550b75

Browse files
committed
perf: precompile anime metadata regexes
1 parent b7f6ee1 commit 1550b75

1 file changed

Lines changed: 34 additions & 17 deletions

File tree

app/core/meta/metaanime.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,32 @@
1111
from app.schemas.types import MediaType
1212

1313

14+
BRACKET_TITLE_RE = re.compile(r'\[(.+?)]')
15+
RESOURCE_PIX_X_RE = re.compile(r'x', re.IGNORECASE)
16+
RESOURCE_PIX_SPLIT_RE = re.compile(r'[Xx]')
17+
ANIME_MARK_RE = re.compile(r"新番|月?番|[日美国][漫剧]")
18+
ANIME_PREFIX_RE = re.compile(r".*番.|.*[日美国][漫剧].")
19+
CATEGORY_TAG_RE = re.compile(
20+
r"[动漫画纪录片电影视连续剧集日美韩中港台海外亚洲华语大陆综艺原盘高清]{2,}|TV|Animation|Movie|Documentar|Anime",
21+
re.IGNORECASE,
22+
)
23+
LEADING_BRACKET_BLOCK_RE = re.compile(r"^[^]]*]")
24+
FILE_SIZE_RE = re.compile(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', re.IGNORECASE)
25+
TV_EPISODE_BRACKET_RE = re.compile(r"\[TV\s+(\d{1,4})", re.IGNORECASE)
26+
FOUR_K_BRACKET_RE = re.compile(r'\[4k]', re.IGNORECASE)
27+
NUMERIC_BRACKET_RE = re.compile(r"\[\d+", re.IGNORECASE)
28+
MIXED_CHINESE_TOKEN_RE = re.compile(r'[\d|#::\-()()\u4e00-\u9fff]')
29+
30+
1431
class MetaAnime(MetaBase):
1532
"""
1633
识别动漫
1734
"""
1835
_anime_no_words = ['CHS&CHT', 'MP4', 'GB MP4', 'WEB-DL']
1936
_name_nostring_re = r"S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}|\s+GB"
2037
_fps_re = r"(\d{2,3})(?=FPS)"
38+
_name_nostring_pattern = re.compile(_name_nostring_re, re.IGNORECASE)
39+
_fps_pattern = re.compile(r"(%s)" % _fps_re, re.IGNORECASE)
2140

2241
def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
2342
super().__init__(title, subtitle, isfile)
@@ -38,7 +57,7 @@ def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
3857
if anitopy_info:
3958
name = anitopy_info.get("anime_title")
4059
if not name or name in self._anime_no_words or (len(name) < 5 and not StringUtils.is_chinese(name)):
41-
name_match = re.search(r'\[(.+?)]', title)
60+
name_match = BRACKET_TITLE_RE.search(title)
4261
if name_match and name_match.group(1):
4362
name = name_match.group(1).strip()
4463
# 拆份中英文名称
@@ -81,9 +100,9 @@ def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
81100
if self.cn_name:
82101
_, self.cn_name, _, _, _, _ = StringUtils.get_keyword(self.cn_name)
83102
if self.cn_name:
84-
self.cn_name = re.sub(r'%s' % self._name_nostring_re, '', self.cn_name, flags=re.IGNORECASE).strip()
103+
self.cn_name = self._name_nostring_pattern.sub('', self.cn_name).strip()
85104
if self.en_name:
86-
self.en_name = re.sub(r'%s' % self._name_nostring_re, '', self.en_name, flags=re.IGNORECASE).strip().title()
105+
self.en_name = self._name_nostring_pattern.sub('', self.en_name).strip().title()
87106
self._name = StringUtils.str_title(self.en_name)
88107
# 年份
89108
year = anitopy_info.get("anime_year")
@@ -154,8 +173,8 @@ def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
154173
if isinstance(self.resource_pix, list):
155174
self.resource_pix = self.resource_pix[0]
156175
if self.resource_pix:
157-
if re.search(r'x', self.resource_pix, re.IGNORECASE):
158-
self.resource_pix = re.split(r'[Xx]', self.resource_pix)[-1] + "p"
176+
if RESOURCE_PIX_X_RE.search(self.resource_pix):
177+
self.resource_pix = RESOURCE_PIX_SPLIT_RE.split(self.resource_pix)[-1] + "p"
159178
else:
160179
self.resource_pix = self.resource_pix.lower()
161180
if str(self.resource_pix).isdigit():
@@ -191,7 +210,7 @@ def __init_anime_fps(self, anitopy_info: dict, original_title: str):
191210
"""
192211
从原始标题中提取帧率信息,与MetaVideo保持完全一致的实现
193212
"""
194-
re_res = re.search(rf"({self._fps_re})", original_title, re.IGNORECASE)
213+
re_res = self._fps_pattern.search(original_title)
195214
if re_res:
196215
fps_value = None
197216
if re_res.group(1): # FPS格式
@@ -211,23 +230,21 @@ def __prepare_title(title: str):
211230
# 所有【】换成[]
212231
title = title.replace("【", "[").replace("】", "]").strip()
213232
# 截掉xx番剧漫
214-
match = re.search(r"新番|月?番|[日美国][漫剧]", title)
233+
match = ANIME_MARK_RE.search(title)
215234
if match and match.span()[1] < len(title) - 1:
216-
title = re.sub(".*番.|.*[日美国][漫剧].", "", title)
235+
title = ANIME_PREFIX_RE.sub("", title)
217236
elif match:
218237
title = title[:title.rfind('[')]
219238
# 截掉分类
220239
first_item = title.split(']')[0]
221-
if first_item and re.search(r"[动漫画纪录片电影视连续剧集日美韩中港台海外亚洲华语大陆综艺原盘高清]{2,}|TV|Animation|Movie|Documentar|Anime",
222-
zhconv_convert(first_item, "zh-hans"),
223-
re.IGNORECASE):
224-
title = re.sub(r"^[^]]*]", "", title).strip()
240+
if first_item and CATEGORY_TAG_RE.search(zhconv_convert(first_item, "zh-hans")):
241+
title = LEADING_BRACKET_BLOCK_RE.sub("", title).strip()
225242
# 去掉大小
226-
title = re.sub(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', "", title, flags=re.IGNORECASE)
243+
title = FILE_SIZE_RE.sub("", title)
227244
# 将TVxx改为xx
228-
title = re.sub(r"\[TV\s+(\d{1,4})", r"[\1", title, flags=re.IGNORECASE)
245+
title = TV_EPISODE_BRACKET_RE.sub(r"[\1", title)
229246
# 将4K转为2160p
230-
title = re.sub(r'\[4k]', '2160p', title, flags=re.IGNORECASE)
247+
title = FOUR_K_BRACKET_RE.sub('2160p', title)
231248
# 处理/分隔的中英文标题
232249
names = title.split("]")
233250
if len(names) > 1 and title.find("- ") == -1:
@@ -246,8 +263,8 @@ def __prepare_title(title: str):
246263
titles.append("%s%s" % (left_char, name.split("/")[0].strip()))
247264
elif name:
248265
if StringUtils.is_chinese(name) and not StringUtils.is_all_chinese(name):
249-
if not re.search(r"\[\d+", name, re.IGNORECASE):
250-
name = re.sub(r'[\d|#::\-()()\u4e00-\u9fff]', '', name).strip()
266+
if not NUMERIC_BRACKET_RE.search(name):
267+
name = MIXED_CHINESE_TOKEN_RE.sub('', name).strip()
251268
if not name or name.strip().isdigit():
252269
continue
253270
if name == '[':

0 commit comments

Comments
 (0)