Skip to content

Commit fec1097

Browse files
committed
fix: prevent duplicate Audiences unread messages
1 parent 737bcb5 commit fec1097

2 files changed

Lines changed: 428 additions & 15 deletions

File tree

app/modules/indexer/parser/nexus_audiences.py

Lines changed: 144 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55

66
from lxml import etree
77

8+
from app.log import logger
89
from app.modules.indexer.parser import SiteSchema
910
from app.modules.indexer.parser.nexus_php import NexusPhpSiteUserInfo
1011
from app.utils.string import StringUtils
1112

1213

1314
class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo):
1415
schema = SiteSchema.NexusAudiences
16+
__UNKNOWN_UNREAD_COUNT = 99999
1517

1618
def __init__(self, *args, **kwargs):
1719
"""
@@ -21,6 +23,7 @@ def __init__(self, *args, **kwargs):
2123
self._user_mail_unread_page = self.__build_unread_mailbox_page(box=1)
2224
self._sys_mail_unread_page = None
2325
self.__next_mail_page = 1
26+
self.__seen_unread_message_links = set()
2427

2528
def _parse_message_unread(self, html_text):
2629
"""
@@ -44,6 +47,8 @@ def _parse_message_unread(self, html_text):
4447
if unread is not None:
4548
self.message_unread = unread
4649
return
50+
if message_tools:
51+
return
4752
finally:
4853
if html is not None:
4954
del html
@@ -63,14 +68,113 @@ def _parse_message_unread_links(self, html_text: str, msg_links: list):
6368
'//tr[.//img[contains(concat(" ", normalize-space(@class), " "), " unreadpm ") '
6469
'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href'
6570
)
66-
msg_links.extend(message_links)
67-
next_page = self.__build_next_unread_mailbox_page(bool(message_links))
71+
new_message_links = self.__filter_new_message_links(message_links)
72+
if message_links and not new_message_links:
73+
logger.warn(f"{self._site_name} 未读消息页只发现重复消息链接,停止后续翻页")
74+
msg_links.extend(new_message_links)
75+
next_page = self.__build_next_unread_mailbox_page(
76+
self.__should_fetch_next_unread_page(new_message_links)
77+
)
6878
finally:
6979
if html is not None:
7080
del html
7181

7282
return next_page
7383

84+
def _pase_unread_msgs(self):
85+
"""
86+
解析 Audiences 未读消息,避免异常分页重复通知和空详情通知。
87+
"""
88+
self.__reset_unread_message_parse_state()
89+
unread_msg_links = []
90+
if self.message_unread > 0 or self.message_read_force:
91+
next_page = self.__parse_unread_message_list_page(
92+
link=self._user_mail_unread_page,
93+
unread_msg_links=unread_msg_links
94+
)
95+
while next_page:
96+
next_page = self.__parse_unread_message_list_page(
97+
link=next_page,
98+
unread_msg_links=unread_msg_links
99+
)
100+
if self.message_unread == self.__UNKNOWN_UNREAD_COUNT:
101+
self.message_unread = len(unread_msg_links)
102+
elif unread_msg_links and not self.message_unread:
103+
self.message_unread = len(unread_msg_links)
104+
for msg_link in unread_msg_links:
105+
logger.debug(f"{self._site_name} 信息链接 {msg_link}")
106+
head, date, content = self._parse_message_content(
107+
self._get_page_content(
108+
urljoin(self._base_url, msg_link),
109+
params=self._mail_content_params,
110+
headers=self._mail_content_headers
111+
)
112+
)
113+
logger.debug(f"{self._site_name} 标题 {head} 时间 {date} 内容 {content}")
114+
if self.__is_empty_message_content(head, date, content):
115+
logger.warn(f"{self._site_name} 信息链接 {msg_link} 解析结果为空,跳过消息通知")
116+
continue
117+
self.message_unread_contents.append((head, date, content))
118+
119+
def __parse_unread_message_list_page(self, link: str, unread_msg_links: list):
120+
"""
121+
读取并解析一页 Audiences 未读消息列表。
122+
"""
123+
if not link:
124+
return None
125+
return self._parse_message_unread_links(
126+
self._get_page_content(
127+
url=urljoin(self._base_url, link),
128+
params=self._mail_unread_params,
129+
headers=self._mail_unread_headers
130+
),
131+
unread_msg_links
132+
)
133+
134+
def __reset_unread_message_parse_state(self):
135+
"""
136+
重置 Audiences 未读消息分页状态,避免复用解析器时沿用上次页码和去重集合。
137+
"""
138+
self.__next_mail_page = 1
139+
self.__seen_unread_message_links.clear()
140+
141+
def __filter_new_message_links(self, message_links: list) -> list:
142+
"""
143+
过滤 Audiences 异常分页重复返回的消息详情链接。
144+
"""
145+
new_message_links = []
146+
for message_link in message_links:
147+
message_link_key = urljoin(self._base_url, message_link)
148+
if message_link_key in self.__seen_unread_message_links:
149+
continue
150+
self.__seen_unread_message_links.add(message_link_key)
151+
new_message_links.append(message_link)
152+
return new_message_links
153+
154+
def __should_fetch_next_unread_page(self, new_message_links: list) -> bool:
155+
"""
156+
判断是否还需要继续请求 Audiences 下一页未读消息列表。
157+
"""
158+
if not new_message_links:
159+
return False
160+
return not self.__has_reached_expected_unread_count()
161+
162+
def __has_reached_expected_unread_count(self) -> bool:
163+
"""
164+
已达到 Audiences 顶部栏给出的未读数时停止翻页。
165+
"""
166+
return not self.message_read_force \
167+
and self.message_unread > 0 \
168+
and self.message_unread != self.__UNKNOWN_UNREAD_COUNT \
169+
and len(self.__seen_unread_message_links) >= self.message_unread
170+
171+
@staticmethod
172+
def __is_empty_message_content(head, date, content) -> bool:
173+
"""
174+
判断消息详情是否完全为空,避免把解析失败页包装成 None 通知。
175+
"""
176+
return not any(str(item).strip() for item in (head, date, content) if item is not None)
177+
74178
@classmethod
75179
def __build_unread_mailbox_page(cls, box: int) -> str:
76180
"""
@@ -208,26 +312,29 @@ def __parse_inbox_unread(self, message_link):
208312
"""
209313
从 Audiences 收件箱入口提取未读数。
210314
"""
211-
inbox_texts = [
315+
for inbox_text in [
212316
message_link.get("title"),
213317
message_link.get("aria-label"),
214-
*message_link.xpath(
215-
'.//*[contains(@class, "site-userbar__compact-tool-badge--unread") '
216-
'or contains(@class, "site-userbar__compact-tool-badge")]/text()'
217-
)
218-
]
318+
]:
319+
unread = self.__extract_inbox_unread_pair(inbox_text)
320+
if unread is not None:
321+
return unread
219322

220-
for inbox_text in inbox_texts:
221-
unread = self.__extract_inbox_unread(inbox_text)
323+
for inbox_text in message_link.xpath(
324+
'.//*[contains(@class, "site-userbar__compact-tool-badge--unread")]/text()'):
325+
unread = self.__extract_inbox_unread_badge(inbox_text)
222326
if unread is not None:
223327
return unread
224328

329+
if self.__has_inbox_unread_marker(message_link):
330+
return self.__UNKNOWN_UNREAD_COUNT
331+
225332
return None
226333

227334
@staticmethod
228-
def __extract_inbox_unread(text: str):
335+
def __extract_inbox_unread_pair(text: str):
229336
"""
230-
Audiences 收件箱角标格式为 总数/未读数,例如 1749/172。
337+
Audiences 总数/未读数格式中提取未读数,例如 1749/172。
231338
"""
232339
if not text:
233340
return None
@@ -240,11 +347,35 @@ def __extract_inbox_unread(text: str):
240347
if inbox_count:
241348
return StringUtils.str_int(inbox_count.group(2))
242349

243-
single_count = re.search(r"收件箱\s*(\d[\d,]*)", text)
350+
return None
351+
352+
@staticmethod
353+
def __extract_inbox_unread_badge(text: str):
354+
"""
355+
从明确的未读角标中提取未读数,避免把普通收件箱总数误作未读。
356+
"""
357+
unread = NexusAudiencesSiteUserInfo.__extract_inbox_unread_pair(text)
358+
if unread is not None:
359+
return unread
360+
361+
if not text:
362+
return None
363+
text = re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
364+
single_count = re.fullmatch(r"(\d[\d,]*)", text)
244365
if single_count:
245366
return StringUtils.str_int(single_count.group(1))
246367
return None
247368

369+
@staticmethod
370+
def __has_inbox_unread_marker(message_link) -> bool:
371+
"""
372+
判断收件箱入口是否只有未读状态但没有可靠数量。
373+
"""
374+
link_class = message_link.get("class") or ""
375+
if "site-userbar__compact-tool--has-unread" in link_class:
376+
return True
377+
return bool(message_link.xpath('.//*[contains(@class, "site-userbar__compact-tool-badge--unread")]'))
378+
248379
def _parse_seeding_pages(self):
249380
if not self._torrent_seeding_page:
250381
return

0 commit comments

Comments
 (0)