55
66from lxml import etree
77
8+ from app .log import logger
89from app .modules .indexer .parser import SiteSchema
910from app .modules .indexer .parser .nexus_php import NexusPhpSiteUserInfo
1011from app .utils .string import StringUtils
1112
1213
1314class NexusAudiencesSiteUserInfo (NexusPhpSiteUserInfo ):
1415 schema = SiteSchema .NexusAudiences
16+ __UNKNOWN_UNREAD_COUNT = 99999
1517
1618 def __init__ (self , * args , ** kwargs ):
1719 """
@@ -21,6 +23,7 @@ def __init__(self, *args, **kwargs):
2123 self ._user_mail_unread_page = self .__build_unread_mailbox_page (box = 1 )
2224 self ._sys_mail_unread_page = None
2325 self .__next_mail_page = 1
26+ self .__seen_unread_message_links = set ()
2427
2528 def _parse_message_unread (self , html_text ):
2629 """
@@ -44,6 +47,8 @@ def _parse_message_unread(self, html_text):
4447 if unread is not None :
4548 self .message_unread = unread
4649 return
50+ if message_tools :
51+ return
4752 finally :
4853 if html is not None :
4954 del html
@@ -63,14 +68,113 @@ def _parse_message_unread_links(self, html_text: str, msg_links: list):
6368 '//tr[.//img[contains(concat(" ", normalize-space(@class), " "), " unreadpm ") '
6469 'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href'
6570 )
66- msg_links .extend (message_links )
67- next_page = self .__build_next_unread_mailbox_page (bool (message_links ))
71+ new_message_links = self .__filter_new_message_links (message_links )
72+ if message_links and not new_message_links :
73+ logger .warn (f"{ self ._site_name } 未读消息页只发现重复消息链接,停止后续翻页" )
74+ msg_links .extend (new_message_links )
75+ next_page = self .__build_next_unread_mailbox_page (
76+ self .__should_fetch_next_unread_page (new_message_links )
77+ )
6878 finally :
6979 if html is not None :
7080 del html
7181
7282 return next_page
7383
84+ def _pase_unread_msgs (self ):
85+ """
86+ 解析 Audiences 未读消息,避免异常分页重复通知和空详情通知。
87+ """
88+ self .__reset_unread_message_parse_state ()
89+ unread_msg_links = []
90+ if self .message_unread > 0 or self .message_read_force :
91+ next_page = self .__parse_unread_message_list_page (
92+ link = self ._user_mail_unread_page ,
93+ unread_msg_links = unread_msg_links
94+ )
95+ while next_page :
96+ next_page = self .__parse_unread_message_list_page (
97+ link = next_page ,
98+ unread_msg_links = unread_msg_links
99+ )
100+ if self .message_unread == self .__UNKNOWN_UNREAD_COUNT :
101+ self .message_unread = len (unread_msg_links )
102+ elif unread_msg_links and not self .message_unread :
103+ self .message_unread = len (unread_msg_links )
104+ for msg_link in unread_msg_links :
105+ logger .debug (f"{ self ._site_name } 信息链接 { msg_link } " )
106+ head , date , content = self ._parse_message_content (
107+ self ._get_page_content (
108+ urljoin (self ._base_url , msg_link ),
109+ params = self ._mail_content_params ,
110+ headers = self ._mail_content_headers
111+ )
112+ )
113+ logger .debug (f"{ self ._site_name } 标题 { head } 时间 { date } 内容 { content } " )
114+ if self .__is_empty_message_content (head , date , content ):
115+ logger .warn (f"{ self ._site_name } 信息链接 { msg_link } 解析结果为空,跳过消息通知" )
116+ continue
117+ self .message_unread_contents .append ((head , date , content ))
118+
119+ def __parse_unread_message_list_page (self , link : str , unread_msg_links : list ):
120+ """
121+ 读取并解析一页 Audiences 未读消息列表。
122+ """
123+ if not link :
124+ return None
125+ return self ._parse_message_unread_links (
126+ self ._get_page_content (
127+ url = urljoin (self ._base_url , link ),
128+ params = self ._mail_unread_params ,
129+ headers = self ._mail_unread_headers
130+ ),
131+ unread_msg_links
132+ )
133+
134+ def __reset_unread_message_parse_state (self ):
135+ """
136+ 重置 Audiences 未读消息分页状态,避免复用解析器时沿用上次页码和去重集合。
137+ """
138+ self .__next_mail_page = 1
139+ self .__seen_unread_message_links .clear ()
140+
141+ def __filter_new_message_links (self , message_links : list ) -> list :
142+ """
143+ 过滤 Audiences 异常分页重复返回的消息详情链接。
144+ """
145+ new_message_links = []
146+ for message_link in message_links :
147+ message_link_key = urljoin (self ._base_url , message_link )
148+ if message_link_key in self .__seen_unread_message_links :
149+ continue
150+ self .__seen_unread_message_links .add (message_link_key )
151+ new_message_links .append (message_link )
152+ return new_message_links
153+
154+ def __should_fetch_next_unread_page (self , new_message_links : list ) -> bool :
155+ """
156+ 判断是否还需要继续请求 Audiences 下一页未读消息列表。
157+ """
158+ if not new_message_links :
159+ return False
160+ return not self .__has_reached_expected_unread_count ()
161+
162+ def __has_reached_expected_unread_count (self ) -> bool :
163+ """
164+ 已达到 Audiences 顶部栏给出的未读数时停止翻页。
165+ """
166+ return not self .message_read_force \
167+ and self .message_unread > 0 \
168+ and self .message_unread != self .__UNKNOWN_UNREAD_COUNT \
169+ and len (self .__seen_unread_message_links ) >= self .message_unread
170+
171+ @staticmethod
172+ def __is_empty_message_content (head , date , content ) -> bool :
173+ """
174+ 判断消息详情是否完全为空,避免把解析失败页包装成 None 通知。
175+ """
176+ return not any (str (item ).strip () for item in (head , date , content ) if item is not None )
177+
74178 @classmethod
75179 def __build_unread_mailbox_page (cls , box : int ) -> str :
76180 """
@@ -208,26 +312,29 @@ def __parse_inbox_unread(self, message_link):
208312 """
209313 从 Audiences 收件箱入口提取未读数。
210314 """
211- inbox_texts = [
315+ for inbox_text in [
212316 message_link .get ("title" ),
213317 message_link .get ("aria-label" ),
214- * message_link .xpath (
215- './/*[contains(@class, "site-userbar__compact-tool-badge--unread") '
216- 'or contains(@class, "site-userbar__compact-tool-badge")]/text()'
217- )
218- ]
318+ ]:
319+ unread = self .__extract_inbox_unread_pair (inbox_text )
320+ if unread is not None :
321+ return unread
219322
220- for inbox_text in inbox_texts :
221- unread = self .__extract_inbox_unread (inbox_text )
323+ for inbox_text in message_link .xpath (
324+ './/*[contains(@class, "site-userbar__compact-tool-badge--unread")]/text()' ):
325+ unread = self .__extract_inbox_unread_badge (inbox_text )
222326 if unread is not None :
223327 return unread
224328
329+ if self .__has_inbox_unread_marker (message_link ):
330+ return self .__UNKNOWN_UNREAD_COUNT
331+
225332 return None
226333
227334 @staticmethod
228- def __extract_inbox_unread (text : str ):
335+ def __extract_inbox_unread_pair (text : str ):
229336 """
230- Audiences 收件箱角标格式为 总数/未读数 ,例如 1749/172。
337+ 从 Audiences 总数/未读数格式中提取未读数 ,例如 1749/172。
231338 """
232339 if not text :
233340 return None
@@ -240,11 +347,35 @@ def __extract_inbox_unread(text: str):
240347 if inbox_count :
241348 return StringUtils .str_int (inbox_count .group (2 ))
242349
243- single_count = re .search (r"收件箱\s*(\d[\d,]*)" , text )
350+ return None
351+
352+ @staticmethod
353+ def __extract_inbox_unread_badge (text : str ):
354+ """
355+ 从明确的未读角标中提取未读数,避免把普通收件箱总数误作未读。
356+ """
357+ unread = NexusAudiencesSiteUserInfo .__extract_inbox_unread_pair (text )
358+ if unread is not None :
359+ return unread
360+
361+ if not text :
362+ return None
363+ text = re .sub (r"\s+" , " " , text .replace ("\xa0 " , " " )).strip ()
364+ single_count = re .fullmatch (r"(\d[\d,]*)" , text )
244365 if single_count :
245366 return StringUtils .str_int (single_count .group (1 ))
246367 return None
247368
369+ @staticmethod
370+ def __has_inbox_unread_marker (message_link ) -> bool :
371+ """
372+ 判断收件箱入口是否只有未读状态但没有可靠数量。
373+ """
374+ link_class = message_link .get ("class" ) or ""
375+ if "site-userbar__compact-tool--has-unread" in link_class :
376+ return True
377+ return bool (message_link .xpath ('.//*[contains(@class, "site-userbar__compact-tool-badge--unread")]' ))
378+
248379 def _parse_seeding_pages (self ):
249380 if not self ._torrent_seeding_page :
250381 return
0 commit comments