fix: 定时任务超时问题优化

SXP-Simon · SXP-Simon · commit 05836f64f2a9 · 2026-04-01T21:14:00.000+08:00
Fixes #141 定时分析任务间隔启动，放宽超时配置限制
diff --git a/_conf_schema.json b/_conf_schema.json
@@ -507,14 +507,20 @@
             "max_concurrent_llm": {
                 "type": "int",
                 "description": "最大 LLM 请求并发数（API 闸口）",
-                "default": 2,
+                "default": 3,
                 "hint": "限制同时发起的 AI 分析请求数量。主要用于【遵守 API 频率限制（RPM）】，避免因为请求过快导致 API 被封号或报错。"
             },
             "max_concurrent_t2i": {
                 "type": "int",
                 "description": "最大 T2I渲染并发数",
                 "default": 1,
                 "hint": "限制同时开启的 T2I 渲染进程数。"
+            },
+            "stagger_seconds": {
+                "type": "int",
+                "description": "多群分析交错间隔（秒）",
+                "default": 10,
+                "hint": "当同时启动多个群组的定时分析任务时，每个任务之间的延迟间隔（秒），建议根据实际的 LLM 服务速度填写，过短可能导致请求堆积，过长则分析效率降低。"
             }
         }
     }
diff --git a/main.py b/main.py
@@ -49,6 +49,7 @@
 from .src.shared.trace_context import TraceContext, TraceLogFilter
 from .src.utils.logger import logger
 from .src.utils.pdf_utils import PDFInstaller
+from .src.utils.resilience import GlobalRateLimiter
 
 
 class GroupDailyAnalysis(Star):
@@ -148,6 +149,9 @@ def __init__(self, context: Context, config: AstrBotConfig):
             plugin_instance=self,
         )
 
+        # 同步全局限流并进行初始化配置
+        GlobalRateLimiter.get_instance(self.config_manager.get_llm_max_concurrent())
+
         self._initialized = False
         self._terminating = False  # 生命周期标志
         self._init_lock = asyncio.Lock()
diff --git a/src/infrastructure/analysis/utils/llm_utils.py b/src/infrastructure/analysis/utils/llm_utils.py
@@ -6,7 +6,7 @@
 import asyncio
 
 from ....utils.logger import logger
-from ....utils.resilience import CircuitBreaker, global_llm_rate_limiter
+from ....utils.resilience import CircuitBreaker, GlobalRateLimiter
 from .structured_output_schema import JSONObject
 
 _circuit_breakers = {}
@@ -273,7 +273,7 @@ async def call_provider_with_retry(
             # 使用全局限流器 + 熔断器记录
             # 超时由 Provider 内部控制，无需外层 wait_for
             try:
-                async with global_llm_rate_limiter:
+                async with GlobalRateLimiter.get_instance().semaphore:
                     llm_kwargs: dict[str, object] = {
                         "chat_provider_id": provider_id,
                         "prompt": prompt,
diff --git a/src/infrastructure/config/config_manager.py b/src/infrastructure/config/config_manager.py
@@ -387,6 +387,10 @@ def get_t2i_max_concurrent(self) -> int:
         """获取全局图片渲染（T2I）最大并发数"""
         return self._get_group("performance").get("max_concurrent_t2i", 1)
 
+    def get_stagger_seconds(self) -> int:
+        """获取多群分析任务启动时的交错间隔（秒）"""
+        return self._get_group("performance").get("stagger_seconds", 2)
+
     def set_max_concurrent_tasks(self, count: int):
         """设置自动分析最大并发数"""
         self._ensure_group("performance")["max_concurrent_groups"] = count
diff --git a/src/infrastructure/scheduler/auto_scheduler.py b/src/infrastructure/scheduler/auto_scheduler.py
@@ -376,10 +376,17 @@ async def dispatch_group(gid, pid, mode):
                         )
 
             tasks = []
-            for gid, pid, mode in all_targets:
+            stagger = self.config_manager.get_stagger_seconds() or 2
+            # 针对定时大任务加入交错等待，减少瞬间峰值延迟
+            for idx, (gid, pid, mode) in enumerate(all_targets):
                 if self._terminating:
                     logger.info("检测到插件正在停止，取消后续任务创建")
                     break
+
+                # 为前几个任务添加微小的启动间隔，均匀分散 API 压力
+                if idx > 0 and stagger > 0:
+                    await asyncio.sleep(stagger)
+
                 task = asyncio.create_task(
                     dispatch_group(gid, pid, mode),
                     name=f"report_{mode}_{gid}",
@@ -418,13 +425,13 @@ async def _perform_auto_analysis_for_group_with_timeout(
     ):
         """为指定群执行自动分析（带超时控制）"""
         try:
-            # 为每个群聊设置独立的超时时间（20分钟）
+            # 为每个群聊设置独立的超时时间，适当放宽到 30 分钟以支持大型批次
             await asyncio.wait_for(
                 self._perform_auto_analysis_for_group(group_id, target_platform_id),
-                timeout=1200,
+                timeout=1800,
             )
         except asyncio.TimeoutError:
-            logger.error(f"群 {group_id} 分析超时（20分钟），跳过该群分析")
+            logger.error(f"群 {group_id} 分析超时（30分钟），跳过该群分析")
         except Exception as e:
             logger.error(f"群 {group_id} 分析任务执行失败: {e}")
 
@@ -664,7 +671,7 @@ async def _perform_incremental_final_report_for_group_with_timeout(
                 self._perform_incremental_final_report_for_group(
                     group_id, target_platform_id
                 ),
-                timeout=1200,
+                timeout=1800,
             )
 
             # 判定是否需要触发回退 (例如：无增量数据等)
@@ -684,7 +691,7 @@ async def _perform_incremental_final_report_for_group_with_timeout(
             return result
 
         except asyncio.TimeoutError:
-            logger.error(f"群 {group_id} 最终报告超时（20分钟）")
+            logger.error(f"群 {group_id} 最终报告超时（30分钟）")
             if self.config_manager.get_incremental_fallback_enabled():
                 logger.warning(f"群 {group_id} 增量报告超时，正在回退到传统全量分析...")
                 return await self._fallback_to_traditional(group_id, target_platform_id)
diff --git a/src/utils/resilience.py b/src/utils/resilience.py
@@ -111,40 +111,50 @@ class GlobalRateLimiter:
     _instance: "GlobalRateLimiter | None" = None
     _semaphore: asyncio.Semaphore | None = None
 
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
     @classmethod
-    def get_instance(cls, max_concurrency: int = 3) -> "GlobalRateLimiter":
+    def get_instance(cls, max_concurrency: int | None = None) -> "GlobalRateLimiter":
         """
         获取或创建限流器单例。
 
         Args:
-            max_concurrency (int): 允许的最大并发行数
+            max_concurrency (int, optional): 允许的最大并发数。如果提供且与当前不同，则重置信号量。
 
         Returns:
             GlobalRateLimiter: 唯一实例
         """
-        if cls._instance is None:
-            cls._instance = cls()
-            cls._semaphore = asyncio.Semaphore(max_concurrency)
-        elif (
-            cls._semaphore is not None and cls._semaphore._value != max_concurrency  # type: ignore
+        instance = cls()
+        if max_concurrency is not None:
+            instance.reconfigure(max_concurrency)
+        elif cls._semaphore is None:
+            # 默认兜底
+            cls._semaphore = asyncio.Semaphore(3)
+        return instance
+
+    def reconfigure(self, max_concurrency: int):
+        """重新配置并发上限。注意：这会替换信号量对象。"""
+        if self._semaphore is None or (
+            hasattr(self._semaphore, "_value")
+            and self._semaphore._value != max_concurrency  # type: ignore
         ):
-            # 如果请求的并发数发生变化，重新创建信号量
+            old_val = (
+                getattr(self._semaphore, "_value", "None")
+                if self._semaphore
+                else "None"
+            )
             logger.info(
-                f"GlobalRateLimiter 重新配置：{cls._semaphore._value} -> {max_concurrency}"
+                f"GlobalRateLimiter 重新配置并发上限：{old_val} -> {max_concurrency}"
             )
-            cls._semaphore = asyncio.Semaphore(max_concurrency)
-        return cls._instance
+            self.__class__._semaphore = asyncio.Semaphore(max_concurrency)
 
     @property
     def semaphore(self) -> asyncio.Semaphore:
         """返回核心的异步信号量对象。"""
         if self._semaphore is None:
-            # 兜底：若直接通过属性访问则初始化默认值
-            self._semaphore = asyncio.Semaphore(3)
+            self.__class__._semaphore = asyncio.Semaphore(3)
+        assert self._semaphore is not None
         return self._semaphore
-
-
-# 导出默认实例：用于 LLM 调用的全局限流
-global_llm_rate_limiter: asyncio.Semaphore = GlobalRateLimiter.get_instance(
-    max_concurrency=3
-).semaphore