@@ -197,36 +197,124 @@ def load_env_key(env_path: str, key: str) -> str:
197197 return ''
198198 return ''
199199
200+ def estimate_tokens (text : str ) -> int :
201+ """粗略估计文本的token数量(中文按字数,英文按词数)"""
202+ chinese_chars = len ([c for c in text if '\u4e00 ' <= c <= '\u9fff ' ])
203+ english_words = len (text .replace ('\n ' , ' ' ).split ()) - chinese_chars
204+ return chinese_chars + english_words // 3
205+
206+ def split_content_by_tokens (content : str , max_tokens : int ) -> list :
207+ """按token限制分割内容"""
208+ chunks = []
209+ lines = content .split ('\n ' )
210+ current_chunk = []
211+ current_tokens = 0
212+
213+ for line in lines :
214+ line_tokens = estimate_tokens (line )
215+ if current_tokens + line_tokens > max_tokens and current_chunk :
216+ chunks .append ('\n ' .join (current_chunk ))
217+ current_chunk = [line ]
218+ current_tokens = line_tokens
219+ else :
220+ current_chunk .append (line )
221+ current_tokens += line_tokens
222+
223+ if current_chunk :
224+ chunks .append ('\n ' .join (current_chunk ))
225+
226+ return chunks
227+
200228def call_deepseek_chat (api_key : str , system_prompt : str , user_content : str ,
201229 model : str = 'deepseek-chat' , base_url : str = 'https://api.deepseek.com' ,
202- timeout_sec : int = 60 , stream : bool = False ) -> str :
203- """调用 DeepSeek Chat Completions API,返回文本内容。"""
230+ timeout_sec : int = 60 , stream : bool = False , max_tokens : int = 0 ,
231+ chunk_size : int = 0 ) -> str :
232+ """调用 AI Chat Completions API,支持分块处理,返回文本内容。"""
233+
234+ # 如果需要分块处理
235+ if chunk_size > 0 and estimate_tokens (user_content ) > chunk_size :
236+ print (f"内容过长({ estimate_tokens (user_content )} tokens),启用分块处理..." )
237+ chunks = split_content_by_tokens (user_content , chunk_size )
238+ print (f"分为 { len (chunks )} 块处理" )
239+
240+ all_responses = []
241+ conversation_history = []
242+
243+ for i , chunk in enumerate (chunks , 1 ):
244+ print (f"\n === 处理第 { i } /{ len (chunks )} 块 ===" )
245+
246+ # 构建对话历史
247+ messages = [{'role' : 'system' , 'content' : system_prompt }]
248+ messages .extend (conversation_history )
249+
250+ # 添加当前块的问题
251+ if i == 1 :
252+ chunk_prompt = f"请分析以下运营数据(第{ i } 部分,共{ len (chunks )} 部分):\n \n { chunk } "
253+ else :
254+ chunk_prompt = f"继续分析运营数据(第{ i } 部分,共{ len (chunks )} 部分):\n \n { chunk } "
255+
256+ messages .append ({'role' : 'user' , 'content' : chunk_prompt })
257+
258+ # 调用API处理当前块
259+ response = _single_api_call (messages , model , base_url , api_key , timeout_sec , stream )
260+ all_responses .append (response )
261+
262+ # 更新对话历史
263+ conversation_history .extend ([
264+ {'role' : 'user' , 'content' : chunk_prompt },
265+ {'role' : 'assistant' , 'content' : response }
266+ ])
267+
268+ # 最终整合
269+ final_prompt = f"基于前面 { len (chunks )} 部分的分析,请生成最终的完整运营分析报告,遵循之前的Markdown格式要求。"
270+ messages = [{'role' : 'system' , 'content' : system_prompt }]
271+ messages .extend (conversation_history )
272+ messages .append ({'role' : 'user' , 'content' : final_prompt })
273+
274+ print (f"\n === 生成最终整合报告 ===" )
275+ final_response = _single_api_call (messages , model , base_url , api_key , timeout_sec , stream )
276+ return final_response
277+
278+ else :
279+ # 单次处理
280+ messages = [
281+ {'role' : 'system' , 'content' : system_prompt },
282+ {'role' : 'user' , 'content' : user_content }
283+ ]
284+ return _single_api_call (messages , model , base_url , api_key , timeout_sec , stream )
285+
286+ def _single_api_call (messages : list , model : str , base_url : str , api_key : str ,
287+ timeout_sec : int , stream : bool ) -> str :
288+ """执行单次API调用"""
204289 # 兼容带/不带v1,自动规范化
205290 base = base_url .rstrip ('/' )
206291 if base .endswith ('/v1' ):
207292 url = f"{ base } /chat/completions"
208293 else :
209294 url = f"{ base } /chat/completions"
210295 headers = {
211- 'Content-Type' : 'application/json' ,
212- 'Authorization' : f'Bearer { api_key } '
296+ 'Content-Type' : 'application/json; charset=utf-8' ,
297+ 'Authorization' : f'Bearer { api_key } ' ,
298+ 'Accept' : 'application/json; charset=utf-8'
213299 }
214300 payload = {
215301 'model' : model ,
216- 'messages' : [
217- { 'role' : 'system' , 'content' : system_prompt },
218- { 'role' : 'user' , 'content' : user_content }
219- ],
302+ 'messages' : messages ,
220303 'stream' : bool (stream )
221304 }
222305 if stream :
223306 # 流式打印到终端,同时聚合内容
224- with requests .post (url , headers = headers , data = json .dumps (payload ), timeout = timeout_sec , stream = True ) as r :
307+ with requests .post (url , headers = headers , data = json .dumps (payload , ensure_ascii = False ), timeout = timeout_sec , stream = True ) as r :
225308 r .raise_for_status ()
309+ r .encoding = 'utf-8' # 强制设置编码
226310 full_text_parts : List [str ] = []
227311 for line in r .iter_lines (decode_unicode = True ):
228312 if not line :
229313 continue
314+ # 确保line是正确编码的字符串
315+ if isinstance (line , bytes ):
316+ line = line .decode ('utf-8' , errors = 'ignore' )
317+
230318 if line .startswith ('data: ' ):
231319 data_str = line [len ('data: ' ):].strip ()
232320 if data_str == '[DONE]' :
@@ -235,17 +323,32 @@ def call_deepseek_chat(api_key: str, system_prompt: str, user_content: str,
235323 obj = json .loads (data_str )
236324 delta = obj .get ('choices' , [{}])[0 ].get ('delta' , {}).get ('content' , '' )
237325 if delta :
326+ # 确保delta是正确的UTF-8字符串
327+ if isinstance (delta , bytes ):
328+ delta = delta .decode ('utf-8' , errors = 'ignore' )
238329 print (delta , end = '' , flush = True )
239330 full_text_parts .append (delta )
240- except Exception :
331+ except Exception as e :
332+ print (f"\n [DEBUG] JSON解析错误: { e } , 原始数据: { data_str [:100 ]} " )
241333 continue
242334 print () # 换行
243335 return '' .join (full_text_parts )
244336 else :
245- resp = requests .post (url , headers = headers , data = json .dumps (payload ), timeout = timeout_sec )
337+ resp = requests .post (url , headers = headers , data = json .dumps (payload , ensure_ascii = False ), timeout = timeout_sec )
246338 resp .raise_for_status ()
247- data = resp .json ()
248- return data .get ('choices' , [{}])[0 ].get ('message' , {}).get ('content' , '' )
339+ resp .encoding = 'utf-8' # 强制设置编码
340+
341+ try :
342+ data = resp .json ()
343+ content = data .get ('choices' , [{}])[0 ].get ('message' , {}).get ('content' , '' )
344+ # 确保返回的内容是正确编码
345+ if isinstance (content , bytes ):
346+ content = content .decode ('utf-8' , errors = 'ignore' )
347+ return content
348+ except Exception as e :
349+ print (f"[DEBUG] 响应解析错误: { e } " )
350+ print (f"[DEBUG] 原始响应: { resp .text [:200 ]} " )
351+ return ""
249352
250353def full_analysis (input_file : str , processed_dir : str , report_dir : str ,
251354 * , enable_ai : bool = False ,
@@ -290,18 +393,66 @@ def full_analysis(input_file: str, processed_dir: str, report_dir: str,
290393 cfg_timeout = int (env_timeout )
291394 except Exception :
292395 pass
396+ # 新增:支持 LMStudio 本地服务
397+ cfg_max_tokens = 0
398+ cfg_chunk_size = 0
399+ if model and model .lower ().startswith ('lmstudio' ):
400+ cfg_base_url = base_url or load_env_key (env_path , 'LMSTUDIO_BASE_URL' ) or cfg_base_url
401+ # 如果模型名是 lmstudio,则从环境变量读取实际模型名
402+ if model .lower () == 'lmstudio' :
403+ env_model = load_env_key (env_path , 'LMSTUDIO_MODEL_NAME' )
404+ if env_model :
405+ model = env_model
406+ api_key = api_key or load_env_key (env_path , 'LMSTUDIO_API_KEY' )
407+ try :
408+ lm_timeout = load_env_key (env_path , 'LMSTUDIO_TIMEOUT_SEC' )
409+ if lm_timeout :
410+ cfg_timeout = int (lm_timeout )
411+ except Exception :
412+ pass
413+
414+ # 读取token限制配置
415+ try :
416+ lm_max_tokens = load_env_key (env_path , 'LMSTUDIO_MAX_TOKENS' )
417+ if lm_max_tokens :
418+ cfg_max_tokens = int (lm_max_tokens )
419+
420+ lm_chunk_size = load_env_key (env_path , 'LMSTUDIO_CHUNK_SIZE' )
421+ if lm_chunk_size :
422+ cfg_chunk_size = int (lm_chunk_size )
423+ except Exception :
424+ pass
425+
426+ print (f"使用 LMStudio 配置: { cfg_base_url } , 模型: { model } " )
427+ if cfg_max_tokens > 0 :
428+ print (f"上下文限制: { cfg_max_tokens } tokens, 分块大小: { cfg_chunk_size } tokens" )
293429 if not api_key :
294430 print ("未在环境或 .env 中找到 DEEPSEEK_API_KEY,跳过AI摘要生成。" )
295431 return True
296432 try :
297433 ai_text = call_deepseek_chat (api_key , system_prompt , user_content ,
298434 model = model , base_url = cfg_base_url ,
299- timeout_sec = cfg_timeout , stream = stream )
435+ timeout_sec = cfg_timeout , stream = stream ,
436+ max_tokens = cfg_max_tokens , chunk_size = cfg_chunk_size )
300437 ai_dir = os .path .dirname (ai_output_path )
301438 if ai_dir :
302439 ensure_dir (ai_dir )
303440 with open (ai_output_path , 'w' , encoding = 'utf-8' ) as f :
304- f .write (ai_text or '' )
441+ # 确保写入文件的内容是正确的UTF-8编码
442+ if ai_text :
443+ # 如果内容包含乱码,尝试修复
444+ try :
445+ # 检测并修复可能的编码问题
446+ if isinstance (ai_text , bytes ):
447+ ai_text = ai_text .decode ('utf-8' , errors = 'ignore' )
448+ # 移除可能的控制字符
449+ ai_text = '' .join (char for char in ai_text if ord (char ) >= 32 or char in '\n \r \t ' )
450+ f .write (ai_text )
451+ except Exception as e :
452+ print (f"[WARNING] 文件写入编码错误: { e } " )
453+ f .write (str (ai_text ))
454+ else :
455+ f .write ('' )
305456 print (f"AI 摘要报告已生成: { ai_output_path } " )
306457 except Exception as e :
307458 print (f"DeepSeek API 调用失败: { e } " )
@@ -316,12 +467,12 @@ def main():
316467 parser .add_argument ('--output-dir' , type = str , default = 'processed_data' , help = '预处理输出目录,默认 processed_data' )
317468 parser .add_argument ('--report-dir' , type = str , default = 'output' , help = 'Markdown 报告输出目录,默认 output' )
318469 # AI 分析相关
319- parser .add_argument ('--ai' , action = 'store_true' , help = '启用 DeepSeek API 生成运营摘要 ' )
470+ parser .add_argument ('--ai' , action = 'store_true' , help = '启用 AI 摘要( DeepSeek 或本地 LMStudio 端点) ' )
320471 parser .add_argument ('--system-prompt' , type = str , default = 'agent/REPORT_ANALYST_SYSTEM_PROMPT.md' , help = '系统提示词路径' )
321472 parser .add_argument ('--env-file' , type = str , default = '.env' , help = '包含 DEEPSEEK_API_KEY 的 .env 文件路径' )
322473 parser .add_argument ('--ai-output' , type = str , default = 'output/ops_summary.md' , help = 'AI 摘要输出文件路径' )
323- parser .add_argument ('--ai-model' , type = str , default = 'deepseek-chat' , help = 'DeepSeek 模型名称,默认 deepseek-chat' )
324- parser .add_argument ('--ai-base-url' , type = str , default = None , help = 'DeepSeek Base URL,默认从 .env 读取或 https ://api.deepseek.com ' )
474+ parser .add_argument ('--ai-model' , type = str , default = 'deepseek-chat' , help = 'AI 模型名称,如 deepseek-chat、lmstudio(从环境变量读取)或具体模型名 ' )
475+ parser .add_argument ('--ai-base-url' , type = str , default = None , help = 'AI Base URL,可指向 DeepSeek 或本地 LMStudio,例如 http ://localhost:1234/v1 ' )
325476 parser .add_argument ('--ai-timeout' , type = int , default = 60 , help = 'DeepSeek 请求超时秒,默认 60,可用 .env 的 DEEPSEEK_TIMEOUT_SEC 覆盖' )
326477 parser .add_argument ('--ai-stream' , action = 'store_true' , help = '启用流式输出,实时打印模型生成内容' )
327478
0 commit comments