Skip to content

Commit 98144fb

Browse files
committed
update_v0.2_20250921
1 parent 1b6dc2f commit 98144fb

File tree

6 files changed

+210
-24
lines changed

6 files changed

+210
-24
lines changed

.env.example.txt

Lines changed: 0 additions & 6 deletions
This file was deleted.

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,26 @@ python run_analysis.py --full --ai --ai-stream
580580

581581
# 自定义AI分析参数
582582
python run_analysis.py --full --ai --ai-model deepseek-chat --ai-timeout 120 --ai-base-url https://api.deepseek.com/v1
583+
584+
# ========================
585+
# 本地 LMStudio 端点示例
586+
# ========================
587+
# 方法1:使用环境变量配置(推荐)
588+
# 1) 配置 .env 文件:
589+
# LMSTUDIO_BASE_URL=http://localhost:1234/v1
590+
# LMSTUDIO_MODEL_NAME=deepseek-r1-distill-llama-8b@q8_0
591+
# LMSTUDIO_TIMEOUT_SEC=180
592+
# LMSTUDIO_MAX_TOKENS=4096 # 模型上下文窗口限制
593+
# LMSTUDIO_CHUNK_SIZE=3000 # 分块处理大小(留出输出空间)
594+
# 2) 运行分析
595+
python run_analysis.py --full --ai --ai-model lmstudio --ai-stream
596+
597+
# 方法2:命令行直接指定
598+
python run_analysis.py --full --ai \
599+
--ai-model deepseek-r1-distill-llama-8b@q8_0 \
600+
--ai-base-url http://localhost:1234/v1 \
601+
--ai-timeout 180 \
602+
--ai-stream
583603
```
584604

585605
### Python API使用

env.example

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# ===============
2+
# DeepSeek 配置示例
3+
# ===============
4+
DEEPSEEK_API_KEY=your_deepseek_api_key_here
5+
DEEPSEEK_BASE_URL=https://api.deepseek.com
6+
DEEPSEEK_TIMEOUT_SEC=180
7+
8+
# =====================
9+
# LMStudio 本地模型配置示例
10+
# =====================
11+
# 如果使用 LMStudio 本地 OpenAI 兼容端点,请配置以下变量
12+
LMSTUDIO_BASE_URL=http://localhost:1234/v1
13+
# LMStudio 中加载的模型名称,例如:deepseek-r1-distill-llama-8b@q8_0
14+
LMSTUDIO_MODEL_NAME=deepseek-r1-distill-llama-8b@q8_0
15+
# 本地服务通常不需要 API KEY,如果需要请填写
16+
LMSTUDIO_API_KEY=1234
17+
LMSTUDIO_TIMEOUT_SEC=180
18+
# 本地模型上下文窗口限制(token数),用于分块处理
19+
LMSTUDIO_MAX_TOKENS=4096
20+
# 每块的最大输入token数(留出输出空间)
21+
LMSTUDIO_CHUNK_SIZE=3000

input/.gitkeep

Whitespace-only changes.

processed_data/.gitkeep

Whitespace-only changes.

run_analysis.py

Lines changed: 169 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -197,36 +197,124 @@ def load_env_key(env_path: str, key: str) -> str:
197197
return ''
198198
return ''
199199

200+
def estimate_tokens(text: str) -> int:
201+
"""粗略估计文本的token数量(中文按字数,英文按词数)"""
202+
chinese_chars = len([c for c in text if '\u4e00' <= c <= '\u9fff'])
203+
english_words = len(text.replace('\n', ' ').split()) - chinese_chars
204+
return chinese_chars + english_words // 3
205+
206+
def split_content_by_tokens(content: str, max_tokens: int) -> list:
207+
"""按token限制分割内容"""
208+
chunks = []
209+
lines = content.split('\n')
210+
current_chunk = []
211+
current_tokens = 0
212+
213+
for line in lines:
214+
line_tokens = estimate_tokens(line)
215+
if current_tokens + line_tokens > max_tokens and current_chunk:
216+
chunks.append('\n'.join(current_chunk))
217+
current_chunk = [line]
218+
current_tokens = line_tokens
219+
else:
220+
current_chunk.append(line)
221+
current_tokens += line_tokens
222+
223+
if current_chunk:
224+
chunks.append('\n'.join(current_chunk))
225+
226+
return chunks
227+
200228
def call_deepseek_chat(api_key: str, system_prompt: str, user_content: str,
201229
model: str = 'deepseek-chat', base_url: str = 'https://api.deepseek.com',
202-
timeout_sec: int = 60, stream: bool = False) -> str:
203-
"""调用 DeepSeek Chat Completions API,返回文本内容。"""
230+
timeout_sec: int = 60, stream: bool = False, max_tokens: int = 0,
231+
chunk_size: int = 0) -> str:
232+
"""调用 AI Chat Completions API,支持分块处理,返回文本内容。"""
233+
234+
# 如果需要分块处理
235+
if chunk_size > 0 and estimate_tokens(user_content) > chunk_size:
236+
print(f"内容过长({estimate_tokens(user_content)} tokens),启用分块处理...")
237+
chunks = split_content_by_tokens(user_content, chunk_size)
238+
print(f"分为 {len(chunks)} 块处理")
239+
240+
all_responses = []
241+
conversation_history = []
242+
243+
for i, chunk in enumerate(chunks, 1):
244+
print(f"\n=== 处理第 {i}/{len(chunks)} 块 ===")
245+
246+
# 构建对话历史
247+
messages = [{'role': 'system', 'content': system_prompt}]
248+
messages.extend(conversation_history)
249+
250+
# 添加当前块的问题
251+
if i == 1:
252+
chunk_prompt = f"请分析以下运营数据(第{i}部分,共{len(chunks)}部分):\n\n{chunk}"
253+
else:
254+
chunk_prompt = f"继续分析运营数据(第{i}部分,共{len(chunks)}部分):\n\n{chunk}"
255+
256+
messages.append({'role': 'user', 'content': chunk_prompt})
257+
258+
# 调用API处理当前块
259+
response = _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)
260+
all_responses.append(response)
261+
262+
# 更新对话历史
263+
conversation_history.extend([
264+
{'role': 'user', 'content': chunk_prompt},
265+
{'role': 'assistant', 'content': response}
266+
])
267+
268+
# 最终整合
269+
final_prompt = f"基于前面 {len(chunks)} 部分的分析,请生成最终的完整运营分析报告,遵循之前的Markdown格式要求。"
270+
messages = [{'role': 'system', 'content': system_prompt}]
271+
messages.extend(conversation_history)
272+
messages.append({'role': 'user', 'content': final_prompt})
273+
274+
print(f"\n=== 生成最终整合报告 ===")
275+
final_response = _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)
276+
return final_response
277+
278+
else:
279+
# 单次处理
280+
messages = [
281+
{'role': 'system', 'content': system_prompt},
282+
{'role': 'user', 'content': user_content}
283+
]
284+
return _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)
285+
286+
def _single_api_call(messages: list, model: str, base_url: str, api_key: str,
287+
timeout_sec: int, stream: bool) -> str:
288+
"""执行单次API调用"""
204289
# 兼容带/不带v1,自动规范化
205290
base = base_url.rstrip('/')
206291
if base.endswith('/v1'):
207292
url = f"{base}/chat/completions"
208293
else:
209294
url = f"{base}/chat/completions"
210295
headers = {
211-
'Content-Type': 'application/json',
212-
'Authorization': f'Bearer {api_key}'
296+
'Content-Type': 'application/json; charset=utf-8',
297+
'Authorization': f'Bearer {api_key}',
298+
'Accept': 'application/json; charset=utf-8'
213299
}
214300
payload = {
215301
'model': model,
216-
'messages': [
217-
{ 'role': 'system', 'content': system_prompt },
218-
{ 'role': 'user', 'content': user_content }
219-
],
302+
'messages': messages,
220303
'stream': bool(stream)
221304
}
222305
if stream:
223306
# 流式打印到终端,同时聚合内容
224-
with requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout_sec, stream=True) as r:
307+
with requests.post(url, headers=headers, data=json.dumps(payload, ensure_ascii=False), timeout=timeout_sec, stream=True) as r:
225308
r.raise_for_status()
309+
r.encoding = 'utf-8' # 强制设置编码
226310
full_text_parts: List[str] = []
227311
for line in r.iter_lines(decode_unicode=True):
228312
if not line:
229313
continue
314+
# 确保line是正确编码的字符串
315+
if isinstance(line, bytes):
316+
line = line.decode('utf-8', errors='ignore')
317+
230318
if line.startswith('data: '):
231319
data_str = line[len('data: '):].strip()
232320
if data_str == '[DONE]':
@@ -235,17 +323,32 @@ def call_deepseek_chat(api_key: str, system_prompt: str, user_content: str,
235323
obj = json.loads(data_str)
236324
delta = obj.get('choices', [{}])[0].get('delta', {}).get('content', '')
237325
if delta:
326+
# 确保delta是正确的UTF-8字符串
327+
if isinstance(delta, bytes):
328+
delta = delta.decode('utf-8', errors='ignore')
238329
print(delta, end='', flush=True)
239330
full_text_parts.append(delta)
240-
except Exception:
331+
except Exception as e:
332+
print(f"\n[DEBUG] JSON解析错误: {e}, 原始数据: {data_str[:100]}")
241333
continue
242334
print() # 换行
243335
return ''.join(full_text_parts)
244336
else:
245-
resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout_sec)
337+
resp = requests.post(url, headers=headers, data=json.dumps(payload, ensure_ascii=False), timeout=timeout_sec)
246338
resp.raise_for_status()
247-
data = resp.json()
248-
return data.get('choices', [{}])[0].get('message', {}).get('content', '')
339+
resp.encoding = 'utf-8' # 强制设置编码
340+
341+
try:
342+
data = resp.json()
343+
content = data.get('choices', [{}])[0].get('message', {}).get('content', '')
344+
# 确保返回的内容是正确编码
345+
if isinstance(content, bytes):
346+
content = content.decode('utf-8', errors='ignore')
347+
return content
348+
except Exception as e:
349+
print(f"[DEBUG] 响应解析错误: {e}")
350+
print(f"[DEBUG] 原始响应: {resp.text[:200]}")
351+
return ""
249352

250353
def full_analysis(input_file: str, processed_dir: str, report_dir: str,
251354
*, enable_ai: bool = False,
@@ -290,18 +393,66 @@ def full_analysis(input_file: str, processed_dir: str, report_dir: str,
290393
cfg_timeout = int(env_timeout)
291394
except Exception:
292395
pass
396+
# 新增:支持 LMStudio 本地服务
397+
cfg_max_tokens = 0
398+
cfg_chunk_size = 0
399+
if model and model.lower().startswith('lmstudio'):
400+
cfg_base_url = base_url or load_env_key(env_path, 'LMSTUDIO_BASE_URL') or cfg_base_url
401+
# 如果模型名是 lmstudio,则从环境变量读取实际模型名
402+
if model.lower() == 'lmstudio':
403+
env_model = load_env_key(env_path, 'LMSTUDIO_MODEL_NAME')
404+
if env_model:
405+
model = env_model
406+
api_key = api_key or load_env_key(env_path, 'LMSTUDIO_API_KEY')
407+
try:
408+
lm_timeout = load_env_key(env_path, 'LMSTUDIO_TIMEOUT_SEC')
409+
if lm_timeout:
410+
cfg_timeout = int(lm_timeout)
411+
except Exception:
412+
pass
413+
414+
# 读取token限制配置
415+
try:
416+
lm_max_tokens = load_env_key(env_path, 'LMSTUDIO_MAX_TOKENS')
417+
if lm_max_tokens:
418+
cfg_max_tokens = int(lm_max_tokens)
419+
420+
lm_chunk_size = load_env_key(env_path, 'LMSTUDIO_CHUNK_SIZE')
421+
if lm_chunk_size:
422+
cfg_chunk_size = int(lm_chunk_size)
423+
except Exception:
424+
pass
425+
426+
print(f"使用 LMStudio 配置: {cfg_base_url}, 模型: {model}")
427+
if cfg_max_tokens > 0:
428+
print(f"上下文限制: {cfg_max_tokens} tokens, 分块大小: {cfg_chunk_size} tokens")
293429
if not api_key:
294430
print("未在环境或 .env 中找到 DEEPSEEK_API_KEY,跳过AI摘要生成。")
295431
return True
296432
try:
297433
ai_text = call_deepseek_chat(api_key, system_prompt, user_content,
298434
model=model, base_url=cfg_base_url,
299-
timeout_sec=cfg_timeout, stream=stream)
435+
timeout_sec=cfg_timeout, stream=stream,
436+
max_tokens=cfg_max_tokens, chunk_size=cfg_chunk_size)
300437
ai_dir = os.path.dirname(ai_output_path)
301438
if ai_dir:
302439
ensure_dir(ai_dir)
303440
with open(ai_output_path, 'w', encoding='utf-8') as f:
304-
f.write(ai_text or '')
441+
# 确保写入文件的内容是正确的UTF-8编码
442+
if ai_text:
443+
# 如果内容包含乱码,尝试修复
444+
try:
445+
# 检测并修复可能的编码问题
446+
if isinstance(ai_text, bytes):
447+
ai_text = ai_text.decode('utf-8', errors='ignore')
448+
# 移除可能的控制字符
449+
ai_text = ''.join(char for char in ai_text if ord(char) >= 32 or char in '\n\r\t')
450+
f.write(ai_text)
451+
except Exception as e:
452+
print(f"[WARNING] 文件写入编码错误: {e}")
453+
f.write(str(ai_text))
454+
else:
455+
f.write('')
305456
print(f"AI 摘要报告已生成: {ai_output_path}")
306457
except Exception as e:
307458
print(f"DeepSeek API 调用失败: {e}")
@@ -316,12 +467,12 @@ def main():
316467
parser.add_argument('--output-dir', type=str, default='processed_data', help='预处理输出目录,默认 processed_data')
317468
parser.add_argument('--report-dir', type=str, default='output', help='Markdown 报告输出目录,默认 output')
318469
# AI 分析相关
319-
parser.add_argument('--ai', action='store_true', help='启用 DeepSeek API 生成运营摘要')
470+
parser.add_argument('--ai', action='store_true', help='启用 AI 摘要(DeepSeek 或本地 LMStudio 端点)')
320471
parser.add_argument('--system-prompt', type=str, default='agent/REPORT_ANALYST_SYSTEM_PROMPT.md', help='系统提示词路径')
321472
parser.add_argument('--env-file', type=str, default='.env', help='包含 DEEPSEEK_API_KEY 的 .env 文件路径')
322473
parser.add_argument('--ai-output', type=str, default='output/ops_summary.md', help='AI 摘要输出文件路径')
323-
parser.add_argument('--ai-model', type=str, default='deepseek-chat', help='DeepSeek 模型名称,默认 deepseek-chat')
324-
parser.add_argument('--ai-base-url', type=str, default=None, help='DeepSeek Base URL,默认从 .env 读取或 https://api.deepseek.com')
474+
parser.add_argument('--ai-model', type=str, default='deepseek-chat', help='AI 模型名称, deepseek-chat、lmstudio(从环境变量读取)或具体模型名')
475+
parser.add_argument('--ai-base-url', type=str, default=None, help='AI Base URL,可指向 DeepSeek 或本地 LMStudio,例如 http://localhost:1234/v1')
325476
parser.add_argument('--ai-timeout', type=int, default=60, help='DeepSeek 请求超时秒,默认 60,可用 .env 的 DEEPSEEK_TIMEOUT_SEC 覆盖')
326477
parser.add_argument('--ai-stream', action='store_true', help='启用流式输出,实时打印模型生成内容')
327478

0 commit comments

Comments
 (0)