Heartie-fastGPT-analyzer/run_analysis.py at main · PancrePal-xiaoyibao/Heartie-fastGPT-analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#!/usr/bin/env python3
"""
小馨宝运营数据分析完整流程

使用方法：
1. 数据预处理：python run_analysis.py --preprocess
2. 月度分析：python run_analysis.py --analyze-monthly
3. 完整流程：python run_analysis.py --full

新增：
- 支持 --input-file 自定义输入（默认 input/chat_logs.csv，兼容 input/filtered_data.csv）
- 支持 --output-dir 自定义输出目录（默认 processed_data）
- 完整流程会额外生成 Markdown 报告到 output/analysis_report.md（可用 --report-dir 指定目录）
"""

import argparse
import os
import sys
import json
import requests
from typing import List, Dict
from data_preprocessor import XiaoXinBaoDataProcessor
from monthly_analyzer import process_all_months
# 尝试导入 LogParser，假设在同级目录
try:
    from log_parser import LogParser
except ImportError:
    LogParser = None
try:
    from visualizer import generate_all_plots
except ImportError:
    generate_all_plots = None

def resolve_input_file(cli_input: str = None) -> str:
    """解析输入文件路径，优先顺序：
    1) 命令行 --input-file
    2) input/chat_logs.csv
    3) input/filtered_data.csv
    4) 根目录兼容（向后兼容）
    """
    if cli_input:
        return cli_input

    # 优先检查 input/ 目录
    if os.path.exists('input/chat_logs.csv'):
        return 'input/chat_logs.csv'
    if os.path.exists('input/filtered_data.csv'):
        return 'input/filtered_data.csv'

    # 兼容根目录（向后兼容）
    if os.path.exists('chat_logs.csv'):
        return 'chat_logs.csv'
    if os.path.exists('filtered_data.csv'):
        return 'filtered_data.csv'

    # Check for log files
    if os.path.exists('input/xyanb.yaml'):
        return 'input/xyanb.yaml'

    return ''

def ensure_dir(path: str) -> None:
    if path and not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

def preprocess_data(input_file: str, output_dir: str, output_format: str = 'csv') -> bool:
    """数据预处理"""
    print("=== 开始数据预处理 ===")
    if not input_file or not os.path.exists(input_file):
        print("未找到输入文件。请在 input/ 目录放置 chat_logs.csv 或 use --input-file 指定。")
        return False

    # Check if input is a log file (.yaml or .log)
    if input_file.endswith('.yaml') or input_file.endswith('.log'):
        print(f"检测到日志文件: {input_file}，尝试解析...")
        if LogParser:
            parser = LogParser(input_file)
            df = parser.parse()
            if df.empty:
                print("日志解析结果为空")
                return False
            # Save parsed DataFrame as CSV (standard format for processor)
            parsed_csv = 'input/chat_logs.csv'
            df.to_csv(parsed_csv, index=False, encoding='utf-8')
            print(f"日志已解析并保存为标准CSV输入: {parsed_csv}")
            input_file = parsed_csv # Switch input to the CSV file
        else:
            print("错误: 找不到 LogParser 模块，无法解析日志文件")
            return False

    processor = XiaoXinBaoDataProcessor(input_file)

    if processor.load_data():
        print("原始数据形状:", processor.df.shape)

        # 执行数据清洗
        processor.clean_column_names()
        print("列名已清洗")

        valid_rows = processor.parse_timestamp()
        print(f"有效时间戳: {valid_rows}/{len(processor.df)}")

        avg_length = processor.extract_dialogue_content()
        print(f"平均对话长度: {avg_length:.2f}字符")

        user_types = processor.categorize_users()
        print("用户类型分布:", user_types)

        sentiments = processor.analyze_sentiment()
        print("情感分布:", sentiments)

        # 保存处理结果
        ensure_dir(output_dir)
        summary = processor.save_processed_data(output_dir, format=output_format)
        print("\n=== 处理完成 ===")
        print("摘要统计:")
        for key, value in summary.items():
            print(f"  {key}: {value}")

        return True
    else:
        print("数据加载失败")
        return False

def run_monthly_analysis(processed_dir: str) -> List[Dict]:
    """运行月度分析"""
    print("=== 开始月度分析 ===")
    if not os.path.exists(processed_dir):
        print("请先运行数据预处理")
        return []
    try:
        reports = process_all_months(processed_dir)

        print(f"\n=== 分析完成，共处理 {len(reports)} 个月的数据 ===")

        # 打印每个月的关键指标
        for report in reports:
            month = report['month']
            metrics = report['basic_metrics']
            print(f"\n{month}:")
            print(f"  对话数: {metrics['total_dialogues']}")
            print(f"  洞察: {'; '.join(report['insights'])}")
            print(f"  建议: {'; '.join(report['recommendations'][:2])}")

        return reports

    except Exception as e:
        print(f"分析失败: {e}")
        return []

def render_markdown_report(reports: List[Dict], summary_json_path: str) -> str:
    """将分析结果渲染为 Markdown 文本。"""
    lines: List[str] = []
    lines.append('# 小馨宝运营分析报告')
    lines.append('')
    lines.append('## 月度概览')
    lines.append('')
    for report in reports:
        month = report.get('month', '')
        metrics = report.get('basic_metrics', {})
        time_dist = report.get('time_distribution', {})
        est_turns = report.get('estimated_turns', {})
        keywords = report.get('keywords', {})
        insights = report.get('insights', [])
        recs = report.get('recommendations', [])
        lines.append(f'### {month}')
        lines.append('')
        lines.append(f'- 总对话数: {metrics.get("total_dialogues", 0)}')
        lines.append(f'- 唯一用户数: {metrics.get("unique_users", 0)}')
        lines.append(f'- 平均对话长度: {metrics.get("avg_dialogue_length", 0)}')
        date_range = metrics.get('date_range', {})
        if date_range:
            lines.append(f'- 时间范围: {date_range.get("start", "")} ~ {date_range.get("end", "")}')
        if insights:
            lines.append('- 关键洞察: ' + '; '.join(insights))
        if recs:
            lines.append('- 建议: ' + '; '.join(recs[:3]))
        lines.append('')
        # 新增：时间分布
        if time_dist:
            lines.append('#### 时间分布')
            lines.append('')
            lines.append(f'- 按小时: {time_dist.get("by_hour", {})}')
            lines.append(f'- 按星期: {time_dist.get("by_weekday", {})}  (0=周一)')
            lines.append(f'- 按日期: {list(time_dist.get("by_date", {}).items())[:10]} ...')
            lines.append('')
        # 新增：轮次估计
        if est_turns:
            lines.append('#### 轮次估计')
            lines.append('')
            lines.append(f'- 平均轮次: {est_turns.get("avg_turns", 0):.2f}')
            lines.append(f'- 轮次分布: {est_turns.get("distribution", {})}')
            lines.append('')
        # 新增：关键词/短语
        if keywords:
            lines.append('#### 关键词与短语')
            lines.append('')
            def format_terms(items):
                return ', '.join([f"{i['term']}({i['count']})" for i in items[:15]])
            lines.append(f'- 关键词: {format_terms(keywords.get("unigrams", []))}')
            lines.append(f'- 2-gram: {format_terms(keywords.get("bigrams", []))}')
            lines.append(f'- 3-gram: {format_terms(keywords.get("trigrams", []))}')
            lines.append('')
    if os.path.exists(summary_json_path):
        lines.append('## 数据摘要')
        lines.append('')
        lines.append(f'> 详见 `{summary_json_path}`')
        lines.append('')
    return '\n'.join(lines)

def load_env_key(env_path: str, key: str) -> str:
    """从 .env 加载指定键，优先读取系统环境变量。"""
    val = os.getenv(key)
    if val:
        return val
    if not env_path or not os.path.exists(env_path):
        return ''
    try:
        with open(env_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                if '=' in line:
                    k, v = line.split('=', 1)
                    if k.strip() == key:
                        return v.strip().strip('"').strip("'")
    except Exception:
        return ''
    return ''

def estimate_tokens(text: str) -> int:
    """粗略估计文本的token数量（中文按字数，英文按词数）"""
    chinese_chars = len([c for c in text if '\u4e00' <= c <= '\u9fff'])
    english_words = len(text.replace('\n', ' ').split()) - chinese_chars
    return chinese_chars + english_words // 3

def split_content_by_tokens(content: str, max_tokens: int) -> list:
    """按token限制分割内容"""
    chunks = []
    lines = content.split('\n')
    current_chunk = []
    current_tokens = 0

    for line in lines:
        line_tokens = estimate_tokens(line)
        if current_tokens + line_tokens > max_tokens and current_chunk:
            chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
            current_tokens = line_tokens
        else:
            current_chunk.append(line)
            current_tokens += line_tokens

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks

def call_deepseek_chat(api_key: str, system_prompt: str, user_content: str,
                       model: str = 'deepseek-chat', base_url: str = 'https://api.deepseek.com',
                       timeout_sec: int = 60, stream: bool = False, max_tokens: int = 0,
                       chunk_size: int = 0) -> str:
    """调用 AI Chat Completions API，支持分块处理，返回文本内容。"""

    # 如果需要分块处理
    if chunk_size > 0 and estimate_tokens(user_content) > chunk_size:
        print(f"内容过长({estimate_tokens(user_content)} tokens)，启用分块处理...")

        # 计算 system_prompt 的 token 数
        system_tokens = estimate_tokens(system_prompt)
        # 为每块预留空间：system_prompt + 输出空间 + 提示语
        available_tokens = chunk_size - system_tokens - 500

        if available_tokens <= 0:
            print(f"警告：system_prompt 太长({system_tokens} tokens)，使用简化提示")
            system_prompt = "你是一个专业的运营数据分析师，请分析以下数据并生成报告。"
            system_tokens = estimate_tokens(system_prompt)
            available_tokens = chunk_size - system_tokens - 500

        chunks = split_content_by_tokens(user_content, available_tokens)
        print(f"分为 {len(chunks)} 块处理（每块约 {available_tokens} tokens）")

        all_responses = []

        # 独立处理每个块，避免对话历史累积
        for i, chunk in enumerate(chunks, 1):
            print(f"\n=== 处理第 {i}/{len(chunks)} 块 ===")

            # 每个块独立处理，不累积历史
            chunk_prompt = f"请分析以下运营数据（第{i}部分，共{len(chunks)}部分），提取关键信息和洞察：\n\n{chunk}"

            messages = [
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': chunk_prompt}
            ]

            # 调用API处理当前块
            response = _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)
            all_responses.append(response)

        # 整合所有块的分析结果
        print(f"\n=== 整合 {len(chunks)} 个分析结果 ===")
        combined_analysis = "\n\n---\n\n".join([f"## 第{i}部分分析\n{resp}" for i, resp in enumerate(all_responses, 1)])

        # 生成最终报告（使用整合后的分析结果）
        final_prompt = f"基于以下各部分的分析结果，生成一份完整的运营分析报告（Markdown格式）：\n\n{combined_analysis}"

        # 检查最终提示是否也太长
        if estimate_tokens(final_prompt) > available_tokens:
            print(f"警告：整合结果太长，使用摘要方式")
            # 只保留关键摘要
            summaries = []
            for i, resp in enumerate(all_responses, 1):
                summary_lines = resp.split('\n')[:10]  # 只取前10行
                summaries.append(f"第{i}部分摘要：\n" + '\n'.join(summary_lines))
            final_prompt = f"基于以下摘要，生成完整的运营分析报告：\n\n" + "\n\n".join(summaries)

        messages = [
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': final_prompt}
        ]

        final_response = _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)
        return final_response

    else:
        # 单次处理
        messages = [
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': user_content}
        ]
        return _single_api_call(messages, model, base_url, api_key, timeout_sec, stream)

def _single_api_call(messages: list, model: str, base_url: str, api_key: str,
                     timeout_sec: int, stream: bool) -> str:
    """执行单次API调用"""
    # 兼容带/不带v1，自动规范化
    base = base_url.rstrip('/')
    if base.endswith('/v1'):
        url = f"{base}/chat/completions"
    else:
        url = f"{base}/v1/chat/completions"
    headers = {
        'Content-Type': 'application/json; charset=utf-8',
        'Authorization': f'Bearer {api_key}',
        'Accept': 'application/json; charset=utf-8'
    }
    payload = {
        'model': model,
        'messages': messages,
        'stream': bool(stream)
    }
    if stream:
        # 流式打印到终端，同时聚合内容
        with requests.post(url, headers=headers, json=payload, timeout=timeout_sec, stream=True) as r:
            r.raise_for_status()
            r.encoding = 'utf-8'  # 强制设置编码
            full_text_parts: List[str] = []
            for line in r.iter_lines(decode_unicode=True):
                if not line:
                    continue
                # 确保line是正确编码的字符串
                if isinstance(line, bytes):
                    line = line.decode('utf-8', errors='ignore')

                if line.startswith('data: '):
                    data_str = line[len('data: '):].strip()
                    if data_str == '[DONE]':
                        break
                    try:
                        obj = json.loads(data_str)
                        if isinstance(obj, dict):
                             delta = obj.get('choices', [{}])[0].get('delta', {}).get('content', '')
                             if delta:
                                 # 确保delta是正确的UTF-8字符串
                                 if isinstance(delta, bytes):
                                     delta = delta.decode('utf-8', errors='ignore')
                                 print(delta, end='', flush=True)
                                 full_text_parts.append(delta)
                        else:
                             # 非标准 JSON 响应 (可能是 LMStudio 的纯文本信息)
                             print(f"\n[DEBUG] 非标准响应对象: {type(obj)} - {str(obj)[:100]}")
                    except Exception as e:
                        print(f"\n[DEBUG] JSON解析错误: {e}, 原始数据: {data_str[:100]}")
                        continue
            print()  # 换行
            return ''.join(full_text_parts)
    else:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout_sec)
        resp.raise_for_status()
        resp.encoding = 'utf-8'  # 强制设置编码

        try:
            data = resp.json()
            content = data.get('choices', [{}])[0].get('message', {}).get('content', '')
            # 确保返回的内容是正确编码
            if isinstance(content, bytes):
                content = content.decode('utf-8', errors='ignore')
            return content
        except Exception as e:
            print(f"[DEBUG] 响应解析错误: {e}")
            print(f"[DEBUG] 原始响应: {resp.text[:200]}")
            return ""

def full_analysis(input_file: str, processed_dir: str, report_dir: str,
                  *, enable_ai: bool = False,
                  system_prompt_path: str = 'agent/REPORT_ANALYST_SYSTEM_PROMPT.md',
                  env_path: str = '.env',
                  ai_output_path: str = 'output/ops_summary.md',
                  model: str = 'deepseek-chat',
                  base_url: str = '',
                  timeout_sec: int = 60,
                  stream: bool = True,
                  output_format: str = 'csv') -> bool:
    """完整分析流程，含 Markdown 报告输出。"""
    print("=== 开始完整分析流程 ===")
    ok = preprocess_data(input_file, processed_dir, output_format=output_format)
    if not ok:
        return False
    reports = run_monthly_analysis(processed_dir)
    if not reports:
        return False

    # 3. 生成可视化图表
    if generate_all_plots:
        try:
            generate_all_plots(processed_dir, report_dir)
        except Exception as e:
            print(f"可视化生成失败: {e}")
    else:
         print("Warning: visualizer module not found, skipping plots.")

    ensure_dir(report_dir)
    md_text = render_markdown_report(reports, os.path.join(processed_dir, 'summary.json'))
    report_path = os.path.join(report_dir, 'analysis_report.md')
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(md_text)
    print(f"Markdown 报告已生成: {report_path}")
    # 可选：AI 运营摘要
    if enable_ai:
        print("=== 调用 DeepSeek API 生成运营摘要 ===")
        if not os.path.exists(system_prompt_path):
            print(f"未找到系统提示词: {system_prompt_path}")
            return True
        with open(system_prompt_path, 'r', encoding='utf-8') as f:
            system_prompt = f.read()
        with open(report_path, 'r', encoding='utf-8') as f:
            user_content = f.read()
        api_key = load_env_key(env_path, 'DEEPSEEK_API_KEY')
        # 读取可配置的 BASE_URL 和 TIMEOUT
        cfg_base_url = base_url or load_env_key(env_path, 'DEEPSEEK_BASE_URL') or 'https://api.deepseek.com'
        cfg_timeout = timeout_sec
        try:
            env_timeout = load_env_key(env_path, 'DEEPSEEK_TIMEOUT_SEC')
            if env_timeout:
                cfg_timeout = int(env_timeout)
        except Exception:
            pass
        # 新增：支持 LMStudio 本地服务
        cfg_max_tokens = 0
        cfg_chunk_size = 0
        if model and model.lower().startswith('lmstudio'):
            cfg_base_url = base_url or load_env_key(env_path, 'LMSTUDIO_BASE_URL') or cfg_base_url
            # 如果模型名是 lmstudio，则从环境变量读取实际模型名
            if model.lower() == 'lmstudio':
                env_model = load_env_key(env_path, 'LMSTUDIO_MODEL_NAME')
                if env_model:
                    model = env_model
            api_key = api_key or load_env_key(env_path, 'LMSTUDIO_API_KEY')
            try:
                lm_timeout = load_env_key(env_path, 'LMSTUDIO_TIMEOUT_SEC')
                if lm_timeout:
                    cfg_timeout = int(lm_timeout)
            except Exception:
                pass

            # 读取token限制配置
            try:
                lm_max_tokens = load_env_key(env_path, 'LMSTUDIO_MAX_TOKENS')
                if lm_max_tokens:
                    cfg_max_tokens = int(lm_max_tokens)

                lm_chunk_size = load_env_key(env_path, 'LMSTUDIO_CHUNK_SIZE')
                if lm_chunk_size:
                    cfg_chunk_size = int(lm_chunk_size)
            except Exception:
                pass

            print(f"使用 LMStudio 配置: {cfg_base_url}, 模型: {model}")
            if cfg_max_tokens > 0:
                print(f"上下文限制: {cfg_max_tokens} tokens, 分块大小: {cfg_chunk_size} tokens")
        # 检查是否有有效的API配置（DeepSeek或LMStudio）
        if not api_key and not (model and model.lower().startswith('lmstudio')):
            print("未在环境或 .env 中找到 DEEPSEEK_API_KEY，跳过AI摘要生成。")
            return True
        try:
            ai_text = call_deepseek_chat(api_key, system_prompt, user_content,
                                         model=model, base_url=cfg_base_url,
                                         timeout_sec=cfg_timeout, stream=stream,
                                         max_tokens=cfg_max_tokens, chunk_size=cfg_chunk_size)
            ai_dir = os.path.dirname(ai_output_path)
            if ai_dir:
                ensure_dir(ai_dir)
            with open(ai_output_path, 'w', encoding='utf-8') as f:
                # 确保写入文件的内容是正确的UTF-8编码
                if ai_text:
                    # 如果内容包含乱码，尝试修复
                    try:
                        # 检测并修复可能的编码问题
                        if isinstance(ai_text, bytes):
                            ai_text = ai_text.decode('utf-8', errors='ignore')
                        # 移除可能的控制字符
                        ai_text = ''.join(char for char in ai_text if ord(char) >= 32 or char in '\n\r\t')
                        f.write(ai_text)
                    except Exception as e:
                        print(f"[WARNING] 文件写入编码错误: {e}")
                        f.write(str(ai_text))
                else:
                    f.write('')
            print(f"AI 摘要报告已生成: {ai_output_path}")
        except Exception as e:
            print(f"DeepSeek API 调用失败: {e}")
    return True

def main():
    parser = argparse.ArgumentParser(description='小馨宝运营数据分析工具')
    parser.add_argument('--preprocess', action='store_true', help='仅数据预处理')
    parser.add_argument('--analyze-monthly', action='store_true', help='仅月度分析')
    parser.add_argument('--full', action='store_true', help='完整流程')
    parser.add_argument('--input-file', type=str, default=None, help='输入CSV，默认自动查找 input/chat_logs.csv 或 input/filtered_data.csv')
    parser.add_argument('--output-dir', type=str, default='processed_data', help='预处理输出目录，默认 processed_data')
    parser.add_argument('--output-format', type=str, default='csv', choices=['csv', 'yaml'], help='输出格式 (csv/yaml)')
    parser.add_argument('--report-dir', type=str, default='output', help='Markdown 报告输出目录，默认 output')
    # AI 分析相关
    parser.add_argument('--ai', action='store_true', help='启用 AI 摘要（DeepSeek 或本地 LMStudio 端点）')
    parser.add_argument('--system-prompt', type=str, default='agent/REPORT_ANALYST_SYSTEM_PROMPT.md', help='系统提示词路径')
    parser.add_argument('--env-file', type=str, default='.env', help='包含 DEEPSEEK_API_KEY 的 .env 文件路径')
    parser.add_argument('--ai-output', type=str, default='output/ops_summary.md', help='AI 摘要输出文件路径')
    parser.add_argument('--ai-model', type=str, default='deepseek-chat', help='AI 模型名称，如 deepseek-chat、lmstudio（从环境变量读取）或具体模型名')
    parser.add_argument('--ai-base-url', type=str, default=None, help='AI Base URL，可指向 DeepSeek 或本地 LMStudio，例如 http://localhost:1234/v1')
    parser.add_argument('--ai-timeout', type=int, default=60, help='DeepSeek 请求超时秒，默认 60，可用 .env 的 DEEPSEEK_TIMEOUT_SEC 覆盖')
    parser.add_argument('--ai-stream', action='store_true', help='启用流式输出，实时打印模型生成内容')

    args = parser.parse_args()
    input_file = resolve_input_file(args.input_file)
    processed_dir = args.output_dir
    report_dir = args.report_dir

    if not any([args.preprocess, args.analyze_monthly, args.full]):
        # 如果没有参数，运行完整流程
        full_analysis(input_file, processed_dir, report_dir,
                      enable_ai=args.ai,
                      system_prompt_path=args.system_prompt,
                      env_path=args.env_file,
                      ai_output_path=args.ai_output,
                      model=args.ai_model,
                      base_url=(args.ai_base_url or ''),
                      timeout_sec=args.ai_timeout,
                      stream=args.ai_stream,
                      output_format=args.output_format)
    elif args.preprocess:
        preprocess_data(input_file, processed_dir, output_format=args.output_format)
    elif args.analyze_monthly:
        run_monthly_analysis(processed_dir)
    elif args.full:
        full_analysis(input_file, processed_dir, report_dir,
                      enable_ai=args.ai,
                      system_prompt_path=args.system_prompt,
                      env_path=args.env_file,
                      ai_output_path=args.ai_output,
                      model=args.ai_model,
                      base_url=(args.ai_base_url or ''),
                      timeout_sec=args.ai_timeout,
                      stream=args.ai_stream,
                      output_format=args.output_format)

if __name__ == "__main__":
    main()