-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexport_for_gemini.py
More file actions
147 lines (123 loc) · 5.21 KB
/
Copy pathexport_for_gemini.py
File metadata and controls
147 lines (123 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
"""Create Gemini-friendly chunks from direct_qisi_scraper.py JSONL output."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows = []
with path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
return rows
def compact_row(row: dict[str, Any]) -> str:
title = row.get("title") or f"Status {row.get('id')}"
body = row.get("text") or row.get("description") or ""
return "\n".join(
[
f"### {row.get('created_at', '')} | {title}",
f"id: {row.get('id')}",
f"url: {row.get('url')}",
f"type: {row.get('type')} | is_column: {row.get('is_column')}",
f"engagement: like={row.get('like_count', 0)}, reply={row.get('reply_count', 0)}, retweet={row.get('retweet_count', 0)}, fav={row.get('fav_count', 0)}",
"",
body.strip(),
"",
]
)
def export_chunks(rows: list[dict[str, Any]], out_dir: Path, max_chars: int) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
header = (
"# 启四说雪球内容分析语料\n\n"
"分析建议:按时间线提取投资框架、策略演化、标的偏好、风险控制、"
"可转债/ETF/股票相关规则,并区分长期原则与短期市场判断。\n\n"
)
chunk = header
index = 1
manifest = []
for row in rows:
block = compact_row(row)
if len(chunk) + len(block) > max_chars and chunk != header:
path = out_dir / f"gemini_chunk_{index:03d}.md"
path.write_text(chunk, encoding="utf-8")
manifest.append({"file": path.name, "chars": len(chunk)})
index += 1
chunk = header
chunk += block
if chunk != header:
path = out_dir / f"gemini_chunk_{index:03d}.md"
path.write_text(chunk, encoding="utf-8")
manifest.append({"file": path.name, "chars": len(chunk)})
(out_dir / "manifest.json").write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
write_prompt_files(out_dir, rows, manifest)
def write_prompt_files(out_dir: Path, rows: list[dict[str, Any]], manifest: list[dict[str, Any]]) -> None:
prompt = f"""# Gemini 分析提示词
你将分析一组来自雪球用户“启四说”(qisi / UID 9199129225)的时间线语料。
语料规模:
- 记录数:{len(rows)}
- 分片数:{len(manifest)}
请按以下目标分析:
1. 提取他的投资体系:资产类别、轮动逻辑、择时/择券/择股标准、仓位和风险控制。
2. 区分长期原则、阶段性判断、每日/短期交易提示,避免把短期观点误当成长期规则。
3. 汇总可转债、ETF、股票、指数、宏观/市场情绪相关的可执行规则。
4. 找出反复出现的关键词、指标、约束条件、风险事件和失败教训。
5. 按时间线梳理策略是否演化,以及哪些观点后来被修正。
6. 输出可验证引用:每条结论尽量附 `id`、日期、标题或 URL。
7. 不要提供投资建议;只做内容归纳、模式抽取和证据整理。
建议输出结构:
- Executive Summary
- 投资框架总览
- 策略模块拆解
- 风险控制与例外条件
- 时间线演化
- 高频标的/资产类别
- 可回测规则草案
- 证据表
- 不确定/需要人工复核的点
"""
(out_dir / "GEMINI_PROMPT.md").write_text(prompt, encoding="utf-8")
schema = {
"summary": "string",
"strategy_modules": [
{
"name": "string",
"description": "string",
"asset_classes": ["string"],
"rules": ["string"],
"risk_controls": ["string"],
"evidence": [{"id": "number|string", "date": "string", "url": "string", "quote_or_summary": "string"}],
}
],
"timeline_evolution": [
{"period": "string", "change": "string", "evidence_ids": ["number|string"]}
],
"backtest_rule_candidates": [
{
"rule": "string",
"required_data": ["string"],
"ambiguities": ["string"],
"evidence_ids": ["number|string"],
}
],
"open_questions": ["string"],
}
(out_dir / "analysis_schema.json").write_text(json.dumps(schema, ensure_ascii=False, indent=2), encoding="utf-8")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Chunk Xueqiu JSONL for Gemini.")
parser.add_argument("jsonl", help="Path to timeline_clean.jsonl")
parser.add_argument("--out-dir", default="", help="Default: <jsonl parent>/gemini")
parser.add_argument("--max-chars", type=int, default=180000)
return parser.parse_args()
def main() -> int:
args = parse_args()
jsonl = Path(args.jsonl)
out_dir = Path(args.out_dir) if args.out_dir else jsonl.parent / "gemini"
rows = load_jsonl(jsonl)
export_chunks(rows, out_dir, args.max_chars)
print(f"exported {len(rows)} rows to {out_dir}")
return 0
if __name__ == "__main__":
raise SystemExit(main())