Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ result = await service.retrieve(
"categories": [...], # Relevant topic areas (auto-prioritized)
"items": [...], # Specific memory facts
"resources": [...], # Original sources for traceability
"graph_nodes": [...], # Graph-enhanced context (if enabled)
"next_step_query": "..." # Predicted follow-up context
}
```
Expand All @@ -487,6 +488,35 @@ result = await service.retrieve(
- `where={"agent_id__in": ["1", "2"]}` - Multi-agent coordination
- Omit `where` for global context awareness

#### Graph-Enhanced Retrieval

MemU can optionally build a **knowledge graph** from stored memories, enabling retrieval that follows semantic relationships between concepts — not just vector similarity.

```python
service = MemoryService(
retrieve_config={
"method": "rag",
"graph": {
"enabled": True, # Enable graph recall alongside vector search
"weight": 0.3, # Score fusion: 70% vector + 30% graph
"max_nodes": 6, # Max graph nodes per query
},
},
# ... other config
)
```

When enabled, the retrieve pipeline runs a **dual-path graph recall**:
1. **Precise path**: Vector/FTS seed nodes → community expansion → BFS walk → Personalized PageRank
2. **Generalized path**: Community representatives → shallow walk → PPR

Results are fused with vector retrieval using configurable weights (`α * vector_score + β * graph_ppr`), giving you both direct semantic matches and structurally related context.

The graph store supports:
- **Personalized PageRank** for query-relevant ranking
- **Label Propagation** for automatic community detection
- **Global PageRank** for baseline node importance

---

## 💡 Proactive Scenarios
Expand Down
282 changes: 282 additions & 0 deletions experiment_graph_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
"""
对比实验: 纯向量召回 vs 图增强召回
baseline: pgvector cosine top-10 (memory_items),种子向量 = gm_nodes FTS命中节点的均值
graph: gm_nodes FTS种子 → PPR 扩展 → 关联 memory_items

策略: Ollama 可能未运行,改用 gm_nodes 里关键词 FTS 找种子节点,
取其 embedding 均值作为查询向量,对 memory_items 做 cosine 搜索。
这样 baseline 和 graph 共用同一 embedding 空间,对比公平。
"""

import json
from collections import defaultdict

import psycopg2
import psycopg2.extras

# ─── 配置 ─────────────────────────────────────────────────────────────────────
PG_DSN = "dbname=memu user=postgres password=postgres host=localhost"
USER_ID = "boris"

# 每个查询: (显示名, [关键词列表用于 FTS 找种子节点])
QUERIES = [
("量化交易 ShinkaEvolve 选股", ["ShinkaEvolve", "quant", "选股", "量化", "scanner"]),
("memU 记忆系统 图增强", ["memU", "graph", "memory", "PPR", "recall"]),
("FSC full-self-coding agent", ["FSC", "full-self-coding", "agent", "executor"]),
("Gitea 前端美化 液态玻璃", ["Gitea", "液态玻璃", "frontend", "glass", "NAS"]),
("SUPER 硬件 H11DSI 内存", ["SUPER", "H11DSI", "hardware", "memory", "512GB"]),
]


# ─── 工具函数 ──────────────────────────────────────────────────────────────────

def vec_literal(v: list[float]) -> str:
return "[" + ",".join(f"{x:.6f}" for x in v) + "]"


# ─── 从 gm_nodes FTS 找种子,取均值 embedding ──────────────────────────────────

def get_seed_embedding(cur, keywords: list[str]) -> tuple[list[float] | None, list[dict]]:
"""用关键词 ILIKE 从 gm_nodes 找种子节点,返回其 embedding 均值 + 节点列表。"""
conditions = " OR ".join(["(name ILIKE %s OR description ILIKE %s OR content ILIKE %s)"] * len(keywords))
params = []
for kw in keywords:
params.extend([f"%{kw}%"] * 3)
params.append(USER_ID)

cur.execute(
f"""SELECT id, name, description, embedding::text as emb_text
FROM gm_nodes
WHERE ({conditions}) AND user_id = %s AND status = 'active' AND embedding IS NOT NULL
LIMIT 10""",
params,
)
rows = cur.fetchall()
if not rows:
return None, []

# 解析 embedding 字符串 → float list
def parse_vec(s: str) -> list[float]:
return [float(x) for x in s.strip("[]").split(",")]

vecs = [parse_vec(r["emb_text"]) for r in rows]
dim = len(vecs[0])
mean_vec = [sum(v[i] for v in vecs) / len(vecs) for i in range(dim)]
seeds = [{"id": r["id"], "name": r["name"]} for r in rows]
return mean_vec, seeds


# ─── Baseline: 直接对 memory_items 做向量搜索 ─────────────────────────────────

def baseline_recall(cur, query_vec: list[float], topk: int = 10) -> list[dict]:
sql = """
SELECT id, summary,
1 - (embedding <=> %s::vector) AS score
FROM memory_items
WHERE user_id = %s
AND embedding IS NOT NULL
ORDER BY embedding <=> %s::vector
LIMIT %s
"""
vlit = vec_literal(query_vec)
cur.execute(sql, (vlit, USER_ID, vlit, topk))
rows = cur.fetchall()
return [{"id": r["id"], "text": r["summary"][:120], "score": float(r["score"])} for r in rows]


# ─── 图增强召回 ────────────────────────────────────────────────────────────────

def graph_recall(cur, query_vec: list[float], topk_seed: int = 5, max_walk: int = 10) -> list[dict]:
# 1. 向量种子:从 gm_nodes 找最近节点
sql_seed = """
SELECT id, name, description, content,
1 - (embedding <=> %s::vector) AS score
FROM gm_nodes
WHERE user_id = %s
AND status = 'active'
AND embedding IS NOT NULL
ORDER BY embedding <=> %s::vector
LIMIT %s
"""
vlit = vec_literal(query_vec)
cur.execute(sql_seed, (vlit, USER_ID, vlit, topk_seed))
seed_rows = cur.fetchall()
if not seed_rows:
return []
seed_ids = [r["id"] for r in seed_rows]

# 2. 加载全图 (节点 + 边)
cur.execute("SELECT id FROM gm_nodes WHERE user_id = %s AND status = 'active'", (USER_ID,))
all_node_ids = {r["id"] for r in cur.fetchall()}

cur.execute(
"SELECT from_id, to_id FROM gm_edges WHERE user_id = %s", (USER_ID,)
)
adj: dict[str, set[str]] = defaultdict(set)
for r in cur.fetchall():
if r["from_id"] in all_node_ids and r["to_id"] in all_node_ids:
adj[r["from_id"]].add(r["to_id"])
adj[r["to_id"]].add(r["from_id"])

# 3. BFS walk depth=2 从种子出发
visited = set(seed_ids)
frontier = set(seed_ids)
for _ in range(2):
next_frontier: set[str] = set()
for nid in frontier:
next_frontier.update(adj.get(nid, set()) - visited)
visited.update(next_frontier)
frontier = next_frontier

# 4. PPR (simplified) on visited set
valid_seeds = [s for s in seed_ids if s in all_node_ids]
if not valid_seeds:
return []
tw = 1.0 / len(valid_seeds)
seed_set = set(valid_seeds)
rank = {nid: (tw if nid in seed_set else 0.0) for nid in all_node_ids}

for _ in range(20):
new_rank = {nid: ((1 - 0.85) * tw if nid in seed_set else 0.0) for nid in all_node_ids}
for nid in all_node_ids:
nbrs = adj[nid]
if not nbrs:
continue
contrib = rank[nid] / len(nbrs)
for nb in nbrs:
new_rank[nb] = new_rank.get(nb, 0.0) + 0.85 * contrib
dangling = sum(rank[nid] for nid in all_node_ids if not adj[nid])
if dangling > 0:
for sid in valid_seeds:
new_rank[sid] += 0.85 * dangling * tw
rank = new_rank

# 5. 取 visited 中 PPR 最高的节点
candidate_ids = visited & all_node_ids
ranked = sorted(candidate_ids, key=lambda n: -rank.get(n, 0.0))[:max_walk]

# 6. 加载节点内容
if not ranked:
return []
placeholders = ",".join(["%s"] * len(ranked))
cur.execute(
f"SELECT id, name, description, content FROM gm_nodes WHERE id IN ({placeholders})",
ranked,
)
node_map = {r["id"]: r for r in cur.fetchall()}

results = []
for nid in ranked:
if nid not in node_map:
continue
n = node_map[nid]
text = f"[{n['name']}] {n['description'] or ''} {n['content'] or ''}".strip()
results.append({
"id": nid,
"name": n["name"],
"text": text[:120],
"ppr": round(rank.get(nid, 0.0), 6),
})
return results


# ─── 将图节点映射回 memory_items (通过内容相似度 FTS) ─────────────────────────

def find_related_memories(cur, node_names: list[str], topk: int = 5) -> list[dict]:
"""对每个 node name 做全文搜索,找最相关的 memory_items。"""
if not node_names:
return []
combined_query = " ".join(node_names[:5])
# 用 ts_query 对 memory_items 的 summary 列做简单 LIKE 匹配(避免 FTS 配置依赖)
conditions = " OR ".join(["summary ILIKE %s"] * min(len(node_names), 5))
params = [f"%{n}%" for n in node_names[:5]]
params.append(USER_ID)
cur.execute(
f"""SELECT id, summary FROM memory_items
WHERE ({conditions}) AND user_id = %s
LIMIT {topk}""",
params,
)
return [{"id": r["id"], "text": r["summary"][:120]} for r in cur.fetchall()]


# ─── 主程序 ────────────────────────────────────────────────────────────────────

def main():
print(f"连接 PG: {PG_DSN}")
conn = psycopg2.connect(PG_DSN)
conn.autocommit = True
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

# 注册 vector 类型(pgvector 扩展)
cur.execute("SELECT NULL::vector") # 触发类型注册

print(f"Embedding: gm_nodes 关键词 FTS 种子均值向量\n{'='*70}\n")

for query_label, keywords in QUERIES:
print(f"{'='*70}")
print(f"查询: 【{query_label}】")
print(f"关键词: {keywords}")
print(f"{'='*70}")

print(" → 从 gm_nodes 找种子节点...", end="", flush=True)
qvec, seed_nodes = get_seed_embedding(cur, keywords)
if qvec is None:
print(f" 无命中,跳过")
continue
print(f" 命中 {len(seed_nodes)} 个节点: {[n['name'] for n in seed_nodes]}")
query = query_label

# Baseline
base_results = baseline_recall(cur, qvec, topk=10)
base_ids = {r["id"] for r in base_results}

# Graph
graph_results = graph_recall(cur, qvec, topk_seed=5, max_walk=10)
graph_node_names = [r["name"] for r in graph_results]

# 图节点关联的 memory_items(通过名称 ILIKE)
graph_memories = find_related_memories(cur, graph_node_names, topk=10)
graph_mem_ids = {r["id"] for r in graph_memories}

# 差集:图带来了哪些 baseline 没有的
new_ids = graph_mem_ids - base_ids
overlap_ids = graph_mem_ids & base_ids

print(f"\n [BASELINE top-10 memory_items]")
for i, r in enumerate(base_results, 1):
print(f" {i:2}. (score={r['score']:.3f}) {r['text'][:100]}")

print(f"\n [GRAPH 激活的节点 (PPR top-10 from gm_nodes)]")
if graph_results:
for i, r in enumerate(graph_results, 1):
print(f" {i:2}. (ppr={r['ppr']:.5f}) [{r['name']}] {r['text'][:80]}")
else:
print(" (无结果)")

print(f"\n [GRAPH → 关联 memory_items (共 {len(graph_memories)} 条)]")
for r in graph_memories:
marker = "★NEW" if r["id"] in new_ids else " =="
print(f" {marker} {r['text'][:100]}")

print(f"\n [差异统计]")
print(f" baseline: {len(base_results)} 条")
print(f" graph关联memories: {len(graph_memories)} 条")
print(f" 重叠: {len(overlap_ids)} 条")
print(f" 图独占新增 (★): {len(new_ids)} 条")

if new_ids:
print(f"\n [图独占新增详情]")
for r in graph_memories:
if r["id"] in new_ids:
print(f" → {r['text'][:110]}")

print()

cur.close()
conn.close()
print("实验完成。")


if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions prepare_pr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import subprocess
import sys

def run_cmd(cmd):
print(f"Running: {cmd}")
result = subprocess.run(cmd, shell=True)
if result.returncode != 0:
print(f"Error: Command failed with exit code {result.returncode}")
sys.exit(result.returncode)

def main():
steps = [
"git checkout main",
"git checkout -b feat/graph-enhanced-retrieval",
"git cherry-pick ac90183 40044e9 2893ed0",
".venv/Scripts/python -m pytest tests/test_graph_store.py -v --tb=short",
"git push origin feat/graph-enhanced-retrieval",
"""gh pr create --title "feat: graph-enhanced retrieval with PPR and community detection" --body "Adds graph-enhanced retrieval that layers a knowledge graph (Personalized PageRank + Label Propagation + community detection) on top of existing vector search for more contextual memory recall. Core changes: GraphStore repository (CRUD + PPR + LPA + dual-path recall), retrieve pipeline WorkflowStep, configurable score fusion (alpha*vector + beta*graph), Alembic migration for gm_* tables. Configuration: New RetrieveGraphConfig (enabled, weight, max_nodes) — disabled by default, zero impact on existing users. Testing: 30 unit tests covering CRUD, PPR, LPA, community merge, score fusion, config validation. Files changed: 12 files, ~1500 lines added. Known limitations (pre-existing): ddl_mode=\\"validate\\" still runs Alembic upgrade; migration hard-codes user_id scope column. Breaking changes: None — graph retrieval is opt-in." --repo NevaMind-AI/memU --head feat/graph-enhanced-retrieval --base main"""
]

for step in steps:
run_cmd(step)

print("Successfully completed all PR preparation steps.")

if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions prepare_pr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
set -e

echo "Starting PR preparation for memU graph-enhanced retrieval..."

git checkout main
git checkout -b feat/graph-enhanced-retrieval
git cherry-pick ac90183 40044e9 2893ed0

./.venv/Scripts/python -m pytest tests/test_graph_store.py -v --tb=short

git push origin feat/graph-enhanced-retrieval

gh pr create --title "feat: graph-enhanced retrieval with PPR and community detection" \
--body "Adds graph-enhanced retrieval that layers a knowledge graph (Personalized PageRank + Label Propagation + community detection) on top of existing vector search for more contextual memory recall. Core changes: GraphStore repository (CRUD + PPR + LPA + dual-path recall), retrieve pipeline WorkflowStep, configurable score fusion (alpha*vector + beta*graph), Alembic migration for gm_* tables. Configuration: New RetrieveGraphConfig (enabled, weight, max_nodes) — disabled by default, zero impact on existing users. Testing: 30 unit tests covering CRUD, PPR, LPA, community merge, score fusion, config validation. Files changed: 12 files, ~1500 lines added. Known limitations (pre-existing): ddl_mode=\"validate\" still runs Alembic upgrade; migration hard-codes user_id scope column. Breaking changes: None — graph retrieval is opt-in." \
--repo NevaMind-AI/memU --head feat/graph-enhanced-retrieval --base main

echo "PR preparation completed successfully."
Loading