Skip to content

Commit 0c80ea2

Browse files
authored
Catchup 14: Add agent faiss provenance (#48)
* Refactor graph routing/validator glue into focused helpers * Refactor step-2 routing and plan migration helpers * Add FAISS provenance manifests for index builders
1 parent 05a5a18 commit 0c80ea2

6 files changed

Lines changed: 732 additions & 129 deletions

File tree

database/scripts/build_all_indices.py

Lines changed: 218 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import argparse
2626
import json
2727
import logging
28+
import subprocess
2829
import sys
30+
from datetime import UTC, datetime
2931
from pathlib import Path
3032
from typing import Any
3133
from unittest.mock import Mock
@@ -40,6 +42,153 @@
4042

4143
logger = logging.getLogger(__name__)
4244

45+
46+
def _project_root() -> Path:
47+
return Path(__file__).resolve().parents[2]
48+
49+
50+
def _iso_utc_now() -> str:
51+
return datetime.now(UTC).isoformat()
52+
53+
54+
def _safe_git_rev_parse_head(source_dir: Path | None) -> str | None:
55+
if source_dir is None:
56+
return None
57+
try:
58+
result = subprocess.run(
59+
["git", "rev-parse", "HEAD"],
60+
cwd=source_dir,
61+
capture_output=True,
62+
text=True,
63+
check=True,
64+
)
65+
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
66+
return None
67+
commit = result.stdout.strip()
68+
return commit or None
69+
70+
71+
def _load_dependencies_commit(solver: str | None, dependencies_path: Path | None = None) -> str | None:
72+
if not solver:
73+
return None
74+
dep_path = dependencies_path or (_project_root() / ".dependencies.json")
75+
try:
76+
payload = json.loads(dep_path.read_text(encoding="utf-8"))
77+
except (FileNotFoundError, json.JSONDecodeError, OSError):
78+
return None
79+
repos = payload.get("repos")
80+
if not isinstance(repos, dict):
81+
return None
82+
entry = repos.get(solver) or repos.get(solver.lower())
83+
if not isinstance(entry, dict):
84+
return None
85+
commit = entry.get("commit")
86+
return commit if isinstance(commit, str) and commit.strip() else None
87+
88+
89+
def _extract_embedding_metadata(embedder: Any) -> tuple[str | None, str | None, int | None]:
90+
config = None
91+
if hasattr(embedder, "service"):
92+
config = getattr(embedder.service, "config", None)
93+
94+
provider: str | None = None
95+
model_name: str | None = None
96+
dimension: int | None = None
97+
if config is not None:
98+
provider = getattr(config, "embedding_provider", None) or provider
99+
model_name = getattr(config, "faiss_embedding_model", None) or model_name
100+
raw_dim = (
101+
getattr(config, "embedding_dimension", None)
102+
or getattr(config, "faiss_embedding_dimension", None)
103+
)
104+
if isinstance(raw_dim, int):
105+
dimension = raw_dim
106+
107+
return provider, model_name, dimension
108+
109+
110+
def _write_provenance_file(
111+
*,
112+
output_dir: Path,
113+
filename: str,
114+
embedder: Any,
115+
solver: str | None,
116+
level: str,
117+
source_dir: Path | None,
118+
) -> tuple[Path, dict[str, Any]]:
119+
provider, model_name, dimension = _extract_embedding_metadata(embedder)
120+
payload = {
121+
"version": "1",
122+
"generated_at": _iso_utc_now(),
123+
"embedding_model": model_name,
124+
"embedding_provider": provider,
125+
"embedding_dimension": dimension,
126+
"solver": solver,
127+
"level": level,
128+
"repo_commit": _safe_git_rev_parse_head(source_dir),
129+
"build_script": "build_all_indices.py",
130+
"dependencies_commit": _load_dependencies_commit(solver),
131+
}
132+
output_dir.mkdir(parents=True, exist_ok=True)
133+
manifest_path = output_dir / filename
134+
manifest_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
135+
return manifest_path, payload
136+
137+
138+
def _session_key(level: str | None, solver: str | None) -> str:
139+
return f"{level or ''}::{solver or '__global__'}"
140+
141+
142+
def _load_session_entries(session_manifest_path: Path) -> list[dict[str, Any]]:
143+
try:
144+
payload = json.loads(session_manifest_path.read_text(encoding="utf-8"))
145+
except (FileNotFoundError, json.JSONDecodeError, OSError):
146+
return []
147+
entries = payload.get("entries")
148+
if not isinstance(entries, list):
149+
return []
150+
normalized: list[dict[str, Any]] = []
151+
for entry in entries:
152+
if isinstance(entry, dict):
153+
normalized.append(entry)
154+
return normalized
155+
156+
157+
def _write_build_session_manifest(
158+
*,
159+
root_output_dir: Path,
160+
new_entries: list[dict[str, Any]],
161+
) -> Path:
162+
manifest_path = root_output_dir / "build_session_manifest.json"
163+
existing_entries = _load_session_entries(manifest_path)
164+
165+
merged: dict[str, dict[str, Any]] = {}
166+
for entry in existing_entries:
167+
key = _session_key(entry.get("level"), entry.get("solver"))
168+
merged[key] = entry
169+
for entry in new_entries:
170+
key = _session_key(entry.get("level"), entry.get("solver"))
171+
merged[key] = entry
172+
173+
final_entries = sorted(
174+
merged.values(),
175+
key=lambda item: (
176+
str(item.get("level") or ""),
177+
str(item.get("solver") or ""),
178+
str(item.get("manifest_path") or ""),
179+
),
180+
)
181+
182+
payload = {
183+
"version": "1",
184+
"generated_at": _iso_utc_now(),
185+
"build_script": "build_all_indices.py",
186+
"entries": final_entries,
187+
}
188+
root_output_dir.mkdir(parents=True, exist_ok=True)
189+
manifest_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
190+
return manifest_path
191+
43192
def _resolve_config_class(repo_path: Path | None = None, code_name: str | None = None):
44193
configs = list(discover_code_configs())
45194

@@ -395,15 +544,81 @@ def main() -> None:
395544

396545
# Build requested levels
397546
total_indices = 0
547+
session_entries: list[dict[str, Any]] = []
398548

399549
if args.level in ['0', 'all']:
400-
total_indices += build_level0(args.output / 'level0', embedder)
550+
level0_dir = args.output / 'level0'
551+
built = build_level0(level0_dir, embedder)
552+
total_indices += built
553+
if built > 0:
554+
manifest_path, payload = _write_provenance_file(
555+
output_dir=level0_dir,
556+
filename="faiss_provenance.json",
557+
embedder=embedder,
558+
solver=None,
559+
level="0",
560+
source_dir=None,
561+
)
562+
logger.debug(f"Provenance manifest written to: {manifest_path}")
563+
session_entries.append(
564+
{
565+
"manifest_path": manifest_path.relative_to(args.output).as_posix(),
566+
**payload,
567+
}
568+
)
401569

402570
if args.level in ['1', 'all']:
403-
total_indices += build_level1(repo_root, args.output / 'level1', embedder, config_class)
571+
level1_dir = args.output / 'level1'
572+
built = build_level1(repo_root, level1_dir, embedder, config_class)
573+
total_indices += built
574+
if built > 0:
575+
solver_slug = str(solver_name).lower()
576+
filename = f"{solver_slug}_faiss_provenance.json"
577+
manifest_path, payload = _write_provenance_file(
578+
output_dir=level1_dir,
579+
filename=filename,
580+
embedder=embedder,
581+
solver=solver_slug,
582+
level="1",
583+
source_dir=repo_root,
584+
)
585+
logger.debug(f"Provenance manifest written to: {manifest_path}")
586+
session_entries.append(
587+
{
588+
"manifest_path": manifest_path.relative_to(args.output).as_posix(),
589+
**payload,
590+
}
591+
)
404592

405593
if args.level in ['2', 'all']:
406-
total_indices += build_level2(repo_root, args.output / 'level2', embedder, config_class)
594+
level2_dir = args.output / 'level2'
595+
built = build_level2(repo_root, level2_dir, embedder, config_class)
596+
total_indices += built
597+
if built > 0:
598+
solver_slug = str(solver_name).lower()
599+
filename = f"{solver_slug}_faiss_provenance.json"
600+
manifest_path, payload = _write_provenance_file(
601+
output_dir=level2_dir,
602+
filename=filename,
603+
embedder=embedder,
604+
solver=solver_slug,
605+
level="2",
606+
source_dir=repo_root,
607+
)
608+
logger.debug(f"Provenance manifest written to: {manifest_path}")
609+
session_entries.append(
610+
{
611+
"manifest_path": manifest_path.relative_to(args.output).as_posix(),
612+
**payload,
613+
}
614+
)
615+
616+
if session_entries:
617+
session_manifest = _write_build_session_manifest(
618+
root_output_dir=args.output,
619+
new_entries=session_entries,
620+
)
621+
logger.debug(f"Build session manifest written to: {session_manifest}")
407622

408623
logger.debug(f"""
409624
╔══════════════════════════════════════════════════════════════════════╗

0 commit comments

Comments
 (0)