|
25 | 25 | import argparse |
26 | 26 | import json |
27 | 27 | import logging |
| 28 | +import subprocess |
28 | 29 | import sys |
| 30 | +from datetime import UTC, datetime |
29 | 31 | from pathlib import Path |
30 | 32 | from typing import Any |
31 | 33 | from unittest.mock import Mock |
|
40 | 42 |
|
41 | 43 | logger = logging.getLogger(__name__) |
42 | 44 |
|
| 45 | + |
| 46 | +def _project_root() -> Path: |
| 47 | + return Path(__file__).resolve().parents[2] |
| 48 | + |
| 49 | + |
| 50 | +def _iso_utc_now() -> str: |
| 51 | + return datetime.now(UTC).isoformat() |
| 52 | + |
| 53 | + |
| 54 | +def _safe_git_rev_parse_head(source_dir: Path | None) -> str | None: |
| 55 | + if source_dir is None: |
| 56 | + return None |
| 57 | + try: |
| 58 | + result = subprocess.run( |
| 59 | + ["git", "rev-parse", "HEAD"], |
| 60 | + cwd=source_dir, |
| 61 | + capture_output=True, |
| 62 | + text=True, |
| 63 | + check=True, |
| 64 | + ) |
| 65 | + except (subprocess.CalledProcessError, FileNotFoundError, OSError): |
| 66 | + return None |
| 67 | + commit = result.stdout.strip() |
| 68 | + return commit or None |
| 69 | + |
| 70 | + |
| 71 | +def _load_dependencies_commit(solver: str | None, dependencies_path: Path | None = None) -> str | None: |
| 72 | + if not solver: |
| 73 | + return None |
| 74 | + dep_path = dependencies_path or (_project_root() / ".dependencies.json") |
| 75 | + try: |
| 76 | + payload = json.loads(dep_path.read_text(encoding="utf-8")) |
| 77 | + except (FileNotFoundError, json.JSONDecodeError, OSError): |
| 78 | + return None |
| 79 | + repos = payload.get("repos") |
| 80 | + if not isinstance(repos, dict): |
| 81 | + return None |
| 82 | + entry = repos.get(solver) or repos.get(solver.lower()) |
| 83 | + if not isinstance(entry, dict): |
| 84 | + return None |
| 85 | + commit = entry.get("commit") |
| 86 | + return commit if isinstance(commit, str) and commit.strip() else None |
| 87 | + |
| 88 | + |
| 89 | +def _extract_embedding_metadata(embedder: Any) -> tuple[str | None, str | None, int | None]: |
| 90 | + config = None |
| 91 | + if hasattr(embedder, "service"): |
| 92 | + config = getattr(embedder.service, "config", None) |
| 93 | + |
| 94 | + provider: str | None = None |
| 95 | + model_name: str | None = None |
| 96 | + dimension: int | None = None |
| 97 | + if config is not None: |
| 98 | + provider = getattr(config, "embedding_provider", None) or provider |
| 99 | + model_name = getattr(config, "faiss_embedding_model", None) or model_name |
| 100 | + raw_dim = ( |
| 101 | + getattr(config, "embedding_dimension", None) |
| 102 | + or getattr(config, "faiss_embedding_dimension", None) |
| 103 | + ) |
| 104 | + if isinstance(raw_dim, int): |
| 105 | + dimension = raw_dim |
| 106 | + |
| 107 | + return provider, model_name, dimension |
| 108 | + |
| 109 | + |
| 110 | +def _write_provenance_file( |
| 111 | + *, |
| 112 | + output_dir: Path, |
| 113 | + filename: str, |
| 114 | + embedder: Any, |
| 115 | + solver: str | None, |
| 116 | + level: str, |
| 117 | + source_dir: Path | None, |
| 118 | +) -> tuple[Path, dict[str, Any]]: |
| 119 | + provider, model_name, dimension = _extract_embedding_metadata(embedder) |
| 120 | + payload = { |
| 121 | + "version": "1", |
| 122 | + "generated_at": _iso_utc_now(), |
| 123 | + "embedding_model": model_name, |
| 124 | + "embedding_provider": provider, |
| 125 | + "embedding_dimension": dimension, |
| 126 | + "solver": solver, |
| 127 | + "level": level, |
| 128 | + "repo_commit": _safe_git_rev_parse_head(source_dir), |
| 129 | + "build_script": "build_all_indices.py", |
| 130 | + "dependencies_commit": _load_dependencies_commit(solver), |
| 131 | + } |
| 132 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 133 | + manifest_path = output_dir / filename |
| 134 | + manifest_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") |
| 135 | + return manifest_path, payload |
| 136 | + |
| 137 | + |
| 138 | +def _session_key(level: str | None, solver: str | None) -> str: |
| 139 | + return f"{level or ''}::{solver or '__global__'}" |
| 140 | + |
| 141 | + |
| 142 | +def _load_session_entries(session_manifest_path: Path) -> list[dict[str, Any]]: |
| 143 | + try: |
| 144 | + payload = json.loads(session_manifest_path.read_text(encoding="utf-8")) |
| 145 | + except (FileNotFoundError, json.JSONDecodeError, OSError): |
| 146 | + return [] |
| 147 | + entries = payload.get("entries") |
| 148 | + if not isinstance(entries, list): |
| 149 | + return [] |
| 150 | + normalized: list[dict[str, Any]] = [] |
| 151 | + for entry in entries: |
| 152 | + if isinstance(entry, dict): |
| 153 | + normalized.append(entry) |
| 154 | + return normalized |
| 155 | + |
| 156 | + |
| 157 | +def _write_build_session_manifest( |
| 158 | + *, |
| 159 | + root_output_dir: Path, |
| 160 | + new_entries: list[dict[str, Any]], |
| 161 | +) -> Path: |
| 162 | + manifest_path = root_output_dir / "build_session_manifest.json" |
| 163 | + existing_entries = _load_session_entries(manifest_path) |
| 164 | + |
| 165 | + merged: dict[str, dict[str, Any]] = {} |
| 166 | + for entry in existing_entries: |
| 167 | + key = _session_key(entry.get("level"), entry.get("solver")) |
| 168 | + merged[key] = entry |
| 169 | + for entry in new_entries: |
| 170 | + key = _session_key(entry.get("level"), entry.get("solver")) |
| 171 | + merged[key] = entry |
| 172 | + |
| 173 | + final_entries = sorted( |
| 174 | + merged.values(), |
| 175 | + key=lambda item: ( |
| 176 | + str(item.get("level") or ""), |
| 177 | + str(item.get("solver") or ""), |
| 178 | + str(item.get("manifest_path") or ""), |
| 179 | + ), |
| 180 | + ) |
| 181 | + |
| 182 | + payload = { |
| 183 | + "version": "1", |
| 184 | + "generated_at": _iso_utc_now(), |
| 185 | + "build_script": "build_all_indices.py", |
| 186 | + "entries": final_entries, |
| 187 | + } |
| 188 | + root_output_dir.mkdir(parents=True, exist_ok=True) |
| 189 | + manifest_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") |
| 190 | + return manifest_path |
| 191 | + |
43 | 192 | def _resolve_config_class(repo_path: Path | None = None, code_name: str | None = None): |
44 | 193 | configs = list(discover_code_configs()) |
45 | 194 |
|
@@ -395,15 +544,81 @@ def main() -> None: |
395 | 544 |
|
396 | 545 | # Build requested levels |
397 | 546 | total_indices = 0 |
| 547 | + session_entries: list[dict[str, Any]] = [] |
398 | 548 |
|
399 | 549 | if args.level in ['0', 'all']: |
400 | | - total_indices += build_level0(args.output / 'level0', embedder) |
| 550 | + level0_dir = args.output / 'level0' |
| 551 | + built = build_level0(level0_dir, embedder) |
| 552 | + total_indices += built |
| 553 | + if built > 0: |
| 554 | + manifest_path, payload = _write_provenance_file( |
| 555 | + output_dir=level0_dir, |
| 556 | + filename="faiss_provenance.json", |
| 557 | + embedder=embedder, |
| 558 | + solver=None, |
| 559 | + level="0", |
| 560 | + source_dir=None, |
| 561 | + ) |
| 562 | + logger.debug(f"Provenance manifest written to: {manifest_path}") |
| 563 | + session_entries.append( |
| 564 | + { |
| 565 | + "manifest_path": manifest_path.relative_to(args.output).as_posix(), |
| 566 | + **payload, |
| 567 | + } |
| 568 | + ) |
401 | 569 |
|
402 | 570 | if args.level in ['1', 'all']: |
403 | | - total_indices += build_level1(repo_root, args.output / 'level1', embedder, config_class) |
| 571 | + level1_dir = args.output / 'level1' |
| 572 | + built = build_level1(repo_root, level1_dir, embedder, config_class) |
| 573 | + total_indices += built |
| 574 | + if built > 0: |
| 575 | + solver_slug = str(solver_name).lower() |
| 576 | + filename = f"{solver_slug}_faiss_provenance.json" |
| 577 | + manifest_path, payload = _write_provenance_file( |
| 578 | + output_dir=level1_dir, |
| 579 | + filename=filename, |
| 580 | + embedder=embedder, |
| 581 | + solver=solver_slug, |
| 582 | + level="1", |
| 583 | + source_dir=repo_root, |
| 584 | + ) |
| 585 | + logger.debug(f"Provenance manifest written to: {manifest_path}") |
| 586 | + session_entries.append( |
| 587 | + { |
| 588 | + "manifest_path": manifest_path.relative_to(args.output).as_posix(), |
| 589 | + **payload, |
| 590 | + } |
| 591 | + ) |
404 | 592 |
|
405 | 593 | if args.level in ['2', 'all']: |
406 | | - total_indices += build_level2(repo_root, args.output / 'level2', embedder, config_class) |
| 594 | + level2_dir = args.output / 'level2' |
| 595 | + built = build_level2(repo_root, level2_dir, embedder, config_class) |
| 596 | + total_indices += built |
| 597 | + if built > 0: |
| 598 | + solver_slug = str(solver_name).lower() |
| 599 | + filename = f"{solver_slug}_faiss_provenance.json" |
| 600 | + manifest_path, payload = _write_provenance_file( |
| 601 | + output_dir=level2_dir, |
| 602 | + filename=filename, |
| 603 | + embedder=embedder, |
| 604 | + solver=solver_slug, |
| 605 | + level="2", |
| 606 | + source_dir=repo_root, |
| 607 | + ) |
| 608 | + logger.debug(f"Provenance manifest written to: {manifest_path}") |
| 609 | + session_entries.append( |
| 610 | + { |
| 611 | + "manifest_path": manifest_path.relative_to(args.output).as_posix(), |
| 612 | + **payload, |
| 613 | + } |
| 614 | + ) |
| 615 | + |
| 616 | + if session_entries: |
| 617 | + session_manifest = _write_build_session_manifest( |
| 618 | + root_output_dir=args.output, |
| 619 | + new_entries=session_entries, |
| 620 | + ) |
| 621 | + logger.debug(f"Build session manifest written to: {session_manifest}") |
407 | 622 |
|
408 | 623 | logger.debug(f""" |
409 | 624 | ╔══════════════════════════════════════════════════════════════════════╗ |
|
0 commit comments