Skip to content

Commit b45d8fb

Browse files
authored
Merge pull request #24 from offx-zinth/master
Master
2 parents 87cfd96 + f69cd8d commit b45d8fb

8 files changed

Lines changed: 268 additions & 8 deletions

File tree

ERROR

Whitespace-only changes.

docker-compose.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,13 @@ services:
66
NEO4J_dbms_security_procedures_unrestricted: apoc.*
77
NEO4J_dbms_security_procedures_allowlist: apoc.*
88
ports:
9+
<<<<<<< HEAD
10+
- "7475:7474" # Host 7475 maps to Container 7474
11+
- "7688:7687" # Host 7688 maps to Container 7687
12+
=======
913
- "7474:7474"
1014
- "7687:7687"
15+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
1116
volumes:
1217
- neo4j_data:/data
1318
- neo4j_logs:/logs
@@ -49,4 +54,8 @@ volumes:
4954
neo4j_data:
5055
neo4j_logs:
5156
chroma_data:
52-
smp_data:
57+
<<<<<<< HEAD
58+
smp_data:
59+
=======
60+
smp_data:
61+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ dependencies = [
2121
"tree-sitter-typescript>=0.23",
2222
"python-dotenv>=1.0",
2323
"structlog>=24.0",
24+
<<<<<<< HEAD
25+
"chromadb",
26+
=======
27+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
2428
]
2529

2630
[project.optional-dependencies]

smp/engine/embedding.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""Embedding service using NVIDIA NIM or OpenAI."""
2+
3+
from __future__ import annotations
4+
5+
import os
6+
from typing import Any
7+
8+
import httpx
9+
10+
from smp.logging import get_logger
11+
12+
log = get_logger(__name__)
13+
14+
15+
class EmbeddingService:
16+
"""Generate embeddings via NVIDIA NIM or OpenAI."""
17+
18+
def __init__(
19+
self,
20+
provider: str = "nvidia",
21+
api_key: str | None = None,
22+
model: str | None = None,
23+
base_url: str | None = None,
24+
dimension: int = 768,
25+
) -> None:
26+
self._provider = provider
27+
self._api_key = api_key or os.environ.get("NVIDIA_NIM_API_KEY") or os.environ.get("OPENAI_API_KEY", "")
28+
self._model = model or os.environ.get("EMBEDDING_MODEL", "nvidia/nv-embed-qa-4")
29+
self._base_url = base_url or os.environ.get(
30+
"EMBEDDING_BASE_URL", "https://integrate.api.nvidia.com/v1"
31+
)
32+
self._dimension = dimension
33+
self._client: httpx.AsyncClient | None = None
34+
35+
async def connect(self) -> None:
36+
self._client = httpx.AsyncClient(
37+
base_url=self._base_url,
38+
headers={"Authorization": f"Bearer {self._api_key}"},
39+
timeout=60.0,
40+
)
41+
log.info("embedding_service_connected", provider=self._provider, model=self._model)
42+
43+
async def close(self) -> None:
44+
if self._client:
45+
await self._client.aclose()
46+
self._client = None
47+
48+
@property
49+
def dimension(self) -> int:
50+
return self._dimension
51+
52+
async def embed(self, text: str) -> list[float]:
53+
"""Generate embedding for a single text."""
54+
if self._client is None:
55+
raise RuntimeError("EmbeddingService not connected")
56+
57+
if self._provider == "nvidia":
58+
return await self._embed_nvidia(text)
59+
elif self._provider == "openai":
60+
return await self._embed_openai(text)
61+
else:
62+
raise ValueError(f"Unknown provider: {self._provider}")
63+
64+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
65+
"""Generate embeddings for multiple texts."""
66+
if self._client is None:
67+
raise RuntimeError("EmbeddingService not connected")
68+
69+
if self._provider == "nvidia":
70+
return await self._embed_batch_nvidia(texts)
71+
elif self._provider == "openai":
72+
return await self._embed_batch_openai(texts)
73+
else:
74+
raise ValueError(f"Unknown provider: {self._provider}")
75+
76+
async def _embed_nvidia(self, text: str) -> list[float]:
77+
payload = {
78+
"input": text,
79+
"model": self._model,
80+
}
81+
response = await self._client.post("/embeddings", json=payload)
82+
response.raise_for_status()
83+
data = response.json()
84+
return data["data"][0]["embedding"]
85+
86+
async def _embed_batch_nvidia(self, texts: list[str]) -> list[list[float]]:
87+
payload = {
88+
"input": texts,
89+
"model": self._model,
90+
}
91+
response = await self._client.post("/embeddings", json=payload)
92+
response.raise_for_status()
93+
data = response.json()
94+
return [item["embedding"] for item in data["data"]]
95+
96+
async def _embed_openai(self, text: str) -> list[float]:
97+
payload = {
98+
"input": text,
99+
"model": self._model,
100+
}
101+
response = await self._client.post("/embeddings", json=payload)
102+
response.raise_for_status()
103+
data = response.json()
104+
return data["data"][0]["embedding"]
105+
106+
async def _embed_batch_openai(self, texts: list[str]) -> list[list[float]]:
107+
payload = {
108+
"input": texts,
109+
"model": self._model,
110+
}
111+
response = await self._client.post("/embeddings", json=payload)
112+
response.raise_for_status()
113+
data = response.json()
114+
return [item["embedding"] for item in data["data"]]
115+
116+
117+
def create_embedding_service() -> EmbeddingService:
118+
"""Create embedding service from environment variables."""
119+
provider = os.getenv("EMBEDDING_PROVIDER", "nvidia")
120+
api_key = os.getenv("NVIDIA_NIM_API_KEY") or os.getenv("OPENAI_API_KEY")
121+
model = os.getenv("EMBEDDING_MODEL")
122+
base_url = os.getenv("EMBEDDING_BASE_URL")
123+
dimension = int(os.getenv("EMBEDDING_DIMENSION", "768"))
124+
return EmbeddingService(provider=provider, api_key=api_key, model=model, base_url=base_url, dimension=dimension)

smp/engine/enricher.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,33 @@
1+
<<<<<<< HEAD
2+
"""Static semantic enricher with optional LLM-based embedding."""
3+
=======
14
"""Static semantic enricher — AST-based extraction.
25
36
Extracts docstrings, inline comments, decorators, type annotations,
47
and computes source hashes purely from the AST.
58
No LLM or embedding generation.
69
"""
10+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
711

812
from __future__ import annotations
913

1014
import hashlib
1115
from datetime import UTC, datetime
16+
<<<<<<< HEAD
17+
from typing import TYPE_CHECKING
18+
=======
19+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
1220

1321
from smp.core.models import GraphNode
1422
from smp.engine.interfaces import SemanticEnricher as SemanticEnricherInterface
1523
from smp.logging import get_logger
1624

25+
<<<<<<< HEAD
26+
if TYPE_CHECKING:
27+
from smp.engine.embedding import EmbeddingService
28+
29+
=======
30+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
1731
log = get_logger(__name__)
1832

1933

@@ -24,15 +38,28 @@ def _compute_source_hash(name: str, file_path: str, start: int, end: int, signat
2438

2539

2640
class StaticSemanticEnricher(SemanticEnricherInterface):
41+
<<<<<<< HEAD
42+
"""Static AST-based semantic enricher with optional embedding support."""
43+
44+
def __init__(self, embedding_service: EmbeddingService | None = None) -> None:
45+
=======
2746
"""Static AST-based semantic enricher. No LLM, no embeddings."""
2847

2948
def __init__(self) -> None:
49+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
3050
self._enrichment_counts: dict[str, int] = {
3151
"enriched": 0,
3252
"skipped": 0,
3353
"no_metadata": 0,
3454
"failed": 0,
3555
}
56+
<<<<<<< HEAD
57+
self._embedding_service = embedding_service
58+
59+
def set_embedding_service(self, service: EmbeddingService) -> None:
60+
self._embedding_service = service
61+
=======
62+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
3663

3764
async def enrich_node(
3865
self,
@@ -83,9 +110,28 @@ async def enrich_batch(
83110
enriched.append(result)
84111
return enriched
85112

113+
<<<<<<< HEAD
114+
@property
115+
def has_llm(self) -> bool:
116+
"""Check if LLM-based embedding is available."""
117+
return self._embedding_service is not None
118+
119+
async def embed(self, text: str) -> list[float]:
120+
"""Generate embedding using the embedding service if available."""
121+
if self._embedding_service is None:
122+
return []
123+
return await self._embedding_service.embed(text)
124+
125+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
126+
"""Generate embeddings for multiple texts."""
127+
if self._embedding_service is None:
128+
return [[] for _ in texts]
129+
return await self._embedding_service.embed_batch(texts)
130+
=======
86131
async def embed(self, text: str) -> list[float]:
87132
"""No-op embedding — static enricher does not use vectors."""
88133
return []
134+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
89135

90136
def get_counts(self) -> dict[str, int]:
91137
"""Return enrichment statistics."""
@@ -94,4 +140,8 @@ def get_counts(self) -> dict[str, int]:
94140
def reset_counts(self) -> None:
95141
"""Reset enrichment counters."""
96142
for key in self._enrichment_counts:
143+
<<<<<<< HEAD
144+
self._enrichment_counts[key] = 0
145+
=======
97146
self._enrichment_counts[key] = 0
147+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d

smp/engine/interfaces.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@ async def enrich_batch(self, nodes: list[GraphNode], force: bool = False) -> lis
5555
async def embed(self, text: str) -> list[float]:
5656
"""No-op for static enricher."""
5757

58+
<<<<<<< HEAD
59+
@abc.abstractmethod
60+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
61+
"""Generate embeddings for multiple texts."""
62+
63+
=======
64+
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
5865

5966
class QueryEngine(abc.ABC):
6067
"""High-level query interface over the memory store."""

0 commit comments

Comments
 (0)