Skip to content

Commit 35c73d4

Browse files
authored
Merge pull request #72 from namtroi/phase5/dev
Phase5/dev
2 parents 41a7a47 + f452a73 commit 35c73d4

71 files changed

Lines changed: 4421 additions & 2118 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

apps/ai-worker/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,6 @@ lxml>=5.1.0
3636
markdownify>=0.11.6
3737
ebooklib>=0.18
3838
openpyxl>=3.1.2
39+
40+
# Phase 5: Hybrid Embeddings (dense + sparse)
41+
fastembed>=0.4.0

apps/ai-worker/src/embedder.py

Lines changed: 0 additions & 60 deletions
This file was deleted.
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# apps/ai-worker/src/hybrid_embedder.py
2+
"""
3+
HybridEmbedder: Dense + Sparse vector generation.
4+
5+
Phase 5: Uses fastembed for both dense (BGE) and sparse (BM25) embeddings.
6+
Replaces sentence-transformers for unified embedding approach.
7+
"""
8+
9+
from dataclasses import dataclass
10+
from typing import List, Optional
11+
12+
import structlog
13+
14+
logger = structlog.get_logger()
15+
16+
17+
@dataclass
18+
class SparseVector:
19+
"""Sparse vector representation for BM25-style search."""
20+
21+
indices: List[int]
22+
values: List[float]
23+
24+
25+
@dataclass
26+
class HybridVector:
27+
"""Combined dense + sparse vector for hybrid search."""
28+
29+
dense: List[float] # 384 floats (BGE-small)
30+
sparse: SparseVector # Variable length
31+
32+
33+
class HybridEmbedder:
34+
"""
35+
Generates both dense and sparse embeddings using fastembed.
36+
37+
- Dense: BAAI/bge-small-en-v1.5 (384 dimensions)
38+
- Sparse: Qdrant/bm25 (BM25-based sparse vectors)
39+
"""
40+
41+
_instance: Optional["HybridEmbedder"] = None
42+
_dense_model = None
43+
_sparse_model = None
44+
45+
def __new__(cls):
46+
if cls._instance is None:
47+
cls._instance = super().__new__(cls)
48+
return cls._instance
49+
50+
def __init__(self):
51+
if self._dense_model is None:
52+
self._load_models()
53+
54+
def _load_models(self):
55+
"""Load both embedding models."""
56+
from fastembed import SparseTextEmbedding, TextEmbedding
57+
58+
logger.info("loading_hybrid_embedding_models")
59+
60+
# Dense model - same as before (BAAI/bge-small-en-v1.5)
61+
logger.info("loading_dense_model", model="BAAI/bge-small-en-v1.5")
62+
self._dense_model = TextEmbedding("BAAI/bge-small-en-v1.5")
63+
64+
# Sparse model - BM25 for keyword matching
65+
logger.info("loading_sparse_model", model="Qdrant/bm25")
66+
self._sparse_model = SparseTextEmbedding("Qdrant/bm25")
67+
68+
logger.info("hybrid_embedding_models_loaded")
69+
70+
def embed(self, texts: List[str]) -> List[HybridVector]:
71+
"""
72+
Generate hybrid (dense + sparse) embeddings for texts.
73+
74+
Args:
75+
texts: List of text strings to embed.
76+
77+
Returns:
78+
List of HybridVector containing dense and sparse vectors.
79+
"""
80+
if not texts:
81+
return []
82+
83+
try:
84+
# Generate both embedding types
85+
dense_embeddings = list(self._dense_model.embed(texts))
86+
sparse_embeddings = list(self._sparse_model.embed(texts))
87+
88+
# Combine into HybridVector
89+
results = []
90+
for dense, sparse in zip(dense_embeddings, sparse_embeddings):
91+
results.append(
92+
HybridVector(
93+
dense=dense.tolist(),
94+
sparse=SparseVector(
95+
indices=sparse.indices.tolist(),
96+
values=sparse.values.tolist(),
97+
),
98+
)
99+
)
100+
101+
return results
102+
103+
except Exception as e:
104+
logger.error("hybrid_embedding_failed", error=str(e))
105+
raise
106+
107+
def embed_dense_only(self, texts: List[str]) -> List[List[float]]:
108+
"""
109+
Generate only dense embeddings (backward compatibility).
110+
111+
Args:
112+
texts: List of text strings to embed.
113+
114+
Returns:
115+
List of dense vectors (384 floats each).
116+
"""
117+
if not texts:
118+
return []
119+
120+
embeddings = list(self._dense_model.embed(texts))
121+
return [e.tolist() for e in embeddings]
122+
123+
def get_token_counts(self, texts: List[str]) -> List[int]:
124+
"""
125+
Estimate token counts for texts.
126+
127+
Uses a simple heuristic based on word count.
128+
For more accurate counts, use the dense model's tokenizer.
129+
"""
130+
if not texts:
131+
return []
132+
133+
# Simple estimation: ~0.75 tokens per word (typical for English)
134+
# This is faster than loading tokenizer for each call
135+
counts = []
136+
for text in texts:
137+
word_count = len(text.split())
138+
# Cap at 512 (model max)
139+
counts.append(min(int(word_count * 1.3), 512))
140+
return counts
141+
142+
143+
# Singleton instance
144+
_hybrid_embedder: Optional[HybridEmbedder] = None
145+
146+
147+
def get_hybrid_embedder() -> HybridEmbedder:
148+
"""Get singleton HybridEmbedder instance."""
149+
global _hybrid_embedder
150+
if _hybrid_embedder is None:
151+
_hybrid_embedder = HybridEmbedder()
152+
return _hybrid_embedder

apps/ai-worker/src/main.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from .callback import send_callback
1919
from .config import settings
20+
from .hybrid_embedder import HybridEmbedder
2021
from .logging_config import configure_logging, get_logger
2122
from .metrics import MetricsCollector
2223
from .models import (
@@ -70,21 +71,52 @@ async def readiness_check():
7071

7172
@app.post("/embed", response_model=EmbedResponse)
7273
async def embed_texts(request: EmbedRequest):
73-
"""Generate embeddings for a list of texts."""
74-
from .embedder import Embedder
74+
"""Generate dense-only embeddings for a list of texts (backward compatibility)."""
7575

7676
if not request.texts:
7777
return EmbedResponse(embeddings=[])
7878

7979
try:
80-
embedder = Embedder()
81-
embeddings = embedder.embed(request.texts)
80+
embedder = HybridEmbedder()
81+
embeddings = embedder.embed_dense_only(request.texts)
8282
return EmbedResponse(embeddings=embeddings)
8383
except Exception as e:
8484
logger.exception("embed_error", error=str(e))
8585
raise HTTPException(status_code=500, detail=str(e))
8686

8787

88+
@app.post("/embed/query")
89+
async def embed_query(request: dict):
90+
"""
91+
Generate hybrid embeddings for a search query.
92+
93+
Returns both dense (384d) and sparse (BM25) vectors for Qdrant hybrid search.
94+
"""
95+
from .models import HybridEmbedResponse, SparseVectorModel
96+
97+
text = request.get("text", "")
98+
if not text:
99+
raise HTTPException(status_code=400, detail="text is required")
100+
101+
try:
102+
embedder = HybridEmbedder()
103+
vectors = embedder.embed([text])
104+
105+
if not vectors:
106+
raise HTTPException(status_code=500, detail="Failed to generate embeddings")
107+
108+
return HybridEmbedResponse(
109+
dense=vectors[0].dense,
110+
sparse=SparseVectorModel(
111+
indices=vectors[0].sparse.indices,
112+
values=vectors[0].sparse.values,
113+
),
114+
)
115+
except Exception as e:
116+
logger.exception("embed_query_error", error=str(e))
117+
raise HTTPException(status_code=500, detail=str(e))
118+
119+
88120
@app.post("/process", response_model=ProcessResponse)
89121
async def process_document(request: ProcessRequest):
90122
"""

apps/ai-worker/src/models.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,33 @@ class EmbedResponse(BaseModel):
114114
"""Response with generated embeddings."""
115115

116116
embeddings: List[List[float]]
117+
118+
119+
# Phase 5: Hybrid Embedding Models for Query
120+
121+
122+
class SparseVectorModel(BaseModel):
123+
"""Sparse vector for BM25-style keyword matching."""
124+
125+
indices: List[int]
126+
values: List[float]
127+
128+
129+
class HybridVectorModel(BaseModel):
130+
"""Combined dense + sparse vector for hybrid search."""
131+
132+
dense: List[float]
133+
sparse: SparseVectorModel
134+
135+
136+
class HybridEmbedRequest(BaseModel):
137+
"""Request to generate hybrid embeddings for search query."""
138+
139+
text: str
140+
141+
142+
class HybridEmbedResponse(BaseModel):
143+
"""Response with hybrid embeddings for search."""
144+
145+
dense: List[float]
146+
sparse: SparseVectorModel

apps/ai-worker/src/pipeline.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .chunkers.document_chunker import DocumentChunker
88
from .chunkers.presentation_chunker import PresentationChunker
99
from .chunkers.tabular_chunker import TabularChunker
10-
from .embedder import Embedder
10+
from .hybrid_embedder import HybridEmbedder
1111
from .logging_config import get_logger
1212
from .models import ProfileConfig
1313
from .quality.analyzer import QualityAnalyzer
@@ -48,7 +48,8 @@ def __init__(self, config: Optional[ProfileConfig] = None):
4848
penalty_per_flag=self.config.qualityPenaltyPerFlag,
4949
)
5050

51-
self.embedder = Embedder()
51+
# Phase 5: Hybrid embedder (dense + sparse)
52+
self.embedder = HybridEmbedder()
5253

5354
def _strip_breadcrumb_prefix(self, content: str) -> str:
5455
"""Remove breadcrumb prefix (> Chapter > Section) from content."""
@@ -291,15 +292,22 @@ def run(
291292
chunk["metadata"]["chunkType"] = category
292293
chunk["index"] = i
293294

294-
# 4. Generate embeddings and token counts (with timing)
295+
# 4. Generate hybrid embeddings and token counts (with timing)
295296
texts = [c["content"] for c in chunks]
296297
embed_start = time.time()
297-
embeddings = self.embedder.embed(texts)
298+
hybrid_vectors = self.embedder.embed(texts)
298299
token_counts = self.embedder.get_token_counts(texts)
299300
embedding_time_ms = int((time.time() - embed_start) * 1000)
300301

301302
for i, chunk in enumerate(chunks):
302-
chunk["embedding"] = embeddings[i]
303+
# Phase 5: Hybrid vector format for Qdrant
304+
chunk["vector"] = {
305+
"dense": hybrid_vectors[i].dense,
306+
"sparse": {
307+
"indices": hybrid_vectors[i].sparse.indices,
308+
"values": hybrid_vectors[i].sparse.values,
309+
},
310+
}
303311
chunk["metadata"]["tokenCount"] = token_counts[i]
304312

305313
logger.info(
-222 KB
Binary file not shown.

apps/ai-worker/tests/regression/test_existing_formats.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,15 @@ class TestEmbeddingDimensions:
9696
"""Regression tests for embedding dimensions."""
9797

9898
def test_embedding_dimensions_unchanged(self):
99-
"""Embeddings should still be 384 dimensions."""
100-
from src.embedder import Embedder
99+
"""HybridEmbedder should return 384 dense dimensions."""
100+
from src.hybrid_embedder import HybridEmbedder
101101

102-
embedder = Embedder()
103-
embedding = embedder.embed("Test text")
102+
embedder = HybridEmbedder()
103+
result = embedder.embed(["Test text"])
104104

105-
assert len(embedding) == 384
105+
# HybridEmbedder returns list of HybridVector
106+
assert len(result) == 1
107+
assert len(result[0].dense) == 384
106108

107109

108110
class TestChunkStructure:

0 commit comments

Comments
 (0)