Merge pull request #16 from ClipABit/pr/search-module-4

eshaan-mehta · web-flow · commit 1d548fc39b82 · 2025-11-27T18:01:13.000-05:00
Pr/search module 4
diff --git a/backend/search/__init__.py b/backend/search/__init__.py
@@ -0,0 +1,8 @@
+"""
+Search module for semantic search using CLIP embeddings and Pinecone.
+"""
+
+from search.embedder import TextEmbedder
+from search.searcher import Searcher
+
+__all__ = ["TextEmbedder", "Searcher"]
diff --git a/backend/search/embedder.py b/backend/search/embedder.py
@@ -0,0 +1,97 @@
+import logging
+from typing import Union, List
+import numpy as np
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+"""
+Text Embedding module using CLIP model.
+
+Provides text-to-vector conversion using OpenAI's CLIP model
+for semantic search capabilities.
+"""
+
+
+class TextEmbedder:
+    """
+    CLIP-based text embedder for semantic search.
+    
+    Converts text queries into 512-dimensional embeddings using
+    OpenAI's CLIP text model (ViT-B/32 variant).
+    
+    Uses CLIPTextModelWithProjection for efficiency (loads only text encoder,
+    not the full CLIP model with vision encoder).
+    
+    Usage:
+        embedder = TextEmbedder()
+        vector = embedder.embed_text("woman on a train")
+    """
+    
+    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
+        """
+        Initialize the text embedder.
+
+        Args:
+            model_name: HuggingFace model identifier for CLIP. 
+                       Defaults to "openai/clip-vit-base-patch32".
+        """
+        self.model_name = model_name
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        self.tokenizer = None
+        
+        logger.info(f"TextEmbedder initialized (device: {self.device})")
+    
+    def _load_model(self):
+        """Lazy load the CLIP text model on first use."""
+        if self.model is None:
+            logger.info(f"Loading CLIP text model: {self.model_name}")
+            self.tokenizer = CLIPTokenizer.from_pretrained(self.model_name)
+            self.model = CLIPTextModelWithProjection.from_pretrained(self.model_name).to(self.device)
+            self.model.eval()
+            logger.info("CLIP text model loaded successfully")
+    
+    def embed_text(self, text: Union[str, List[str]]) -> np.ndarray:
+        """
+        Generate embeddings for text input(s).
+        
+        Args:
+            text: Single text string or list of text strings
+        
+        Returns:
+            numpy array of embeddings (512-d, L2-normalized)
+            Shape: (512,) for single text, (N, 512) for batch
+        """
+        self._load_model()
+        
+        # Handle single string
+        if isinstance(text, str):
+            text = [text]
+        
+        # Tokenize inputs
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=77  # CLIP's max sequence length
+        ).to(self.device)
+        
+        # Generate embeddings
+        with torch.no_grad():
+            # CLIPTextModelWithProjection outputs already-projected features
+            text_features = self.model(**inputs).text_embeds
+            # L2 normalize (essential for cosine similarity search)
+            text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+        
+        # Convert to numpy
+        embeddings = text_features.cpu().numpy()
+        
+        # Return single vector if single input
+        if len(embeddings) == 1:
+            return embeddings[0]
+        
+        return embeddings
diff --git a/backend/search/searcher.py b/backend/search/searcher.py
@@ -0,0 +1,101 @@
+"""
+Semantic Searcher using Pinecone vector database.
+
+Coordinates text embedding and vector search to find semantically
+similar content.
+"""
+
+import logging
+from typing import List, Dict, Any
+
+from database.pinecone_connector import PineconeConnector
+from search.embedder import TextEmbedder
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Searcher:
+    """
+    High-level semantic search coordinator.
+    
+    Combines text embedding and Pinecone vector search to provide
+    an easy-to-use interface for semantic similarity search.
+    
+    Usage:
+        searcher = Searcher(api_key="...", index_name="chunks-index")
+        results = searcher.search("woman on a train", top_k=3)
+    """
+    
+    def __init__(
+        self,
+        api_key: str,
+        index_name: str,
+        namespace: str = "__default__"
+    ):
+        """
+        Initialize searcher with Pinecone connection.
+        
+        Args:
+            api_key: Pinecone API key
+            index_name: Name of Pinecone index to search
+            namespace: Optional namespace for partitioning data
+        """
+        self.embedder = TextEmbedder()
+        self.connector = PineconeConnector(api_key=api_key, index_name=index_name)
+        self.namespace = namespace
+        
+        logger.info(
+            f"Searcher initialized (index={index_name}, namespace='{namespace}')"
+        )
+    
+    @property
+    def device(self) -> str:
+        """Get the device being used for embeddings (cpu/cuda)."""
+        return self.embedder.device
+    
+    def search(
+        self,
+        query: str,
+        top_k: int = 5
+    ) -> List[Dict[str, Any]]:
+        """
+        Search for semantically similar content.
+
+        Args:
+            query: Natural language search query
+            top_k: Number of results to return (default: 5)
+
+        Returns:
+            List of matches with scores and metadata, sorted by similarity
+
+        Example:
+            results = searcher.search("cooking in kitchen", top_k=3)
+            for result in results:
+                print(f"Score: {result['score']}")
+                print(f"Metadata: {result['metadata']}")
+        """
+        logger.info(f"Searching for: '{query}' (top_k={top_k})")
+        
+        # Generate query embedding
+        query_embedding = self.embedder.embed_text(query)
+        
+        # Search Pinecone with optional filters
+        matches = self.connector.query_chunks(
+            query_embedding=query_embedding,
+            namespace=self.namespace,
+            top_k=top_k
+        )
+        
+        # Format results
+        results = []
+        for match in matches:
+            result = {
+                'id': match.get('id'),
+                'score': match.get('score', 0.0),
+                'metadata': match.get('metadata', {})
+            }
+            results.append(result)
+        
+        logger.info(f"Found {len(results)} results")
+        return results