Merge pull request #11 from ClipABit/videoembed

SNubby19 · web-flow · commit 1e309ac69591 · 2025-11-21T17:40:16.000-05:00
Integrated embedding logic to the process/upload video pipeline
diff --git a/backend/embeddings/__init__.py b/backend/embeddings/__init__.py
@@ -1 +1,5 @@
-# Make embeddings a proper Python package
+# Make embeddings a proper Python package
+
+from .embedder import VideoEmbedder
+
+__all__ = ["VideoEmbedder"]
diff --git a/backend/embeddings/embedder.py b/backend/embeddings/embedder.py
@@ -0,0 +1,63 @@
+import torch
+import numpy as np
+from PIL import Image
+from transformers import (
+    CLIPModel,
+    CLIPProcessor
+)
+
+
+class VideoEmbedder:
+    """
+    A class to handle video embedding generation using various models.
+    """
+    def __init__(self):
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._clip_model = None
+        self._clip_processor = None  
+        self._get_clip_model()
+        
+    def _get_clip_model(self):
+        """Lazily load and return CLIP model + processor."""
+        if self._clip_model is None or self._clip_processor is None:
+            print("Loading CLIP model into memory...")
+            self._clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
+                self._device
+            )
+            self._clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        return self._clip_model, self._clip_processor
+   
+    def _generate_clip_embedding(self, frames, num_frames: int = 8) -> torch.Tensor:
+        """
+        Generate a single embedding for a video chunk by averaging the normalized
+        embeddings of sampled frames using the Open AI CLIP Model.
+        Args:
+            processed_chunk (Dict[str, Any]): The processed video chunk object.
+            num_frames (int): Number of frames to sample evenly across the video.
+        
+        Returns:
+            torch.Tensor: A single, normalized embedding tensor for the video chunk.
+        """
+        
+        # Fetch the preloaded model and processor
+        model, processor = self._get_clip_model()
+        
+        # Sample frames evenly across the video if the num frames is greater than available frames 
+        num_frames = min(num_frames, frames.shape[0])
+        frame_indices = np.linspace(0, frames.shape[0] - 1, num_frames).astype(int)
+        sampled_frames = [Image.fromarray(frames[idx]) for idx in frame_indices]
+        
+        # Transform the frame data to match the standard dimensions and normalization of the pixel values to the ranges 
+        # of the data the model was trained on.
+        inputs = processor(images=sampled_frames, return_tensors="pt", size=224).to(self._device)    
+        
+        with torch.no_grad():
+            frame_features = model.get_image_features(**inputs)
+            frame_features = frame_features / frame_features.norm(p=2, dim=-1, keepdim=True)
+            
+            video_embedding = frame_features.mean(dim=0)
+            video_embedding = video_embedding / video_embedding.norm(p=2, dim=-1, keepdim=True)
+            
+        
+        return video_embedding.cpu()    
+    
diff --git a/backend/main.py b/backend/main.py
@@ -48,6 +48,7 @@ def startup(self):
 
         # Import classes here
         from preprocessing.preprocessor import Preprocessor
+        from embeddings.embedder import VideoEmbedder
         from database.pinecone_connector import PineconeConnector
         from database.job_store_connector import JobStoreConnector
         from database.r2_connector import R2Connector
@@ -81,6 +82,7 @@ def startup(self):
         # Instantiate classes
 
         self.preprocessor = Preprocessor(min_chunk_duration=1.0, max_chunk_duration=10.0, scene_threshold=13.0)
+        self.video_embedder = VideoEmbedder()
         self.pinecone_connector = PineconeConnector(api_key=PINECONE_API_KEY, index_name=PINECONE_CHUNKS_INDEX)
         self.job_store = JobStoreConnector(dict_name="clipabit-jobs")
         self.r2_connector = R2Connector(account_id=R2_ACCOUNT_ID,
@@ -128,11 +130,45 @@ async def process_video(self, video_bytes: bytes, filename: str, job_id: str):
             # Prepare chunk details for response (without frame arrays)
             chunk_details = []
             for chunk in processed_chunks:
+                embedding = self.video_embedder._generate_clip_embedding(chunk["frames"], num_frames=8)
+               
+                logger.info(f"[Job {job_id}] Generated CLIP embedding for chunk {chunk['chunk_id']}")
+                logger.info(f"[Job {job_id}] Upserting embedding for chunk {chunk['chunk_id']} to Pinecone...")
+              
+    
+                # 1. Handle timestamp_range (List of Numbers -> Two Numbers)
+                if 'timestamp_range' in chunk['metadata']:
+                    start_time, end_time = chunk['metadata'].pop('timestamp_range')
+                    chunk['metadata']['start_time_s'] = start_time
+                    chunk['metadata']['end_time_s'] = end_time
+
+                # 2. Handle file_info (Nested Dict -> Flat Keys)
+                if 'file_info' in chunk['metadata']:
+                    file_info = chunk['metadata'].pop('file_info')
+                    for key, value in file_info.items():
+                        chunk['metadata'][f'file_{key}'] = value
+                        
+                # 3. Final Check: Remove Nulls (Optional but good practice)
+                # Pinecone rejects keys with null values.
+                keys_to_delete = [k for k, v in chunk['metadata'].items() if v is None]
+                for k in keys_to_delete:
+                    del chunk['metadata'][k]
+              
+               
+                self.pinecone_connector.upsert_chunk(
+                    chunk_id=chunk['chunk_id'],
+                    chunk_embedding=embedding.numpy(),
+                    namespace="test",
+                    metadata=chunk['metadata']
+                )            
+                
                 chunk_details.append({
                     "chunk_id": chunk['chunk_id'],
                     "metadata": chunk['metadata'],
-                    "memory_mb": chunk['memory_mb']
+                    "memory_mb": chunk['memory_mb'],
                 })
+              
+            # TODO: Upload processed data to S3
 
             result = {
                 "job_id": job_id,
@@ -143,7 +179,7 @@ async def process_video(self, video_bytes: bytes, filename: str, job_id: str):
                 "total_frames": total_frames,
                 "total_memory_mb": total_memory,
                 "avg_complexity": avg_complexity,
-                "chunk_details": chunk_details
+                "chunk_details": chunk_details,
             }
             
             logger.info(f"[Job {job_id}] Finished processing {filename}")
@@ -219,7 +255,7 @@ async def upload(self, file: UploadFile = None):
             "message": "Video uploaded successfully, processing in background"
         }
 
-    @modal.fastapi_endpoint(method="POST")
+    @modal.fastapi_endpoint(method="GET")
     async def search(self, query: str):
         """Search endpoint - accepts a text query and returns semantic search results."""
         logger.info(f"[Search] Query: {query}")