1+ """Face repository: detection, embedding and incremental clustering utilities.
2+
3+ This module provides the FaceRepository class which wraps face detection
4+ and embedding (via DeepFace) and clustring (via Nearest-Neighbor + Threshold).
5+
6+ Responsibilities:
7+ - detect faces in images and compute embeddings.
8+ - for each face, find the face_id in database that it belongs to (or create a new face_id).
9+ - add new face embeddings to Pinecone vector database, with metadata for face ID and chunk ID.
10+ """
11+
12+ # from incdbscan import IncrementalDBSCAN
13+ from deepface import DeepFace
14+ import numpy as np
15+ # from sklearn.cluster import *
16+ from .face import Face
17+ import logging
18+ from database import PineconeConnector
19+ import uuid
20+
21+ logging .basicConfig (level = logging .INFO )
22+ logger = logging .getLogger (__name__ )
23+
24+ class FaceRepository :
25+ """Repository that detects faces, computes embeddings and clusters them.
26+
27+ Attributes:
28+ pinecone_connector (PineconeConnector): Connector to Pinecone vector database.
29+ detector_backend (str): Backend to use for face detection (default: "mtcnn").
30+ embedding_model_name (str): Model name to use for face embedding (default: "ArcFace").
31+ enforce_detection (bool): Whether to enforce face detection (default: True).
32+ align (bool): Whether to align faces before embedding (default: True).
33+ threshold (float): Similarity threshold for face recognition (default: 0.35).
34+ Class attributes:
35+ all_detector_backends (list[str]): List of all supported detector backends.
36+ all_embed_models (list[str]): List of all supported embedding models.
37+ """
38+
39+ # list of all detector backends used for face detection
40+ all_detector_backends = [
41+ "mtcnn"
42+ ]
43+
44+ # list of embedding models (informational)
45+ all_embed_models = [
46+ "ArcFace"
47+ ]
48+
49+ def __init__ (
50+ self ,
51+ pinecone_api_key : str ,
52+ index_name : str ,
53+ detector_backend = "mtcnn" ,
54+ embedding_model_name = "ArcFace" ,
55+ enforce_detection = True ,
56+ align = True ,
57+ threshold = 0.35 ):
58+ # parameters for face detection and embedding with deepface
59+ self .detector_backend = detector_backend
60+ self .embedding_model_name = embedding_model_name
61+ self .enforce_detection = enforce_detection
62+ self .align = align
63+ self .threshold = threshold # threshold for face recognition matching
64+
65+ self .pinecone_connector = PineconeConnector (api_key = pinecone_api_key , index_name = index_name )
66+
67+ self .cluster_example_face : dict [int , Face ] = {} # list of example face images in cluster = key
68+
69+ logging .debug (f"FaceRepository: Initialized FaceRepository with detector_backend={ detector_backend } , "
70+ f"embedding_model_name={ embedding_model_name } , enforce_detection={ enforce_detection } , "
71+ f"align={ align } , index_name={ index_name } , threshold={ threshold } " )
72+
73+ def _detect_and_embed (self , img ):
74+ """Detect faces in `img` and compute embeddings.
75+
76+ Args:
77+ img (str | np.ndarray): Path to the image or an image as a NumPy array.
78+
79+ Returns:
80+ list[Face]: A list of Face objects. Each Face contains the embedding
81+ (np.ndarray) and the cropped face image as a NumPy array.
82+
83+ Notes:
84+ This method calls ``DeepFace.represent`` which returns a list of
85+ dictionaries with keys such as ``embedding`` and ``facial_area``.
86+ We convert each result into a ``Face`` using
87+ ``Face.from_original_image`` with the reported bounding box.
88+ """
89+ faces : list [Face ] = []
90+
91+ try :
92+ rep = DeepFace .represent (
93+ img_path = img ,
94+ model_name = self .embedding_model_name ,
95+ detector_backend = self .detector_backend ,
96+ enforce_detection = self .enforce_detection ,
97+ align = self .align ,
98+ )
99+ except Exception as e :
100+ logger .error (f"FaceRepository: Error during face detection and embedding on image { img } : { e } \n returning empty face list." )
101+ return []
102+
103+ for r in rep :
104+ try :
105+ face = Face .from_original_image (
106+ embedding = np .array (r ["embedding" ]),
107+ orig_image = img ,
108+ bbox = (
109+ r ["facial_area" ]["x" ],
110+ r ["facial_area" ]["y" ],
111+ r ["facial_area" ]["w" ],
112+ r ["facial_area" ]["h" ],
113+ ),
114+ )
115+ except Exception as e :
116+ logger .error (f"FaceRepository: Error creating Face object from representation { r } on image { img } : { e } \n skipping this face." )
117+ continue
118+ faces .append (face )
119+
120+ return faces
121+
122+ def _upsert_face_embedding (self , face_ids_count : dict , namespace : str , face_id : str , chunk_id : str , face_embedding : np .ndarray ):
123+ """Upsert a face embedding into the Pinecone index.
124+
125+ Args:
126+ face_ids_count (dict): Dictionary mapping face IDs to their counts.
127+ namespace (str): Namespace to upsert the face embedding into.
128+ face_id (str): Unique identifier for the face.
129+ chunk_id (str): Unique identifier for the clip chunk.
130+ face_embedding (np.ndarray): The face embedding to upsert.
131+
132+ Returns:
133+ bool: True if upsert was successful, False otherwise.
134+ """
135+ try :
136+ success = self .pinecone_connector .upsert_chunk (
137+ chunk_id = str (uuid .uuid4 ()),
138+ chunk_embedding = face_embedding ,
139+ namespace = namespace ,
140+ metadata = {"face_id" : face_id , "chunk_id" : chunk_id }
141+ )
142+ except Exception as e :
143+ logger .error (f"FaceRepository: Error upserting face embedding for face_id { face_id } in chunk_id { chunk_id } : { e } " )
144+ return False
145+
146+ if success :
147+ if face_id in face_ids_count :
148+ face_ids_count [face_id ] += 1
149+ else :
150+ face_ids_count [face_id ] = 1
151+ return success
152+
153+ # add a list of faces to the cluster
154+ def add_faces (self , namespace : str , chunk_id : str , faces : list [Face ]):
155+ """Add a batch of Face objects to the clustering model.
156+
157+ This method inserts the provided embeddings into the incremental
158+ clustering model, updates the global embedding list, and records which
159+ clusters appear in `clip_id`.
160+
161+ Args:
162+ namespace (str): Namespace to upsert the face embeddings into.
163+ chunk_id (str): Unique identifier for the clip chunk.
164+ faces (list[Face]): List of Face objects to add.
165+
166+ Returns:
167+ dict: A dictionary mapping face IDs to the number of times they
168+ appear in this chunk.
169+ """
170+ logging .debug (f"FaceRepository: Adding { len (faces )} faces to clustering for chunk_id { chunk_id } ." )
171+
172+ try :
173+ # collect and stack embeddings from face objects
174+ face_embeddings = [f .embedding for f in faces ]
175+ except Exception as e :
176+ logger .error (f"FaceRepository: Error extracting embeddings from faces for chunk_id { chunk_id } : { e } \n returning empty label list." )
177+ return []
178+
179+ # dict where key = face_id, value = number of times appear in this chunk
180+ face_ids_count : dict = {}
181+
182+ for e in face_embeddings :
183+ # find closest face from pinecone vector db
184+ best_match = self .pinecone_connector .query_chunks (
185+ query_embedding = e ,
186+ namespace = namespace ,
187+ top_k = 1
188+ )
189+ print (best_match )
190+ if not best_match or len (best_match ) == 0 :
191+ # no match found, insert as new cluster
192+ new_id = str (uuid .uuid4 ())
193+ upsert_success = self ._upsert_face_embedding (
194+ face_ids_count = face_ids_count ,
195+ namespace = namespace ,
196+ face_id = new_id ,
197+ chunk_id = chunk_id ,
198+ face_embedding = e
199+ )
200+ if not upsert_success :
201+ logger .error (f"FaceRepository: Failed to upsert new face embedding for new_id { new_id } in chunk_id { chunk_id } ." )
202+ continue
203+ continue
204+
205+ best_match = best_match [0 ]
206+
207+ # if score above threshold, group new embedding into existing cluster
208+ if best_match ['score' ] > self .threshold :
209+ face_id = str (best_match ["metadata" ].get ("face_id" , None ))
210+ print (face_id )
211+ if face_id is not None :
212+ upsert_success = self ._upsert_face_embedding (
213+ face_ids_count = face_ids_count ,
214+ namespace = namespace ,
215+ face_id = face_id ,
216+ chunk_id = chunk_id ,
217+ face_embedding = e
218+ )
219+ if not upsert_success :
220+ logger .error (f"FaceRepository: Failed to upsert face embedding for existing face_id { face_id } in chunk_id { chunk_id } ." )
221+ continue
222+ else :
223+ logger .error (f"FaceRepository: Best match from Pinecone for chunk_id { chunk_id } has no face_id in metadata. Skipping." )
224+ continue
225+ else :
226+ # otherwise, insert as new cluster
227+ new_id = str (uuid .uuid4 ())
228+ upsert_success = self ._upsert_face_embedding (
229+ face_ids_count = face_ids_count ,
230+ namespace = namespace ,
231+ face_id = new_id ,
232+ chunk_id = chunk_id ,
233+ face_embedding = e
234+ )
235+ if not upsert_success :
236+ logger .error (f"FaceRepository: Failed to upsert new face embedding for new_id { new_id } in chunk_id { chunk_id } ." )
237+ continue
238+
239+ return face_ids_count
240+
241+ def add_images (self , namespace : str , chunk_id : str , img_lst : list ):
242+ """Detect faces and add their embeddings for a list of images.
243+
244+ This is a convenience wrapper that runs detection+embedding for each
245+ image in ``img_lst`` and then calls :meth:`add_faces` to insert the
246+ resulting embeddings into the clustering model.
247+
248+ Args:
249+ clip_id (int): Clip identifier to associate the detected faces with.
250+ img_lst (list[str|np.ndarray]): List of image paths or NumPy arrays.
251+
252+ Returns:
253+ dict: A dictionary mapping face IDs to the number of times they
254+ appear in this chunk.
255+ """
256+ logging .debug (f"FaceRepository: Adding frame images for chunk_id { chunk_id } for facial recognition, number of images: { len (img_lst )} " )
257+ if not img_lst :
258+ logging .warning (f"FaceRepository: empty img_lst provided for chunk_id { chunk_id } " )
259+
260+ embedded_faces = []
261+ for img in img_lst :
262+ faces = self ._detect_and_embed (img )
263+ embedded_faces += faces
264+
265+ face_cluster = self .add_faces (namespace , chunk_id , embedded_faces )
266+ logging .debug (f"FaceRepository: Completed processing { len (img_lst )} images for chunk_id { chunk_id } " )
267+
268+ return face_cluster
0 commit comments