Update get neighbours

luciaquirke · luciaquirke · commit cdebc3fca2b5 · 2025-02-17T10:07:52.000Z
diff --git a/delphi/semantic_index/index.py b/delphi/semantic_index/index.py
@@ -2,28 +2,25 @@
 from pathlib import Path
 
 import faiss
+import numpy as np
 from datasets import Dataset
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 
 from delphi.config import CacheConfig
 from delphi.logger import logger
 
 
-def get_neighbors_by_id(index: faiss.IndexIDMap, vector_id: int, k: int = 10):
-    # First reconstruct the vector for the given ID
-    vector = index.reconstruct(vector_id)
-
-    # Reshape to match FAISS expectations (needs 2D array)
-    vector = vector.reshape(1, -1)
-
-    # Search for nearest neighbors
-    distances, neighbor_ids = index.search(
-        vector, k + 1
-    )  # k+1 since it will find itself
+def get_neighbors(model, index, query: str, k: int = 1000):
+    q_embedding = model.encode([query])
+    result = index.search(q_embedding, k=k)
+    # result: tuple of (L2 distances, top match indices).
+    # supports matrix indexing for some reason so the top match index
+    # requires two indices
+    result[1][0][0]
+    # text_data[first_result]
 
     # Remove the first result (which will be the query vector itself)
-    return distances[0][1:], neighbor_ids[0][1:]
+    # return distances[0][1:], neighbor_ids[0][1:]
 
 
 def get_index_path(base_path: Path, cfg: CacheConfig):
@@ -46,124 +43,40 @@ def save_index(index: faiss.IndexFlatL2, base_path: Path, cfg: CacheConfig):
         json.dump(
             {
                 "index_path": str(index_path),
-                "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+                "embedding_model": cfg.faiss_embedding_model,
             },
             f,
         )
 
 
-# def split_text(text: str, cfg: CacheConfig):
-#     splitter = RecursiveCharacterTextSplitter(
-#         chunk_size=cfg.ctx_len, chunk_overlap=cfg.ctx_len // 4
-#     )
-#     return splitter.split_text(text)
-
-
-def split_text(text: str, cfg: CacheConfig):
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=cfg.ctx_len,
-        chunk_overlap=cfg.ctx_len // 4,
-        length_function=lambda x: 1 + len(x) // 4,
-    )
-    return splitter.split_text(text)
-
-
-def build_semantic_index(data: Dataset, cfg: CacheConfig):
+def build_semantic_index(data: Dataset, cfg: CacheConfig, batch_size: int = 1024):
     """
-    Build a semantic index of the token sequences.
+    Build a semantic index, assuming data['text'] is of appropriate length.
     """
 
     model = SentenceTransformer(cfg.faiss_embedding_model, device="cuda")
-    d = next(model.parameters()).dtype
+    d = model[1].word_embedding_dimension
 
     index = faiss.IndexHNSWFlat(d, cfg.faiss_hnsw_config["M"])
     index.hnsw.efConstruction = cfg.faiss_hnsw_config["efConstruction"]
     index.hnsw.efSearch = cfg.faiss_hnsw_config["efSearch"]
 
-    data["text"]
-    breakpoint()
-
-    # index_tokenizer = AutoTokenizer.from_pretrained
-    # ('sentence-transformers/all-MiniLM-L6-v2')
-    # index_model = AutoModel.from_pretrained(
-    # 'sentence-transformers/all-MiniLM-L6-v2').to("cuda")
-
-    # index_tokens = chunk_and_tokenize(data, index_tokenizer, max_seq_len=cfg.ctx_len,
-    # text_key=cfg.dataset_row)
-    # index_tokens = index_tokens["input_ids"]
-    # index_tokens = assert_type(Tensor, index_tokens)
-
-    # token_embeddings = index_model(index_tokens[:2].to("cuda")).last_hidden_state
-
-    # base_index = faiss.IndexFlatL2(token_embeddings.shape[-1])
-    # index = faiss.IndexIDMap(base_index)
-
-    # batch_size = 512
-    # dataloader = DataLoader(index_tokens, batch_size=batch_size) # type: ignore
-
-    # from tqdm import tqdm
-    # with torch.no_grad():
-    #     for batch_idx, batch in enumerate(tqdm(dataloader)):
-    #         batch = batch.to("cuda")
-    #         token_embeddings = index_model(batch).last_hidden_state
-    #         sentence_embeddings = token_embeddings.mean(dim=1)
-    #         sentence_embeddings = sentence_embeddings.cpu().numpy().astype(np.float32)
-
-    #         ids = np.arange(batch_idx * batch_size, batch_idx * batch_size +
-    # len(batch))
-    #         index.add_with_ids(sentence_embeddings, ids)
-
-    return None
-    # """
-    # Build a semantic index of the token sequences.
-    # """
-
-    # model = SentenceTransformer(cfg.faiss_embedding_model, device="cuda")
-    # d = next(model.parameters()).dtype
-
-    # text = data['text']
-    # chunks = []
-    # for t in text:
-    #     chunks.extend(split_text(t, cfg))
-
-    # breakpoint()
-    # index = faiss.IndexHNSWFlat(d, cfg.faiss_hnsw_config["M"])
-    # index.metric_type = faiss.METRIC_L2
-    # index.hnsw.efConstruction = cfg.faiss_hnsw_config["efConstruction"]
-    # index.hnsw.efSearch = cfg.faiss_hnsw_config["efSearch"]
-
-    # index_tokenizer = AutoTokenizer.from_pretraine
-    # d('sentence-transformers/all-MiniLM-L6-v2')
-    # index_model = AutoModel.from_pretrained('sentence-transform
-    # ers/all-MiniLM-L6-v2').to("cuda")
-
-    # index_tokens = chunk_and_tokenize(data, index_tokenizer,
-    # max_seq_len=cfg.ctx_len, text_key=cfg.dataset_row)
-    # index_tokens = index_tokens["input_ids"]
-    # index_tokens = assert_type(Tensor, index_tokens)
-
-    # token_embeddings = index_model(index_tokens[:2].to("cuda")).last_hidden_state
-
-    # base_index = faiss.IndexFlatL2(token_embeddings.shape[-1])
-    # index = faiss.IndexIDMap(base_index)
-
-    # batch_size = 512
-    # dataloader = DataLoader(index_tokens, batch_size=batch_size) # type: ignore
-
-    # from tqdm import tqdm
-    # with torch.no_grad():
-    #     for batch_idx, batch in enumerate(tqdm(dataloader)):
-    #         batch = batch.to("cuda")
-    #         token_embeddings = index_model(batch).last_hidden_state
-    #         sentence_embeddings = token_embeddings.mean(dim=1)
-    #         sentence_embeddings = sentence_embeddings.cpu().numpy()
-    # .astype(np.float32)
-
-    #         ids = np.arange(batch_idx * batch_size, batch_idx * batch_size
-    # + len(batch))
-    #         index.add_with_ids(sentence_embeddings, ids)
-
-    # return None
+    text_data = data["text"]
+
+    embeddings = []
+    for i in range(0, len(text_data), batch_size):
+        print(f"Processing batch {i} of {len(text_data)}")
+        batch = text_data[i : i + batch_size]
+        batch_embeddings = model.encode(
+            batch, batch_size=batch_size, device="cuda", convert_to_numpy=True
+        )
+        embeddings.append(batch_embeddings)
+
+    embeddings = np.vstack(embeddings)
+
+    index.add(embeddings)  # type: ignore
+
+    return index
 
 
 def build_or_load_index(data: Dataset, base_path: Path, cfg: CacheConfig):