-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_orig.py
More file actions
83 lines (62 loc) · 2.67 KB
/
benchmark_orig.py
File metadata and controls
83 lines (62 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from my_rag_new import SQLDocumentStore, FAISSDatabase
import pandas as pd
import re
import numpy as np
import torch
sql_store = SQLDocumentStore(db_path="CUHK/documents.db")
faiss_db = FAISSDatabase(db_type="withIndexIDMap", db_dimension=768)
# sql_store.clear_database("CUHK/documents.db") # figure out why this does not work
# Read CSV file
df = pd.read_csv('generations5.csv')
indexer = 0
for index, row in df.iterrows():
acronym = row[0]
description = row[1]
question = row[2]
description_embedding = np.fromstring(row[3][1:-1], sep=',') # Convert CSV string to NumPy array
question_embedding = np.fromstring(row[4][1:-1], sep=',')
documents = [description, row[5], row[6], row[7], row[8], row[9]]
embeddings = [
description_embedding,
np.fromstring(row[10][1:-1], sep=','),
np.fromstring(row[11][1:-1], sep=','),
np.fromstring(row[12][1:-1], sep=','),
np.fromstring(row[13][1:-1], sep=','),
np.fromstring(row[14][1:-1], sep=',')
]
doc_ids = []
for text in documents:
if pd.notna(text): # Avoid adding NaN values
sql_store.add_document(indexer, text)
doc_ids.append(indexer)
indexer += 1
embedding_tensor = torch.Tensor(embeddings)
print(len(embedding_tensor), len(doc_ids), index)
print(doc_ids)
faiss_db.add_documents(embedding_tensor, doc_ids)
faiss_db.save_index("CUHK/faiss.index")
def extract_UUID(text):
uuid_pattern = r'\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b'
return re.findall(uuid_pattern, text)
def benchmark():
correct = 0
num_elements = len(df)
for index, row in df.iterrows():
query = row[2]
query_embedding = np.fromstring(row[4][1:-1], sep=',')
distances, indices = faiss_db.search(query_embedding.reshape(1, -1), k=5)
top = indices[0][0] # First element of top matches
document_ids = list(indices[0])
document = sql_store.fetch_document(int(document_ids[0]))
query_uuid = extract_UUID(query)
document_uuid = extract_UUID(document)
print(f'Query: {query} \nQuery Results: \n\n ')
for i in range(5):
idx_doc = int(document_ids[i])
print(f"Result {i+1}: {indices[0][i]} (distance: {distances[0][i]}) (document: {sql_store.fetch_document(idx_doc)} \n ")
if query_uuid == document_uuid:
correct += 1
return (correct / num_elements), correct, num_elements
if __name__ == "__main__":
benchmark_nums = benchmark()
print("Benchmark score: " + str(benchmark_nums[0]) + "\n" + "Correct: " + str(benchmark_nums[1]) + " Trials: " + str(benchmark_nums[2]))