forked from DataTalksClub/llm-zoomcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprep.py
More file actions
106 lines (83 loc) · 3.14 KB
/
prep.py
File metadata and controls
106 lines (83 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from dotenv import load_dotenv
from db import init_db
load_dotenv()
ELASTIC_URL = os.getenv("ELASTIC_URL_LOCAL")
MODEL_NAME = os.getenv("MODEL_NAME")
INDEX_NAME = os.getenv("INDEX_NAME")
BASE_URL = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main"
def fetch_documents():
print("Fetching documents...")
relative_url = "03-vector-search/eval/documents-with-ids.json"
docs_url = f"{BASE_URL}/{relative_url}?raw=1"
docs_response = requests.get(docs_url)
documents = docs_response.json()
print(f"Fetched {len(documents)} documents")
return documents
def fetch_ground_truth():
print("Fetching ground truth data...")
relative_url = "03-vector-search/eval/ground-truth-data.csv"
ground_truth_url = f"{BASE_URL}/{relative_url}?raw=1"
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[
df_ground_truth.course == "machine-learning-zoomcamp"
]
ground_truth = df_ground_truth.to_dict(orient="records")
print(f"Fetched {len(ground_truth)} ground truth records")
return ground_truth
def load_model():
print(f"Loading model: {MODEL_NAME}")
return SentenceTransformer(MODEL_NAME)
def setup_elasticsearch():
print("Setting up Elasticsearch...")
es_client = Elasticsearch(ELASTIC_URL)
index_settings = {
"settings": {"number_of_shards": 1, "number_of_replicas": 0},
"mappings": {
"properties": {
"text": {"type": "text"},
"section": {"type": "text"},
"question": {"type": "text"},
"course": {"type": "keyword"},
"id": {"type": "keyword"},
"question_text_vector": {
"type": "dense_vector",
"dims": 384,
"index": True,
"similarity": "cosine",
},
}
},
}
es_client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)
es_client.indices.create(index=INDEX_NAME, body=index_settings)
print(f"Elasticsearch index '{INDEX_NAME}' created")
return es_client
def index_documents(es_client, documents, model):
print("Indexing documents...")
for doc in tqdm(documents):
question = doc["question"]
text = doc["text"]
doc["question_text_vector"] = model.encode(question + " " + text).tolist()
es_client.index(index=INDEX_NAME, document=doc)
print(f"Indexed {len(documents)} documents")
def main():
# you may consider to comment <start>
# if you just want to init the db or didn't want to re-index
print("Starting the indexing process...")
documents = fetch_documents()
ground_truth = fetch_ground_truth()
model = load_model()
es_client = setup_elasticsearch()
index_documents(es_client, documents, model)
# you may consider to comment <end>
print("Initializing database...")
init_db()
print("Indexing process completed successfully!")
if __name__ == "__main__":
main()