1515from llama_index .core import VectorStoreIndex , get_response_synthesizer
1616from llama_index .core .retrievers import VectorIndexRetriever
1717from llama_index .core .query_engine import RetrieverQueryEngine
18+ from llama_index .core import Settings
1819
1920
2021# one extra dep
2728STORE_DIR = "store"
2829SOURCE_DOCUMENT = "source document"
2930
31+ Settings .chunk_size = 3000
32+ Settings .chunk_overlap = 50
33+
3034
3135def create_store (model ):
3236 OLLAMA_MODEL = model .replace ("Ollama:" , "" ) if model is not LLM_GPT4o else None
3337
34- json_inputs = glob .glob ("processed/json/*/*.json" )
35- # print_(json_inputs)
38+ json_inputs = glob .glob ("processed/json/papers/*.json" )
3639
3740 documents = []
3841 for json_file in json_inputs :
39- print_ ("Adding %s" % json_file )
42+ print_ ("Adding file to document store: %s" % json_file )
4043
4144 with open (json_file , encoding = "utf-8" ) as f :
4245 doc_model = json .load (f )
@@ -60,36 +63,33 @@ def create_store(model):
6063 if len (all_text ) == 0 :
6164 all_text = " "
6265 # print_(f'---------------------\n{all_text}\n---------------------')
63- src_info = (
64- f"WormAtlas Handbook: [{ title } , Section { section } ]({ src_page } )"
65- )
66+ src_type = "Publication"
67+ if "wormatlas" in json_file :
68+ src_type = "WormAtlas Handbook"
69+ src_info = f"{ src_type } : [{ title } , Section { section } ]({ src_page } )"
6670 doc = Document (text = all_text , metadata = {SOURCE_DOCUMENT : src_info })
6771 documents .append (doc )
6872
69- if "-test" in sys .argv :
70- print_ ("Finishing before section requiring OPENAI_API_KEY..." )
71-
72- else :
73- print_ ("Creating a vector store index for %s" % model )
73+ print_ ("Creating a vector store index for %s" % model )
7474
75- STORE_SUBFOLDER = ""
75+ STORE_SUBFOLDER = ""
7676
77- if OLLAMA_MODEL is not None :
78- ollama_embedding = OllamaEmbedding (
79- model_name = OLLAMA_MODEL ,
80- )
81- STORE_SUBFOLDER = "/%s" % OLLAMA_MODEL .replace (":" , "_" )
77+ if OLLAMA_MODEL is not None :
78+ ollama_embedding = OllamaEmbedding (
79+ model_name = OLLAMA_MODEL ,
80+ )
81+ STORE_SUBFOLDER = "/%s" % OLLAMA_MODEL .replace (":" , "_" )
8282
83- # create an index from the parsed markdown
84- index = VectorStoreIndex .from_documents (
85- documents , embed_model = ollama_embedding , show_progress = True
86- )
87- else :
88- index = VectorStoreIndex .from_documents (documents )
83+ # create an index from the parsed markdown
84+ index = VectorStoreIndex .from_documents (
85+ documents , embed_model = ollama_embedding , show_progress = True
86+ )
87+ else :
88+ index = VectorStoreIndex .from_documents (documents )
8989
90- print_ ("Persisting vector store index" )
90+ print_ ("Persisting vector store index" )
9191
92- index .storage_context .persist (persist_dir = STORE_DIR + STORE_SUBFOLDER )
92+ index .storage_context .persist (persist_dir = STORE_DIR + STORE_SUBFOLDER )
9393
9494
9595def load_index (model ):
@@ -147,7 +147,7 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
147147
148148 # create a query engine for the index
149149 if OLLAMA_MODEL is not None :
150- llm = Ollama (model = OLLAMA_MODEL )
150+ llm = Ollama (model = OLLAMA_MODEL , request_timeout = 60.0 )
151151
152152 ollama_embedding = OllamaEmbedding (
153153 model_name = OLLAMA_MODEL ,
@@ -159,6 +159,7 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
159159 refine_template = refine_template ,
160160 embed_model = ollama_embedding ,
161161 )
162+ # print(dir(query_engine.retriever))
162163
163164 query_engine .retriever .similarity_top_k = similarity_top_k
164165
@@ -184,27 +185,10 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
184185 return query_engine
185186
186187
187- def process_query (response , model ):
188+ def process_query (query , model , verbose = False ):
189+ print_ ("Processing query: %s" % query )
188190 response = query_engine .query (query )
189191
190- """
191- import pprint as pp
192-
193- print(type(response))
194- print(dir(response))
195-
196- print("------")
197- pp.pprint(response.metadata)
198- print("------")
199-
200- for sn in response.source_nodes:
201- print(" -- ")
202- print(f' - {sn.score}: {sn.metadata['source document']}')
203- pp.pprint(sn)
204- print("------")
205- pp.pprint(response.response)
206- print("------")"""
207-
208192 response_text = str (response )
209193
210194 if "<think>" in response_text : # Give deepseek a fighting chance...
@@ -217,7 +201,14 @@ def process_query(response, model):
217201 cutoff = 0.2
218202 files_used = []
219203 for sn in response .source_nodes :
220- # print(sn)
204+ if verbose :
205+ print_ ("===================================" )
206+ # print(dir(sn))
207+ print_ (sn .metadata ["source document" ])
208+ print_ ("-------" )
209+ print_ ("Length of selection below: %i" % len (sn .text ))
210+ print_ (sn .text )
211+
221212 sd = sn .metadata ["source document" ]
222213
223214 if sd not in files_used :
@@ -244,10 +235,10 @@ def process_query(response, model):
244235
245236 llm_ver = get_llm_from_argv (sys .argv )
246237
247- if "-q" not in sys .argv :
248- create_store (llm_ver )
249-
250238 if "-test" not in sys .argv :
239+ if "-q" not in sys .argv :
240+ create_store (llm_ver )
241+
251242 index_reloaded = load_index (llm_ver )
252243 query_engine = get_query_engine (index_reloaded , llm_ver )
253244
@@ -265,11 +256,18 @@ def process_query(response, model):
265256 "When was the first metazoan genome sequenced? Answer only with the year.","""
266257
267258 queries = [
268- "The NeuroPAL transgene is amazing. Give me some examples of fluorophores in it." ,
269259 "What is the main function of cell pair AVB?" ,
270- "What can you tell me about Alan Coulson?" ,
271260 "In what year was William Shakespeare born? " ,
261+ "Tell me about the egg laying apparatus in C. elegans" ,
262+ "Tell me briefly about the neuronal control of C. elegans locomotion and the influence of monoamines." ,
263+ "What can you tell me about Alan Coulson?" ,
264+ "The NeuroPAL transgene is amazing. Give me some examples of fluorophores in it." ,
272265 ]
266+ """queries = [
267+ "What can you tell me about Alan Coulson?",
268+ ]"""
269+
270+ print_ ("Processing %i queries" % len (queries ))
273271
274272 for query in queries :
275273 process_query (query , llm_ver )
0 commit comments