Skip to content

Commit 0cccc77

Browse files
authored
Merge pull request #21 from openworm/development
Update tests
2 parents 1609c84 + 5dc8316 commit 0cccc77

File tree

4 files changed

+93
-73
lines changed

4 files changed

+93
-73
lines changed

openworm_ai/graphrag/GraphRAG_test.py

Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from llama_index.core import VectorStoreIndex, get_response_synthesizer
1616
from llama_index.core.retrievers import VectorIndexRetriever
1717
from llama_index.core.query_engine import RetrieverQueryEngine
18+
from llama_index.core import Settings
1819

1920

2021
# one extra dep
@@ -27,16 +28,18 @@
2728
STORE_DIR = "store"
2829
SOURCE_DOCUMENT = "source document"
2930

31+
Settings.chunk_size = 3000
32+
Settings.chunk_overlap = 50
33+
3034

3135
def create_store(model):
3236
OLLAMA_MODEL = model.replace("Ollama:", "") if model is not LLM_GPT4o else None
3337

34-
json_inputs = glob.glob("processed/json/*/*.json")
35-
# print_(json_inputs)
38+
json_inputs = glob.glob("processed/json/papers/*.json")
3639

3740
documents = []
3841
for json_file in json_inputs:
39-
print_("Adding %s" % json_file)
42+
print_("Adding file to document store: %s" % json_file)
4043

4144
with open(json_file, encoding="utf-8") as f:
4245
doc_model = json.load(f)
@@ -60,36 +63,33 @@ def create_store(model):
6063
if len(all_text) == 0:
6164
all_text = " "
6265
# print_(f'---------------------\n{all_text}\n---------------------')
63-
src_info = (
64-
f"WormAtlas Handbook: [{title}, Section {section}]({src_page})"
65-
)
66+
src_type = "Publication"
67+
if "wormatlas" in json_file:
68+
src_type = "WormAtlas Handbook"
69+
src_info = f"{src_type}: [{title}, Section {section}]({src_page})"
6670
doc = Document(text=all_text, metadata={SOURCE_DOCUMENT: src_info})
6771
documents.append(doc)
6872

69-
if "-test" in sys.argv:
70-
print_("Finishing before section requiring OPENAI_API_KEY...")
71-
72-
else:
73-
print_("Creating a vector store index for %s" % model)
73+
print_("Creating a vector store index for %s" % model)
7474

75-
STORE_SUBFOLDER = ""
75+
STORE_SUBFOLDER = ""
7676

77-
if OLLAMA_MODEL is not None:
78-
ollama_embedding = OllamaEmbedding(
79-
model_name=OLLAMA_MODEL,
80-
)
81-
STORE_SUBFOLDER = "/%s" % OLLAMA_MODEL.replace(":", "_")
77+
if OLLAMA_MODEL is not None:
78+
ollama_embedding = OllamaEmbedding(
79+
model_name=OLLAMA_MODEL,
80+
)
81+
STORE_SUBFOLDER = "/%s" % OLLAMA_MODEL.replace(":", "_")
8282

83-
# create an index from the parsed markdown
84-
index = VectorStoreIndex.from_documents(
85-
documents, embed_model=ollama_embedding, show_progress=True
86-
)
87-
else:
88-
index = VectorStoreIndex.from_documents(documents)
83+
# create an index from the parsed markdown
84+
index = VectorStoreIndex.from_documents(
85+
documents, embed_model=ollama_embedding, show_progress=True
86+
)
87+
else:
88+
index = VectorStoreIndex.from_documents(documents)
8989

90-
print_("Persisting vector store index")
90+
print_("Persisting vector store index")
9191

92-
index.storage_context.persist(persist_dir=STORE_DIR + STORE_SUBFOLDER)
92+
index.storage_context.persist(persist_dir=STORE_DIR + STORE_SUBFOLDER)
9393

9494

9595
def load_index(model):
@@ -147,7 +147,7 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
147147

148148
# create a query engine for the index
149149
if OLLAMA_MODEL is not None:
150-
llm = Ollama(model=OLLAMA_MODEL)
150+
llm = Ollama(model=OLLAMA_MODEL, request_timeout=60.0)
151151

152152
ollama_embedding = OllamaEmbedding(
153153
model_name=OLLAMA_MODEL,
@@ -159,6 +159,7 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
159159
refine_template=refine_template,
160160
embed_model=ollama_embedding,
161161
)
162+
# print(dir(query_engine.retriever))
162163

163164
query_engine.retriever.similarity_top_k = similarity_top_k
164165

@@ -184,27 +185,10 @@ def get_query_engine(index_reloaded, model, similarity_top_k=4):
184185
return query_engine
185186

186187

187-
def process_query(response, model):
188+
def process_query(query, model, verbose=False):
189+
print_("Processing query: %s" % query)
188190
response = query_engine.query(query)
189191

190-
"""
191-
import pprint as pp
192-
193-
print(type(response))
194-
print(dir(response))
195-
196-
print("------")
197-
pp.pprint(response.metadata)
198-
print("------")
199-
200-
for sn in response.source_nodes:
201-
print(" -- ")
202-
print(f' - {sn.score}: {sn.metadata['source document']}')
203-
pp.pprint(sn)
204-
print("------")
205-
pp.pprint(response.response)
206-
print("------")"""
207-
208192
response_text = str(response)
209193

210194
if "<think>" in response_text: # Give deepseek a fighting chance...
@@ -217,7 +201,14 @@ def process_query(response, model):
217201
cutoff = 0.2
218202
files_used = []
219203
for sn in response.source_nodes:
220-
# print(sn)
204+
if verbose:
205+
print_("===================================")
206+
# print(dir(sn))
207+
print_(sn.metadata["source document"])
208+
print_("-------")
209+
print_("Length of selection below: %i" % len(sn.text))
210+
print_(sn.text)
211+
221212
sd = sn.metadata["source document"]
222213

223214
if sd not in files_used:
@@ -244,10 +235,10 @@ def process_query(response, model):
244235

245236
llm_ver = get_llm_from_argv(sys.argv)
246237

247-
if "-q" not in sys.argv:
248-
create_store(llm_ver)
249-
250238
if "-test" not in sys.argv:
239+
if "-q" not in sys.argv:
240+
create_store(llm_ver)
241+
251242
index_reloaded = load_index(llm_ver)
252243
query_engine = get_query_engine(index_reloaded, llm_ver)
253244

@@ -265,11 +256,18 @@ def process_query(response, model):
265256
"When was the first metazoan genome sequenced? Answer only with the year.","""
266257

267258
queries = [
268-
"The NeuroPAL transgene is amazing. Give me some examples of fluorophores in it.",
269259
"What is the main function of cell pair AVB?",
270-
"What can you tell me about Alan Coulson?",
271260
"In what year was William Shakespeare born? ",
261+
"Tell me about the egg laying apparatus in C. elegans",
262+
"Tell me briefly about the neuronal control of C. elegans locomotion and the influence of monoamines.",
263+
"What can you tell me about Alan Coulson?",
264+
"The NeuroPAL transgene is amazing. Give me some examples of fluorophores in it.",
272265
]
266+
"""queries = [
267+
"What can you tell me about Alan Coulson?",
268+
]"""
269+
270+
print_("Processing %i queries" % len(queries))
273271

274272
for query in queries:
275273
process_query(query, llm_ver)

openworm_ai/quiz/QuizMaster.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
4747
last_question = None
4848

4949
indexing = ["1", "2", "3", "4"]
50-
50+
5151
for line in response.split("\n"):
5252
if len(line.strip()) > 0:
5353
if "QUESTION" in line or line[-1] == "?":
@@ -87,6 +87,7 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
8787
# quiz_json = "openworm_ai/quiz/samples/GPT4o_10questions.json"
8888

8989
quiz_json = "openworm_ai/quiz/samples/GPT4o_100questions.json"
90+
quiz_json = "openworm_ai/quiz/samples/GPT4o_100questions_celegans.json"
9091

9192
quiz = MultipleChoiceQuiz.from_file(quiz_json)
9293

@@ -122,9 +123,10 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
122123

123124
from openworm_ai.utils.llms import ask_question_get_response
124125

125-
resp = ask_question_get_response(
126+
orig_resp = ask_question_get_response(
126127
full_question, llm_ver, print_question=False
127128
).strip()
129+
resp = orig_resp
128130

129131
if "<think>" in resp: # Give deepseek a fighting chance...
130132
resp = (
@@ -133,6 +135,8 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
133135
resp = resp.replace("\n", " ").strip()
134136
guess = resp[-1]
135137
else:
138+
if "\n" in resp:
139+
resp = resp.split("\n")[0]
136140
guess = resp.split(":")[0].strip()
137141
if " " in guess:
138142
guess = guess[0]
@@ -143,7 +147,7 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
143147
if guess in presented_answers:
144148
g = presented_answers[guess]
145149
else:
146-
g = "%s (cannot be interpreted!)" % guess
150+
g = "[%s] [[%s]] (this cannot be interpreted!)" % (guess, orig_resp)
147151
print(
148152
f" >> {qi}) Is their guess of ({g}) for ({q}) correct (right answer: {correct_text})? {correct_guess}"
149153
)

openworm_ai/quiz/quiz_all.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,29 @@
22
import time
33
import random
44
import datetime
5+
56
from openworm_ai.utils.llms import (
67
LLM_OLLAMA_LLAMA32_1B,
7-
LLM_GPT4o,
8-
LLM_GEMINI,
9-
LLM_CLAUDE37,
10-
LLM_GPT35,
11-
LLM_OLLAMA_PHI4,
12-
LLM_OLLAMA_GEMMA2,
13-
LLM_OLLAMA_GEMMA,
14-
LLM_OLLAMA_QWEN,
15-
LLM_OLLAMA_TINYLLAMA,
8+
LLM_OLLAMA_LLAMA32_3B,
9+
# LLM_GPT4o,
10+
# LLM_GEMINI,
11+
# LLM_CLAUDE37,
12+
# LLM_GPT35,
13+
# LLM_OLLAMA_PHI4,
14+
# LLM_OLLAMA_GEMMA2,
15+
# LLM_OLLAMA_GEMMA,
16+
# LLM_OLLAMA_QWEN,
17+
# LLM_OLLAMA_TINYLLAMA,
1618
ask_question_get_response,
1719
)
20+
21+
1822
from openworm_ai.quiz.Templates import (
1923
ASK_Q,
2024
) # Ensure this matches the correct import path
2125

22-
iteration_per_day = 1
2326
field = "celegans" # general/science/celegans
27+
iteration_per_day = 3
2428
current_date = datetime.datetime.now().strftime("%d-%m-%y")
2529
SOURCE_QUESTIONS_FILE = "openworm_ai/quiz/samples/GPT4o_100questions_celegans.json"
2630
OUTPUT_FILENAME = f"llm_scores_{field}_{current_date}_{iteration_per_day}.json"
@@ -35,16 +39,17 @@ def load_llms():
3539
"""Loads only the selected LLMs: Ollama Llama3 and GPT-3.5."""
3640
llms = [
3741
LLM_OLLAMA_LLAMA32_1B,
38-
LLM_GPT4o,
39-
LLM_GEMINI,
40-
LLM_CLAUDE37,
41-
LLM_GPT35,
42-
LLM_OLLAMA_PHI4,
43-
LLM_OLLAMA_GEMMA2,
42+
LLM_OLLAMA_LLAMA32_3B,
43+
# LLM_GPT4o,
44+
#####LLM_GEMINI,
45+
####LLM_CLAUDE37,
46+
###LLM_GPT35,
47+
##LLM_OLLAMA_PHI4,
48+
# LLM_OLLAMA_GEMMA2,
4449
# LLM_OLLAMA_DEEPSEEK - unable to answer A-D(too few params?),
45-
LLM_OLLAMA_GEMMA,
46-
LLM_OLLAMA_QWEN,
47-
LLM_OLLAMA_TINYLLAMA,
50+
# LLM_OLLAMA_GEMMA,
51+
# LLM_OLLAMA_QWEN,
52+
# LLM_OLLAMA_TINYLLAMA,
4853
# LLM_OLLAMA_FALCON2 - 'only an assistant with no acess to external resources',
4954
# LLM_OLLAMA_CODELLAMA - understands only a fraction of questions, doesnt understand prompts
5055
] # Defined constants

openworm_ai/utils/llms.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
LLM_CMD_LINE_ARGS["-o-l32"] = LLM_OLLAMA_LLAMA32
2626
LLM_OLLAMA_LLAMA32_1B = "Ollama:llama3.2:1b"
2727
LLM_CMD_LINE_ARGS["-o-l321b"] = LLM_OLLAMA_LLAMA32_1B
28+
29+
LLM_OLLAMA_LLAMA32_3B = "Ollama:llama3.2:3b"
30+
LLM_CMD_LINE_ARGS["-o-l323b"] = LLM_OLLAMA_LLAMA32_3B
31+
2832
LLM_OLLAMA_MISTRAL = "Ollama:mistral"
2933
LLM_CMD_LINE_ARGS["-o-m"] = LLM_OLLAMA_MISTRAL
3034
LLM_OLLAMA_TINYLLAMA = "Ollama:tinyllama"
@@ -45,6 +49,10 @@
4549
LLM_CMD_LINE_ARGS["-qw"] = LLM_OLLAMA_QWEN
4650
LLM_OLLAMA_CODELLAMA = "Ollama:codellama:latest"
4751
LLM_OLLAMA_FALCON2 = "Ollama:falcon2:latest"
52+
LLM_OLLAMA_FALCON2 = "Ollama:falcon2:latest"
53+
54+
LLM_OLLAMA_OLMO2_7B = "Ollama:olmo2:7b"
55+
LLM_CMD_LINE_ARGS["-o-olmo27b"] = LLM_OLLAMA_OLMO2_7B
4856

4957
OPENAI_LLMS = [LLM_GPT35, LLM_GPT4, LLM_GPT4o]
5058

@@ -70,6 +78,7 @@
7078
LLM_OLLAMA_QWEN,
7179
LLM_OLLAMA_CODELLAMA,
7280
LLM_OLLAMA_FALCON2,
81+
LLM_OLLAMA_OLMO2_7B,
7382
)
7483

7584

@@ -186,6 +195,7 @@ def get_llm(llm_ver, temperature):
186195

187196
elif llm_ver in [
188197
LLM_OLLAMA_LLAMA32_1B,
198+
LLM_OLLAMA_LLAMA32_3B,
189199
LLM_OLLAMA_MISTRAL,
190200
LLM_OLLAMA_TINYLLAMA,
191201
LLM_OLLAMA_PHI3,
@@ -197,6 +207,7 @@ def get_llm(llm_ver, temperature):
197207
LLM_OLLAMA_QWEN,
198208
LLM_OLLAMA_CODELLAMA,
199209
LLM_OLLAMA_FALCON2,
210+
LLM_OLLAMA_OLMO2_7B,
200211
]:
201212
from langchain_ollama.llms import OllamaLLM
202213

@@ -336,6 +347,8 @@ def ask_question_get_response(
336347
import sys
337348

338349
question = "What is the most common type of neuron in the brain?"
350+
question = "Why is the worm C. elegans important to scientists?"
351+
question = "Tell me briefly about the neuronal control of C. elegans locomotion and the influence of monoamines."
339352

340353
llm_ver = get_llm_from_argv(sys.argv)
341354

0 commit comments

Comments
 (0)