diff --git a/openworm_ai/quiz/QuizMaster.py b/openworm_ai/quiz/QuizMaster.py index c6291b7..c360e65 100644 --- a/openworm_ai/quiz/QuizMaster.py +++ b/openworm_ai/quiz/QuizMaster.py @@ -1,11 +1,29 @@ from openworm_ai.quiz.QuizModel import MultipleChoiceQuiz, Question, Answer - from openworm_ai.utils.llms import ask_question_get_response from openworm_ai.utils.llms import get_llm_from_argv +from openworm_ai.utils.llms import get_llm +from openworm_ai.utils.llms import LLM_CLAUDE37 +from openworm_ai.utils.llms import LLM_OLLAMA_GEMMA2 +from openworm_ai.utils.llms import LLM_OLLAMA_PHI4 +from openworm_ai.utils.llms import get_anthropic_key + +from llama_index.core import Document, VectorStoreIndex +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.embeddings.ollama import OllamaEmbedding +from typing import List, Optional + import random from enum import Enum +import json + +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser + +from langchain_openai import OpenAIEmbeddings +import numpy as np + indexing = ["A", "B", "C", "D"] @@ -13,6 +31,21 @@ "QuizScope", [("GeneralKnowledge", 1), ("Science", 2), ("CElegans", 3)] ) +def get_default_critic_llm_ver(): + """ + Choose the default critic model: + - If an Anthropic key is available → use Claude 3.7 Sonnet + - Otherwise → fall back to a local Ollama model (gemma2) + """ + try: + key = get_anthropic_key() + except Exception: + key = None + + if key: + return LLM_CLAUDE37 + else: + return LLM_OLLAMA_GEMMA2 def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0): suffix = None @@ -74,6 +107,380 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0): % (llm_ver.replace(":", "_"), num_questions, suffix) ) +def _extract_json_array(text: str) -> str: + """ + Extract the first top-level JSON array from the LLM response. + This is defensive in case the model adds extra text or code fences. + """ + start = text.find("[") + end = text.rfind("]") + if start == -1 or end == -1 or end <= start: + raise ValueError("Could not find a JSON array in the LLM response.") + return text[start : end + 1] + +def _is_valid_mcq_item(item: dict) -> bool: + """ + Basic sanity-check for a generated MCQ item. + + Expects schema like: + { + "question": "stem", + "options": [{"label": "A", "text": "..."}, ...], + "correct_label": "A", + ... + } + + Returns True if it looks usable, False otherwise. + """ + try: + # Question text + q = item.get("question", "") + if not isinstance(q, str) or not q.strip(): + return False + + # Options + options = item.get("options") + if not isinstance(options, list) or len(options) < 2: + return False + + labels = set() + for opt in options: + if not isinstance(opt, dict): + return False + label = opt.get("label") + text = opt.get("text") + if not isinstance(label, str) or not label.strip(): + return False + if not isinstance(text, str) or not text.strip(): + return False + labels.add(label) + + # Correct label + correct = item.get("correct_label") + if not isinstance(correct, str) or correct not in labels: + return False + + return True + except Exception: + return False + + +def score_question_with_critic(item, llm_ver_critic=None, temperature=0.0): + """ + Use a separate LLM (critic) to score a single MCQ item. + + item: { + "question": "string", + "options": [...], + "correct_label": "A" + } + + Returns: (score: float, comment: None) + """ + # 1) Decide which critic model to use + if llm_ver_critic is None: + llm_ver_critic = get_default_critic_llm_ver() + + # 2) Represent the MCQ as JSON so the critic sees the exact structure + mcq_json_str = json.dumps(item, ensure_ascii=False, indent=2) + + # 3) Build the critic prompt + # IMPORTANT: no literal { ... } JSON examples here, to avoid PromptTemplate treating + # them as variables. We just describe the format in words. + critic_prompt = """ +You are an expert evaluator of multiple-choice questions. + +You will be given ONE MCQ in JSON format: +- "question": the question text +- "options": array of answers (A–D) +- "correct_label": the intended correct option. + +Evaluate the QUALITY of the question on: +1. Clarity (is the wording precise?) +2. Unambiguity (is there ONLY one correct answer?) +3. Factual correctness (is the correct answer truly correct?) +4. Distractor quality (are wrong answers plausible but incorrect?) +5. Appropriateness (no trick wording or opinion-based content). + +Return a single integer score from 0 to 100: +- 90–100: excellent +- 70–89: good +- 50–69: borderline +- <50: poor. + +Your output MUST be a valid JSON object with a single field "score" whose value is an integer. +Do not include any other keys or any extra text. +For example, if you think the quality is 87, your output should be a JSON object with "score": 87. + +Here is the MCQ to evaluate: + +{mcq_json} +""".strip() + + # 4) Use LangChain to run the critic model + prompt = PromptTemplate( + template=critic_prompt, + input_variables=["mcq_json"], + ) + + try: + llm = get_llm(llm_ver_critic, temperature) + chain = prompt | llm | StrOutputParser() + resp = chain.invoke({"mcq_json": mcq_json_str}).strip() + except Exception as e: + print(f"⚠ Critic LLM call failed for model {llm_ver_critic}: {e}") + # Neutral default score if critic fails + return 50.0, None + + # 5) Parse the critic response as JSON + try: + start = resp.find("{") + end = resp.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("No JSON object found in critic response.") + + json_str = resp[start : end + 1] + obj = json.loads(json_str) + + score = float(obj.get("score", 50.0)) + return score, None + except Exception as e: + print("⚠ Failed to parse critic response as JSON:") + print(resp) + print(e) + return 50.0, None + + +# VectorStore-based embedding dedup (RAG-style) + +def question_to_text(item: dict) -> str: + """ + Turn a question item into a single text blob, analogous to how the RAG + script builds `all_text` for each section. + + item schema (from GENERATE_Q_JSON): + { + "question": "stem", + "options": [{"label": "A", "text": "..."}, ...], + "correct_label": "A", + ... + } + """ + stem = item.get("question", "").strip() + options = item.get("options", []) + + parts = [stem] + for opt in options: + label = opt.get("label", "") + text = opt.get("text", "") + parts.append(f"{label}. {text}") + + return " ".join(parts).strip() + + +def get_embed_model_for_llm(llm_ver: str): + """ + Mirror the RAG script logic: + + In RAG, for Ollama models they do: + OLLAMA_MODEL = model.replace("Ollama:", "") if model is not LLM_GPT4o else None + ollama_embedding = OllamaEmbedding(model_name=OLLAMA_MODEL) if OLLAMA_MODEL else None + VectorStoreIndex.from_documents(documents, embed_model=ollama_embedding) + + Here: + - If llm_ver starts with 'Ollama:' → use OllamaEmbedding(model_name=...) + - Else → use OpenAIEmbedding (LlamaIndex's OpenAI stack). + """ + if llm_ver.startswith("Ollama:"): + ollama_model = llm_ver.replace("Ollama:", "") + print(f"[Step 7] Using OllamaEmbedding for VectorStoreIndex: {ollama_model}") + return OllamaEmbedding(model_name=ollama_model) + else: + print("[Step 7] Using OpenAIEmbedding for VectorStoreIndex") + return OpenAIEmbedding() + + +def build_question_index(questions: List[dict], llm_ver: str) -> VectorStoreIndex: + """ + Build an in-memory VectorStoreIndex over the questions, analogous to how + the RAG script builds an index over WormAtlas sections. + """ + docs: List[Document] = [] + for idx, q in enumerate(questions): + text = question_to_text(q) + # Store the index of the question in metadata so we can map back + docs.append(Document(text=text, metadata={"qid": idx})) + + embed_model = get_embed_model_for_llm(llm_ver) + index = VectorStoreIndex.from_documents(docs, embed_model=embed_model) + return index + +def deduplicate_questions_with_index(questions: List[dict], llm_ver: str, similarity_threshold: float = 0.9, max_items: Optional[int] = None) -> List[dict]: + """ + Dedup using VectorStoreIndex + + - Build a VectorStoreIndex over all questions. + - For each question (in the given order, which we make 'best first' via critic), + query the index with its own text. + - If any ALREADY-KEPT question appears among the top similar results + with score >= similarity_threshold, we treat this as a duplicate/overlap. + - Otherwise, we keep it. + """ + if not questions: + return [] + + index = build_question_index(questions, llm_ver) + retriever = index.as_retriever(similarity_top_k=5) + + kept_indices: List[int] = [] + + for idx, q in enumerate(questions): + if max_items is not None and len(kept_indices) >= max_items: + break + + text = question_to_text(q) + results = retriever.retrieve(text) + + is_dup = False + # Check if this question is too similar to any ALREADY-KEPT question + for node_with_score in results: + # Node metadata should include our "qid" + meta = node_with_score.metadata or {} + other_id = meta.get("qid") + score = node_with_score.score + + # Skip self-match + if other_id == idx: + continue + + # If we already kept this other question and similarity is high → duplicate + if other_id in kept_indices and score is not None and score >= similarity_threshold: + is_dup = True + break + + if not is_dup: + kept_indices.append(idx) + + # Return questions in the original (already-sorted) order, but filtered + return [questions[i] for i in kept_indices] + + + + + + + + + + + + +def generate_quiz_json(num_questions, llm_ver, quiz_scope, temperature=0.2): + """ + Generate a MultipleChoiceQuiz using JSON-based prompts instead of free-text parsing. + + num_questions = desired final number of questions. + Internally we over-generate (2x) so that later we can filter/score/dedup. + Right now we simply take the first num_questions as a placeholder. + """ + if quiz_scope == QuizScope.CElegans: + from openworm_ai.quiz.TemplatesCelegans import GENERATE_Q_JSON as GENERATE_Q + suffix = "_celegans_v2" + elif quiz_scope == QuizScope.Science: + # Only if/when you add a science JSON template + from openworm_ai.quiz.TemplatesScience import GENERATE_Q_JSON as GENERATE_Q + suffix = "_science_v2" + elif quiz_scope == QuizScope.GeneralKnowledge: + # Only if/when you add a general JSON template + from openworm_ai.quiz.Templates import GENERATE_Q_JSON as GENERATE_Q + suffix = "_general_v2" + else: + raise ValueError(f"Unsupported quiz scope: {quiz_scope}") + + # Over-generate so we can filter / dedup later + raw_n = num_questions * 3 + + prompt = GENERATE_Q.replace("", str(raw_n)) + + # Ask LLM + raw = ask_question_get_response(prompt, llm_ver, temperature) + + try: + json_str = _extract_json_array(raw) + data = json.loads(json_str) + except Exception: + print("⚠ Failed to parse JSON from LLM. Raw output (first 500 chars):") + print(raw[:500]) + raise + + # Filter out malformed items before critic/dedup + original_len = len(data) + data = [item for item in data if _is_valid_mcq_item(item)] + if len(data) < original_len: + print( + f"⚠ Filtered out {original_len - len(data)} invalid MCQ items " + f"({len(data)} remain)" + ) + + if not data: + raise ValueError("No valid MCQ items after validation; aborting.") + + + # In theory data length should be raw_n; in practice we guard. + # For now, we just take the first num_questions; later we'll insert critic + dedup. + # In theory data length should be raw_n; in practice we guard. + # Now: score each question with a critic and keep the top N. + + critic_llm_ver = get_default_critic_llm_ver() + print(f"Using critic model {critic_llm_ver} to score {len(data)} questions") + + scored_items = [] + for idx, item in enumerate(data): + score, _ = score_question_with_critic(item, llm_ver_critic=critic_llm_ver) + item["_critic_score"] = score + print(f" [Critic] Q{idx}: score={score:.1f}") + scored_items.append(item) + + # Sort by critic score (highest first) and select the top num_questions + scored_items.sort(key=lambda x: x.get("_critic_score", 0.0), reverse=True) + # Step 7: deduplicate using a VectorStoreIndex, mimicking the RAG pattern + try: + selected_items = deduplicate_questions_with_index( + scored_items, + llm_ver=llm_ver, + similarity_threshold=0.9, # tweak based on what you see + max_items=num_questions, + ) + print( + f"[Step 7] Selected {len(selected_items)} questions after VectorStore " + f"dedup (target={num_questions})" + ) + except Exception as e: + print( + f"⚠ [Step 7] VectorStore-based dedup failed, " + f"falling back to simple top-{num_questions} slice: {e}" + ) + selected_items = scored_items[:num_questions] + + quiz = MultipleChoiceQuiz( + title=f"{llm_ver.replace(':', '_')}_{num_questions}questions{suffix}", + source=f"Generated by {llm_ver}, temperature: {temperature}, mode: JSON_v2_raw{raw_n}", + ) + + # Convert each JSON object to Question/Answer objects + for item in selected_items: + stem = item["question"].strip() + q_obj = Question(question=stem) + + # Our quiz model uses indices "1", "2", "3", "4" + for i, opt in enumerate(item["options"]): + text = opt["text"].strip() + is_correct = (opt["label"] == item["correct_label"]) + q_obj.answers.append(Answer(str(i + 1), text, is_correct)) + + quiz.questions.append(q_obj) + + return quiz if __name__ == "__main__": import sys @@ -128,18 +535,33 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0): ).strip() resp = orig_resp - if "" in resp: # Give deepseek a fighting chance... - resp = ( - resp[0 : resp.index("")] + resp[resp.index("") + 8 :] - ) - resp = resp.replace("\n", " ").strip() - guess = resp[-1] - else: - if "\n" in resp: - resp = resp.split("\n")[0] - guess = resp.split(":")[0].strip() - if " " in guess: - guess = guess[0] + # Handle models that include chain-of-thought with ... + if "" in resp: + try: + before = resp[: resp.index("")] + after = resp[resp.index("") + len("") :] + resp = (before + "\n" + after).strip() + except ValueError: + # If tags are malformed, fall back to original + resp = orig_resp + + # Take the first non-empty line + first_line = resp.splitlines()[0].strip() if resp else "" + + # Look for the first A/B/C/D in that line + guess = None + for ch in first_line: + if ch in ["A", "B", "C", "D"]: + guess = ch + break + + # Fallback if nothing sensible found + if guess is None: + candidate = first_line.split(":")[0].strip() + if candidate: + guess = candidate[0] + else: + guess = "Z" # definitely invalid, will be caught later total_qs += 1 correct_guess = guess == correct_answer @@ -165,11 +587,49 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0): ) # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers + # this can be used to plot comparison of variety of llms on general knowledge + # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers + # this can be used to plot comparison of variety of llms on general knowledge + # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers # this can be used to plot comparison of variety of llms on general knowledge else: num = 100 + use_v2_json = "--v2-json" in sys.argv + for a in sys.argv: if a.isnumeric(): num = int(a) - print(f"Using LLM {llm_ver} for saving quiz with {num} questions") - save_quiz(num, 4, llm_ver, quiz_scope=QuizScope.CElegans, temperature=0.2) + + # Decide which scope we're using + quiz_scope = QuizScope.CElegans + suffix_scope = "_celegans" + if "--general" in sys.argv: + quiz_scope = QuizScope.GeneralKnowledge + suffix_scope = "_general" + elif "--science" in sys.argv: + quiz_scope = QuizScope.Science + suffix_scope = "_science" + + if use_v2_json: + print( + f"Using LLM {llm_ver} for saving JSON-v2 quiz with {num} questions " + f"(scope={quiz_scope.name})" + ) + quiz = generate_quiz_json( + num, llm_ver, quiz_scope=quiz_scope, temperature=0.2 + ) + out_path = ( + "openworm_ai/quiz/samples/" + f"{llm_ver.replace(':','_')}_{num}questions{suffix_scope}_v2.json" + ) + quiz.to_json_file(out_path) + print(f"Saved JSON-v2 quiz to {out_path}") + else: + print( + f"Using LLM {llm_ver} for saving legacy quiz with {num} questions " + f"(scope={quiz_scope.name})" + ) + save_quiz( + num, 4, llm_ver, quiz_scope=quiz_scope, temperature=0.2 + ) + diff --git a/openworm_ai/quiz/QuizMasterCorpus.py b/openworm_ai/quiz/QuizMasterCorpus.py index 5035fa2..f090f4b 100644 --- a/openworm_ai/quiz/QuizMasterCorpus.py +++ b/openworm_ai/quiz/QuizMasterCorpus.py @@ -1,9 +1,18 @@ import os import json import random +from typing import List +import glob + from openworm_ai.quiz.QuizModel import MultipleChoiceQuiz, Question, Answer from openworm_ai.quiz.TemplatesCorpus import TEXT_ANSWER_EXAMPLE -from openworm_ai.utils.llms import ask_question_get_response, LLM_GPT4o +from openworm_ai.utils.llms import ask_question_get_response, LLM_GPT4o, LLM_OLLAMA_GEMMA2 + +from openworm_ai.quiz.QuizMaster import (_is_valid_mcq_item, score_question_with_critic, deduplicate_questions_with_index, get_default_critic_llm_ver, get_embed_model_for_llm) + +from llama_index.core import Document, VectorStoreIndex + + indexing = ["A", "B", "C", "D"] TOKEN_LIMIT = 30_000 # 🔹 Keeps request within OpenAI's limits @@ -25,139 +34,351 @@ 📌 **IMPORTANT:** If the text does not have enough content for questions, generate as many as possible. """ +def load_corpus_sections(papers_glob: str = "processed/json/papers/*.json") -> List[dict]: + """ + Load sections from all processed paper JSONs, skipping obvious non-body-text + sections like References, Bibliography, etc. + + Returns a list of dicts: + { + "text": "...section text...", + "source": "PaperFile.json: [Title, Section X](url)" + } + """ + json_inputs = glob.glob(papers_glob) + sections: List[dict] = [] + + if not json_inputs: + print(f"⚠ Warning: no JSON papers found under {papers_glob}") + return sections + + for json_file in json_inputs: + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as e: + print(f"⚠ Error reading {json_file}: {e}") + continue + + for title, doc_contents in data.items(): + src_page = doc_contents.get("source", json_file) + for section_name, details in doc_contents.get("sections", {}).items(): + sec_name_lower = section_name.lower() + + # 🔹 Skip obvious reference-like / non-content sections + if any( + key in sec_name_lower + for key in [ + "reference", + "bibliograph", + "supplementary", + "acknowledg", + "funding", + "author contributions", + "materials and methods", # optional, remove if you want methods Qs + ] + ): + continue + + paragraphs = details.get("paragraphs", []) + text = " ".join( + p.get("contents", "") for p in paragraphs + ).strip() + + # Skip ultra-short or weird sections (tables, axes, etc.) + if len(text.split()) < 30: + continue + + # Skip sections that look like pure citation/DOI blobs + lower_text = text.lower() + if "doi.org" in lower_text or "doi:" in lower_text: + continue + + src_info = ( + f"{os.path.basename(json_file)}: " + f"[{title}, Section {section_name}]({src_page})" + ) + sections.append( + { + "text": text, + "source": src_info, + } + ) + + print(f" Loaded {len(sections)} sections from corpus papers (after filtering)") + return sections + + +def build_corpus_index_for_mcq( + llm_ver: str, + papers_glob: str = "processed/json/papers/*.json") -> tuple[VectorStoreIndex, List[Document]]: + """ + Build a VectorStoreIndex over the corpus sections, to use for RAG-style + context selection when generating MCQs. + + Returns: + (index, docs) + - index: VectorStoreIndex over all sections + - docs: list of Documents with .text and metadata["source"] + """ + # Reuse your existing loader (with filtering) + sections = load_corpus_sections(papers_glob=papers_glob) + if not sections: + raise ValueError("No sections found for corpus index.") + + docs: List[Document] = [] + for sid, sec in enumerate(sections): + docs.append( + Document( + text=sec["text"], + metadata={"source": sec["source"], "sid": sid}, + ) + ) + + embed_model = get_embed_model_for_llm(llm_ver) + print("[RAG] Building VectorStoreIndex for corpus MCQ generation...") + index = VectorStoreIndex.from_documents(docs, embed_model=embed_model) + print(f"[RAG] Built index over {len(docs)} documents") + return index, docs + + + + + -def load_limited_documents(file_path, max_tokens=TOKEN_LIMIT, num_chunks=5): - """Loads a JSON document in chunks to generate questions in batches.""" - if not os.path.exists(file_path): - print(f"⚠ Warning: {file_path} not found. Exiting...") - return [] - try: - with open(file_path, "r", encoding="utf-8") as f: - data = json.load(f) - extracted_texts = [] - all_text = "" - # Extract all sections - for title, doc_contents in data.items(): - all_text += f"\n📌 **{title}**\n" - for section, details in doc_contents.get("sections", {}).items(): - all_text += f"🔹 **{section}**:\n" - if "paragraphs" in details: - all_text += ( - " ".join([p["contents"] for p in details["paragraphs"]]) - + "\n\n" - ) - - # Split document into chunks - words = all_text.split() - chunk_size = len(words) // num_chunks - for i in range(num_chunks): - start = i * chunk_size - end = (i + 1) * chunk_size if i < num_chunks - 1 else len(words) - chunk_text = " ".join(words[start:end]) - extracted_texts.append(chunk_text) - - return extracted_texts - - except (json.JSONDecodeError, UnicodeDecodeError, PermissionError) as e: - print(f"⚠ Error reading {file_path}: {e}") - return [] - - -def save_quiz(num_questions=100, num_answers=4, llm_ver=LLM_GPT4o, temperature=0): - """Generates and saves a quiz using GPT-4o while ensuring all content is from documents in batches.""" - - num_batches = num_questions // 20 # Generate in batches of 20 questions - document_chunks = load_limited_documents( - "processed/json/papers/Corsi_et_al_2015.json", num_chunks=num_batches - ) - if not document_chunks: - print("⚠ Error: No valid document chunks found.") + + + + + + + + + + + +def save_quiz_v2( + num_questions: int = 100, + llm_ver: str = LLM_GPT4o, + temperature: float = 0.2, + questions_per_section: int = 3, +): + """ + Generate and save a corpus-based quiz where each MCQ is grounded + in the processed paper JSONs, using RAG-style context selection. + + Pipeline: + - Build a VectorStoreIndex over processed/json/papers/*.json + - Repeatedly pick a random seed doc and retrieve top-k similar docs + - Use concatenated retrieved text as context for MCQ generation + - Validate items (_is_valid_mcq_item) + - Critic-score, rank, and deduplicate (using the same pipeline as QuizMaster) + - Build a MultipleChoiceQuiz and save as JSON + """ + # 🔹 Build RAG index over corpus + try: + index, docs = build_corpus_index_for_mcq(llm_ver) + except Exception as e: + print(f"⚠ Error building corpus index: {e}") return - # Initialize quiz - quiz = MultipleChoiceQuiz( - title=f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_batched", - source=f"Generated by {llm_ver}, temperature: {temperature}", - ) + if not docs: + print("⚠ Error: No documents in corpus index.") + return - question_count = 0 + retriever = index.as_retriever(similarity_top_k=3) - # Loop over document chunks and generate questions in batches - for i, chunk_text in enumerate(document_chunks): - print(f"📝 Generating batch {i + 1} (20 questions) from document chunk...") + # Over-generate so critic + dedup have room + raw_target = num_questions * 3 + all_items: List[dict] = [] - # Create strict prompt for the batch - question_prompt = ( - STRICT_GENERATE_Q.replace("", str(20)) - + TEXT_ANSWER_EXAMPLE - + "\n\n🔹 **Use ONLY the following document knowledge for questions:**\n\n" - + chunk_text - ) + # We'll cap the number of retrieval/generation attempts so we don't loop forever + max_attempts = raw_target * 2 + attempts = 0 - response = ask_question_get_response(question_prompt, llm_ver, temperature) + while len(all_items) < raw_target and attempts < max_attempts: + attempts += 1 - # Ensure GPT-4o generated questions - questions_generated = response.count("QUESTION:") - if questions_generated < 15: - print( - f"⚠ Warning: GPT-4o generated only {questions_generated} questions in batch {i + 1}. Skipping batch." - ) + # 🔹 Pick a random seed document and use it as a "query" + seed_doc = random.choice(docs) + seed_text = seed_doc.text + + try: + results = retriever.retrieve(seed_text) + except Exception as e: + print(f"⚠ RAG retrieval failed on attempt {attempts}: {e}") continue - # Parse and add questions to quiz - last_question = None - indexing = ["1", "2", "3", "4"] - - for line in response.split("\n"): - if question_count >= num_questions: - break # Stop at exactly 100 questions - - if len(line.strip()) > 0: - if "QUESTION" in line or line.strip().endswith("?"): - question_text = line.split(":", 1)[-1].strip() - print(f"Question: <{question_text}>") - last_question = Question(question=question_text) - quiz.questions.append(last_question) - question_count += 1 - elif "CORRECT ANSWER" in line: - correct_ans = line.split(":", 1)[-1].strip() - i = len(last_question.answers) - last_question.answers.append(Answer(indexing[i], correct_ans, True)) - elif "WRONG ANSWER" in line: - wrong_ans = line.split(":", 1)[-1].strip() - i = len(last_question.answers) - last_question.answers.append(Answer(indexing[i], wrong_ans, False)) - - if question_count >= num_questions: - break # Stop if we reached 100 questions - - # Ensure quiz has enough valid questions before saving - if len(quiz.questions) < num_questions * 0.8: + # Build context from top-k retrieved docs + ctx_texts = [] + sources = set() + for r in results: + try: + # NodeWithScore in LlamaIndex allows get_content() + ctx_texts.append(r.get_content()) + src = r.metadata.get("source", "") + except AttributeError: + # Fallback if API differs + node = getattr(r, "node", None) + if node is not None: + ctx_texts.append(getattr(node, "text", "")) + src = node.metadata.get("source", "") + else: + continue + if src: + sources.add(src) + + context = "\n\n".join(t for t in ctx_texts if t.strip()) + if not context.strip(): + continue + + source = "; ".join(sorted(sources)) + + prompt = f""" +You are generating multiple-choice questions based on scientific papers +about C. elegans. + +You are given the following reference material, composed of several +semantically related passages: + +\"\"\"{context}\"\"\" + +Use ONLY this material (no external knowledge) to generate +{questions_per_section} multiple-choice questions. + +Each question must: +- Be clearly answerable from the provided material. +- Have exactly one correct answer and 3 plausible incorrect answers. +- Be specific and technically accurate, suitable for advanced students or researchers. +- NOT reference the text explicitly (no "according to the text" phrasing). + +Return your output as a JSON array. Each element must have the form: +{{ + "question": "...", + "options": [ + {{"label": "A", "text": "..."}}, + {{"label": "B", "text": "..."}}, + {{"label": "C", "text": "..."}}, + {{"label": "D", "text": "..."}} + ], + "correct_label": "A" +}} + +Do not include any extra keys, commentary, or code fences. +""".strip() + + raw = ask_question_get_response(prompt, llm_ver, temperature) + + try: + from openworm_ai.quiz.QuizMaster import _extract_json_array # reuse helper + json_str = _extract_json_array(raw) + items = json.loads(json_str) + except Exception: + print("⚠ Failed to parse JSON from RAG-based corpus generation. Skipping this batch.") + continue + + valid_items = [it for it in items if _is_valid_mcq_item(it)] + if not valid_items: + continue + + #Attach source metadata + for it in valid_items: + it["_source"] = source + + all_items.extend(valid_items) + + if not all_items: + print("⚠ Error: No valid MCQs generated from RAG-based corpus passages.") + return + + print(f"📊 Corpus+RAG generation produced {len(all_items)} valid MCQs before critic/dedup") + + # Critic scoring (same as before) + critic_llm_ver = get_default_critic_llm_ver() + print(f"[Corpus+RAG] Using critic model {critic_llm_ver} to score {len(all_items)} questions") + + for idx, item in enumerate(all_items): + score, _ = score_question_with_critic(item, llm_ver_critic=critic_llm_ver) + item["_critic_score"] = score + print(f" [Corpus+RAG Critic] Q{idx}: score={score:.1f}") + + all_items.sort(key=lambda x: x.get("_critic_score", 0.0), reverse=True) + + #Dedup with same VectorStore-based logic + try: + selected_items = deduplicate_questions_with_index( + all_items, + llm_ver=llm_ver, + similarity_threshold=0.9, + max_items=num_questions, + ) print( - "⚠ Error: Not enough valid questions were generated. Quiz will not be saved." + f"[Corpus+RAG Step 7] Selected {len(selected_items)} corpus-based questions " + f"after dedup (target={num_questions})" ) - return + except Exception as e: + print( + f"⚠ [Corpus+RAG Step 7] VectorStore-based dedup failed, " + f"falling back to top-{num_questions}: {e}" + ) + selected_items = all_items[:num_questions] + + # 🔹 Build MultipleChoiceQuiz + quiz = MultipleChoiceQuiz( + title=f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_rag_v2", + source=f"Corpus-based (RAG) quiz generated from processed papers by {llm_ver}, " + f"temperature: {temperature}", + ) + + for item in selected_items: + stem = item["question"].strip() + q_obj = Question(question=stem) + + for i, opt in enumerate(item["options"]): + text = opt["text"].strip() + is_correct = (opt["label"] == item["correct_label"]) + q_obj.answers.append(Answer(str(i + 1), text, is_correct)) + + quiz.questions.append(q_obj) - print("===============================\n Generated quiz:\n") + print("===============================\n Generated corpus+RAG quiz:\n") print(quiz.to_yaml()) - quiz.to_json_file( - f"openworm_ai/quiz/samples/{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_batched.json" + out_path = ( + f"openworm_ai/quiz/samples/" + f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_rag_v2.json" ) + quiz.to_json_file(out_path) + print(f"💾 Saved corpus+RAG JSON-v2 quiz to {out_path}") + + if __name__ == "__main__": import sys + import os + + if os.getenv("OPENAI_API_KEY"): + llm_ver = LLM_GPT4o + else: + llm_ver = LLM_OLLAMA_GEMMA2 - llm_ver = LLM_GPT4o # Always use GPT-4o print(f"Selected LLM: {llm_ver}") + if "-ask" in sys.argv: - quiz_json = f"openworm_ai/quiz/samples/{llm_ver.replace(':', '_')}_100_questions_celegans_corpus.json" + # Match the new v2 filename pattern + num_questions = 4 + quiz_json = ( + f"openworm_ai/quiz/samples/" + f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_v2.json" + ) + + print(f"Loading quiz from: {quiz_json}") quiz = MultipleChoiceQuiz.from_file(quiz_json) total_qs = 0 @@ -195,5 +416,13 @@ def save_quiz(num_questions=100, num_answers=4, llm_ver=LLM_GPT4o, temperature=0 f" >> {qi}) {q} → Guess: {resp}, Correct: {correct_answer} → {correct_guess}" ) + print(f"\nTotal correct: {total_correct} / {total_qs}") + else: - save_quiz(100, 4, llm_ver, temperature=0.2) + # Use the new v2 generator + save_quiz_v2( + num_questions=4, + llm_ver=llm_ver, + temperature=0.2, + questions_per_section=1, + ) diff --git a/openworm_ai/quiz/Templates.py b/openworm_ai/quiz/Templates.py index 1f29de1..7479f85 100644 --- a/openworm_ai/quiz/Templates.py +++ b/openworm_ai/quiz/Templates.py @@ -4,33 +4,83 @@ GENERATE_Q = """ Generate a list of multiple choice questions to test someone's general knowledge. -The questions should be answerable by a reasonably intelligent adult, and should be on a wide range of subjects. +The questions should be answerable by an intelligent adult and should cover a wide range of topics, +such as history, geography, science, culture, technology, medicine, society, and everyday facts. There should be possible answers, only one of which is unambiguously correct, and all of the answers should be kept brief. Each of the question/answer sets should be presented in the following format: """ TEXT_ANSWER_EXAMPLE = """ -QUESTION: What is the capital of France? -CORRECT ANSWER: Paris -WRONG ANSWER: Madrid -WRONG ANSWER: Rome -WRONG ANSWER: Dublin +QUESTION: What is the capital city of Japan? +CORRECT ANSWER: Tokyo +WRONG ANSWER: Osaka +WRONG ANSWER: Kyoto +WRONG ANSWER: Nagoya """ +# New JSON-based MCQ generation template (v2) +GENERATE_Q_JSON = """ +You are an expert academic question writer. + +Generate high-quality general-knowledge multiple-choice questions. +Cover a wide range of topics such as history, geography, science, culture, medicine, +society, world affairs, technology, and everyday factual knowledge. + +Each question MUST: +- Be clearly and precisely worded. +- Be answerable by an intelligent adult without needing specialist knowledge. +- Have exactly ONE correct answer and three incorrect but plausible answers. +- Be unambiguous so that two well-informed people would independently choose the same correct option. + +STRICTLY AVOID AMBIGUITY: +- Do NOT use vague terms like "main", "best", "most important", or "most likely" + unless the question defines them clearly. +- Do NOT write questions where more than one answer could be argued correct. +- Avoid vague pronouns ("this", "it", "they") if unclear what they refer to. +- Avoid questions whose answer might depend on opinion or interpretation. + +For the incorrect options: +- They must be factually wrong. +- They must still sound plausible to someone with partial knowledge. +- Avoid joke answers or irrelevant answers. +- Do NOT use "All of the above" or "None of the above". + +Return ONLY valid JSON, with no extra commentary. The JSON must be an array: + +[ + { + "question": "string", + "options": [ + {"label": "A", "text": "string"}, + {"label": "B", "text": "string"}, + {"label": "C", "text": "string"}, + {"label": "D", "text": "string"} + ], + "correct_label": "A" + }, + ... +] + +Do not include fewer or more than objects in the array. +""" + + + + ASK_Q = """You are to select the correct answer for a multiple choice question. A number of answers will be presented and you should respond with only the letter corresponding to the correct answer. For example if the question is: -What is the capital of France? +What is the capital city of Japan? and the potential answers are: -E: Madrid -F: Paris -G: Rome -H: Dublin +E: Osaka +F: Tokyo +G: Kyoto +H: Nagoya you should only answer: @@ -44,14 +94,13 @@ -Remember: only respond with the letter of the correct answer! """ if __name__ == "__main__": import sys question = ( - GENERATE_Q.replace("", "100").replace("", "4") + GENERATE_Q.replace("", "5").replace("", "4") + TEXT_ANSWER_EXAMPLE ) diff --git a/openworm_ai/quiz/TemplatesCelegans.py b/openworm_ai/quiz/TemplatesCelegans.py index bd5f42d..5f832fa 100644 --- a/openworm_ai/quiz/TemplatesCelegans.py +++ b/openworm_ai/quiz/TemplatesCelegans.py @@ -19,6 +19,55 @@ """ +# New JSON-based MCQ generation template (v2) +GENERATE_Q_JSON = """ +You are an expert on *Caenorhabditis elegans* (C. elegans) biology and neuroscience. + +Generate high-quality multiple-choice questions about C. elegans. +Cover a range of topics (anatomy, nervous system, behaviour, genetics, development, physiology, lab techniques, and research significance). +Questions should be answerable by a scientifically literate, intelligent adult without needing to be a specialist in C. elegans. + +Each question MUST: +- Be specific to C. elegans (not generic animal biology). +- Be clearly and precisely worded. +- Have exactly ONE correct answer and three incorrect but plausible answers. +- Be answerable in a way that two well-informed experts on C. elegans would agree on the same option. + +STRICTLY AVOID AMBIGUITY: +- Do NOT use vague terms like "main", "best", "most important", or "most likely" + unless the question explicitly defines them clearly enough that only one option fits. +- Do NOT ask questions where more than one option could reasonably be argued correct. +- Avoid vague pronouns ("this", "it", "they") if it might be unclear what they refer to. +- If a question could be interpreted in multiple ways, REWRITE it until the meaning is unique. + +For the incorrect options: +- They must be factually wrong for C. elegans. +- They must still sound plausible to someone with partial understanding of C. elegans. +- Avoid obviously silly or irrelevant answers. +- Do NOT use "All of the above" or "None of the above". + +Return ONLY valid JSON, with no extra commentary. The JSON must be an array: + +[ + { + "question": "string", + "options": [ + {"label": "A", "text": "string"}, + {"label": "B", "text": "string"}, + {"label": "C", "text": "string"}, + {"label": "D", "text": "string"} + ], + "correct_label": "A" + }, + ... +] + +Do not include fewer or more than objects in the array. +""" + + + + ASK_Q = """You are to select the correct answer for a multiple choice question. A number of answers will be presented and you should respond with only the letter corresponding to the correct answer. For example if the question is: diff --git a/openworm_ai/quiz/TemplatesScience.py b/openworm_ai/quiz/TemplatesScience.py index db2a73e..364ea82 100644 --- a/openworm_ai/quiz/TemplatesScience.py +++ b/openworm_ai/quiz/TemplatesScience.py @@ -4,7 +4,7 @@ GENERATE_Q = """ Generate a list of multiple choice questions to test someone's scientific knowledge. -The questions should be answerable by an intelligent adult, and should be on a wide range of subjects in scinece: biology, chemistry, physics and all the relevant fields. +The questions should be answerable by an intelligent adult, and should be on a wide range of subjects in science: biology, chemistry, physics and all the relevant fields. There should be possible answers, only one of which is unambiguously correct, and all of the answers should be kept brief. Each of the question/answer sets should be presented in the following format: @@ -19,24 +19,84 @@ """ -ASK_Q = """You are to select the correct answer for a multiple choice question. -A number of answers will be presented and you should respond with only the letter corresponding to the correct answer. -For example if the question is: +# New JSON-based MCQ generation template (v2) +GENERATE_Q_JSON = """ +You are an expert on *Science* including biology, chemistry, physics and mathematics and all relevant fields. + +Generate high-quality multiple-choice questions on a wide range of scientific topics. +Cover a range of topics (biology, chemistry, physics, mathematics and related disciplines, and all the subtopics within these fields). +Questions should be answerable by a scientifically literate, intelligent adult without needing to be a specialist in the specific topic area. + +Each question MUST: +- Be specific to scientific knowledge. +- Be clearly and precisely worded. +- Have exactly ONE correct answer and three incorrect but plausible answers. +- Be answerable in a way that two well-informed experts on the scientific topic would agree on the same option. + +STRICTLY AVOID AMBIGUITY: +- Do NOT use vague terms like "main", "best", "most important", or "most likely" + unless the question explicitly defines them clearly enough that only one option fits. +- Do NOT ask questions where more than one option could reasonably be argued correct. +- Avoid vague pronouns ("this", "it", "they") if it might be unclear what they refer to. +- If a question could be interpreted in multiple ways, REWRITE it until the meaning is unique. + +For the incorrect options: +- They must be factually wrong for C. elegans. +- They must still sound plausible to someone with partial understanding of science. +- Avoid obviously silly or irrelevant answers. +- Do NOT use "All of the above" or "None of the above". + +Return ONLY valid JSON, with no extra commentary. The JSON must be an array: + +[ + { + "question": "string", + "options": [ + {"label": "A", "text": "string"}, + {"label": "B", "text": "string"}, + {"label": "C", "text": "string"}, + {"label": "D", "text": "string"} + ], + "correct_label": "A" + }, + ... +] + +Do not include fewer or more than objects in the array. +""" + + +ASK_Q = """ +You are to select the correct answer for a multiple choice question. + +A number of answers will be presented and you should respond with only the letter +corresponding to the correct answer. + +Here is an example to show the format: + +Example question: What is the powerhouse of the cell responsible for cellular respiration? -and the potential answers are: +Example options: E: Nucleus F: Mitochondria G: Ribosome H: Golgi Apparatus -you should only answer: +In that example, the correct answer is option F, so you would answer with: F -This is your question: +IMPORTANT: In the REAL questions below, the options will ALWAYS be labelled with the letters A, B, C, and D. +For those questions you MUST answer with exactly ONE of these letters: A, B, C, or D. + +Do NOT answer with any other letter. +Do NOT repeat the question or options. +Do NOT add explanations, punctuation, or any extra text. + +Now answer this question: @@ -44,6 +104,7 @@ +Respond with exactly ONE character: A, B, C, or D. """ if __name__ == "__main__": diff --git a/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json b/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json new file mode 100644 index 0000000..de90c31 --- /dev/null +++ b/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json @@ -0,0 +1,131 @@ +{ + "title": "Ollama_mistral_5questions_general_v2", + "source": "Generated by Ollama:mistral, temperature: 0.2, mode: JSON_v2_raw10", + "questions": [ + { + "question": "Who was the first President of the United States?", + "answers": [ + { + "ref": "1", + "ans": "George Washington", + "correct": true + }, + { + "ref": "2", + "ans": "Thomas Jefferson", + "correct": false + }, + { + "ref": "3", + "ans": "Benjamin Franklin", + "correct": false + }, + { + "ref": "4", + "ans": "Abraham Lincoln", + "correct": false + } + ] + }, + { + "question": "Which continent is the largest in terms of total area?", + "answers": [ + { + "ref": "1", + "ans": "Asia", + "correct": true + }, + { + "ref": "2", + "ans": "Africa", + "correct": false + }, + { + "ref": "3", + "ans": "Antarctica", + "correct": false + }, + { + "ref": "4", + "ans": "South America", + "correct": false + } + ] + }, + { + "question": "What is the chemical symbol for water?", + "answers": [ + { + "ref": "1", + "ans": "HW", + "correct": false + }, + { + "ref": "2", + "ans": "H2O", + "correct": true + }, + { + "ref": "3", + "ans": "HOH", + "correct": false + }, + { + "ref": "4", + "ans": "OH2", + "correct": false + } + ] + }, + { + "question": "Which of these countries is NOT a member of the European Union?", + "answers": [ + { + "ref": "1", + "ans": "France", + "correct": false + }, + { + "ref": "2", + "ans": "Spain", + "correct": false + }, + { + "ref": "3", + "ans": "Norway", + "correct": true + }, + { + "ref": "4", + "ans": "Poland", + "correct": false + } + ] + }, + { + "question": "Who wrote the novel 'To Kill a Mockingbird'?", + "answers": [ + { + "ref": "1", + "ans": "Harper Lee", + "correct": true + }, + { + "ref": "2", + "ans": "J.D. Salinger", + "correct": false + }, + { + "ref": "3", + "ans": "Ernest Hemingway", + "correct": false + }, + { + "ref": "4", + "ans": "Mark Twain", + "correct": false + } + ] + } + ] +} \ No newline at end of file diff --git a/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json b/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json new file mode 100644 index 0000000..0bef406 --- /dev/null +++ b/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json @@ -0,0 +1,4 @@ +{ + "title": "GPT4o_20questions_celegans", + "source": "Generated by gpt-4o, temperature: 0.2" +} \ No newline at end of file diff --git a/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json b/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json new file mode 100644 index 0000000..804df23 --- /dev/null +++ b/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json @@ -0,0 +1,4 @@ +{ + "title": "GPT4o_5questions_celegans", + "source": "Generated by gpt-4o, temperature: 0.2" +} \ No newline at end of file diff --git a/openworm_ai/utils/llms.py b/openworm_ai/utils/llms.py index 388f4dc..7b1be80 100644 --- a/openworm_ai/utils/llms.py +++ b/openworm_ai/utils/llms.py @@ -90,6 +90,33 @@ ] +def requires_openai_key(llm_ver): + return llm_ver in OPENAI_LLMS + + +def get_openai_api_key(): + """ + Returns the OpenAI API key from: + 1. Environment variables (preferred) + 2. A file '../oaik' (legacy OpenWorm option), IF it exists + """ + # 1. Try environment variable + key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_KEY") + if key: + return key.strip() + + # 2. Legacy fallback – only read file if it exists + oaik_path = "../oaik" + if os.path.exists(oaik_path): + with open(oaik_path, "r") as f: + return f.read().strip() + + # 3. Nothing found → fail clearly + raise RuntimeError( + "OpenAI API key not found.\n" + "Set environment variable OPENAI_API_KEY or place a key in '../oaik'." + ) + GENERAL_QUERY_PROMPT_TEMPLATE = """Answer the following question. Provide succinct, yet scientifically accurate answers. Question: {question} @@ -226,15 +253,36 @@ def generate_panel_response(input_text, llm_panelists, llm_panel_chair, temperat def get_llm_from_argv(argv): + # Default remains GPT-4o llm_ver = LLM_GPT4o - for arg in LLM_CMD_LINE_ARGS: + # Allow command-line flags to override + for arg, model_name in LLM_CMD_LINE_ARGS.items(): if arg in argv: - llm_ver = LLM_CMD_LINE_ARGS[arg] + return model_name + + # Allow explicit model names as positional args + for a in argv[1:]: + if a.startswith("Ollama:"): + return a + if a in PREF_ORDER_LLMS: + return a + if a.upper() in ("GPT4O", "GPT-4O"): + return LLM_GPT4o + + # --- FINAL FAILSAFE --- + # If default GPT-4o chosen but key missing → fallback to Ollama + try: + if requires_openai_key(llm_ver): + _ = get_openai_api_key() # Just try getting key + except Exception: + print("⚠ No OpenAI key found → using local Ollama model instead.") + return LLM_OLLAMA_LLAMA32 return llm_ver + def ask_question_get_response( question, llm_ver, temperature=0, only_celegans=False, print_question=True ):