diff --git a/openworm_ai/quiz/QuizMaster.py b/openworm_ai/quiz/QuizMaster.py
index c6291b7..c360e65 100644
--- a/openworm_ai/quiz/QuizMaster.py
+++ b/openworm_ai/quiz/QuizMaster.py
@@ -1,11 +1,29 @@
 from openworm_ai.quiz.QuizModel import MultipleChoiceQuiz, Question, Answer
 
-
 from openworm_ai.utils.llms import ask_question_get_response
 from openworm_ai.utils.llms import get_llm_from_argv
+from openworm_ai.utils.llms import get_llm
+from openworm_ai.utils.llms import LLM_CLAUDE37
+from openworm_ai.utils.llms import LLM_OLLAMA_GEMMA2
+from openworm_ai.utils.llms import LLM_OLLAMA_PHI4
+from openworm_ai.utils.llms import get_anthropic_key
+
+from llama_index.core import Document, VectorStoreIndex
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.embeddings.ollama import OllamaEmbedding
+from typing import List, Optional
+
 
 import random
 from enum import Enum
+import json
+
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+from langchain_openai import OpenAIEmbeddings
+import numpy as np
+
 
 indexing = ["A", "B", "C", "D"]
 
@@ -13,6 +31,21 @@
     "QuizScope", [("GeneralKnowledge", 1), ("Science", 2), ("CElegans", 3)]
 )
 
+def get_default_critic_llm_ver():
+    """
+    Choose the default critic model:
+    - If an Anthropic key is available → use Claude 3.7 Sonnet
+    - Otherwise → fall back to a local Ollama model (gemma2)
+    """
+    try:
+        key = get_anthropic_key()
+    except Exception:
+        key = None
+
+    if key:
+        return LLM_CLAUDE37
+    else:
+        return LLM_OLLAMA_GEMMA2
 
 def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
     suffix = None
@@ -74,6 +107,380 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
         % (llm_ver.replace(":", "_"), num_questions, suffix)
     )
 
+def _extract_json_array(text: str) -> str:
+    """
+    Extract the first top-level JSON array from the LLM response.
+    This is defensive in case the model adds extra text or code fences.
+    """
+    start = text.find("[")
+    end = text.rfind("]")
+    if start == -1 or end == -1 or end <= start:
+        raise ValueError("Could not find a JSON array in the LLM response.")
+    return text[start : end + 1]
+
+def _is_valid_mcq_item(item: dict) -> bool:
+    """
+    Basic sanity-check for a generated MCQ item.
+
+    Expects schema like:
+      {
+        "question": "stem",
+        "options": [{"label": "A", "text": "..."}, ...],
+        "correct_label": "A",
+        ...
+      }
+
+    Returns True if it looks usable, False otherwise.
+    """
+    try:
+        # Question text
+        q = item.get("question", "")
+        if not isinstance(q, str) or not q.strip():
+            return False
+
+        # Options
+        options = item.get("options")
+        if not isinstance(options, list) or len(options) < 2:
+            return False
+
+        labels = set()
+        for opt in options:
+            if not isinstance(opt, dict):
+                return False
+            label = opt.get("label")
+            text = opt.get("text")
+            if not isinstance(label, str) or not label.strip():
+                return False
+            if not isinstance(text, str) or not text.strip():
+                return False
+            labels.add(label)
+
+        # Correct label
+        correct = item.get("correct_label")
+        if not isinstance(correct, str) or correct not in labels:
+            return False
+
+        return True
+    except Exception:
+        return False
+
+
+def score_question_with_critic(item, llm_ver_critic=None, temperature=0.0):
+    """
+    Use a separate LLM (critic) to score a single MCQ item.
+
+    item: {
+      "question": "string",
+      "options": [...],
+      "correct_label": "A"
+    }
+
+    Returns: (score: float, comment: None)
+    """
+    # 1) Decide which critic model to use
+    if llm_ver_critic is None:
+        llm_ver_critic = get_default_critic_llm_ver()
+
+    # 2) Represent the MCQ as JSON so the critic sees the exact structure
+    mcq_json_str = json.dumps(item, ensure_ascii=False, indent=2)
+
+    # 3) Build the critic prompt
+    # IMPORTANT: no literal { ... } JSON examples here, to avoid PromptTemplate treating
+    # them as variables. We just describe the format in words.
+    critic_prompt = """
+You are an expert evaluator of multiple-choice questions.
+
+You will be given ONE MCQ in JSON format:
+- "question": the question text
+- "options": array of answers (A–D)
+- "correct_label": the intended correct option.
+
+Evaluate the QUALITY of the question on:
+1. Clarity (is the wording precise?)
+2. Unambiguity (is there ONLY one correct answer?)
+3. Factual correctness (is the correct answer truly correct?)
+4. Distractor quality (are wrong answers plausible but incorrect?)
+5. Appropriateness (no trick wording or opinion-based content).
+
+Return a single integer score from 0 to 100:
+- 90–100: excellent
+- 70–89: good
+- 50–69: borderline
+- <50: poor.
+
+Your output MUST be a valid JSON object with a single field "score" whose value is an integer.
+Do not include any other keys or any extra text.
+For example, if you think the quality is 87, your output should be a JSON object with "score": 87.
+
+Here is the MCQ to evaluate:
+
+{mcq_json}
+""".strip()
+
+    # 4) Use LangChain to run the critic model
+    prompt = PromptTemplate(
+        template=critic_prompt,
+        input_variables=["mcq_json"],
+    )
+
+    try:
+        llm = get_llm(llm_ver_critic, temperature)
+        chain = prompt | llm | StrOutputParser()
+        resp = chain.invoke({"mcq_json": mcq_json_str}).strip()
+    except Exception as e:
+        print(f"⚠ Critic LLM call failed for model {llm_ver_critic}: {e}")
+        # Neutral default score if critic fails
+        return 50.0, None
+
+    # 5) Parse the critic response as JSON
+    try:
+        start = resp.find("{")
+        end = resp.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            raise ValueError("No JSON object found in critic response.")
+
+        json_str = resp[start : end + 1]
+        obj = json.loads(json_str)
+
+        score = float(obj.get("score", 50.0))
+        return score, None
+    except Exception as e:
+        print("⚠ Failed to parse critic response as JSON:")
+        print(resp)
+        print(e)
+        return 50.0, None
+    
+
+# VectorStore-based embedding dedup (RAG-style) 
+
+def question_to_text(item: dict) -> str:
+    """
+    Turn a question item into a single text blob, analogous to how the RAG
+    script builds `all_text` for each section.
+
+    item schema (from GENERATE_Q_JSON):
+      {
+        "question": "stem",
+        "options": [{"label": "A", "text": "..."}, ...],
+        "correct_label": "A",
+        ...
+      }
+    """
+    stem = item.get("question", "").strip()
+    options = item.get("options", [])
+
+    parts = [stem]
+    for opt in options:
+        label = opt.get("label", "")
+        text = opt.get("text", "")
+        parts.append(f"{label}. {text}")
+
+    return " ".join(parts).strip()
+
+
+def get_embed_model_for_llm(llm_ver: str):
+    """
+    Mirror the RAG script logic:
+
+    In RAG, for Ollama models they do:
+      OLLAMA_MODEL = model.replace("Ollama:", "") if model is not LLM_GPT4o else None
+      ollama_embedding = OllamaEmbedding(model_name=OLLAMA_MODEL) if OLLAMA_MODEL else None
+      VectorStoreIndex.from_documents(documents, embed_model=ollama_embedding)
+
+    Here:
+      - If llm_ver starts with 'Ollama:' → use OllamaEmbedding(model_name=...)
+      - Else → use OpenAIEmbedding (LlamaIndex's OpenAI stack).
+    """
+    if llm_ver.startswith("Ollama:"):
+        ollama_model = llm_ver.replace("Ollama:", "")
+        print(f"[Step 7] Using OllamaEmbedding for VectorStoreIndex: {ollama_model}")
+        return OllamaEmbedding(model_name=ollama_model)
+    else:
+        print("[Step 7] Using OpenAIEmbedding for VectorStoreIndex")
+        return OpenAIEmbedding()
+
+
+def build_question_index(questions: List[dict], llm_ver: str) -> VectorStoreIndex:
+    """
+    Build an in-memory VectorStoreIndex over the questions, analogous to how
+    the RAG script builds an index over WormAtlas sections.
+    """
+    docs: List[Document] = []
+    for idx, q in enumerate(questions):
+        text = question_to_text(q)
+        # Store the index of the question in metadata so we can map back
+        docs.append(Document(text=text, metadata={"qid": idx}))
+
+    embed_model = get_embed_model_for_llm(llm_ver)
+    index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
+    return index
+
+def deduplicate_questions_with_index(questions: List[dict], llm_ver: str, similarity_threshold: float = 0.9, max_items: Optional[int] = None) -> List[dict]:
+    """
+    Dedup using VectorStoreIndex
+
+    - Build a VectorStoreIndex over all questions.
+    - For each question (in the given order, which we make 'best first' via critic),
+      query the index with its own text.
+    - If any ALREADY-KEPT question appears among the top similar results
+      with score >= similarity_threshold, we treat this as a duplicate/overlap.
+    - Otherwise, we keep it.
+    """
+    if not questions:
+        return []
+
+    index = build_question_index(questions, llm_ver)
+    retriever = index.as_retriever(similarity_top_k=5)
+
+    kept_indices: List[int] = []
+
+    for idx, q in enumerate(questions):
+        if max_items is not None and len(kept_indices) >= max_items:
+            break
+
+        text = question_to_text(q)
+        results = retriever.retrieve(text)
+
+        is_dup = False
+        # Check if this question is too similar to any ALREADY-KEPT question
+        for node_with_score in results:
+            # Node metadata should include our "qid"
+            meta = node_with_score.metadata or {}
+            other_id = meta.get("qid")
+            score = node_with_score.score
+
+            # Skip self-match
+            if other_id == idx:
+                continue
+
+            # If we already kept this other question and similarity is high → duplicate
+            if other_id in kept_indices and score is not None and score >= similarity_threshold:
+                is_dup = True
+                break
+
+        if not is_dup:
+            kept_indices.append(idx)
+
+    # Return questions in the original (already-sorted) order, but filtered
+    return [questions[i] for i in kept_indices]
+
+
+
+
+
+
+
+
+
+
+
+       
+def generate_quiz_json(num_questions, llm_ver, quiz_scope, temperature=0.2):
+    """
+    Generate a MultipleChoiceQuiz using JSON-based prompts instead of free-text parsing.
+
+    num_questions = desired final number of questions.
+    Internally we over-generate (2x) so that later we can filter/score/dedup.
+    Right now we simply take the first num_questions as a placeholder.
+    """
+    if quiz_scope == QuizScope.CElegans:
+        from openworm_ai.quiz.TemplatesCelegans import GENERATE_Q_JSON as GENERATE_Q
+        suffix = "_celegans_v2"
+    elif quiz_scope == QuizScope.Science:
+        # Only if/when you add a science JSON template
+        from openworm_ai.quiz.TemplatesScience import GENERATE_Q_JSON as GENERATE_Q
+        suffix = "_science_v2"
+    elif quiz_scope == QuizScope.GeneralKnowledge:
+        # Only if/when you add a general JSON template
+        from openworm_ai.quiz.Templates import GENERATE_Q_JSON as GENERATE_Q
+        suffix = "_general_v2"
+    else:
+        raise ValueError(f"Unsupported quiz scope: {quiz_scope}")
+
+    # Over-generate so we can filter / dedup later
+    raw_n = num_questions * 3
+
+    prompt = GENERATE_Q.replace("<QUESTION_NUMBER>", str(raw_n))
+
+    # Ask LLM
+    raw = ask_question_get_response(prompt, llm_ver, temperature)
+
+    try:
+        json_str = _extract_json_array(raw)
+        data = json.loads(json_str)
+    except Exception:
+        print("⚠ Failed to parse JSON from LLM. Raw output (first 500 chars):")
+        print(raw[:500])
+        raise
+
+    # Filter out malformed items before critic/dedup
+    original_len = len(data)
+    data = [item for item in data if _is_valid_mcq_item(item)]
+    if len(data) < original_len:
+        print(
+            f"⚠ Filtered out {original_len - len(data)} invalid MCQ items "
+            f"({len(data)} remain)"
+        )
+
+    if not data:
+        raise ValueError("No valid MCQ items after validation; aborting.")
+
+
+    # In theory data length should be raw_n; in practice we guard.
+    # For now, we just take the first num_questions; later we'll insert critic + dedup.
+    # In theory data length should be raw_n; in practice we guard.
+    # Now: score each question with a critic and keep the top N.
+
+    critic_llm_ver = get_default_critic_llm_ver()
+    print(f"Using critic model {critic_llm_ver} to score {len(data)} questions")
+
+    scored_items = []
+    for idx, item in enumerate(data):
+        score, _ = score_question_with_critic(item, llm_ver_critic=critic_llm_ver)
+        item["_critic_score"] = score
+        print(f"  [Critic] Q{idx}: score={score:.1f}")
+        scored_items.append(item)
+
+    # Sort by critic score (highest first) and select the top num_questions
+    scored_items.sort(key=lambda x: x.get("_critic_score", 0.0), reverse=True)
+     # Step 7: deduplicate using a VectorStoreIndex, mimicking the RAG pattern
+    try:
+        selected_items = deduplicate_questions_with_index(
+            scored_items,
+            llm_ver=llm_ver,
+            similarity_threshold=0.9,  # tweak based on what you see
+            max_items=num_questions,
+        )
+        print(
+            f"[Step 7] Selected {len(selected_items)} questions after VectorStore "
+            f"dedup (target={num_questions})"
+        )
+    except Exception as e:
+        print(
+            f"⚠ [Step 7] VectorStore-based dedup failed, "
+            f"falling back to simple top-{num_questions} slice: {e}"
+        )
+        selected_items = scored_items[:num_questions]
+
+    quiz = MultipleChoiceQuiz(
+        title=f"{llm_ver.replace(':', '_')}_{num_questions}questions{suffix}",
+        source=f"Generated by {llm_ver}, temperature: {temperature}, mode: JSON_v2_raw{raw_n}",
+    )
+
+    # Convert each JSON object to Question/Answer objects
+    for item in selected_items:
+        stem = item["question"].strip()
+        q_obj = Question(question=stem)
+
+        # Our quiz model uses indices "1", "2", "3", "4"
+        for i, opt in enumerate(item["options"]):
+            text = opt["text"].strip()
+            is_correct = (opt["label"] == item["correct_label"])
+            q_obj.answers.append(Answer(str(i + 1), text, is_correct))
+
+        quiz.questions.append(q_obj)
+
+    return quiz
 
 if __name__ == "__main__":
     import sys
@@ -128,18 +535,33 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
             ).strip()
             resp = orig_resp
 
-            if "<think>" in resp:  # Give deepseek a fighting chance...
-                resp = (
-                    resp[0 : resp.index("<think>")] + resp[resp.index("</think>") + 8 :]
-                )
-                resp = resp.replace("\n", " ").strip()
-                guess = resp[-1]
-            else:
-                if "\n" in resp:
-                    resp = resp.split("\n")[0]
-                guess = resp.split(":")[0].strip()
-                if " " in guess:
-                    guess = guess[0]
+            # Handle models that include chain-of-thought with <think>...</think>
+            if "<think>" in resp:
+                try:
+                    before = resp[: resp.index("<think>")]
+                    after = resp[resp.index("</think>") + len("</think>") :]
+                    resp = (before + "\n" + after).strip()
+                except ValueError:
+                    # If tags are malformed, fall back to original
+                    resp = orig_resp
+
+            # Take the first non-empty line
+            first_line = resp.splitlines()[0].strip() if resp else ""
+
+            # Look for the first A/B/C/D in that line
+            guess = None
+            for ch in first_line:
+                if ch in ["A", "B", "C", "D"]:
+                    guess = ch
+                    break
+
+            # Fallback if nothing sensible found
+            if guess is None:
+                candidate = first_line.split(":")[0].strip()
+                if candidate:
+                    guess = candidate[0]
+                else:
+                    guess = "Z"  # definitely invalid, will be caught later
 
             total_qs += 1
             correct_guess = guess == correct_answer
@@ -165,11 +587,49 @@ def save_quiz(num_questions, num_answers, llm_ver, quiz_scope, temperature=0):
         )
 
     # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers
+    # this can be used to plot comparison of variety of llms on general knowledge
+     # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers
+    # this can be used to plot comparison of variety of llms on general knowledge
+       # make this into a method which returns a dictionary of all the "stats" that lists the llm, correct/incorrect answers
     # this can be used to plot comparison of variety of llms on general knowledge
     else:
         num = 100
+        use_v2_json = "--v2-json" in sys.argv
+
         for a in sys.argv:
             if a.isnumeric():
                 num = int(a)
-        print(f"Using LLM {llm_ver} for saving quiz with {num} questions")
-        save_quiz(num, 4, llm_ver, quiz_scope=QuizScope.CElegans, temperature=0.2)
+
+        # Decide which scope we're using
+        quiz_scope = QuizScope.CElegans
+        suffix_scope = "_celegans"
+        if "--general" in sys.argv:
+            quiz_scope = QuizScope.GeneralKnowledge
+            suffix_scope = "_general"
+        elif "--science" in sys.argv:
+            quiz_scope = QuizScope.Science
+            suffix_scope = "_science"
+
+        if use_v2_json:
+            print(
+                f"Using LLM {llm_ver} for saving JSON-v2 quiz with {num} questions "
+                f"(scope={quiz_scope.name})"
+            )
+            quiz = generate_quiz_json(
+                num, llm_ver, quiz_scope=quiz_scope, temperature=0.2
+            )
+            out_path = (
+                "openworm_ai/quiz/samples/"
+                f"{llm_ver.replace(':','_')}_{num}questions{suffix_scope}_v2.json"
+            )
+            quiz.to_json_file(out_path)
+            print(f"Saved JSON-v2 quiz to {out_path}")
+        else:
+            print(
+                f"Using LLM {llm_ver} for saving legacy quiz with {num} questions "
+                f"(scope={quiz_scope.name})"
+            )
+            save_quiz(
+                num, 4, llm_ver, quiz_scope=quiz_scope, temperature=0.2
+            )
+
diff --git a/openworm_ai/quiz/QuizMasterCorpus.py b/openworm_ai/quiz/QuizMasterCorpus.py
index 5035fa2..f090f4b 100644
--- a/openworm_ai/quiz/QuizMasterCorpus.py
+++ b/openworm_ai/quiz/QuizMasterCorpus.py
@@ -1,9 +1,18 @@
 import os
 import json
 import random
+from typing import List
+import glob
+
 from openworm_ai.quiz.QuizModel import MultipleChoiceQuiz, Question, Answer
 from openworm_ai.quiz.TemplatesCorpus import TEXT_ANSWER_EXAMPLE
-from openworm_ai.utils.llms import ask_question_get_response, LLM_GPT4o
+from openworm_ai.utils.llms import ask_question_get_response, LLM_GPT4o, LLM_OLLAMA_GEMMA2
+
+from openworm_ai.quiz.QuizMaster import (_is_valid_mcq_item, score_question_with_critic, deduplicate_questions_with_index, get_default_critic_llm_ver, get_embed_model_for_llm)
+
+from llama_index.core import Document, VectorStoreIndex 
+
+
 
 indexing = ["A", "B", "C", "D"]
 TOKEN_LIMIT = 30_000  # 🔹 Keeps request within OpenAI's limits
@@ -25,139 +34,351 @@
 📌 **IMPORTANT:** If the text does not have enough content for <QUESTION_NUMBER> questions, generate as many as possible.  
 """
 
+def load_corpus_sections(papers_glob: str = "processed/json/papers/*.json") -> List[dict]:
+    """
+    Load sections from all processed paper JSONs, skipping obvious non-body-text
+    sections like References, Bibliography, etc.
+
+    Returns a list of dicts:
+      {
+        "text": "...section text...",
+        "source": "PaperFile.json: [Title, Section X](url)"
+      }
+    """
+    json_inputs = glob.glob(papers_glob)
+    sections: List[dict] = []
+
+    if not json_inputs:
+        print(f"⚠ Warning: no JSON papers found under {papers_glob}")
+        return sections
+
+    for json_file in json_inputs:
+        try:
+            with open(json_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception as e:
+            print(f"⚠ Error reading {json_file}: {e}")
+            continue
+
+        for title, doc_contents in data.items():
+            src_page = doc_contents.get("source", json_file)
+            for section_name, details in doc_contents.get("sections", {}).items():
+                sec_name_lower = section_name.lower()
+
+                # 🔹 Skip obvious reference-like / non-content sections
+                if any(
+                    key in sec_name_lower
+                    for key in [
+                        "reference",
+                        "bibliograph",
+                        "supplementary",
+                        "acknowledg",
+                        "funding",
+                        "author contributions",
+                        "materials and methods",  # optional, remove if you want methods Qs
+                    ]
+                ):
+                    continue
+
+                paragraphs = details.get("paragraphs", [])
+                text = " ".join(
+                    p.get("contents", "") for p in paragraphs
+                ).strip()
+
+                # Skip ultra-short or weird sections (tables, axes, etc.)
+                if len(text.split()) < 30:
+                    continue
+
+                # Skip sections that look like pure citation/DOI blobs
+                lower_text = text.lower()
+                if "doi.org" in lower_text or "doi:" in lower_text:
+                    continue
+
+                src_info = (
+                    f"{os.path.basename(json_file)}: "
+                    f"[{title}, Section {section_name}]({src_page})"
+                )
+                sections.append(
+                    {
+                        "text": text,
+                        "source": src_info,
+                    }
+                )
+
+    print(f" Loaded {len(sections)} sections from corpus papers (after filtering)")
+    return sections
+
+
+def build_corpus_index_for_mcq(
+    llm_ver: str,
+    papers_glob: str = "processed/json/papers/*.json") -> tuple[VectorStoreIndex, List[Document]]:
+    """
+    Build a VectorStoreIndex over the corpus sections, to use for RAG-style
+    context selection when generating MCQs.
+
+    Returns:
+      (index, docs)
+      - index: VectorStoreIndex over all sections
+      - docs: list of Documents with .text and metadata["source"]
+    """
+    # Reuse your existing loader (with filtering)
+    sections = load_corpus_sections(papers_glob=papers_glob)
+    if not sections:
+        raise ValueError("No sections found for corpus index.")
+
+    docs: List[Document] = []
+    for sid, sec in enumerate(sections):
+        docs.append(
+            Document(
+                text=sec["text"],
+                metadata={"source": sec["source"], "sid": sid},
+            )
+        )
+
+    embed_model = get_embed_model_for_llm(llm_ver)
+    print("[RAG] Building VectorStoreIndex for corpus MCQ generation...")
+    index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
+    print(f"[RAG] Built index over {len(docs)} documents")
+    return index, docs
+
+
+
+
+
 
-def load_limited_documents(file_path, max_tokens=TOKEN_LIMIT, num_chunks=5):
-    """Loads a JSON document in chunks to generate questions in batches."""
-    if not os.path.exists(file_path):
-        print(f"⚠ Warning: {file_path} not found. Exiting...")
-        return []
 
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
 
-        extracted_texts = []
-        all_text = ""
 
-        # Extract all sections
-        for title, doc_contents in data.items():
-            all_text += f"\n📌 **{title}**\n"
-            for section, details in doc_contents.get("sections", {}).items():
-                all_text += f"🔹 **{section}**:\n"
-                if "paragraphs" in details:
-                    all_text += (
-                        " ".join([p["contents"] for p in details["paragraphs"]])
-                        + "\n\n"
-                    )
-
-        # Split document into chunks
-        words = all_text.split()
-        chunk_size = len(words) // num_chunks
-        for i in range(num_chunks):
-            start = i * chunk_size
-            end = (i + 1) * chunk_size if i < num_chunks - 1 else len(words)
-            chunk_text = " ".join(words[start:end])
-            extracted_texts.append(chunk_text)
-
-        return extracted_texts
-
-    except (json.JSONDecodeError, UnicodeDecodeError, PermissionError) as e:
-        print(f"⚠ Error reading {file_path}: {e}")
-        return []
-
-
-def save_quiz(num_questions=100, num_answers=4, llm_ver=LLM_GPT4o, temperature=0):
-    """Generates and saves a quiz using GPT-4o while ensuring all content is from documents in batches."""
-
-    num_batches = num_questions // 20  # Generate in batches of 20 questions
-    document_chunks = load_limited_documents(
-        "processed/json/papers/Corsi_et_al_2015.json", num_chunks=num_batches
-    )
 
-    if not document_chunks:
-        print("⚠ Error: No valid document chunks found.")
+
+
+
+
+
+
+
+
+
+
+
+def save_quiz_v2(
+    num_questions: int = 100,
+    llm_ver: str = LLM_GPT4o,
+    temperature: float = 0.2,
+    questions_per_section: int = 3,
+):
+    """
+    Generate and save a corpus-based quiz where each MCQ is grounded
+    in the processed paper JSONs, using RAG-style context selection.
+
+    Pipeline:
+      - Build a VectorStoreIndex over processed/json/papers/*.json
+      - Repeatedly pick a random seed doc and retrieve top-k similar docs
+      - Use concatenated retrieved text as context for MCQ generation
+      - Validate items (_is_valid_mcq_item)
+      - Critic-score, rank, and deduplicate (using the same pipeline as QuizMaster)
+      - Build a MultipleChoiceQuiz and save as JSON
+    """
+    # 🔹 Build RAG index over corpus
+    try:
+        index, docs = build_corpus_index_for_mcq(llm_ver)
+    except Exception as e:
+        print(f"⚠ Error building corpus index: {e}")
         return
 
-    # Initialize quiz
-    quiz = MultipleChoiceQuiz(
-        title=f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_batched",
-        source=f"Generated by {llm_ver}, temperature: {temperature}",
-    )
+    if not docs:
+        print("⚠ Error: No documents in corpus index.")
+        return
 
-    question_count = 0
+    retriever = index.as_retriever(similarity_top_k=3)
 
-    # Loop over document chunks and generate questions in batches
-    for i, chunk_text in enumerate(document_chunks):
-        print(f"📝 Generating batch {i + 1} (20 questions) from document chunk...")
+    # Over-generate so critic + dedup have room
+    raw_target = num_questions * 3
+    all_items: List[dict] = []
 
-        # Create strict prompt for the batch
-        question_prompt = (
-            STRICT_GENERATE_Q.replace("<QUESTION_NUMBER>", str(20))
-            + TEXT_ANSWER_EXAMPLE
-            + "\n\n🔹 **Use ONLY the following document knowledge for questions:**\n\n"
-            + chunk_text
-        )
+    # We'll cap the number of retrieval/generation attempts so we don't loop forever
+    max_attempts = raw_target * 2
+    attempts = 0
 
-        response = ask_question_get_response(question_prompt, llm_ver, temperature)
+    while len(all_items) < raw_target and attempts < max_attempts:
+        attempts += 1
 
-        # Ensure GPT-4o generated questions
-        questions_generated = response.count("QUESTION:")
-        if questions_generated < 15:
-            print(
-                f"⚠ Warning: GPT-4o generated only {questions_generated} questions in batch {i + 1}. Skipping batch."
-            )
+        # 🔹 Pick a random seed document and use it as a "query"
+        seed_doc = random.choice(docs)
+        seed_text = seed_doc.text
+
+        try:
+            results = retriever.retrieve(seed_text)
+        except Exception as e:
+            print(f"⚠ RAG retrieval failed on attempt {attempts}: {e}")
             continue
 
-        # Parse and add questions to quiz
-        last_question = None
-        indexing = ["1", "2", "3", "4"]
-
-        for line in response.split("\n"):
-            if question_count >= num_questions:
-                break  # Stop at exactly 100 questions
-
-            if len(line.strip()) > 0:
-                if "QUESTION" in line or line.strip().endswith("?"):
-                    question_text = line.split(":", 1)[-1].strip()
-                    print(f"Question: <{question_text}>")
-                    last_question = Question(question=question_text)
-                    quiz.questions.append(last_question)
-                    question_count += 1
-                elif "CORRECT ANSWER" in line:
-                    correct_ans = line.split(":", 1)[-1].strip()
-                    i = len(last_question.answers)
-                    last_question.answers.append(Answer(indexing[i], correct_ans, True))
-                elif "WRONG ANSWER" in line:
-                    wrong_ans = line.split(":", 1)[-1].strip()
-                    i = len(last_question.answers)
-                    last_question.answers.append(Answer(indexing[i], wrong_ans, False))
-
-        if question_count >= num_questions:
-            break  # Stop if we reached 100 questions
-
-    # Ensure quiz has enough valid questions before saving
-    if len(quiz.questions) < num_questions * 0.8:
+        # Build context from top-k retrieved docs
+        ctx_texts = []
+        sources = set()
+        for r in results:
+            try:
+                # NodeWithScore in LlamaIndex allows get_content()
+                ctx_texts.append(r.get_content())
+                src = r.metadata.get("source", "")
+            except AttributeError:
+                # Fallback if API differs
+                node = getattr(r, "node", None)
+                if node is not None:
+                    ctx_texts.append(getattr(node, "text", ""))
+                    src = node.metadata.get("source", "")
+                else:
+                    continue
+            if src:
+                sources.add(src)
+
+        context = "\n\n".join(t for t in ctx_texts if t.strip())
+        if not context.strip():
+            continue
+
+        source = "; ".join(sorted(sources))
+
+        prompt = f"""
+You are generating multiple-choice questions based on scientific papers
+about C. elegans.
+
+You are given the following reference material, composed of several
+semantically related passages:
+
+\"\"\"{context}\"\"\"
+
+Use ONLY this material (no external knowledge) to generate
+{questions_per_section} multiple-choice questions.
+
+Each question must:
+- Be clearly answerable from the provided material.
+- Have exactly one correct answer and 3 plausible incorrect answers.
+- Be specific and technically accurate, suitable for advanced students or researchers.
+- NOT reference the text explicitly (no "according to the text" phrasing).
+
+Return your output as a JSON array. Each element must have the form:
+{{
+  "question": "...",
+  "options": [
+    {{"label": "A", "text": "..."}},
+    {{"label": "B", "text": "..."}},
+    {{"label": "C", "text": "..."}},
+    {{"label": "D", "text": "..."}}
+  ],
+  "correct_label": "A"
+}}
+
+Do not include any extra keys, commentary, or code fences.
+""".strip()
+
+        raw = ask_question_get_response(prompt, llm_ver, temperature)
+
+        try:
+            from openworm_ai.quiz.QuizMaster import _extract_json_array  # reuse helper
+            json_str = _extract_json_array(raw)
+            items = json.loads(json_str)
+        except Exception:
+            print("⚠ Failed to parse JSON from RAG-based corpus generation. Skipping this batch.")
+            continue
+
+        valid_items = [it for it in items if _is_valid_mcq_item(it)]
+        if not valid_items:
+            continue
+
+        #Attach source metadata
+        for it in valid_items:
+            it["_source"] = source
+
+        all_items.extend(valid_items)
+
+    if not all_items:
+        print("⚠ Error: No valid MCQs generated from RAG-based corpus passages.")
+        return
+
+    print(f"📊 Corpus+RAG generation produced {len(all_items)} valid MCQs before critic/dedup")
+
+    # Critic scoring (same as before)
+    critic_llm_ver = get_default_critic_llm_ver()
+    print(f"[Corpus+RAG] Using critic model {critic_llm_ver} to score {len(all_items)} questions")
+
+    for idx, item in enumerate(all_items):
+        score, _ = score_question_with_critic(item, llm_ver_critic=critic_llm_ver)
+        item["_critic_score"] = score
+        print(f"  [Corpus+RAG Critic] Q{idx}: score={score:.1f}")
+
+    all_items.sort(key=lambda x: x.get("_critic_score", 0.0), reverse=True)
+
+    #Dedup with same VectorStore-based logic
+    try:
+        selected_items = deduplicate_questions_with_index(
+            all_items,
+            llm_ver=llm_ver,
+            similarity_threshold=0.9,
+            max_items=num_questions,
+        )
         print(
-            "⚠ Error: Not enough valid questions were generated. Quiz will not be saved."
+            f"[Corpus+RAG Step 7] Selected {len(selected_items)} corpus-based questions "
+            f"after dedup (target={num_questions})"
         )
-        return
+    except Exception as e:
+        print(
+            f"⚠ [Corpus+RAG Step 7] VectorStore-based dedup failed, "
+            f"falling back to top-{num_questions}: {e}"
+        )
+        selected_items = all_items[:num_questions]
+
+    # 🔹 Build MultipleChoiceQuiz
+    quiz = MultipleChoiceQuiz(
+        title=f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_rag_v2",
+        source=f"Corpus-based (RAG) quiz generated from processed papers by {llm_ver}, "
+               f"temperature: {temperature}",
+    )
+
+    for item in selected_items:
+        stem = item["question"].strip()
+        q_obj = Question(question=stem)
+
+        for i, opt in enumerate(item["options"]):
+            text = opt["text"].strip()
+            is_correct = (opt["label"] == item["correct_label"])
+            q_obj.answers.append(Answer(str(i + 1), text, is_correct))
+
+        quiz.questions.append(q_obj)
 
-    print("===============================\n  Generated quiz:\n")
+    print("===============================\n  Generated corpus+RAG quiz:\n")
     print(quiz.to_yaml())
 
-    quiz.to_json_file(
-        f"openworm_ai/quiz/samples/{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_batched.json"
+    out_path = (
+        f"openworm_ai/quiz/samples/"
+        f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_rag_v2.json"
     )
+    quiz.to_json_file(out_path)
+    print(f"💾 Saved corpus+RAG JSON-v2 quiz to {out_path}")
+
+
 
 
 if __name__ == "__main__":
     import sys
+    import os
+
+    if os.getenv("OPENAI_API_KEY"):
+        llm_ver = LLM_GPT4o
+    else:
+        llm_ver = LLM_OLLAMA_GEMMA2
 
-    llm_ver = LLM_GPT4o  # Always use GPT-4o
     print(f"Selected LLM: {llm_ver}")
 
+
     if "-ask" in sys.argv:
-        quiz_json = f"openworm_ai/quiz/samples/{llm_ver.replace(':', '_')}_100_questions_celegans_corpus.json"
+        # Match the new v2 filename pattern
+        num_questions = 4
+        quiz_json = (
+            f"openworm_ai/quiz/samples/"
+            f"{llm_ver.replace(':', '_')}_{num_questions}questions_celegans_corpus_v2.json"
+        )
+
+        print(f"Loading quiz from: {quiz_json}")
         quiz = MultipleChoiceQuiz.from_file(quiz_json)
 
         total_qs = 0
@@ -195,5 +416,13 @@ def save_quiz(num_questions=100, num_answers=4, llm_ver=LLM_GPT4o, temperature=0
                 f" >> {qi}) {q} → Guess: {resp}, Correct: {correct_answer} → {correct_guess}"
             )
 
+        print(f"\nTotal correct: {total_correct} / {total_qs}")
+
     else:
-        save_quiz(100, 4, llm_ver, temperature=0.2)
+        # Use the new v2 generator
+        save_quiz_v2(
+            num_questions=4,
+            llm_ver=llm_ver,
+            temperature=0.2,
+            questions_per_section=1,
+        )
diff --git a/openworm_ai/quiz/Templates.py b/openworm_ai/quiz/Templates.py
index 1f29de1..7479f85 100644
--- a/openworm_ai/quiz/Templates.py
+++ b/openworm_ai/quiz/Templates.py
@@ -4,33 +4,83 @@
 
 GENERATE_Q = """
 Generate a list of <QUESTION_NUMBER> multiple choice questions to test someone's general knowledge.
-The questions should be answerable by a reasonably intelligent adult, and should be on a wide range of subjects.
+The questions should be answerable by an intelligent adult and should cover a wide range of topics,
+such as history, geography, science, culture, technology, medicine, society, and everyday facts.
 There should be <ANSWER_NUMBER> possible answers, only one of which is unambiguously correct, and all of the answers should be kept brief.
 Each of the <QUESTION_NUMBER> question/answer sets should be presented in the following format:
 
 """
 
 TEXT_ANSWER_EXAMPLE = """
-QUESTION: What is the capital of France?
-CORRECT ANSWER: Paris
-WRONG ANSWER: Madrid
-WRONG ANSWER: Rome
-WRONG ANSWER: Dublin
+QUESTION: What is the capital city of Japan?
+CORRECT ANSWER: Tokyo
+WRONG ANSWER: Osaka
+WRONG ANSWER: Kyoto
+WRONG ANSWER: Nagoya
 
 """
 
+# New JSON-based MCQ generation template (v2)
+GENERATE_Q_JSON = """
+You are an expert academic question writer.
+
+Generate <QUESTION_NUMBER> high-quality general-knowledge multiple-choice questions.
+Cover a wide range of topics such as history, geography, science, culture, medicine,
+society, world affairs, technology, and everyday factual knowledge.
+
+Each question MUST:
+- Be clearly and precisely worded.
+- Be answerable by an intelligent adult without needing specialist knowledge.
+- Have exactly ONE correct answer and three incorrect but plausible answers.
+- Be unambiguous so that two well-informed people would independently choose the same correct option.
+
+STRICTLY AVOID AMBIGUITY:
+- Do NOT use vague terms like "main", "best", "most important", or "most likely"
+  unless the question defines them clearly.
+- Do NOT write questions where more than one answer could be argued correct.
+- Avoid vague pronouns ("this", "it", "they") if unclear what they refer to.
+- Avoid questions whose answer might depend on opinion or interpretation.
+
+For the incorrect options:
+- They must be factually wrong.
+- They must still sound plausible to someone with partial knowledge.
+- Avoid joke answers or irrelevant answers.
+- Do NOT use "All of the above" or "None of the above".
+
+Return ONLY valid JSON, with no extra commentary. The JSON must be an array:
+
+[
+  {
+    "question": "string",
+    "options": [
+      {"label": "A", "text": "string"},
+      {"label": "B", "text": "string"},
+      {"label": "C", "text": "string"},
+      {"label": "D", "text": "string"}
+    ],
+    "correct_label": "A"
+  },
+  ...
+]
+
+Do not include fewer or more than <QUESTION_NUMBER> objects in the array.
+"""
+
+
+
+
 ASK_Q = """You are to select the correct answer for a multiple choice question. 
 A number of answers will be presented and you should respond with only the letter corresponding to the correct answer.
 For example if the question is: 
 
-What is the capital of France?
+What is the capital city of Japan?
 
 and the potential answers are:
 
-E: Madrid
-F: Paris
-G: Rome
-H: Dublin
+E: Osaka
+F: Tokyo
+G: Kyoto
+H: Nagoya
 
 you should only answer: 
 
@@ -44,14 +94,13 @@
 
 <ANSWERS>
 
-Remember: only respond with the letter of the correct answer!
 """
 
 if __name__ == "__main__":
     import sys
 
     question = (
-        GENERATE_Q.replace("<QUESTION_NUMBER>", "100").replace("<ANSWER_NUMBER>", "4")
+        GENERATE_Q.replace("<QUESTION_NUMBER>", "5").replace("<ANSWER_NUMBER>", "4")
         + TEXT_ANSWER_EXAMPLE
     )
 
diff --git a/openworm_ai/quiz/TemplatesCelegans.py b/openworm_ai/quiz/TemplatesCelegans.py
index bd5f42d..5f832fa 100644
--- a/openworm_ai/quiz/TemplatesCelegans.py
+++ b/openworm_ai/quiz/TemplatesCelegans.py
@@ -19,6 +19,55 @@
 
 """
 
+# New JSON-based MCQ generation template (v2)
+GENERATE_Q_JSON = """
+You are an expert on *Caenorhabditis elegans* (C. elegans) biology and neuroscience.
+
+Generate <QUESTION_NUMBER> high-quality multiple-choice questions about C. elegans.
+Cover a range of topics (anatomy, nervous system, behaviour, genetics, development, physiology, lab techniques, and research significance).
+Questions should be answerable by a scientifically literate, intelligent adult without needing to be a specialist in C. elegans.
+
+Each question MUST:
+- Be specific to C. elegans (not generic animal biology).
+- Be clearly and precisely worded.
+- Have exactly ONE correct answer and three incorrect but plausible answers.
+- Be answerable in a way that two well-informed experts on C. elegans would agree on the same option.
+
+STRICTLY AVOID AMBIGUITY:
+- Do NOT use vague terms like "main", "best", "most important", or "most likely"
+  unless the question explicitly defines them clearly enough that only one option fits.
+- Do NOT ask questions where more than one option could reasonably be argued correct.
+- Avoid vague pronouns ("this", "it", "they") if it might be unclear what they refer to.
+- If a question could be interpreted in multiple ways, REWRITE it until the meaning is unique.
+
+For the incorrect options:
+- They must be factually wrong for C. elegans.
+- They must still sound plausible to someone with partial understanding of C. elegans.
+- Avoid obviously silly or irrelevant answers.
+- Do NOT use "All of the above" or "None of the above".
+
+Return ONLY valid JSON, with no extra commentary. The JSON must be an array:
+
+[
+  {
+    "question": "string",
+    "options": [
+      {"label": "A", "text": "string"},
+      {"label": "B", "text": "string"},
+      {"label": "C", "text": "string"},
+      {"label": "D", "text": "string"}
+    ],
+    "correct_label": "A"
+  },
+  ...
+]
+
+Do not include fewer or more than <QUESTION_NUMBER> objects in the array.
+"""
+
+
+
+
 ASK_Q = """You are to select the correct answer for a multiple choice question. 
 A number of answers will be presented and you should respond with only the letter corresponding to the correct answer.
 For example if the question is: 
diff --git a/openworm_ai/quiz/TemplatesScience.py b/openworm_ai/quiz/TemplatesScience.py
index db2a73e..364ea82 100644
--- a/openworm_ai/quiz/TemplatesScience.py
+++ b/openworm_ai/quiz/TemplatesScience.py
@@ -4,7 +4,7 @@
 
 GENERATE_Q = """
 Generate a list of <QUESTION_NUMBER> multiple choice questions to test someone's scientific knowledge.
-The questions should be answerable by an intelligent adult, and should be on a wide range of subjects in scinece: biology, chemistry, physics and all the relevant fields.
+The questions should be answerable by an intelligent adult, and should be on a wide range of subjects in science: biology, chemistry, physics and all the relevant fields.
 There should be <ANSWER_NUMBER> possible answers, only one of which is unambiguously correct, and all of the answers should be kept brief.
 Each of the <QUESTION_NUMBER> question/answer sets should be presented in the following format:
 
@@ -19,24 +19,84 @@
 
 """
 
-ASK_Q = """You are to select the correct answer for a multiple choice question. 
-A number of answers will be presented and you should respond with only the letter corresponding to the correct answer.
-For example if the question is: 
+# New JSON-based MCQ generation template (v2)
+GENERATE_Q_JSON = """
+You are an expert on *Science* including biology, chemistry, physics and mathematics and all relevant fields.
+
+Generate <QUESTION_NUMBER> high-quality multiple-choice questions on a wide range of scientific topics.
+Cover a range of topics (biology, chemistry, physics, mathematics and related disciplines, and all the subtopics within these fields).
+Questions should be answerable by a scientifically literate, intelligent adult without needing to be a specialist in the specific topic area.
+
+Each question MUST:
+- Be specific to scientific knowledge.
+- Be clearly and precisely worded.
+- Have exactly ONE correct answer and three incorrect but plausible answers.
+- Be answerable in a way that two well-informed experts on the scientific topic would agree on the same option.
+
+STRICTLY AVOID AMBIGUITY:
+- Do NOT use vague terms like "main", "best", "most important", or "most likely"
+  unless the question explicitly defines them clearly enough that only one option fits.
+- Do NOT ask questions where more than one option could reasonably be argued correct.
+- Avoid vague pronouns ("this", "it", "they") if it might be unclear what they refer to.
+- If a question could be interpreted in multiple ways, REWRITE it until the meaning is unique.
+
+For the incorrect options:
+- They must be factually wrong for C. elegans.
+- They must still sound plausible to someone with partial understanding of science.
+- Avoid obviously silly or irrelevant answers.
+- Do NOT use "All of the above" or "None of the above".
+
+Return ONLY valid JSON, with no extra commentary. The JSON must be an array:
+
+[
+  {
+    "question": "string",
+    "options": [
+      {"label": "A", "text": "string"},
+      {"label": "B", "text": "string"},
+      {"label": "C", "text": "string"},
+      {"label": "D", "text": "string"}
+    ],
+    "correct_label": "A"
+  },
+  ...
+]
+
+Do not include fewer or more than <QUESTION_NUMBER> objects in the array.
+"""
+
+
+ASK_Q = """
+You are to select the correct answer for a multiple choice question.
+
+A number of answers will be presented and you should respond with only the letter
+corresponding to the correct answer.
+
+Here is an example to show the format:
+
+Example question:
 
 What is the powerhouse of the cell responsible for cellular respiration?
 
-and the potential answers are:
+Example options:
 
 E: Nucleus
 F: Mitochondria
 G: Ribosome
 H: Golgi Apparatus
 
-you should only answer: 
+In that example, the correct answer is option F, so you would answer with:
 
 F
 
-This is your question:
+IMPORTANT: In the REAL questions below, the options will ALWAYS be labelled with the letters A, B, C, and D.
+For those questions you MUST answer with exactly ONE of these letters: A, B, C, or D.
+
+Do NOT answer with any other letter.
+Do NOT repeat the question or options.
+Do NOT add explanations, punctuation, or any extra text.
+
+Now answer this question:
 
 <QUESTION>
 
@@ -44,6 +104,7 @@
 
 <ANSWERS>
 
+Respond with exactly ONE character: A, B, C, or D.
 """
 
 if __name__ == "__main__":
diff --git a/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json b/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json
new file mode 100644
index 0000000..de90c31
--- /dev/null
+++ b/openworm_ai/quiz/samples/Ollama_mistral_5questions_general_v2.json
@@ -0,0 +1,131 @@
+{
+    "title": "Ollama_mistral_5questions_general_v2",
+    "source": "Generated by Ollama:mistral, temperature: 0.2, mode: JSON_v2_raw10",
+    "questions": [
+        {
+            "question": "Who was the first President of the United States?",
+            "answers": [
+                {
+                    "ref": "1",
+                    "ans": "George Washington",
+                    "correct": true
+                },
+                {
+                    "ref": "2",
+                    "ans": "Thomas Jefferson",
+                    "correct": false
+                },
+                {
+                    "ref": "3",
+                    "ans": "Benjamin Franklin",
+                    "correct": false
+                },
+                {
+                    "ref": "4",
+                    "ans": "Abraham Lincoln",
+                    "correct": false
+                }
+            ]
+        },
+        {
+            "question": "Which continent is the largest in terms of total area?",
+            "answers": [
+                {
+                    "ref": "1",
+                    "ans": "Asia",
+                    "correct": true
+                },
+                {
+                    "ref": "2",
+                    "ans": "Africa",
+                    "correct": false
+                },
+                {
+                    "ref": "3",
+                    "ans": "Antarctica",
+                    "correct": false
+                },
+                {
+                    "ref": "4",
+                    "ans": "South America",
+                    "correct": false
+                }
+            ]
+        },
+        {
+            "question": "What is the chemical symbol for water?",
+            "answers": [
+                {
+                    "ref": "1",
+                    "ans": "HW",
+                    "correct": false
+                },
+                {
+                    "ref": "2",
+                    "ans": "H2O",
+                    "correct": true
+                },
+                {
+                    "ref": "3",
+                    "ans": "HOH",
+                    "correct": false
+                },
+                {
+                    "ref": "4",
+                    "ans": "OH2",
+                    "correct": false
+                }
+            ]
+        },
+        {
+            "question": "Which of these countries is NOT a member of the European Union?",
+            "answers": [
+                {
+                    "ref": "1",
+                    "ans": "France",
+                    "correct": false
+                },
+                {
+                    "ref": "2",
+                    "ans": "Spain",
+                    "correct": false
+                },
+                {
+                    "ref": "3",
+                    "ans": "Norway",
+                    "correct": true
+                },
+                {
+                    "ref": "4",
+                    "ans": "Poland",
+                    "correct": false
+                }
+            ]
+        },
+        {
+            "question": "Who wrote the novel 'To Kill a Mockingbird'?",
+            "answers": [
+                {
+                    "ref": "1",
+                    "ans": "Harper Lee",
+                    "correct": true
+                },
+                {
+                    "ref": "2",
+                    "ans": "J.D. Salinger",
+                    "correct": false
+                },
+                {
+                    "ref": "3",
+                    "ans": "Ernest Hemingway",
+                    "correct": false
+                },
+                {
+                    "ref": "4",
+                    "ans": "Mark Twain",
+                    "correct": false
+                }
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json b/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json
new file mode 100644
index 0000000..0bef406
--- /dev/null
+++ b/openworm_ai/quiz/samples/gpt-4o_20questions_celegans.json
@@ -0,0 +1,4 @@
+{
+    "title": "GPT4o_20questions_celegans",
+    "source": "Generated by gpt-4o, temperature: 0.2"
+}
\ No newline at end of file
diff --git a/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json b/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json
new file mode 100644
index 0000000..804df23
--- /dev/null
+++ b/openworm_ai/quiz/samples/gpt-4o_5questions_celegans.json
@@ -0,0 +1,4 @@
+{
+    "title": "GPT4o_5questions_celegans",
+    "source": "Generated by gpt-4o, temperature: 0.2"
+}
\ No newline at end of file
diff --git a/openworm_ai/utils/llms.py b/openworm_ai/utils/llms.py
index 388f4dc..7b1be80 100644
--- a/openworm_ai/utils/llms.py
+++ b/openworm_ai/utils/llms.py
@@ -90,6 +90,33 @@
 ]
 
 
+def requires_openai_key(llm_ver):
+    return llm_ver in OPENAI_LLMS
+
+
+def get_openai_api_key():
+    """
+    Returns the OpenAI API key from:
+    1. Environment variables (preferred)
+    2. A file '../oaik' (legacy OpenWorm option), IF it exists
+    """
+    # 1. Try environment variable
+    key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_KEY")
+    if key:
+        return key.strip()
+
+    # 2. Legacy fallback – only read file if it exists
+    oaik_path = "../oaik"
+    if os.path.exists(oaik_path):
+        with open(oaik_path, "r") as f:
+            return f.read().strip()
+
+    # 3. Nothing found → fail clearly
+    raise RuntimeError(
+        "OpenAI API key not found.\n"
+        "Set environment variable OPENAI_API_KEY or place a key in '../oaik'."
+    )
+
 GENERAL_QUERY_PROMPT_TEMPLATE = """Answer the following question. Provide succinct, yet scientifically accurate
     answers. Question: {question}
 
@@ -226,15 +253,36 @@ def generate_panel_response(input_text, llm_panelists, llm_panel_chair, temperat
 
 
 def get_llm_from_argv(argv):
+    # Default remains GPT-4o
     llm_ver = LLM_GPT4o
 
-    for arg in LLM_CMD_LINE_ARGS:
+    # Allow command-line flags to override
+    for arg, model_name in LLM_CMD_LINE_ARGS.items():
         if arg in argv:
-            llm_ver = LLM_CMD_LINE_ARGS[arg]
+            return model_name
+
+    # Allow explicit model names as positional args
+    for a in argv[1:]:
+        if a.startswith("Ollama:"):
+            return a
+        if a in PREF_ORDER_LLMS:
+            return a
+        if a.upper() in ("GPT4O", "GPT-4O"):
+            return LLM_GPT4o
+
+    # --- FINAL FAILSAFE ---
+    # If default GPT-4o chosen but key missing → fallback to Ollama
+    try:
+        if requires_openai_key(llm_ver):
+            _ = get_openai_api_key()  # Just try getting key
+    except Exception:
+        print("⚠ No OpenAI key found → using local Ollama model instead.")
+        return LLM_OLLAMA_LLAMA32
 
     return llm_ver
 
 
+
 def ask_question_get_response(
     question, llm_ver, temperature=0, only_celegans=False, print_question=True
 ):