master_thesis_adaptive_rag_system/calculate_metrisc_at_k.py at main · ZabinskiMichal/master_thesis_adaptive_rag_system · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import pathlib
import logging
from beir import LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from sentence_transformers import SentenceTransformer
import chromadb
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from llm_client import together_client
from modul_decyzujny.decompose_query import check_if_query_is_complex, decompose_query
from modul_decyzujny.first_router import query_router, ToolChoice
from modul_decyzujny.knowledge_summarizer import (
    summarize_knowledge_with_most_occurring_words,
)
from query_analizer.hyde_generator import HyDEGenerator

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import re
import json

from tools.vector_store_search_fallback import vector_store_search

# use llm as a judge to check if document is relevant to question
# mozna uzywac llm ALBO rerankera

# ============== parameters to adjust ===================
# specify dataset HERE
# fiqa - ma normalne pytania
# trec-covid tez git

# dataset = "trec-covid-v2"
dataset = "fiqa"
use_ll_aaj = False
use_reranker = False
use_hyde = False
split_documents = False
route_questions = False
use_adaptive_k = True
# moze byc tu odpalic mistrall small 3.1
judge_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# cross_encoder_model = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_encoder_model = "cross-encoder/quora-distilroberta-base"
# ========================================================


assert not (
    use_ll_aaj and use_reranker
), "Ustaw tylko jeden tryb oceny: LLM albo reranker!"


if use_reranker:
    cross_encoder_name = cross_encoder_model
    ce_tokenizer = AutoTokenizer.from_pretrained(cross_encoder_name)
    ce_model = AutoModelForSequenceClassification.from_pretrained(cross_encoder_name)
    ce_model.eval()


def grade_document_with_mistral(query: str, document: str) -> str:
    grading_prompt = f"""
    You are a grader assessing relevance of a document to a question.

    You are given:
    1. A question
    2. Part of document, that might be helpfully to answer this question.

    If this document might be helpful in answering question that you will see in a second, or it has  semantic meaning
    related to the question, it should be considered as relevant otherwise it is not relevant.
    Your task is to provide answer that will inform if document is highly relevant to the question or not.

    Now, you will question and part of this document.

    ---
    Question:
    {query}

    Document:
    {document}

    Please respond ONLY with a JSON object like this, and make aure your judge is relevant to previous instructions:
    {{"response": "yes"}} or {{"response": "no"}}

    """

    response = together_client.chat.completions.create(
        model=judge_model,
        messages=[{"role": "user", "content": grading_prompt}],
        temperature=0,
        max_tokens=50,
    )

    response_text = response.choices[0].message.content.strip()

    try:
        match = re.search(r"\{.*?\}", response_text)
        if not match:
            raise ValueError("No JSON object found in response.")
        parsed = json.loads(match.group(0))
        print("✅ Successfully parsed json")

        return parsed.get("response", "").strip().lower()

    except Exception as e:
        print(f"❌ JSON parsing failed: {e}\nResponse was: {response_text}")
        return "no"  # default fallback


class GradeDocuments(BaseModel):
    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


preamble = """You are a grader assessing relevance of a retrieved document to a user question.
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages(
    [("human", "Retrieved document: \n\n{document}\n\nUser question: {question}")]
)

# === Logowanie ===
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

# === Parametry ===

embedding_model_name = "all-MiniLM-L6-v2"
collection_name = f"beir-{dataset}-corpus"
top_k = 32

# === 1. Załaduj dane ===
data_path = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets", dataset)
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

model = SentenceTransformer(embedding_model_name)