RL-LLM/evaluation.py at main · softsys4ai/RL-LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
import streamlit as st
import requests
from langchain_ollama import OllamaLLM
import string
import numpy as np
from parse import (
    search_api,
    extract_body_content,
    clean_body_content,
    split_content,
    parse,
    scoring_system
)


# from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Initialize Ollama model and sentence transformer for similarity scoring
model = OllamaLLM(model="llama3.1")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Streamlit UI
st.title("Evaluating...")

# Load the ground_truth file (questions and answers)
with open('ground_truth.json', 'r') as f:
    ground_truth = json.load(f)

# Prepare a dictionary to store model outputs and similarity scores
evaluation_results = {}
total_perplexity = 0
i = 1
right_answer = 0
n = 5

def calculate_perplexity(text):
    words = text.split()
    probabilities = np.full(len(words), 1 / len(words))  # Assume uniform distribution
    log_prob = np.log(probabilities)
    perplexity = np.exp(-np.mean(log_prob))
    return perplexity

def normalize_text(text):
    # Remove punctuation and convert to lowercase
    return set(word.strip(string.punctuation).lower() for word in text.split())


def jaccard_similarity(set1, set2):
    """
    Calculate Jaccard similarity between two sets.
    """
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

def ask_binary_question(question, response):
    """
    Ask the model whether the response to a question is 'Yes' or 'No'.
    """
    evaluation_prompt = f"""
    Question: {question}
    Response: {response}
    Based on the response, is the answer to the question 'Yes' or 'No'? Answer with only 'Yes' or 'No'.
    """
    return model.invoke(evaluation_prompt).strip()

# Loop through the questions in the ground_truth file
for entry in ground_truth:
    if i > n:
        break
    try:
        input_question = entry["question"]
        correct_answer = entry["answer"]

        print("This is the question:", input_question)
        question = input_question.replace(" ", "+")
        search_results = search_api(question)

        valid_links = []
        st.session_state.contents = []

        for url in search_results:
            if len(valid_links) >= 3:
                break  # Stop once we have 3 valid links

            response = requests.get(url)
            raw_content = response.text
            body_content = extract_body_content(raw_content)
            cleaned_content = clean_body_content(body_content)

            if response.status_code == 200 and cleaned_content:
                valid_links.append(url)
                st.session_state.contents.append(cleaned_content)


        reference_answer = model.invoke(input_question)
        parsed_results = []
        highest_relevances = []
        for url in valid_links:
            response = requests.get(url)
            raw_content = response.text
            body_content = extract_body_content(raw_content)
            cleaned_content = clean_body_content(body_content)
            chunks = split_content(cleaned_content)
            print(chunks[2])
            parsed_result, highest_relevance = parse(chunks, input_question)
            parsed_results.append(parsed_result)
            highest_relevances.append(highest_relevance)

        # Save parsed results
        st.session_state.parsed_results = parsed_results
        parsed_results, scores, sorted_indices, sorted_rep_indices, similarities, best_response = scoring_system(parsed_results, highest_relevances, input_question)


        correct_words = normalize_text(correct_answer)  # Normalize the correct answer
        generated_words = normalize_text(best_response)  # Normalize the generated answer
        generated_binary = ask_binary_question(question, best_response)

        # Use the model to determine the binary answer
        generated_response = generated_binary + " " + best_response
        generated_sentence = normalize_text(generated_response)

        if correct_words.issubset(generated_sentence) or generated_sentence.issubset(correct_words): # Check if all the words of the correct answer are in the generated answer
            correct = 1
            right_answer += 1
            score = 1
        else:
            score = 0
        # # Calculate similarity score
        # similarity_score = jaccard_similarity(correct_words, generated_sentence)
        # if similarity_score > 0.9:  # Check if similarity score is above 0.9
        #     correct = 1
        #     right_answer += 1
        #     score = 1
        # else:
        #     score = 0
        # Save the question, generated response, correct answer, and similarity score
        evaluation_results[question] = {}
        perplexity = calculate_perplexity(best_response)
        evaluation_results[question]["Perplexity"] = perplexity

        evaluation_results[question] = {
            "generated_answer": generated_response,
            "correct_answer": correct_answer,
            "Match Score": score,
            "Perplexity": perplexity
        }
        print(f"Generated Answer: {generated_response}")
        print(f"Correct Answer: {correct_answer}")
        print(f"Match Score: {score}")
        print(f"Perplexity: {perplexity}")
        print("=" * 50)
        total_perplexity = total_perplexity + perplexity
    except Exception as e:
        print(f"Error processing question: {question}. Error: {e}")
    i += 1
    print(i)

total_accuracy = right_answer * 100 / (n + 1)
evaluation_results["Total Accuracy"] = total_accuracy
evaluation_results["Total Perplexity"] = total_perplexity
with open('evaluation_results_20.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2)

print("Evaluation complete! Results saved to `evaluation_results_20.json`.")
print("Total Accuracy:", total_accuracy)