Agent-Oriented-Planning/evaluate.py at main · lalaliat/Agent-Oriented-Planning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
import datasets
from prompt import evaluate_prompt
from planner import agent

path = '/mnt/liao/planner/documents/baseline1_choose_one_agent.txt'

def evaluate(path):
    with open(path, 'r') as file:
        content = file.read()

    pattern_query = re.compile(r'\*\*\*(.*?)\*\*\*', re.DOTALL)
    matches_query = pattern_query.findall(content)
    pattern_answer = re.compile(r'\~\~\~(.*?)\~\~\~', re.DOTALL)
    matches_answer = pattern_answer.findall(content)


    data_path = '/mnt/liao/planner/datasets/huskyqa'
    raw_datasets = datasets.load_dataset(data_path)

    label = []
    for num in range(len(matches_query)):
        # print('query: ', matches_query[num])
        # print('ground truth: ', raw_datasets['test']['answer'][num])
        # print('prediction: ', matches_answer[num])
        prompt = evaluate_prompt % (matches_query[num], raw_datasets['test']['answer'][num], matches_answer[num])
        for i in range(10):
            res = agent(prompt)
            if isinstance(res['data'], dict):
                label.append(res['data']['response']['choices'][0]['message']['content'])
                print(res['data']['response']['choices'][0]['message']['content'])
                with open('baseline1_label.txt', 'a') as file:
                    file.write(res['data']['response']['choices'][0]['message']['content']+'\n')
                break
    return label

evaluate(path)

# test

with open(path, 'r') as file:
    content = file.read()

pattern_query = re.compile(r'\*\*\*(.*?)\*\*\*', re.DOTALL)
matches_query = pattern_query.findall(content)
pattern_answer = re.compile(r'\~\~\~(.*?)\~\~\~', re.DOTALL)
matches_answer = pattern_answer.findall(content)


data_path = '/mnt/liao/planner/datasets/huskyqa'
raw_datasets = datasets.load_dataset(data_path)

label = []

evaluate_prompt = '''
You are CompareGPT, a machine to verify the correctness of predictions. Answer with only yes/no.\n\n You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the \"Ground-truth answer\" and the \"Prediction\" to determine whether the prediction correctly answers the question. The prediction may contain extra information, but a correct prediction includes the ground-truth answer. You can answer \"yes\" if the prediction includes the ground-truth answer. You must answer \"no\" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. Note that the error within three decimal places is negligible. By the way, give the reason for the evaluation. Think step by step.
---
Question: %s
Ground-truth answer: %s
Prediction: %s
'''
num = 119
prompt = evaluate_prompt % (matches_query[num], raw_datasets['test']['answer'][num], matches_answer[num])
res = agent(prompt, model = 'gpt-3.5-turbo')
print(res['data']['response']['choices'][0]['message']['content'])