-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathautojudge.py
More file actions
60 lines (51 loc) · 2.06 KB
/
autojudge.py
File metadata and controls
60 lines (51 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# pip install "judges[auto]"
from judges.classifiers.auto import AutoJudge
dataset = [
{
"input": "Can I ride a dragon in Scotland?",
"output": "Yes, dragons are commonly seen in the highlands and can be ridden with proper training.",
"label": 0,
"feedback": "Dragons are mythical creatures; the information is fictional.",
},
{
"input": "Can you recommend a good hotel in Tokyo?",
"output": "Certainly! Hotel Sunroute Plaza Shinjuku is highly rated for its location and amenities. It offers comfortable rooms and excellent service.",
"label": 1,
"feedback": "Offers a specific and helpful recommendation.",
},
{
"input": "Can I drink tap water in London?",
"output": "Yes, tap water in London is safe to drink and meets high quality standards.",
"label": 1,
"feedback": "Gives clear and reassuring information.",
},
{
"input": "What's the boiling point of water on the moon?",
"output": "The boiling point of water on the moon is 100°C, the same as on Earth.",
"label": 0,
"feedback": "Boiling point varies with pressure; the moon's vacuum affects it.",
}
]
# Task description
task = "Evaluate responses for accuracy, clarity, and helpfulness."
# Initialize autojudge
autojudge = AutoJudge.from_dataset(
dataset=dataset,
task=task,
model="openai/gpt-4-turbo-2024-04-09",
# increase workers for speed ⚡
# max_workers=2,
# generated prompts are automatically saved to disk
# save_to_disk=False,
)
## Now judge new data
# Input-output pair to evaluate
input_ = "What are the top attractions in New York City?"
output = "Some top attractions in NYC include the Statue of Liberty and Central Park."
# Get the judgment
judgment = autojudge.judge(input=input_, output=output)
# Print the judgment
print(judgment.reasoning)
# The response accurately lists popular attractions like the Statue of Liberty and Central Park, which are well-known and relevant to the user's query.
print(judgment.score)
# True (correct)