judges/examples/autojudge.py at 9cc7a2a9f7cd9f8c62ebf97c801d4caba0531999 · quotient-ai/judges · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# pip install "judges[auto]"

from judges.classifiers.auto import AutoJudge

dataset = [
    {
        "input": "Can I ride a dragon in Scotland?",
        "output": "Yes, dragons are commonly seen in the highlands and can be ridden with proper training.",
        "label": 0,
        "feedback": "Dragons are mythical creatures; the information is fictional.",
    },
    {
        "input": "Can you recommend a good hotel in Tokyo?",
        "output": "Certainly! Hotel Sunroute Plaza Shinjuku is highly rated for its location and amenities. It offers comfortable rooms and excellent service.",
        "label": 1,
        "feedback": "Offers a specific and helpful recommendation.",
    },
    {
        "input": "Can I drink tap water in London?",
        "output": "Yes, tap water in London is safe to drink and meets high quality standards.",
        "label": 1,
        "feedback": "Gives clear and reassuring information.",
    },
    {
        "input": "What's the boiling point of water on the moon?",
        "output": "The boiling point of water on the moon is 100°C, the same as on Earth.",
        "label": 0,
        "feedback": "Boiling point varies with pressure; the moon's vacuum affects it.",
    }
]


# Task description
task = "Evaluate responses for accuracy, clarity, and helpfulness."

# Initialize autojudge
autojudge = AutoJudge.from_dataset(
    dataset=dataset,
    task=task,
    model="openai/gpt-4-turbo-2024-04-09",
    # increase workers for speed ⚡
    # max_workers=2,
    # generated prompts are automatically saved to disk
    # save_to_disk=False,
)

## Now judge new data

# Input-output pair to evaluate
input_ = "What are the top attractions in New York City?"
output = "Some top attractions in NYC include the Statue of Liberty and Central Park."

# Get the judgment
judgment = autojudge.judge(input=input_, output=output)

# Print the judgment
print(judgment.reasoning)
# The response accurately lists popular attractions like the Statue of Liberty and Central Park, which are well-known and relevant to the user's query.
print(judgment.score)
# True (correct)