Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,7 @@ The library also provides an interface to combine multiple judges through the `J
## Usage

### Pick a model

- **OpenAI:**
- By default, `judges` uses the OpenAI client and models due to its widespread use. To get started, you'll need an OpenAI API key set as an environment variable `OPENAI_API_KEY`
- **LiteLLM:**
- If you would like to use models on other inference providers, `judges` also integrates with `litellm` as an extra dependency. Run `pip install "judges[litellm]"`, and set the appropriate API keys based on the [LiteLLM Docs](https://docs.litellm.ai/docs/#basic-usage).

> [!TIP]
> If you choose to use `litellm` to use 3rd-party inference providers, and the model you want is not available via the function below, check the docs of the inference provider directly since `litellm` docs may not always be up to date.
By default, `judges` uses [`instructor`](https://python.useinstructor.com/) for structured outputs and models due to its widespread use. To get started, set your `OPENAI_API_KEY` or whatever key you want for a specific model provider. Refer to the instructor docs for more providers.


### Send data to an LLM
Expand Down Expand Up @@ -126,7 +119,7 @@ from judges.classifiers.correctness import PollMultihopCorrectness

# use the correctness classifier to determine if the first model
# answered correctly
correctness = PollMultihopCorrectness(model='gpt-4o-mini')
correctness = PollMultihopCorrectness(model='openai/gpt-4o-mini')

judgment = correctness.judge(
input=input,
Expand All @@ -148,8 +141,8 @@ A jury of LLMs can enable more diverse results and enable you to combine the jud
from judges import Jury
from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness

poll = PollMultihopCorrectness(model='gpt-4o')
raft = RAFTCorrectness(model='gpt-4o-mini')
poll = PollMultihopCorrectness(model='openai/gpt-4o')
raft = RAFTCorrectness(model='openai/gpt-4o-mini')

jury = Jury(judges=[poll, raft], voting_method="average")

Expand Down Expand Up @@ -227,7 +220,7 @@ task = "Evaluate responses for accuracy, clarity, and helpfulness."
autojudge = AutoJudge.from_dataset(
dataset=dataset,
task=task,
model="gpt-4-turbo-2024-04-09",
model="openai/gpt-4-turbo-2024-04-09",
# increase workers for speed ⚡
# max_workers=2,
# generated prompts are automatically saved to disk
Expand Down Expand Up @@ -284,7 +277,7 @@ judges PollMultihopCorrectness -m gpt-4 -i test_cases.json -o results.json

The CLI accepts the following parameters:
- `judge`: The type of judge to use (see [Classifiers](#classifiers-1))
- `--model` or `-m`: The name of the model to use (e.g., "gpt-4", "<litellm_provider>/<model_name>")
- `--model` or `-m`: The name of the model to use (e.g., "gpt-4", "<provider>/<model_name>")
- `--input` or `-i`: Either a JSON string or path to a JSON file containing test cases
- `--output` or `-o` (optional): Path to save the results (if not provided, prints to stdout)

Expand Down
2 changes: 1 addition & 1 deletion examples/autojudge.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
autojudge = AutoJudge.from_dataset(
dataset=dataset,
task=task,
model="gpt-4-turbo-2024-04-09",
model="openai/gpt-4-turbo-2024-04-09",
# increase workers for speed ⚡
# max_workers=2,
# generated prompts are automatically saved to disk
Expand Down
4 changes: 3 additions & 1 deletion examples/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
From that day on, Fig had a new friend. Every afternoon, the two of them would meet in the same spot, enjoying the quiet companionship of an unlikely friendship. Fig's adventurous heart had found a little peace in the simple joy of being with his new friend.
"""

print("Getting input, expected, and output...")
# set up the input prompt
input = f'{story}\n\nQuestion:{question}'

Expand All @@ -35,9 +36,10 @@



print("Judging...")
# use the correctness classifier to determine if the first model
# answered correctly
correctness = PollMultihopCorrectness(model='gpt-4o-mini')
correctness = PollMultihopCorrectness(model='anthropic/claude-sonnet-4-20250514')

judgment = correctness.judge(
input=input,
Expand Down
2 changes: 1 addition & 1 deletion examples/evaluating-ai-search-engines.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
},
"outputs": [],
"source": [
"!pip install judges[litellm] datasets google-generativeai exa_py seaborn matplotlib --quiet"
"!pip install judges datasets google-generativeai exa_py seaborn matplotlib --quiet"
]
},
{
Expand Down
56 changes: 56 additions & 0 deletions examples/judge_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from judges.graders.information_coverage import HaystackBulletPointCoverageCorrectness

haystack = HaystackBulletPointCoverageCorrectness(model="anthropic/claude-sonnet-4-20250514")

input = None

# bullet points
output = """
# Patient-Doctor Consultation Patterns

1. The patient often mentions that they are worried about medication side effects[12][14]
2. The doctor and patient spend time going over symptoms[3][8], particularly the initial symptoms and the progression in the last few months[5]
3. The doctor and patient discuss medical history within the patient's family[1][5], with the patient often unaware that some of the conditions are hereditary[2]
4. Patients frequently downplay their pain levels[7][11], describing severe discomfort as "just a little uncomfortable"[9]
5. The doctor asks about lifestyle factors[4][13], including diet, exercise, and sleep patterns, which patients sometimes find irrelevant to their current complaint[6]
6. Patients often interrupt the doctor's explanation[2][10] to ask about return-to-work timelines or activity restrictions[15]
7. The doctor spends considerable time explaining diagnostic procedures[8][12], while patients focus mainly on whether the tests will be painful[3]
8. Patients frequently bring up symptoms they researched online[5][14], leading to discussions about reliable health information sources[11]
9. The doctor and patient negotiate treatment plans[1][9], with patients expressing preferences for non-pharmaceutical approaches first[7]
10. Patients often ask for referrals to specialists[6][13] before trying the primary care physician's initial treatment recommendations[4]
11. The doctor addresses medication compliance issues[10][15], discovering that patients have been skipping doses due to cost concerns[8]
12. Patients mention symptoms they've been experiencing for months[2][12] but only decided to address when they became severe[5]
13. The doctor discusses preventive care measures[3][11], which patients sometimes view as unnecessary when they feel healthy[9]
14. Patients frequently ask about alternative or complementary treatments[7][14], requiring the doctor to address efficacy and safety concerns[1]
15. The doctor spends time clarifying medical terminology[4][10], as patients often misunderstand previous diagnoses or test results[6]
16. Patients express anxiety about upcoming procedures[8][13], leading to detailed discussions about what to expect[12]
17. The doctor addresses lifestyle modifications[11][15], while patients focus on quick fixes or medications instead[3]
18. Patients often mention that friends or family members have suggested various remedies[5][9], requiring medical clarification[7]
19. The doctor discusses follow-up care[2][14], but patients sometimes assume they only need to return if symptoms worsen[10]
20. Patients frequently ask about prognosis and timeline for recovery[6][11], wanting specific dates that doctors cannot always provide[4]
21. The doctor addresses insurance coverage concerns[1][13], as patients worry about the cost of recommended treatments[8]
22. Patients often compare their symptoms to previous episodes[9][15], assuming the current issue is identical[12]
23. The doctor discusses medication interactions[3][7], discovering that patients haven't disclosed all supplements they're taking[5]
24. Patients frequently ask about driving restrictions[10][14] and other daily activity limitations after procedures[2]
25. The doctor spends time addressing health maintenance[6][11], while patients are primarily focused on their acute complaint[9]
26. Patients often express frustration with previous healthcare experiences[4][13], affecting their trust in current recommendations[1]
27. The doctor discusses test results interpretation[8][15], as patients may have received conflicting information from other sources[7]
28. Patients frequently ask about genetic testing[12][5], particularly when family history becomes relevant to their condition[3]
29. The doctor addresses work accommodation needs[11][14], helping patients understand how to communicate with employers about medical restrictions[6]
30. Patients often seek reassurance about symptoms[2][10], needing multiple confirmations that serious conditions have been ruled out[4]
31. The doctor discusses mental health screening[9][13], as patients may not initially connect physical symptoms with psychological stress[8]
32. Patients frequently ask about second opinions[1][15], particularly for complex diagnoses or recommended surgeries[11]
33. The doctor addresses medication timing and administration[5][7], discovering that patients have been taking medications incorrectly[12]
"""

# reference insight
expected = "The patient is worried about getting extra care from the doctor if conditions show"

print("Judging...")
judgment = haystack.judge(
input=input,
output=output,
expected=expected,
)

print("Judgment:", judgment)
20 changes: 16 additions & 4 deletions examples/jury.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
From that day on, Fig had a new friend. Every afternoon, the two of them would meet in the same spot, enjoying the quiet companionship of an unlikely friendship. Fig's adventurous heart had found a little peace in the simple joy of being with his new friend.
"""

print("Getting input, expected, and output...")
# set up the input prompt
input = f'{story}\n\nQuestion:{question}'

Expand All @@ -34,17 +35,28 @@


from judges import Jury
from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness
from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness, PollZeroShotCorrectness

poll = PollMultihopCorrectness(model='gpt-4o')
raft = RAFTCorrectness(model='gpt-4o-mini')
poll = PollMultihopCorrectness(model='anthropic/claude-sonnet-4-20250514')
raft = RAFTCorrectness(model='openai/gpt-4o-mini')
poll_zeroshot = PollZeroShotCorrectness('openai/gpt-4.1')

jury = Jury(judges=[poll, raft], voting_method="average")
jury = Jury(judges=[poll, raft, poll_zeroshot], voting_method="average")

print("Getting jury's verdict...")
verdict = jury.vote(
input=input,
output=output,
expected=expected,
)
print("Verdict:")
print(verdict.score)
print("--------------------------------")
print("Individual Judgments:")
for i, judgment in enumerate(verdict.judgments):
print(f"Judgment {i+1}:")
print(judgment.reasoning)
print(judgment.score)
print("--------------------------------")
print("--------------------------------")

87 changes: 0 additions & 87 deletions judges/_client.py

This file was deleted.

Loading