quotient-ai · freddiev4 · Jun 4, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/README.md b/README.md
@@ -70,14 +70,7 @@ The library also provides an interface to combine multiple judges through the `J
 ## Usage
 
 ### Pick a model
-
-- **OpenAI:** 
-  - By default, `judges` uses the OpenAI client and models due to its widespread use. To get started, you'll need an OpenAI API key set as an environment variable `OPENAI_API_KEY`
-- **LiteLLM:** 
-  - If you would like to use models on other inference providers, `judges` also integrates with `litellm` as an extra dependency. Run `pip install "judges[litellm]"`, and set the appropriate API keys based on the [LiteLLM Docs](https://docs.litellm.ai/docs/#basic-usage).
-
-> [!TIP]  
-> If you choose to use `litellm` to use 3rd-party inference providers, and the model you want is not available via the function below, check the docs of the inference provider directly since `litellm` docs may not always be up to date.
+By default, `judges` uses [`instructor`](https://python.useinstructor.com/) for structured outputs and models due to its widespread use. To get started, set your `OPENAI_API_KEY` or whatever key you want for a specific model provider. Refer to the instructor docs for more providers.
 
 
 ### Send data to an LLM
@@ -126,7 +119,7 @@ from judges.classifiers.correctness import PollMultihopCorrectness
 
 # use the correctness classifier to determine if the first model
 # answered correctly
-correctness = PollMultihopCorrectness(model='gpt-4o-mini')
+correctness = PollMultihopCorrectness(model='openai/gpt-4o-mini')
 
 judgment = correctness.judge(
     input=input,
@@ -148,8 +141,8 @@ A jury of LLMs can enable more diverse results and enable you to combine the jud
 from judges import Jury
 from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness
 
-poll = PollMultihopCorrectness(model='gpt-4o')
-raft = RAFTCorrectness(model='gpt-4o-mini')
+poll = PollMultihopCorrectness(model='openai/gpt-4o')
+raft = RAFTCorrectness(model='openai/gpt-4o-mini')
 
 jury = Jury(judges=[poll, raft], voting_method="average")
 
@@ -227,7 +220,7 @@ task = "Evaluate responses for accuracy, clarity, and helpfulness."
 autojudge = AutoJudge.from_dataset(
     dataset=dataset,
     task=task,
-    model="gpt-4-turbo-2024-04-09",
+    model="openai/gpt-4-turbo-2024-04-09",
     # increase workers for speed ⚡
     # max_workers=2,
     # generated prompts are automatically saved to disk
@@ -284,7 +277,7 @@ judges PollMultihopCorrectness -m gpt-4 -i test_cases.json -o results.json
 
 The CLI accepts the following parameters:
 - `judge`: The type of judge to use (see [Classifiers](#classifiers-1))
-- `--model` or `-m`: The name of the model to use (e.g., "gpt-4", "<litellm_provider>/<model_name>")
+- `--model` or `-m`: The name of the model to use (e.g., "gpt-4", "<provider>/<model_name>")
 - `--input` or `-i`: Either a JSON string or path to a JSON file containing test cases
 - `--output` or `-o` (optional): Path to save the results (if not provided, prints to stdout)
 

diff --git a/examples/autojudge.py b/examples/autojudge.py
@@ -37,7 +37,7 @@
 autojudge = AutoJudge.from_dataset(
     dataset=dataset,
     task=task,
-    model="gpt-4-turbo-2024-04-09",
+    model="openai/gpt-4-turbo-2024-04-09",
     # increase workers for speed ⚡
     # max_workers=2,
     # generated prompts are automatically saved to disk

diff --git a/examples/basic.py b/examples/basic.py
@@ -15,6 +15,7 @@
 From that day on, Fig had a new friend. Every afternoon, the two of them would meet in the same spot, enjoying the quiet companionship of an unlikely friendship. Fig's adventurous heart had found a little peace in the simple joy of being with his new friend.
 """
 
+print("Getting input, expected, and output...")
 # set up the input prompt
 input = f'{story}\n\nQuestion:{question}'
 
@@ -35,9 +36,10 @@
 
 
 
+print("Judging...")
 # use the correctness classifier to determine if the first model
 # answered correctly
-correctness = PollMultihopCorrectness(model='gpt-4o-mini')
+correctness = PollMultihopCorrectness(model='anthropic/claude-sonnet-4-20250514')
 
 judgment = correctness.judge(
     input=input,

diff --git a/examples/evaluating-ai-search-engines.ipynb b/examples/evaluating-ai-search-engines.ipynb
@@ -62,7 +62,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install judges[litellm] datasets google-generativeai exa_py seaborn matplotlib --quiet"
+    "!pip install judges datasets google-generativeai exa_py seaborn matplotlib --quiet"
    ]
   },
   {

diff --git a/examples/judge_coverage.py b/examples/judge_coverage.py
@@ -0,0 +1,56 @@
+from judges.graders.information_coverage import HaystackBulletPointCoverageCorrectness
+
+haystack = HaystackBulletPointCoverageCorrectness(model="anthropic/claude-sonnet-4-20250514")
+
+input = None
+
+# bullet points
+output = """
+# Patient-Doctor Consultation Patterns
+
+1. The patient often mentions that they are worried about medication side effects[12][14]
+2. The doctor and patient spend time going over symptoms[3][8], particularly the initial symptoms and the progression in the last few months[5]
+3. The doctor and patient discuss medical history within the patient's family[1][5], with the patient often unaware that some of the conditions are hereditary[2]
+4. Patients frequently downplay their pain levels[7][11], describing severe discomfort as "just a little uncomfortable"[9]
+5. The doctor asks about lifestyle factors[4][13], including diet, exercise, and sleep patterns, which patients sometimes find irrelevant to their current complaint[6]
+6. Patients often interrupt the doctor's explanation[2][10] to ask about return-to-work timelines or activity restrictions[15]
+7. The doctor spends considerable time explaining diagnostic procedures[8][12], while patients focus mainly on whether the tests will be painful[3]
+8. Patients frequently bring up symptoms they researched online[5][14], leading to discussions about reliable health information sources[11]
+9. The doctor and patient negotiate treatment plans[1][9], with patients expressing preferences for non-pharmaceutical approaches first[7]
+10. Patients often ask for referrals to specialists[6][13] before trying the primary care physician's initial treatment recommendations[4]
+11. The doctor addresses medication compliance issues[10][15], discovering that patients have been skipping doses due to cost concerns[8]
+12. Patients mention symptoms they've been experiencing for months[2][12] but only decided to address when they became severe[5]
+13. The doctor discusses preventive care measures[3][11], which patients sometimes view as unnecessary when they feel healthy[9]
+14. Patients frequently ask about alternative or complementary treatments[7][14], requiring the doctor to address efficacy and safety concerns[1]
+15. The doctor spends time clarifying medical terminology[4][10], as patients often misunderstand previous diagnoses or test results[6]
+16. Patients express anxiety about upcoming procedures[8][13], leading to detailed discussions about what to expect[12]
+17. The doctor addresses lifestyle modifications[11][15], while patients focus on quick fixes or medications instead[3]
+18. Patients often mention that friends or family members have suggested various remedies[5][9], requiring medical clarification[7]
+19. The doctor discusses follow-up care[2][14], but patients sometimes assume they only need to return if symptoms worsen[10]
+20. Patients frequently ask about prognosis and timeline for recovery[6][11], wanting specific dates that doctors cannot always provide[4]
+21. The doctor addresses insurance coverage concerns[1][13], as patients worry about the cost of recommended treatments[8]
+22. Patients often compare their symptoms to previous episodes[9][15], assuming the current issue is identical[12]
+23. The doctor discusses medication interactions[3][7], discovering that patients haven't disclosed all supplements they're taking[5]
+24. Patients frequently ask about driving restrictions[10][14] and other daily activity limitations after procedures[2]
+25. The doctor spends time addressing health maintenance[6][11], while patients are primarily focused on their acute complaint[9]
+26. Patients often express frustration with previous healthcare experiences[4][13], affecting their trust in current recommendations[1]
+27. The doctor discusses test results interpretation[8][15], as patients may have received conflicting information from other sources[7]
+28. Patients frequently ask about genetic testing[12][5], particularly when family history becomes relevant to their condition[3]
+29. The doctor addresses work accommodation needs[11][14], helping patients understand how to communicate with employers about medical restrictions[6]
+30. Patients often seek reassurance about symptoms[2][10], needing multiple confirmations that serious conditions have been ruled out[4]
+31. The doctor discusses mental health screening[9][13], as patients may not initially connect physical symptoms with psychological stress[8]
+32. Patients frequently ask about second opinions[1][15], particularly for complex diagnoses or recommended surgeries[11]
+33. The doctor addresses medication timing and administration[5][7], discovering that patients have been taking medications incorrectly[12]
+"""
+
+# reference insight
+expected = "The patient is worried about getting extra care from the doctor if conditions show"
+
+print("Judging...")
+judgment = haystack.judge(
+    input=input,
+    output=output,
+    expected=expected,
+)
+
+print("Judgment:", judgment)
diff --git a/examples/jury.py b/examples/jury.py
@@ -14,6 +14,7 @@
 From that day on, Fig had a new friend. Every afternoon, the two of them would meet in the same spot, enjoying the quiet companionship of an unlikely friendship. Fig's adventurous heart had found a little peace in the simple joy of being with his new friend.
 """
 
+print("Getting input, expected, and output...")
 # set up the input prompt
 input = f'{story}\n\nQuestion:{question}'
 
@@ -34,17 +35,28 @@
 
 
 from judges import Jury
-from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness
+from judges.classifiers.correctness import PollMultihopCorrectness, RAFTCorrectness, PollZeroShotCorrectness
 
-poll = PollMultihopCorrectness(model='gpt-4o')
-raft = RAFTCorrectness(model='gpt-4o-mini')
+poll = PollMultihopCorrectness(model='anthropic/claude-sonnet-4-20250514')
+raft = RAFTCorrectness(model='openai/gpt-4o-mini')
+poll_zeroshot = PollZeroShotCorrectness('openai/gpt-4.1')
 
-jury = Jury(judges=[poll, raft], voting_method="average")
+jury = Jury(judges=[poll, raft, poll_zeroshot], voting_method="average")
 
+print("Getting jury's verdict...")
 verdict = jury.vote(
     input=input,
     output=output,
     expected=expected,
 )
+print("Verdict:")
 print(verdict.score)
+print("--------------------------------")
+print("Individual Judgments:")
+for i, judgment in enumerate(verdict.judgments):
+    print(f"Judgment {i+1}:")
+    print(judgment.reasoning)
+    print(judgment.score)
+    print("--------------------------------")
+print("--------------------------------")
 
diff --git a/judges/_client.py b/judges/_client.py