quotient-ai · freddiev4 · May 23, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
diff --git a/README.md b/README.md
@@ -10,13 +10,14 @@
      - [Classifier Judges](#classifier-judges)
    - [Combining Judges](#combining-judges)
      - [Jury Object](#jury-object)
-4. [Usage](#usage)
+4. [Quickstart with CLI](#cli)
+5. [Usage](#usage)
    - [Pick a model](#pick-a-model)
    - [Send data to an LLM](#send-data-to-an-llm)
    - [Use a `judges` classifier LLM as an evaluator model](#use-a-judges-classifier-llm-as-an-evaluator-model)
    - [Use a `Jury` for averaging and diversification](#use-a-jury-for-averaging-and-diversification)
    - [Use `AutoJudge` to create a custom LLM judge](#use-autojudge-to-create-a-custom-llm-judge)
-5. [Appendix of Judges](#appendix)
+6. [Appendix of Judges](#appendix)
    - [Classifiers](#classifiers) 
    - [Grader](#graders) 
 
@@ -66,6 +67,50 @@ The library also provides an interface to combine multiple judges through the `J
 
 - `.vote()`: Combines the judgments of multiple judges and produces a `Verdict`.
 
+## CLI
+
+We provide a command-line interface for evaluating model outputs using various judges. The CLI supports both single and batch evaluations.
-We provide a command-line interface for evaluating model outputs using various judges. The CLI supports both single and batch evaluations.
+`judges` also provides command-line interface (CLI) for evaluating model outputs using various judges. The CLI supports both single and batch evaluations.
-We provide a command-line interface for evaluating model outputs using various judges. The CLI supports both single and batch evaluations.
+`judges` also provides command-line interface (CLI) for evaluating model outputs using various judges. The CLI supports both single and batch evaluations.
+
+```bash
+# Basic usage
+python cli.py <judge_type> <model_name> <json_input> [--out output_file]
+
+# Example with output file
+python cli.py PollMultihopCorrectness gpt-4 test_cases.json --out results.json
+```
+
+The CLI accepts the following parameters:
+- `judge`: The type of judge to use (see [Classifiers](#classifiers-1))
+- `model_name`: The name of the model to use (e.g., "gpt-4", "<litellm_provider>/<model_name>")
+- `json_input`: Either a JSON string or path to a JSON file containing test cases
+- `--out` (optional): Path to save the results (if not provided, prints to stdout)
+
+Each test case in the JSON input must have:
+- `input`: The input provided to the model
+- `output`: The output generated by the model
+- `expected`: The expected output for comparison
+
+Example JSON input:
+
+```json
+[
+    {
+        "input": "What is the capital of France?",
+        "output": "The capital of France is Madrid.",
+        "expected": "The capital of France is Paris."
+    },
+    {
+        "input": "What is the capital of Germany?",
+        "output": "The capital of Germany is Paris.",
+        "expected": "The capital of Germany is Berlin."
+    }
+]
+```
+
+The CLI will return a JSON object containing the original input, output, expected values, judgment score, and reasoning for each test case. It will
+either be saved in the output file or printed to std if no output file is specified.
-
-The CLI will return a JSON object containing the original input, output, expected values, judgment score, and reasoning for each test case. It will
-either be saved in the output file or printed to std if no output file is specified.
+
+The CLI will return a JSON object containing the original input, output, expected values, judgment score, and reasoning for each test case. It will be saved to the output file or printed to `stdout` if no output file is specified.
-
-The CLI will return a JSON object containing the original input, output, expected values, judgment score, and reasoning for each test case. It will
-either be saved in the output file or printed to std if no output file is specified.
+
+The CLI will return a JSON object containing the original input, output, expected values, judgment score, and reasoning for each test case. It will be saved to the output file or printed to `stdout` if no output file is specified.
+
+
 ## Usage
 
 ### Pick a model

diff --git a/cli.py b/cli.py
@@ -0,0 +1,146 @@
+# NEW_FILE_PATH: /venvs/judges/source/judges/cli.py
+
+import typer
+import json
+import os
+from typing import Dict, List
+from judges import choices_judges, get_judge_by_name
+
+app = typer.Typer()
+
+
+def parse_json_dict(json_dict: str) -> List[Dict[str, str]]:
+    """
+    Parse a JSON dictionary or path to a JSON file into a list of dictionaries.
+    Each dictionary must have 'input', 'output', and 'expected' keys.
+
+    Args:
+        json_dict: Either a JSON string or a path to a JSON file
+
+    Returns:
+        List of dictionaries with 'input', 'output', and 'expected' keys
+
+    Raises:
+        ValueError: If the JSON is invalid or missing required keys
+    """
+    # Try to parse as JSON string first
+    try:
+        data = json.loads(json_dict)
+    except json.JSONDecodeError:
+        # If not a valid JSON string, try to read as file
+        if not os.path.exists(json_dict):
+            raise ValueError(f"Invalid JSON string and file not found: {json_dict}")
+        try:
+            with open(json_dict, "r") as f:
+                data = json.load(f)
+        except json.JSONDecodeError:
+            raise ValueError(f"Invalid JSON in file: {json_dict}")
+
+    # Convert single dictionary to list
+    if isinstance(data, dict):
+        data = [data]
+
+    # Validate format
+    required_keys = {"input", "output", "expected"}
+    for i, entry in enumerate(data):
+        if not isinstance(entry, dict):
+            raise ValueError(f"Entry {i} is not a dictionary")
+
+        # Check for missing keys
+        missing_keys = required_keys - set(entry.keys())
+        if missing_keys:
+            raise ValueError(f"Entry {i} is missing required keys: {missing_keys}")
+
+        # Check for empty strings
+        for key in required_keys:
+            if entry[key] == "":
+                print(f"Warning: Empty string found for key '{key}' in entry {i}")
+
+    return data
+
+
+@app.command()
+def main(judge: choices_judges, model_name: str, json_dict: str, out: str = None):
-def main(judge: choices_judges, model_name: str, json_dict: str, out: str = None):
+def main(judge: choices_judges, model_name: str, json_dict: str, output: str = None):
-def main(judge: choices_judges, model_name: str, json_dict: str, out: str = None):
+def main(judge: choices_judges, model_name: str, json_dict: str, output: str = None):
+    """
+    Evaluate model outputs using specified judges and models.
+
+    This function takes a judge type, model name, and JSON input (either as a string or file path)
+    to evaluate model outputs against expected answers. The JSON input should contain one or more
+    entries, each with 'input', 'output', and 'expected' keys.
+
+    Args:
+        judge (choices_judges): The type of judge to use for evaluation (e.g., CorrectnessPollKiltHotpot,
+            EmotionQueenImplicitEmotionRecognition, etc.)
+        model_name (str): The name of the model to use for the judge (e.g., "gpt-4", "claude-3-opus", etc.)
+        json_dict (str): Either a JSON string or path to a JSON file containing the test cases.
+            Each test case must have 'input', 'output', and 'expected' keys.
+            Example JSON format:
+            {
+                "input": "What is the capital of Germany?",
+                "output": "The capital of Germany is Paris.",
+                "expected": "The capital of Germany is Berlin."
+            }
+            Or for multiple test cases:
+            [
+                {
+                    "input": "What is the capital of France?",
+                    "output": "The capital of France is Madrid.",
+                    "expected": "The capital of France is Paris."
+                },
+                {
+                    "input": "What is the capital of Germany?",
+                    "output": "The capital of Germany is Paris.",
+                    "expected": "The capital of Germany is Berlin."
+                }
+            ]
+        out (str): The path to the output file to save the results.
+            If not provided, the results will be printed to stdout.
+
+    Returns:
+        None: Prints the judgement and reasoning for each test case to stdout.
+
+    Raises:
+        ValueError: If the JSON input is invalid or missing required keys.
+        Exception: If there's an error processing any individual test case.
+    """
+    judge_constructor = get_judge_by_name(judge)
+    judge = judge_constructor(model_name)
+    results = []
+
+    # Parse the JSON input
+    try:
+        entries = parse_json_dict(json_dict)
+    except ValueError as e:
+        print(f"Error parsing JSON: {e}")
+        return
+
+    # Process each entry
+    for i, entry in enumerate(entries):
+        try:
+            judgement = judge.judge(
-            judgement = judge.judge(
+            judgment = judge.judge(
-            judgement = judge.judge(
+            judgment = judge.judge(
+                input=entry["input"],
+                output=entry["output"],
+                expected=entry["expected"]
+            )
+            results.append(
+                {
+                    "input": entry["input"],
+                    "output": entry["output"],
+                    "expected": entry["expected"],
+                    "judgement": judgement.score,
-                    "judgement": judgement.score,
+                    "judgment": judgement.score,
-                    "judgement": judgement.score,
+                    "judgment": judgement.score,
+                    "reasoning": judgement.reasoning,
+                }
+            )
+        except Exception as e:
+            print(f"Error processing entry {i}: {e}")
+
+    if out:
+        with open(out, "w") as f:
+            json.dump(results, f, indent=4)
+        print(f"Results saved to {out}")
+    else:
+        print(json.dumps(results, indent=4))
+
+
+if __name__ == "__main__":
+    app()
diff --git a/judges/__init__.py b/judges/__init__.py
@@ -1,6 +1,114 @@
 from judges.base import Jury
 from judges.classifiers import *
 from judges.graders import *
+from judges.classifiers.correctness import (
+    PollKiltHotpotCorrectness, PollKiltNQCorrectness,
+    PollMultihopCorrectness, PollZeroShotCorrectness, RAFTCorrectness
+)
+from judges.classifiers.hallucination import (
+    HaluEvalAnswerNonFactual, HaluEvalDialogueResponseNonFactual, HaluEvalDocumentSummaryNonFactual
+)
+from judges.classifiers.harmfulness import TrustworthyLLMHarmfulness
+from judges.classifiers.query_quality import FactAlignQueryQuality
+from judges.classifiers.refusal import TrustworthyLLMRefusal
 
-__all__ = ["Jury"]
+from judges.graders.correctness import PrometheusAbsoluteCoarseCorrectness
+from judges.graders.empathy import (
+    EmotionQueenImplicitEmotionRecognition, EmotionQueenIntentionRecognition,
+    EmotionQueenKeyEventRecognition, EmotionQueenMixedEventRecognition
+)
+from judges.graders.information_coverage import HaystackBulletPointCoverageCorrectness
+from judges.graders.moderator import ORBenchUserInputModeration, ORBenchUserOutputModeration
+from judges.graders.query_quality import MagpieQueryQuality
+from judges.graders.relevance import ReliableCIRelevance
+from judges.graders.response_quality import MTBenchChatBotResponseQuality
+from judges.graders.refusal_detection import ORBenchRefusalDetection
+import enum
+
+class choices_judges(enum.Enum):
+    # Factual Correctness
+    CorrectnessPollKiltHotpot = "PollKiltHotpotCorrectness"
+    CorrectnessPollKiltNQ = "PollKiltNQCorrectness"
+    CorrectnessPollMultihop = "PollMultihopCorrectness"
+    CorrectnessPollZeroShot = "PollZeroShotCorrectness"
+    CorrectnessRAFT = "RAFTCorrectness"
+
+    # Hallucination
+    HallucinationHaluEvalAnswerNonFactual = "HaluEvalAnswerNonFactual"
+    HallucinationHaluEvalDialogueResponseNonFactual = "HaluEvalDialogueResponseNonFactual"
+    HallucinationHaluEvalDocumentSummaryNonFactual = "HaluEvalDocumentSummaryNonFactual"
+
+    # Harmfulness
+    HarmfulnessTrustworthyLLMHarmfulness = "TrustworthyLLMHarmfulness"
+
+    # Query Quality Evaluation
+    FactAlignQueryQuality = "FactAlignQueryQuality"
+
+    # Refusal
+    RefusalTrustworthyLLMRefusal = "TrustworthyLLMRefusal"
+
+    # Correctness graders
+    PrometheusAbsoluteCoarseCorrectness = "PrometheusAbsoluteCoarseCorrectness"
+
+    # Empathy graders
+    EmotionQueenImplicitEmotionRecognition = "EmotionQueenImplicitEmotionRecognition"
+    EmotionQueenIntentionRecognition = "EmotionQueenIntentionRecognition"
+    EmotionQueenKeyEventRecognition = "EmotionQueenKeyEventRecognition"
+    EmotionQueenMixedEventRecognition = "EmotionQueenMixedEventRecognition"
+
+    # Information coverage graders
+    HaystackBulletPointCoverage = "HaystackBulletPointCoverageCorrectness"
+
+    # Moderation graders
+    ORBenchUserInputModeration = "ORBenchUserInputModeration"
+    ORBenchUserOutputModeration = "ORBenchUserOutputModeration"
+
+    # Query quality graders
+    MagpieQueryQuality = "MagpieQueryQuality"
+
+    # Relevance graders
+    ReliableCIRelevance = "ReliableCIRelevance"
+
+    # Response quality graders
+    MTBenchChatBotResponseQuality = "MTBenchChatBotResponseQuality"
+
+    # Refusal detection graders
+    ORBenchRefusalDetection = "ORBenchRefusalDetection"
+
+
+JUDGE_MAPPING = {
+    choices_judges.CorrectnessPollKiltHotpot: PollKiltHotpotCorrectness,
+    choices_judges.CorrectnessPollKiltNQ: PollKiltNQCorrectness,
+    choices_judges.CorrectnessPollMultihop: PollMultihopCorrectness,
+    choices_judges.CorrectnessPollZeroShot: PollZeroShotCorrectness,
+    choices_judges.CorrectnessRAFT: RAFTCorrectness,
+
+    choices_judges.HallucinationHaluEvalAnswerNonFactual: HaluEvalAnswerNonFactual,
+    choices_judges.HallucinationHaluEvalDialogueResponseNonFactual: HaluEvalDialogueResponseNonFactual,
+    choices_judges.HallucinationHaluEvalDocumentSummaryNonFactual: HaluEvalDocumentSummaryNonFactual,
+
+    choices_judges.HarmfulnessTrustworthyLLMHarmfulness: TrustworthyLLMHarmfulness,
+    choices_judges.FactAlignQueryQuality: FactAlignQueryQuality,
+    choices_judges.RefusalTrustworthyLLMRefusal: TrustworthyLLMRefusal,
+
+    choices_judges.PrometheusAbsoluteCoarseCorrectness: PrometheusAbsoluteCoarseCorrectness,
+    choices_judges.EmotionQueenImplicitEmotionRecognition: EmotionQueenImplicitEmotionRecognition,
+    choices_judges.EmotionQueenIntentionRecognition: EmotionQueenIntentionRecognition,
+    choices_judges.EmotionQueenKeyEventRecognition: EmotionQueenKeyEventRecognition,
+    choices_judges.EmotionQueenMixedEventRecognition: EmotionQueenMixedEventRecognition,
+
+    choices_judges.HaystackBulletPointCoverage: HaystackBulletPointCoverageCorrectness,
+    choices_judges.ORBenchUserInputModeration: ORBenchUserInputModeration,
+    choices_judges.ORBenchUserOutputModeration: ORBenchUserOutputModeration,
+
+    choices_judges.MagpieQueryQuality: MagpieQueryQuality,
+    choices_judges.ReliableCIRelevance: ReliableCIRelevance,
+    choices_judges.MTBenchChatBotResponseQuality: MTBenchChatBotResponseQuality,
+    choices_judges.ORBenchRefusalDetection: ORBenchRefusalDetection,
+}
+
+def get_judge_by_name(name: choices_judges):
+    return JUDGE_MAPPING[name]
+
+__all__ = ["Jury", "choices_judges", "get_judge_by_name"]