stanfordnlp · TomeHirata · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/docs/docs/tutorials/agents/index.ipynb b/docs/docs/tutorials/agents/index.ipynb
@@ -500,23 +500,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.results],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",

diff --git a/docs/docs/tutorials/classification_finetuning/index.ipynb b/docs/docs/tutorials/classification_finetuning/index.ipynb
@@ -568,23 +568,20 @@
     "        metric=extraction_correctness_metric,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+    "    result = evaluate_correctness(people_extractor)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+    "    mlflow.log_metric(\"exact_match\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Text\": [example.text for example in devset],\n",
     "            \"Expected\": [example.example_label for example in devset],\n",
-    "            \"Predicted\": outputs,\n",
-    "            \"Exact match\": all_scores,\n",
+    "            \"Predicted\": [output[1] for output in result.results],\n",
+    "            \"Exact match\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",

diff --git a/docs/docs/tutorials/entity_extraction/index.ipynb b/docs/docs/tutorials/entity_extraction/index.ipynb
@@ -514,23 +514,20 @@
         "        metric=extraction_correctness_metric,\n",
         "        num_threads=24,\n",
         "        display_progress=True,\n",
-        "        # To record the outputs and detailed scores to MLflow\n",
-        "        return_all_scores=True,\n",
-        "        return_outputs=True,\n",
         "    )\n",
         "\n",
         "    # Evaluate the program as usual\n",
-        "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+        "    result = evaluate_correctness(people_extractor)\n",
         "\n",
         "    # Log the aggregated score\n",
-        "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+        "    mlflow.log_metric(\"exact_match\", result.score)\n",
         "    # Log the detailed evaluation results as a table\n",
         "    mlflow.log_table(\n",
         "        {\n",
         "            \"Tokens\": [example.tokens for example in test_set],\n",
         "            \"Expected\": [example.expected_extracted_people for example in test_set],\n",
-        "            \"Predicted\": outputs,\n",
-        "            \"Exact match\": all_scores,\n",
+        "            \"Predicted\": [output[1] for output in result.results],\n",
+        "            \"Exact match\": [output[2] for output in result.results],\n",
         "        },\n",
         "        artifact_file=\"eval_results.json\",\n",
         "    )\n",

diff --git a/docs/docs/tutorials/math/index.ipynb b/docs/docs/tutorials/math/index.ipynb
@@ -369,21 +369,21 @@
     "\n",
     "# Start an MLflow Run to record the evaluation\n",
     "with mlflow.start_run(run_name=\"math_evaluation\"):\n",
-    "    kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
+    "    kwargs = dict(num_threads=THREADS, display_progress=True)\n",
     "    evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(module)\n",
+    "    result = evaluate(module)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"correctness\", aggregated_score)\n",
+    "    mlflow.log_metric(\"correctness\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in dataset.dev],\n",
     "            \"Gold Answer\": [example.answer for example in dataset.dev],\n",
-    "            \"Predicted Answer\": outputs,\n",
-    "            \"Correctness\": all_scores,\n",
+    "            \"Predicted Answer\": [output[1] for output in result.results],\n",
+    "            \"Correctness\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",

diff --git a/docs/docs/tutorials/multihop_search/index.ipynb b/docs/docs/tutorials/multihop_search/index.ipynb
@@ -534,23 +534,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(Hop())\n",
+    "    result = evaluate(Hop())\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.results],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",

diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb
@@ -731,24 +731,21 @@
     "        metric=metric,\n",
     "        num_threads=24,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
+    "    mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in eval_set],\n",
     "            \"Gold Response\": [example.response for example in eval_set],\n",
-    "            \"Predicted Response\": outputs,\n",
-    "            \"Semantic F1 Score\": all_scores,\n",
+    "            \"Predicted Response\": [output[1] for output in result.results],\n",
+    "            \"Semantic F1 Score\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
@@ -1471,6 +1468,11 @@
     "\n",
     "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -57,8 +57,6 @@ def __init__(
         display_progress: bool = False,
         display_table: Union[bool, int] = False,
         max_errors: int = 5,
-        return_all_scores: bool = False,
-        return_outputs: bool = False,
         provide_traceback: Optional[bool] = None,
         failure_score: float = 0.0,
         **kwargs,
@@ -72,8 +70,6 @@ def __init__(
             display_table (Union[bool, int]): Whether to display the evaluation results in a table.
                 If a number is passed, the evaluation results will be truncated to that number before displayed.
             max_errors (int): The maximum number of errors to allow before stopping evaluation.
-            return_all_scores (bool): Whether to return scores for every data record in `devset`.
-            return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`.
             provide_traceback (Optional[bool]): Whether to provide traceback information during evaluation.
             failure_score (float): The default score to use if evaluation fails due to an exception.
         """
@@ -83,8 +79,6 @@ def __init__(
         self.display_progress = display_progress
         self.display_table = display_table
         self.max_errors = max_errors
-        self.return_all_scores = return_all_scores
-        self.return_outputs = return_outputs
         self.provide_traceback = provide_traceback
         self.failure_score = failure_score
 
@@ -97,8 +91,6 @@ def __call__(
         num_threads: Optional[int] = None,
         display_progress: Optional[bool] = None,
         display_table: Optional[Union[bool, int]] = None,
-        return_all_scores: Optional[bool] = None,
-        return_outputs: Optional[bool] = None,
         callback_metadata: Optional[dict[str, Any]] = None,
     ):
         """
@@ -112,36 +104,20 @@ def __call__(
                 `self.display_progress`.
             display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use
                 `self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed.
-            return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided,
-                use `self.return_all_scores`.
-            return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
-                provided, use `self.return_outputs`.
             callback_metadata (dict): Metadata to be used for evaluate callback handlers.
 
         Returns:
-            The evaluation results are returned in different formats based on the flags:
-
-            - Base return: A float percentage score (e.g., 67.30) representing overall performance
-
-            - With `return_all_scores=True`:
-                Returns (overall_score, individual_scores) where individual_scores is a list of
-                float scores for each example in devset
-
-            - With `return_outputs=True`:
-                Returns (overall_score, result_triples) where result_triples is a list of
-                (example, prediction, score) tuples for each example in devset
-
-            - With both flags=True:
-                Returns (overall_score, result_triples, individual_scores)
-
+            The evaluation results are returned as a dspy.Prediction object containing the following attributes:
+
+            - score: A float percentage score (e.g., 67.30) representing overall performance
+
+            - results: a list of (example, prediction, score) tuples for each example in devset
         """
         metric = metric if metric is not None else self.metric
         devset = devset if devset is not None else self.devset
         num_threads = num_threads if num_threads is not None else self.num_threads
         display_progress = display_progress if display_progress is not None else self.display_progress
         display_table = display_table if display_table is not None else self.display_table
-        return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
-        return_outputs = return_outputs if return_outputs is not None else self.return_outputs
 
         if callback_metadata:
             logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
@@ -184,15 +160,11 @@ def process_item(example):
             result_df = self._construct_result_table(results, metric_name)
 
             self._display_result_table(result_df, display_table, metric_name)
-
-        if return_all_scores and return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
-        if return_all_scores:
-            return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
-        if return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results
-
-        return round(100 * ncorrect / ntotal, 2)
+
+        return dspy.Prediction(
+            score=round(100 * ncorrect / ntotal, 2),
+            results=results,
+        )
 
     def _construct_result_table(
         self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str

diff --git a/dspy/primitives/prediction.py b/dspy/primitives/prediction.py
@@ -97,6 +97,15 @@ def __ge__(self, other):
         elif isinstance(other, Prediction):
             return self.__float__() >= float(other)
         raise TypeError(f"Unsupported type for comparison: {type(other)}")
+
+    def __eq__(self, other):
+        if isinstance(other, (float, int)):
+            return self.__float__() == other
+        elif isinstance(other, Prediction):
+            return self.__float__() == float(other)
+        else:
+            # we should return False when Prediction is compared with other types 
+            return False
 
     @property
     def completions(self):

diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py
@@ -231,7 +231,6 @@ def bootstrap_trace_data(
         devset=dataset,
         num_threads=num_threads,
         display_progress=True,
-        return_outputs=True,
         provide_traceback=False,  # TODO(check with team)
         max_errors=len(dataset) * 10,  # TODO(check with team)
         failure_score=failure_score,
@@ -290,10 +289,10 @@ def wrapped_program(**kwargs):
 
                 return failed_pred, trace
 
-    _, outputs = evaluator(wrapped_program, metric=wrapped_metric)
+    results = evaluator(wrapped_program, metric=wrapped_metric).results
 
     data = []
-    for example_ind, (example, prediction, score) in enumerate(outputs):
+    for example_ind, (example, prediction, score) in enumerate(results):
         try:
             prediction, trace = prediction
         except ValueError as ve:

diff --git a/dspy/teleprompt/copro_optimizer.py b/dspy/teleprompt/copro_optimizer.py
@@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs):
                         f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for "
                         f"Predictor {p_i+1} of {len(module.predictors())}.",
                     )
-                    score = evaluate(module_clone, devset=trainset, **eval_kwargs)
+                    score = evaluate(module_clone, devset=trainset, **eval_kwargs).score
                     if self.prompt_model:
                         logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")
                     total_calls += 1

diff --git a/dspy/teleprompt/infer_rules.py b/dspy/teleprompt/infer_rules.py
@@ -116,9 +116,8 @@ def evaluate_program(self, program, dataset):
             max_errors=self.max_errors,
             display_table=False,
             display_progress=True,
-            return_all_scores=True,
         )
-        score, _ = evaluate(program, metric=self.metric)
+        score = evaluate(program, metric=self.metric).score
         return score
 
 

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -511,9 +511,7 @@ def _optimize_prompt_parameters(
         adjusted_num_trials = int((num_trials + num_trials // minibatch_full_eval_steps + 1 + run_additional_full_eval_at_end) if minibatch else num_trials)
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
-        default_score, _ = eval_candidate_program(
-            len(valset), valset, program, evaluate, self.rng, return_all_scores=True
-        )
+        default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng).score
         logger.info(f"Default program score: {default_score}\n")
 
         trial_logs = {}
@@ -563,7 +561,7 @@ def objective(trial):
 
             # Evaluate the candidate program (on minibatch if minibatch=True)
             batch_size = minibatch_size if minibatch else len(valset)
-            score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng)
+            score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng).score
             total_eval_calls += batch_size
 
             # Update best score and program
@@ -796,7 +794,7 @@ def _perform_full_evaluation(
             param_score_dict, fully_evaled_param_combos
         )
         logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...")
-        full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng)
+        full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng).score
         score_data.append({"score": full_eval_score, "program": highest_mean_program, "full_eval": True})
 
         # Log full eval as a trial so that optuna can learn from the new results

diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
@@ -116,7 +116,9 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
                 display_progress=True,
             )
 
-            score, subscores = evaluate(program, return_all_scores=True)
+            result = evaluate(program)
+
+            score, subscores = result.score, [output[2] for output in result.results]
 
             all_subscores.append(subscores)
 

diff --git a/dspy/teleprompt/teleprompt_optuna.py b/dspy/teleprompt/teleprompt_optuna.py
@@ -48,9 +48,9 @@ def objective(self, trial):
             display_table=False,
             display_progress=True,
         )
-        score = evaluate(program2, return_all_scores=False)
+        result = evaluate(program2)
         trial.set_user_attr("program", program2)
-        return score
+        return result.score
 
     def compile(self, student, *, teacher=None, max_demos, trainset, valset=None):
         import optuna