diff --git a/docs/docs/tutorials/agents/index.ipynb b/docs/docs/tutorials/agents/index.ipynb index 8b5521fb66..ef7cc56c45 100644 --- a/docs/docs/tutorials/agents/index.ipynb +++ b/docs/docs/tutorials/agents/index.ipynb @@ -500,23 +500,20 @@ " metric=top5_recall,\n", " num_threads=16,\n", " display_progress=True,\n", - " # To record the outputs and detailed scores to MLflow\n", - " return_all_scores=True,\n", - " return_outputs=True,\n", " )\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate(cot)\n", + " result = evaluate(cot)\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"top5_recall\", aggregated_score)\n", + " mlflow.log_metric(\"top5_recall\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Claim\": [example.claim for example in eval_set],\n", " \"Expected Titles\": [example.titles for example in eval_set],\n", - " \"Predicted Titles\": outputs,\n", - " \"Top 5 Recall\": all_scores,\n", + " \"Predicted Titles\": [output[1] for output in result.results],\n", + " \"Top 5 Recall\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", diff --git a/docs/docs/tutorials/classification_finetuning/index.ipynb b/docs/docs/tutorials/classification_finetuning/index.ipynb index 4864183b1c..4baac6df63 100644 --- a/docs/docs/tutorials/classification_finetuning/index.ipynb +++ b/docs/docs/tutorials/classification_finetuning/index.ipynb @@ -568,23 +568,20 @@ " metric=extraction_correctness_metric,\n", " num_threads=16,\n", " display_progress=True,\n", - " # To record the outputs and detailed scores to MLflow\n", - " return_all_scores=True,\n", - " return_outputs=True,\n", " )\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n", + " result = evaluate_correctness(people_extractor)\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"exact_match\", aggregated_score)\n", + " mlflow.log_metric(\"exact_match\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Text\": [example.text for example in devset],\n", " \"Expected\": [example.example_label for example in devset],\n", - " \"Predicted\": outputs,\n", - " \"Exact match\": all_scores,\n", + " \"Predicted\": [output[1] for output in result.results],\n", + " \"Exact match\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", diff --git a/docs/docs/tutorials/entity_extraction/index.ipynb b/docs/docs/tutorials/entity_extraction/index.ipynb index 10a96a5d59..95798912ee 100644 --- a/docs/docs/tutorials/entity_extraction/index.ipynb +++ b/docs/docs/tutorials/entity_extraction/index.ipynb @@ -514,23 +514,20 @@ " metric=extraction_correctness_metric,\n", " num_threads=24,\n", " display_progress=True,\n", - " # To record the outputs and detailed scores to MLflow\n", - " return_all_scores=True,\n", - " return_outputs=True,\n", " )\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n", + " result = evaluate_correctness(people_extractor)\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"exact_match\", aggregated_score)\n", + " mlflow.log_metric(\"exact_match\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Tokens\": [example.tokens for example in test_set],\n", " \"Expected\": [example.expected_extracted_people for example in test_set],\n", - " \"Predicted\": outputs,\n", - " \"Exact match\": all_scores,\n", + " \"Predicted\": [output[1] for output in result.results],\n", + " \"Exact match\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", diff --git a/docs/docs/tutorials/math/index.ipynb b/docs/docs/tutorials/math/index.ipynb index eb64396302..651511ef67 100644 --- a/docs/docs/tutorials/math/index.ipynb +++ b/docs/docs/tutorials/math/index.ipynb @@ -369,21 +369,21 @@ "\n", "# Start an MLflow Run to record the evaluation\n", "with mlflow.start_run(run_name=\"math_evaluation\"):\n", - " kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n", + " kwargs = dict(num_threads=THREADS, display_progress=True)\n", " evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate(module)\n", + " result = evaluate(module)\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"correctness\", aggregated_score)\n", + " mlflow.log_metric(\"correctness\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Question\": [example.question for example in dataset.dev],\n", " \"Gold Answer\": [example.answer for example in dataset.dev],\n", - " \"Predicted Answer\": outputs,\n", - " \"Correctness\": all_scores,\n", + " \"Predicted Answer\": [output[1] for output in result.results],\n", + " \"Correctness\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", diff --git a/docs/docs/tutorials/multihop_search/index.ipynb b/docs/docs/tutorials/multihop_search/index.ipynb index 1c5998b424..f83f3faf25 100644 --- a/docs/docs/tutorials/multihop_search/index.ipynb +++ b/docs/docs/tutorials/multihop_search/index.ipynb @@ -534,23 +534,20 @@ " metric=top5_recall,\n", " num_threads=16,\n", " display_progress=True,\n", - " # To record the outputs and detailed scores to MLflow\n", - " return_all_scores=True,\n", - " return_outputs=True,\n", " )\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate(Hop())\n", + " result = evaluate(Hop())\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"top5_recall\", aggregated_score)\n", + " mlflow.log_metric(\"top5_recall\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Claim\": [example.claim for example in eval_set],\n", " \"Expected Titles\": [example.titles for example in eval_set],\n", - " \"Predicted Titles\": outputs,\n", - " \"Top 5 Recall\": all_scores,\n", + " \"Predicted Titles\": [output[1] for output in result.results],\n", + " \"Top 5 Recall\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb index 79c9374f87..a650707c4f 100644 --- a/docs/docs/tutorials/rag/index.ipynb +++ b/docs/docs/tutorials/rag/index.ipynb @@ -731,24 +731,21 @@ " metric=metric,\n", " num_threads=24,\n", " display_progress=True,\n", - " # To record the outputs and detailed scores to MLflow\n", - " return_all_scores=True,\n", - " return_outputs=True,\n", " )\n", "\n", " # Evaluate the program as usual\n", - " aggregated_score, outputs, all_scores = evaluate(cot)\n", + " result = evaluate(cot)\n", "\n", "\n", " # Log the aggregated score\n", - " mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n", + " mlflow.log_metric(\"semantic_f1_score\", result.score)\n", " # Log the detailed evaluation results as a table\n", " mlflow.log_table(\n", " {\n", " \"Question\": [example.question for example in eval_set],\n", " \"Gold Response\": [example.response for example in eval_set],\n", - " \"Predicted Response\": outputs,\n", - " \"Semantic F1 Score\": all_scores,\n", + " \"Predicted Response\": [output[1] for output in result.results],\n", + " \"Semantic F1 Score\": [output[2] for output in result.results],\n", " },\n", " artifact_file=\"eval_results.json\",\n", " )\n", @@ -1471,6 +1468,11 @@ "\n", "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 5740ba8a19..c76583134b 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -57,8 +57,6 @@ def __init__( display_progress: bool = False, display_table: Union[bool, int] = False, max_errors: int = 5, - return_all_scores: bool = False, - return_outputs: bool = False, provide_traceback: Optional[bool] = None, failure_score: float = 0.0, **kwargs, @@ -72,8 +70,6 @@ def __init__( display_table (Union[bool, int]): Whether to display the evaluation results in a table. If a number is passed, the evaluation results will be truncated to that number before displayed. max_errors (int): The maximum number of errors to allow before stopping evaluation. - return_all_scores (bool): Whether to return scores for every data record in `devset`. - return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. provide_traceback (Optional[bool]): Whether to provide traceback information during evaluation. failure_score (float): The default score to use if evaluation fails due to an exception. """ @@ -83,8 +79,6 @@ def __init__( self.display_progress = display_progress self.display_table = display_table self.max_errors = max_errors - self.return_all_scores = return_all_scores - self.return_outputs = return_outputs self.provide_traceback = provide_traceback self.failure_score = failure_score @@ -97,8 +91,6 @@ def __call__( num_threads: Optional[int] = None, display_progress: Optional[bool] = None, display_table: Optional[Union[bool, int]] = None, - return_all_scores: Optional[bool] = None, - return_outputs: Optional[bool] = None, callback_metadata: Optional[dict[str, Any]] = None, ): """ @@ -112,36 +104,20 @@ def __call__( `self.display_progress`. display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use `self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed. - return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided, - use `self.return_all_scores`. - return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not - provided, use `self.return_outputs`. callback_metadata (dict): Metadata to be used for evaluate callback handlers. Returns: - The evaluation results are returned in different formats based on the flags: - - - Base return: A float percentage score (e.g., 67.30) representing overall performance - - - With `return_all_scores=True`: - Returns (overall_score, individual_scores) where individual_scores is a list of - float scores for each example in devset - - - With `return_outputs=True`: - Returns (overall_score, result_triples) where result_triples is a list of - (example, prediction, score) tuples for each example in devset - - - With both flags=True: - Returns (overall_score, result_triples, individual_scores) - + The evaluation results are returned as a dspy.Prediction object containing the following attributes: + + - score: A float percentage score (e.g., 67.30) representing overall performance + + - results: a list of (example, prediction, score) tuples for each example in devset """ metric = metric if metric is not None else self.metric devset = devset if devset is not None else self.devset num_threads = num_threads if num_threads is not None else self.num_threads display_progress = display_progress if display_progress is not None else self.display_progress display_table = display_table if display_table is not None else self.display_table - return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores - return_outputs = return_outputs if return_outputs is not None else self.return_outputs if callback_metadata: logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}") @@ -184,15 +160,11 @@ def process_item(example): result_df = self._construct_result_table(results, metric_name) self._display_result_table(result_df, display_table, metric_name) - - if return_all_scores and return_outputs: - return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results] - if return_all_scores: - return round(100 * ncorrect / ntotal, 2), [score for *_, score in results] - if return_outputs: - return round(100 * ncorrect / ntotal, 2), results - - return round(100 * ncorrect / ntotal, 2) + + return dspy.Prediction( + score=round(100 * ncorrect / ntotal, 2), + results=results, + ) def _construct_result_table( self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str diff --git a/dspy/primitives/prediction.py b/dspy/primitives/prediction.py index 670b816b28..c632e4dfa4 100644 --- a/dspy/primitives/prediction.py +++ b/dspy/primitives/prediction.py @@ -97,6 +97,15 @@ def __ge__(self, other): elif isinstance(other, Prediction): return self.__float__() >= float(other) raise TypeError(f"Unsupported type for comparison: {type(other)}") + + def __eq__(self, other): + if isinstance(other, (float, int)): + return self.__float__() == other + elif isinstance(other, Prediction): + return self.__float__() == float(other) + else: + # we should return False when Prediction is compared with other types + return False @property def completions(self): diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py index 83b3808a67..a2c6dd4f97 100644 --- a/dspy/teleprompt/bootstrap_finetune.py +++ b/dspy/teleprompt/bootstrap_finetune.py @@ -231,7 +231,6 @@ def bootstrap_trace_data( devset=dataset, num_threads=num_threads, display_progress=True, - return_outputs=True, provide_traceback=False, # TODO(check with team) max_errors=len(dataset) * 10, # TODO(check with team) failure_score=failure_score, @@ -290,10 +289,10 @@ def wrapped_program(**kwargs): return failed_pred, trace - _, outputs = evaluator(wrapped_program, metric=wrapped_metric) + results = evaluator(wrapped_program, metric=wrapped_metric).results data = [] - for example_ind, (example, prediction, score) in enumerate(outputs): + for example_ind, (example, prediction, score) in enumerate(results): try: prediction, trace = prediction except ValueError as ve: diff --git a/dspy/teleprompt/copro_optimizer.py b/dspy/teleprompt/copro_optimizer.py index 6a3760d8db..65d09dc6f3 100644 --- a/dspy/teleprompt/copro_optimizer.py +++ b/dspy/teleprompt/copro_optimizer.py @@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs): f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for " f"Predictor {p_i+1} of {len(module.predictors())}.", ) - score = evaluate(module_clone, devset=trainset, **eval_kwargs) + score = evaluate(module_clone, devset=trainset, **eval_kwargs).score if self.prompt_model: logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}") total_calls += 1 diff --git a/dspy/teleprompt/infer_rules.py b/dspy/teleprompt/infer_rules.py index 7d4d7e620f..e3f548b499 100644 --- a/dspy/teleprompt/infer_rules.py +++ b/dspy/teleprompt/infer_rules.py @@ -116,9 +116,8 @@ def evaluate_program(self, program, dataset): max_errors=self.max_errors, display_table=False, display_progress=True, - return_all_scores=True, ) - score, _ = evaluate(program, metric=self.metric) + score = evaluate(program, metric=self.metric).score return score diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index 05dd4eec1a..d527b93c4c 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -511,9 +511,7 @@ def _optimize_prompt_parameters( adjusted_num_trials = int((num_trials + num_trials // minibatch_full_eval_steps + 1 + run_additional_full_eval_at_end) if minibatch else num_trials) logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==") - default_score, _ = eval_candidate_program( - len(valset), valset, program, evaluate, self.rng, return_all_scores=True - ) + default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng).score logger.info(f"Default program score: {default_score}\n") trial_logs = {} @@ -563,7 +561,7 @@ def objective(trial): # Evaluate the candidate program (on minibatch if minibatch=True) batch_size = minibatch_size if minibatch else len(valset) - score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng) + score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng).score total_eval_calls += batch_size # Update best score and program @@ -796,7 +794,7 @@ def _perform_full_evaluation( param_score_dict, fully_evaled_param_combos ) logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...") - full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng) + full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng).score score_data.append({"score": full_eval_score, "program": highest_mean_program, "full_eval": True}) # Log full eval as a trial so that optuna can learn from the new results diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index 89fcd5eae6..78a1389e89 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -116,7 +116,9 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None display_progress=True, ) - score, subscores = evaluate(program, return_all_scores=True) + result = evaluate(program) + + score, subscores = result.score, [output[2] for output in result.results] all_subscores.append(subscores) diff --git a/dspy/teleprompt/teleprompt_optuna.py b/dspy/teleprompt/teleprompt_optuna.py index 46cc12361e..31724de496 100644 --- a/dspy/teleprompt/teleprompt_optuna.py +++ b/dspy/teleprompt/teleprompt_optuna.py @@ -48,9 +48,9 @@ def objective(self, trial): display_table=False, display_progress=True, ) - score = evaluate(program2, return_all_scores=False) + result = evaluate(program2) trial.set_user_attr("program", program2) - return score + return result.score def compile(self, student, *, teacher=None, max_demos, trainset, valset=None): import optuna diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py index b6578303fd..3f79b360f4 100644 --- a/dspy/teleprompt/utils.py +++ b/dspy/teleprompt/utils.py @@ -43,26 +43,24 @@ def create_minibatch(trainset, batch_size=50, rng=None): return minibatch -def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None, return_all_scores=False): +def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None): """Evaluate a candidate program on the trainset, using the specified batch size.""" try: # Evaluate on the full trainset if batch_size >= len(trainset): - return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"}) + return evaluate(candidate_program, devset=trainset, callback_metadata={"metric_key": "eval_full"}) # Or evaluate on a minibatch else: return evaluate( candidate_program, devset=create_minibatch(trainset, batch_size, rng), - return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_minibatch"} ) except Exception: logger.error("An exception occurred during evaluation", exc_info=True) - if return_all_scores: - return 0.0, [0.0] * len(trainset) - return 0.0 # TODO: Handle this better, as -ve scores are possible + # TODO: Handle this better, as -ve scores are possible + return dspy.Prediction(score=0.0, results=[]) def eval_candidate_program_with_pruning( trial, trial_logs, trainset, candidate_program, evaluate, trial_num, batch_size=100, diff --git a/tests/teleprompt/test_utils.py b/tests/teleprompt/test_utils.py index 5b3a59098c..965c0dca73 100644 --- a/tests/teleprompt/test_utils.py +++ b/tests/teleprompt/test_utils.py @@ -1,7 +1,5 @@ from unittest.mock import Mock -import pytest - import dspy from dspy.teleprompt.utils import eval_candidate_program @@ -44,18 +42,12 @@ def test_eval_candidate_program_minibatch(): assert result == 0 -@pytest.mark.parametrize("return_all_scores", [True, False]) -def test_eval_candidate_program_failure(return_all_scores): +def test_eval_candidate_program_failure(): trainset = [1, 2, 3, 4, 5] candidate_program = DummyModule() evaluate = Mock(side_effect=ValueError("Error")) batch_size = 3 - result = eval_candidate_program( - batch_size, trainset, candidate_program, evaluate, return_all_scores=return_all_scores - ) + result = eval_candidate_program(batch_size, trainset, candidate_program, evaluate) - if return_all_scores: - assert result == (0.0, [0.0] * len(trainset)) - else: - assert result == 0.0 + assert result == 0.0 \ No newline at end of file