Skip to content

Change the output interface of evaluate #8003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions docs/docs/tutorials/agents/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -500,23 +500,20 @@
" metric=top5_recall,\n",
" num_threads=16,\n",
" display_progress=True,\n",
" # To record the outputs and detailed scores to MLflow\n",
" return_all_scores=True,\n",
" return_outputs=True,\n",
" )\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
" result = evaluate(cot)\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
" mlflow.log_metric(\"top5_recall\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Claim\": [example.claim for example in eval_set],\n",
" \"Expected Titles\": [example.titles for example in eval_set],\n",
" \"Predicted Titles\": outputs,\n",
" \"Top 5 Recall\": all_scores,\n",
" \"Predicted Titles\": [output[1] for output in result.results],\n",
" \"Top 5 Recall\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down
11 changes: 4 additions & 7 deletions docs/docs/tutorials/classification_finetuning/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -568,23 +568,20 @@
" metric=extraction_correctness_metric,\n",
" num_threads=16,\n",
" display_progress=True,\n",
" # To record the outputs and detailed scores to MLflow\n",
" return_all_scores=True,\n",
" return_outputs=True,\n",
" )\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
" result = evaluate_correctness(people_extractor)\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
" mlflow.log_metric(\"exact_match\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Text\": [example.text for example in devset],\n",
" \"Expected\": [example.example_label for example in devset],\n",
" \"Predicted\": outputs,\n",
" \"Exact match\": all_scores,\n",
" \"Predicted\": [output[1] for output in result.results],\n",
" \"Exact match\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down
11 changes: 4 additions & 7 deletions docs/docs/tutorials/entity_extraction/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -514,23 +514,20 @@
" metric=extraction_correctness_metric,\n",
" num_threads=24,\n",
" display_progress=True,\n",
" # To record the outputs and detailed scores to MLflow\n",
" return_all_scores=True,\n",
" return_outputs=True,\n",
" )\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
" result = evaluate_correctness(people_extractor)\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
" mlflow.log_metric(\"exact_match\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Tokens\": [example.tokens for example in test_set],\n",
" \"Expected\": [example.expected_extracted_people for example in test_set],\n",
" \"Predicted\": outputs,\n",
" \"Exact match\": all_scores,\n",
" \"Predicted\": [output[1] for output in result.results],\n",
" \"Exact match\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down
10 changes: 5 additions & 5 deletions docs/docs/tutorials/math/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -369,21 +369,21 @@
"\n",
"# Start an MLflow Run to record the evaluation\n",
"with mlflow.start_run(run_name=\"math_evaluation\"):\n",
" kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
" kwargs = dict(num_threads=THREADS, display_progress=True)\n",
" evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate(module)\n",
" result = evaluate(module)\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"correctness\", aggregated_score)\n",
" mlflow.log_metric(\"correctness\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Question\": [example.question for example in dataset.dev],\n",
" \"Gold Answer\": [example.answer for example in dataset.dev],\n",
" \"Predicted Answer\": outputs,\n",
" \"Correctness\": all_scores,\n",
" \"Predicted Answer\": [output[1] for output in result.results],\n",
" \"Correctness\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down
11 changes: 4 additions & 7 deletions docs/docs/tutorials/multihop_search/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -534,23 +534,20 @@
" metric=top5_recall,\n",
" num_threads=16,\n",
" display_progress=True,\n",
" # To record the outputs and detailed scores to MLflow\n",
" return_all_scores=True,\n",
" return_outputs=True,\n",
" )\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate(Hop())\n",
" result = evaluate(Hop())\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
" mlflow.log_metric(\"top5_recall\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Claim\": [example.claim for example in eval_set],\n",
" \"Expected Titles\": [example.titles for example in eval_set],\n",
" \"Predicted Titles\": outputs,\n",
" \"Top 5 Recall\": all_scores,\n",
" \"Predicted Titles\": [output[1] for output in result.results],\n",
" \"Top 5 Recall\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down
16 changes: 9 additions & 7 deletions docs/docs/tutorials/rag/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -731,24 +731,21 @@
" metric=metric,\n",
" num_threads=24,\n",
" display_progress=True,\n",
" # To record the outputs and detailed scores to MLflow\n",
" return_all_scores=True,\n",
" return_outputs=True,\n",
" )\n",
"\n",
" # Evaluate the program as usual\n",
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
" result = evaluate(cot)\n",
"\n",
"\n",
" # Log the aggregated score\n",
" mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
" mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
" # Log the detailed evaluation results as a table\n",
" mlflow.log_table(\n",
" {\n",
" \"Question\": [example.question for example in eval_set],\n",
" \"Gold Response\": [example.response for example in eval_set],\n",
" \"Predicted Response\": outputs,\n",
" \"Semantic F1 Score\": all_scores,\n",
" \"Predicted Response\": [output[1] for output in result.results],\n",
" \"Semantic F1 Score\": [output[2] for output in result.results],\n",
" },\n",
" artifact_file=\"eval_results.json\",\n",
" )\n",
Expand Down Expand Up @@ -1471,6 +1468,11 @@
"\n",
"The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
Expand Down
48 changes: 10 additions & 38 deletions dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ def __init__(
display_progress: bool = False,
display_table: Union[bool, int] = False,
max_errors: int = 5,
return_all_scores: bool = False,
return_outputs: bool = False,
provide_traceback: Optional[bool] = None,
failure_score: float = 0.0,
**kwargs,
Expand All @@ -72,8 +70,6 @@ def __init__(
display_table (Union[bool, int]): Whether to display the evaluation results in a table.
If a number is passed, the evaluation results will be truncated to that number before displayed.
max_errors (int): The maximum number of errors to allow before stopping evaluation.
return_all_scores (bool): Whether to return scores for every data record in `devset`.
return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`.
provide_traceback (Optional[bool]): Whether to provide traceback information during evaluation.
failure_score (float): The default score to use if evaluation fails due to an exception.
"""
Expand All @@ -83,8 +79,6 @@ def __init__(
self.display_progress = display_progress
self.display_table = display_table
self.max_errors = max_errors
self.return_all_scores = return_all_scores
self.return_outputs = return_outputs
self.provide_traceback = provide_traceback
self.failure_score = failure_score

Expand All @@ -97,8 +91,6 @@ def __call__(
num_threads: Optional[int] = None,
display_progress: Optional[bool] = None,
display_table: Optional[Union[bool, int]] = None,
return_all_scores: Optional[bool] = None,
return_outputs: Optional[bool] = None,
callback_metadata: Optional[dict[str, Any]] = None,
):
"""
Expand All @@ -112,36 +104,20 @@ def __call__(
`self.display_progress`.
display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use
`self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed.
return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided,
use `self.return_all_scores`.
return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
provided, use `self.return_outputs`.
callback_metadata (dict): Metadata to be used for evaluate callback handlers.

Returns:
The evaluation results are returned in different formats based on the flags:

- Base return: A float percentage score (e.g., 67.30) representing overall performance

- With `return_all_scores=True`:
Returns (overall_score, individual_scores) where individual_scores is a list of
float scores for each example in devset

- With `return_outputs=True`:
Returns (overall_score, result_triples) where result_triples is a list of
(example, prediction, score) tuples for each example in devset

- With both flags=True:
Returns (overall_score, result_triples, individual_scores)

The evaluation results are returned as a dspy.Prediction object containing the following attributes:

- score: A float percentage score (e.g., 67.30) representing overall performance

- results: a list of (example, prediction, score) tuples for each example in devset
"""
metric = metric if metric is not None else self.metric
devset = devset if devset is not None else self.devset
num_threads = num_threads if num_threads is not None else self.num_threads
display_progress = display_progress if display_progress is not None else self.display_progress
display_table = display_table if display_table is not None else self.display_table
return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
return_outputs = return_outputs if return_outputs is not None else self.return_outputs

if callback_metadata:
logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
Expand Down Expand Up @@ -184,15 +160,11 @@ def process_item(example):
result_df = self._construct_result_table(results, metric_name)

self._display_result_table(result_df, display_table, metric_name)

if return_all_scores and return_outputs:
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
if return_all_scores:
return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
if return_outputs:
return round(100 * ncorrect / ntotal, 2), results

return round(100 * ncorrect / ntotal, 2)

return dspy.Prediction(
score=round(100 * ncorrect / ntotal, 2),
results=results,
)

def _construct_result_table(
self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str
Expand Down
9 changes: 9 additions & 0 deletions dspy/primitives/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ def __ge__(self, other):
elif isinstance(other, Prediction):
return self.__float__() >= float(other)
raise TypeError(f"Unsupported type for comparison: {type(other)}")

def __eq__(self, other):
if isinstance(other, (float, int)):
return self.__float__() == other
elif isinstance(other, Prediction):
return self.__float__() == float(other)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: shall we do float(self) == float(other) for consistency?

Copy link
Collaborator Author

@TomeHirata TomeHirata Mar 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this should be consistent with how __ge__ or __le__ are implemented?

else:
# we should return False when Prediction is compared with other types
return False

@property
def completions(self):
Expand Down
5 changes: 2 additions & 3 deletions dspy/teleprompt/bootstrap_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ def bootstrap_trace_data(
devset=dataset,
num_threads=num_threads,
display_progress=True,
return_outputs=True,
provide_traceback=False, # TODO(check with team)
max_errors=len(dataset) * 10, # TODO(check with team)
failure_score=failure_score,
Expand Down Expand Up @@ -290,10 +289,10 @@ def wrapped_program(**kwargs):

return failed_pred, trace

_, outputs = evaluator(wrapped_program, metric=wrapped_metric)
results = evaluator(wrapped_program, metric=wrapped_metric).results

data = []
for example_ind, (example, prediction, score) in enumerate(outputs):
for example_ind, (example, prediction, score) in enumerate(results):
try:
prediction, trace = prediction
except ValueError as ve:
Expand Down
2 changes: 1 addition & 1 deletion dspy/teleprompt/copro_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs):
f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for "
f"Predictor {p_i+1} of {len(module.predictors())}.",
)
score = evaluate(module_clone, devset=trainset, **eval_kwargs)
score = evaluate(module_clone, devset=trainset, **eval_kwargs).score
if self.prompt_model:
logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")
total_calls += 1
Expand Down
3 changes: 1 addition & 2 deletions dspy/teleprompt/infer_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,8 @@ def evaluate_program(self, program, dataset):
max_errors=self.max_errors,
display_table=False,
display_progress=True,
return_all_scores=True,
)
score, _ = evaluate(program, metric=self.metric)
score = evaluate(program, metric=self.metric).score
return score


Expand Down
8 changes: 3 additions & 5 deletions dspy/teleprompt/mipro_optimizer_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,9 +511,7 @@ def _optimize_prompt_parameters(
adjusted_num_trials = int((num_trials + num_trials // minibatch_full_eval_steps + 1 + run_additional_full_eval_at_end) if minibatch else num_trials)
logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")

default_score, _ = eval_candidate_program(
len(valset), valset, program, evaluate, self.rng, return_all_scores=True
)
default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng).score
logger.info(f"Default program score: {default_score}\n")

trial_logs = {}
Expand Down Expand Up @@ -563,7 +561,7 @@ def objective(trial):

# Evaluate the candidate program (on minibatch if minibatch=True)
batch_size = minibatch_size if minibatch else len(valset)
score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng)
score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng).score
total_eval_calls += batch_size

# Update best score and program
Expand Down Expand Up @@ -796,7 +794,7 @@ def _perform_full_evaluation(
param_score_dict, fully_evaled_param_combos
)
logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...")
full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng)
full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng).score
score_data.append({"score": full_eval_score, "program": highest_mean_program, "full_eval": True})

# Log full eval as a trial so that optuna can learn from the new results
Expand Down
4 changes: 3 additions & 1 deletion dspy/teleprompt/random_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
display_progress=True,
)

score, subscores = evaluate(program, return_all_scores=True)
result = evaluate(program)

score, subscores = result.score, [output[2] for output in result.results]

all_subscores.append(subscores)

Expand Down
4 changes: 2 additions & 2 deletions dspy/teleprompt/teleprompt_optuna.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def objective(self, trial):
display_table=False,
display_progress=True,
)
score = evaluate(program2, return_all_scores=False)
result = evaluate(program2)
trial.set_user_attr("program", program2)
return score
return result.score

def compile(self, student, *, teacher=None, max_demos, trainset, valset=None):
import optuna
Expand Down
Loading
Loading