diff --git a/deepeval/optimizer/algorithms/gepa/gepa.py b/deepeval/optimizer/algorithms/gepa/gepa.py index e42dc459d0..b6d4492995 100644 --- a/deepeval/optimizer/algorithms/gepa/gepa.py +++ b/deepeval/optimizer/algorithms/gepa/gepa.py @@ -2,7 +2,8 @@ import uuid import random import time - +from rich.table import Table +from rich import box from typing import ( Awaitable, Callable, @@ -17,6 +18,7 @@ from deepeval.models.base_model import DeepEvalBaseLLM from deepeval.errors import DeepEvalError +from deepeval.optimizer.scorer.schema import ScorerDiagnosisResult from deepeval.optimizer.utils import Aggregator, mean_of_all from deepeval.optimizer.types import ( AcceptedIterationDict, @@ -37,8 +39,7 @@ from deepeval.optimizer.policies import ( pick_best_with_ties, select_prompt_configuration_pareto, - frequency_weights, - pareto_frontier, + _is_dominated, ) from deepeval.prompt.api import PromptType from deepeval.prompt.prompt import Prompt @@ -71,6 +72,8 @@ class GEPA(BaseAlgorithm): Number of examples drawn from D_feedback per iteration. Default is 8. pareto_size : int Size of the Pareto validation subset D_pareto. Default is 3. + patience : int + If there's no improvement in the Pareto score table for the last patience iterations, stop the optimization. Default is 3. random_seed : int, optional RNG seed for reproducibility. If None, derived from time.time_ns(). tie_breaker : TieBreaker @@ -87,8 +90,11 @@ def __init__( minibatch_size: int = 8, pareto_size: int = 3, random_seed: Optional[int] = None, + patience: int = 3, tie_breaker: TieBreaker = TieBreaker.PREFER_CHILD, aggregate_instances: Aggregator = mean_of_all, + reflection_model: Optional[DeepEvalBaseLLM] = "gpt-4o-mini", + mutation_model: Optional[DeepEvalBaseLLM] = "gpt-4o", scorer: Optional[BaseScorer] = None, ) -> None: # Validate parameters @@ -102,6 +108,7 @@ def __init__( self.iterations = iterations self.minibatch_size = minibatch_size self.pareto_size = pareto_size + self.patience = patience self.tie_breaker = tie_breaker self.aggregate_instances = aggregate_instances self.scorer = scorer @@ -118,10 +125,10 @@ def __init__( # Status callback set by PromptOptimizer: # (kind, step_index, total_steps, detail) -> None self.status_callback: Optional[RunnerStatusCallback] = None + self.step_callback: Optional[Callable[[str], None]] = None - # Optimizer model used by the rewriter for prompt mutation. - # Set by PromptOptimizer. - self.optimizer_model: Optional["DeepEvalBaseLLM"] = None + self.reflection_model: Optional["DeepEvalBaseLLM"] = reflection_model + self.mutation_model: Optional["DeepEvalBaseLLM"] = mutation_model # lazy loaded self._rewriter: Optional[Rewriter] = None @@ -144,6 +151,11 @@ def execute( "run the optimizer." ) + if self.reflection_model is not None: + self.scorer.optimizer_model = self.reflection_model + if self.mutation_model is not None: + self._rewriter.optimizer_model = self.mutation_model + self._ensure_scorer() self.reset_state() @@ -158,13 +170,17 @@ def execute( self._add_prompt_configuration(root_prompt_configuration) accepted_iterations: List[Dict] = [] + consecutive_rejections = 0 def _one_iteration() -> bool: nonlocal accepted_iterations + nonlocal consecutive_rejections if not d_feedback: return False + iter_start = time.perf_counter() + # Seed Pareto scores lazily on first iteration if not self.pareto_score_table: self.pareto_score_table[root_prompt_configuration.id] = ( @@ -183,13 +199,19 @@ def _one_iteration() -> bool: minibatch = self._draw_minibatch(d_feedback) # 4. Feedback - feedback_text = self.scorer.get_minibatch_feedback( + feedback_diagnosis = self.scorer.get_minibatch_feedback( parent_prompt_configuration, selected_module_id, minibatch ) + parent_minibatch_score = self.scorer.score_minibatch( + parent_prompt_configuration, minibatch + ) + # 5. Rewrite child_prompt = self._generate_child_prompt( - selected_module_id, parent_prompt_configuration, feedback_text + selected_module_id, + parent_prompt_configuration, + feedback_diagnosis, ) if child_prompt is None: # Child prompt matched parent; skip this iteration. @@ -200,31 +222,93 @@ def _one_iteration() -> bool: selected_module_id, parent_prompt_configuration, child_prompt ) - # 7. Evaluate parent/child on minibatch - parent_score = self.scorer.score_minibatch( - parent_prompt_configuration, minibatch - ) - child_score = self.scorer.score_minibatch( + child_minibatch_score = self.scorer.score_minibatch( child_prompt_configuration, minibatch ) - # 8. Acceptance test - accepted = self._should_accept_child(parent_score, child_score) + if child_minibatch_score <= parent_minibatch_score: + parent_agg = self.aggregate_instances( + self.pareto_score_table[parent_prompt_configuration.id] + ) + self._iteration_log.append( + { + "iteration": self._current_iteration, + "outcome": "skipped", + "reason": f"Skipped (minibatch score did not improve)", + "before": parent_agg, + "after": child_minibatch_score, + "elapsed": time.perf_counter() - iter_start, + } + ) + return True + + # 7. Evaluate child on the GLOBAL validation set (d_pareto) + child_pareto_scores = self.scorer.score_pareto( + child_prompt_configuration, d_pareto + ) + parent_pareto_scores = self.pareto_score_table[ + parent_prompt_configuration.id + ] + + # 8. Acceptance test (Pareto non-domination vs parent) + accepted = self._should_accept_child( + child_pareto_scores, parent_pareto_scores + ) + if accepted: + consecutive_rejections = 0 + parent_agg = self.aggregate_instances(parent_pareto_scores) + child_agg = self.aggregate_instances(child_pareto_scores) accepted_iterations.append( self._accept_child( selected_module_id, parent_prompt_configuration, child_prompt_configuration, - d_pareto, - parent_score, - child_score, + child_pareto_scores, + parent_agg, + child_agg, ) ) + self._iteration_log.append( + { + "iteration": self._current_iteration, + "outcome": "accepted", + "reason": "Accepted by Pareto non-domination", + "before": parent_agg, + "after": child_agg, + "elapsed": time.perf_counter() - iter_start, + } + ) + else: + consecutive_rejections += 1 + self._iteration_log.append( + { + "iteration": self._current_iteration, + "outcome": "rejected", + "reason": f"Rejected (consecutive rejections: {consecutive_rejections}/{self.patience})", + "before": self.aggregate_instances( + parent_pareto_scores + ), + "after": self.aggregate_instances(child_pareto_scores), + "elapsed": time.perf_counter() - iter_start, + } + ) + + if consecutive_rejections >= self.patience: + self._iteration_log[-1][ + "reason" + ] = f"early stop (patience={self.patience})" + return False return True self._run_loop_iteration(_one_iteration) + if not self.pareto_score_table: + raise DeepEvalError( + "GEPA finished without any Pareto scores (empty score table). " + "Common causes: empty feedback split, or the loop exited before " + "the first scoring step ran." + ) best = self._best_by_aggregate() prompt_config_snapshots = build_prompt_config_snapshots( self.prompt_configurations_by_id @@ -253,6 +337,11 @@ async def a_execute( "run the optimizer." ) + if self.reflection_model is not None: + self.scorer.optimizer_model = self.reflection_model + if self.mutation_model is not None: + self._rewriter.optimizer_model = self.mutation_model + self._ensure_scorer() self.reset_state() @@ -267,26 +356,28 @@ async def a_execute( self._add_prompt_configuration(root_prompt_configuration) accepted_iterations: List[Dict] = [] + consecutive_rejections = 0 async def _one_iteration() -> bool: - nonlocal accepted_iterations + nonlocal accepted_iterations, consecutive_rejections if not d_feedback: return False iter_start = time.perf_counter() + cur = self._current_iteration # Seed Pareto scores lazily on first iteration if not self.pareto_score_table: - t0 = time.perf_counter() + self._update_step( + cur, + f"Scoring seed prompt on {len(d_pareto)} pareto goldens...", + ) self.pareto_score_table[root_prompt_configuration.id] = ( await self.scorer.a_score_pareto( root_prompt_configuration, d_pareto ) ) - print( - f"[DEBUG] Initial pareto scoring ({len(d_pareto)} goldens): {time.perf_counter() - t0:.2f}s" - ) # 1. Pick prompt_configuration via Pareto parent_prompt_configuration = self._pick_prompt_configuration() @@ -296,23 +387,38 @@ async def _one_iteration() -> bool: # 3. Draw minibatch minibatch = self._draw_minibatch(d_feedback) - print(f"[DEBUG] Minibatch size: {len(minibatch)}") # 4. Feedback - t0 = time.perf_counter() - feedback_text = await self.scorer.a_get_minibatch_feedback( + self._update_step( + cur, f"Gathering feedback on {len(minibatch)} goldens..." + ) + feedback_diagnosis = await self.scorer.a_get_minibatch_feedback( parent_prompt_configuration, selected_module_id, minibatch ) - print(f"[DEBUG] Get feedback: {time.perf_counter() - t0:.2f}s") + + parent_minibatch_score = await self.scorer.a_score_minibatch( + parent_prompt_configuration, minibatch + ) # 5. Rewrite - t0 = time.perf_counter() + self._update_step(cur, "Rewriting prompt from feedback...") child_prompt = await self._a_generate_child_prompt( - selected_module_id, parent_prompt_configuration, feedback_text + selected_module_id, + parent_prompt_configuration, + feedback_diagnosis, ) - print(f"[DEBUG] Rewrite prompt: {time.perf_counter() - t0:.2f}s") + if child_prompt is None: - print(f"[DEBUG] Child prompt same as parent, skipping") + self._iteration_log.append( + { + "iteration": cur, + "outcome": "skipped", + "reason": "child == parent", + "before": None, + "after": None, + "elapsed": time.perf_counter() - iter_start, + } + ) return True # 6. Child prompt_configuration @@ -320,50 +426,97 @@ async def _one_iteration() -> bool: selected_module_id, parent_prompt_configuration, child_prompt ) - # 7. Evaluate parent/child on minibatch - t0 = time.perf_counter() - parent_score = await self.scorer.a_score_minibatch( - parent_prompt_configuration, minibatch - ) - print( - f"[DEBUG] Score parent on minibatch: {time.perf_counter() - t0:.2f}s (score={parent_score:.4f})" + child_minibatch_score = await self.scorer.a_score_minibatch( + child_prompt_configuration, minibatch ) - t0 = time.perf_counter() - child_score = await self.scorer.a_score_minibatch( - child_prompt_configuration, minibatch + if child_minibatch_score <= parent_minibatch_score: + parent_agg = self.aggregate_instances( + self.pareto_score_table[parent_prompt_configuration.id] + ) + self._iteration_log.append( + { + "iteration": cur, + "outcome": "skipped", + "reason": f"Skipped (minibatch score did not improve)", + "before": parent_agg, + "after": child_minibatch_score, + "elapsed": time.perf_counter() - iter_start, + } + ) + return True + + # 7. Evaluate child on the GLOBAL validation set (d_pareto) + self._update_step( + cur, + f"Evaluating child on pareto set ({len(d_pareto)} goldens)...", ) - print( - f"[DEBUG] Score child on minibatch: {time.perf_counter() - t0:.2f}s (score={child_score:.4f})" + child_pareto_scores = await self.scorer.a_score_pareto( + child_prompt_configuration, d_pareto ) + parent_pareto_scores = self.pareto_score_table[ + parent_prompt_configuration.id + ] - # 8. Acceptance test - accepted = self._should_accept_child(parent_score, child_score) - print( - f"[DEBUG] Acceptance: {'ACCEPTED' if accepted else 'REJECTED'}" + # 8. Acceptance test (Pareto non-domination vs parent) + accepted = self._should_accept_child( + child_pareto_scores, parent_pareto_scores ) + if accepted: - t0 = time.perf_counter() + consecutive_rejections = 0 + parent_agg = self.aggregate_instances(parent_pareto_scores) + child_agg = self.aggregate_instances(child_pareto_scores) accepted_iterations.append( await self._a_accept_child( selected_module_id, parent_prompt_configuration, child_prompt_configuration, - d_pareto, - parent_score, - child_score, + child_pareto_scores, + parent_agg, + child_agg, ) ) - print( - f"[DEBUG] Accept child (pareto scoring): {time.perf_counter() - t0:.2f}s" + self._iteration_log.append( + { + "iteration": cur, + "outcome": "accepted", + "reason": "Accepted by Pareto non-domination", + "before": parent_agg, + "after": child_agg, + "elapsed": time.perf_counter() - iter_start, + } + ) + else: + consecutive_rejections += 1 + self._iteration_log.append( + { + "iteration": cur, + "outcome": "rejected", + "reason": f"Rejected (consecutive rejections: {consecutive_rejections}/{self.patience})", + "before": self.aggregate_instances( + parent_pareto_scores + ), + "after": self.aggregate_instances(child_pareto_scores), + "elapsed": time.perf_counter() - iter_start, + } ) - print( - f"[DEBUG] Total iteration time: {time.perf_counter() - iter_start:.2f}s\n" - ) + if consecutive_rejections >= self.patience: + self._iteration_log[-1][ + "reason" + ] = f"early stop (patience={self.patience})" + return False + return True await self._a_run_loop_iteration(_one_iteration) + if not self.pareto_score_table: + raise DeepEvalError( + "GEPA finished without any Pareto scores (empty score table). " + "Common causes: empty feedback split, or the loop exited before " + "the first scoring step ran." + ) best = self._best_by_aggregate() prompt_config_snapshots = build_prompt_config_snapshots( self.prompt_configurations_by_id @@ -391,6 +544,9 @@ def reset_state(self) -> None: PromptConfigurationId, Optional[PromptConfigurationId] ] = {} self.pareto_score_table: ScoreTable = {} + # Accumulates one dict per iteration for the final summary table + self._iteration_log: List[Dict] = [] + self._current_iteration: int = 0 def _ensure_scorer(self) -> None: if self.scorer is None: @@ -483,43 +639,9 @@ def _best_by_aggregate(self) -> PromptConfiguration: return self.prompt_configurations_by_id[chosen] def _pick_prompt_configuration(self) -> PromptConfiguration: - # Log Pareto selection details - all_candidates = list(self.pareto_score_table.keys()) - print(f"[DEBUG] Pareto Selection:") - print(f" - Total candidates in pool: {len(all_candidates)}") - - # Show score table - print(f" - Score table (per-instance scores):") - for cid, scores in self.pareto_score_table.items(): - is_root = self.parents_by_id.get(cid) is None - label = ( - "(root)" - if is_root - else f"(child of {self.parents_by_id.get(cid)[:8]}...)" - ) - mean_score = sum(scores) / len(scores) if scores else 0 - print( - f" {cid[:8]}... {label}: {[round(s, 3) for s in scores]} (mean={mean_score:.3f})" - ) - - # Show Pareto frontier - frontier = pareto_frontier(all_candidates, self.pareto_score_table) - print(f" - Pareto frontier ({len(frontier)} non-dominated):") - for cid in frontier: - print(f" {cid[:8]}...") - - # Show frequency weights - freq = frequency_weights(self.pareto_score_table) - print(f" - Frequency weights (how often each wins an instance):") - for cid, weight in freq.items(): - print(f" {cid[:8]}...: {weight}") - - # Do the selection selected_prompt_configuration_id = select_prompt_configuration_pareto( self.pareto_score_table, random_state=self.random_state ) - print(f" - Selected: {selected_prompt_configuration_id[:8]}...\n") - return self.prompt_configurations_by_id[ selected_prompt_configuration_id ] @@ -544,16 +666,15 @@ async def _a_generate_child_prompt( self, selected_module_id: ModuleId, parent_prompt_configuration: PromptConfiguration, - feedback_text: str, + feedback_diagnosis: ScorerDiagnosisResult, ) -> Optional[Prompt]: old_prompt = parent_prompt_configuration.prompts.get( selected_module_id, Prompt(text_template="") ) new_prompt = await self._rewriter.a_rewrite( - module_id=selected_module_id, old_prompt=old_prompt, - feedback_text=feedback_text, + feedback_diagnosis=feedback_diagnosis, ) if old_prompt.type != new_prompt.type or self._prompts_equivalent( @@ -568,16 +689,15 @@ def _generate_child_prompt( self, selected_module_id: ModuleId, parent_prompt_configuration: PromptConfiguration, - feedback_text: str, + feedback_diagnosis: ScorerDiagnosisResult, ) -> Optional[Prompt]: old_prompt = parent_prompt_configuration.prompts.get( selected_module_id, Prompt(text_template="") ) new_prompt = self._rewriter.rewrite( - module_id=selected_module_id, old_prompt=old_prompt, - feedback_text=feedback_text, + feedback_diagnosis=feedback_diagnosis, ) if old_prompt.type != new_prompt.type or self._prompts_equivalent( @@ -602,31 +722,61 @@ def _make_child( return child_prompt_configuration def _should_accept_child( - self, parent_score: float, child_score: float + self, child_scores: List[float], parent_scores: List[float] ) -> bool: - jitter = 1e-6 - return child_score >= parent_score + max(GEPA_MIN_DELTA, jitter) + if _is_dominated( + candidate_scores=child_scores, + other_scores=parent_scores, + min_delta=GEPA_MIN_DELTA, + ): + return False + + current_archive_scores = list(self.pareto_score_table.values()) + + for existing_scores in current_archive_scores: + if _is_dominated( + candidate_scores=child_scores, + other_scores=existing_scores, + min_delta=GEPA_MIN_DELTA, + ): + return False + + return True def _accept_child( self, selected_module_id: ModuleId, parent_prompt_configuration: PromptConfiguration, child_prompt_configuration: PromptConfiguration, - d_pareto: Union[List["Golden"], List["ConversationalGolden"]], - parent_score: float, - child_score: float, + child_pareto_scores: List[float], + parent_agg_score: float, + child_agg_score: float, ) -> AcceptedIterationDict: self._add_prompt_configuration(child_prompt_configuration) self.pareto_score_table[child_prompt_configuration.id] = ( - self.scorer.score_pareto(child_prompt_configuration, d_pareto) + child_pareto_scores ) + ids_to_remove = [] + for config_id, scores in self.pareto_score_table.items(): + if config_id == child_prompt_configuration.id: + continue + if _is_dominated( + candidate_scores=scores, + other_scores=child_pareto_scores, + min_delta=GEPA_MIN_DELTA, + ): + ids_to_remove.append(config_id) + + for rid in ids_to_remove: + del self.pareto_score_table[rid] + return AcceptedIterationDict( parent=parent_prompt_configuration.id, child=child_prompt_configuration.id, module=selected_module_id, - before=parent_score, - after=child_score, + before=parent_agg_score, + after=child_agg_score, ) async def _a_accept_child( @@ -634,25 +784,42 @@ async def _a_accept_child( selected_module_id: ModuleId, parent_prompt_configuration: PromptConfiguration, child_prompt_configuration: PromptConfiguration, - d_pareto: Union[List["Golden"], List["ConversationalGolden"]], - parent_score: float, - child_score: float, + child_pareto_scores: List[float], + parent_agg_score: float, + child_agg_score: float, ) -> AcceptedIterationDict: self._add_prompt_configuration(child_prompt_configuration) self.pareto_score_table[child_prompt_configuration.id] = ( - await self.scorer.a_score_pareto( - child_prompt_configuration, d_pareto - ) + child_pareto_scores ) + ids_to_remove = [] + for config_id, scores in self.pareto_score_table.items(): + if config_id == child_prompt_configuration.id: + continue + if _is_dominated( + candidate_scores=scores, + other_scores=child_pareto_scores, + min_delta=GEPA_MIN_DELTA, + ): + ids_to_remove.append(config_id) + + for rid in ids_to_remove: + del self.pareto_score_table[rid] + return AcceptedIterationDict( parent=parent_prompt_configuration.id, child=child_prompt_configuration.id, module=selected_module_id, - before=parent_score, - after=child_score, + before=parent_agg_score, + after=child_agg_score, ) + def _update_step(self, iteration: int, label: str) -> None: + """Update the sub-step row on the outer progress bar.""" + if self.step_callback is not None: + self.step_callback(label) + def _update_progress( self, total_iterations: int, @@ -699,12 +866,12 @@ def _run_loop_iteration( self._update_progress(total_iterations, iteration, remaining_iterations) while remaining_iterations > 0: iteration += 1 + self._current_iteration = iteration try: ok = gepa_iteration() except Exception as exc: - # Report a user facing error event and halt optimization. self._update_error(total_iterations, iteration, exc) - break + raise if not ok: break remaining_iterations -= 1 @@ -722,15 +889,128 @@ async def _a_run_loop_iteration( self._update_progress(total_iterations, iteration, remaining_iterations) while remaining_iterations > 0: iteration += 1 + self._current_iteration = iteration try: ok = await a_gepa_iteration() except Exception as exc: - # Report a user facing error event and halt optimization. self._update_error(total_iterations, iteration, exc) - break + raise if not ok: break remaining_iterations -= 1 self._update_progress( total_iterations, iteration, remaining_iterations ) + + def generate_summary_table(self, report: OptimizationReport) -> List[Table]: + """Generates GEPA-specific evolutionary iteration and Pareto tables.""" + _PURPLE = "rgb(106,0,255)" + _GREEN = "rgb(25,227,160)" + _RED = "rgb(255,85,85)" + _DIM = "rgb(55,65,81)" + + tables = [] + iteration_log = getattr(self, "_iteration_log", []) + + # 1. Iteration Table + iter_table = Table( + title=f"✨ [{_PURPLE}]{self.name}[/] Evolutionary Mutations", + box=box.ROUNDED, + border_style=_PURPLE, + header_style=f"bold {_PURPLE}", + show_lines=True, + expand=True, + ) + iter_table.add_column( + "#", style="bold white", justify="right", no_wrap=True + ) + iter_table.add_column("Outcome", justify="center", no_wrap=True) + iter_table.add_column("Before", justify="right", no_wrap=True) + iter_table.add_column("After", justify="right", no_wrap=True) + iter_table.add_column("Δ Score", justify="right", no_wrap=True) + iter_table.add_column("Note", style=f"{_DIM}", no_wrap=False) + iter_table.add_column("Time", justify="right", no_wrap=True) + + for entry in iteration_log: + i = str(entry["iteration"]) + outcome = entry["outcome"] + before = entry.get("before") + after = entry.get("after") + reason = entry.get("reason", "") + elapsed = entry.get("elapsed", 0.0) + + if outcome == "accepted": + outcome_cell = f"[{_GREEN}]✔ accepted[/]" + elif outcome == "rejected": + outcome_cell = f"[{_RED}]✘ rejected[/]" + else: + outcome_cell = f"[{_DIM}]↷ skipped[/]" + + before_cell = f"{before:.4f}" if before is not None else "—" + after_cell = f"{after:.4f}" if after is not None else "—" + + if before is not None and after is not None: + delta = after - before + sign = "+" if delta >= 0 else "" + color = _GREEN if delta > 0 else (_RED if delta < 0 else _DIM) + delta_cell = f"[{color}]{sign}{delta:.4f}[/]" + else: + delta_cell = "—" + + time_cell = f"[{_DIM}]{elapsed:.2f}s[/]" + iter_table.add_row( + i, + outcome_cell, + before_cell, + after_cell, + delta_cell, + reason, + time_cell, + ) + + tables.append(iter_table) + + # 2. Pareto Table + if report and report.pareto_scores: + pareto_table = Table( + title=f"[{_PURPLE}]Final Pareto Archive[/]", + box=box.HORIZONTALS, + border_style=_PURPLE, + header_style=f"bold {_PURPLE}", + show_lines=True, + expand=True, + ) + pareto_table.add_column("Config ID", style="white", no_wrap=True) + pareto_table.add_column("Role", justify="center", no_wrap=True) + pareto_table.add_column("Scores", no_wrap=False) + pareto_table.add_column("Aggregate", justify="right", no_wrap=True) + + best_id = report.best_id + for cid, scores in report.pareto_scores.items(): + is_root = report.parents.get(cid) is None + role = f"[{_PURPLE}]root[/]" if is_root else f"[{_DIM}]child[/]" + is_best = cid == best_id + + short_id = cid[:8] + "…" + if is_best: + short_id = f"[bold {_GREEN}]{short_id} ★[/]" + + if len(scores) > 6: + score_strs = ( + [f"{s:.3f}" for s in scores[:3]] + + ["..."] + + [f"{s:.3f}" for s in scores[-3:]] + ) + else: + score_strs = [f"{s:.3f}" for s in scores] + scores_cell = f"[{_DIM}][{', '.join(score_strs)}][/]" + + agg = sum(scores) / len(scores) if scores else 0.0 + agg_color = _GREEN if is_best else "white" + agg_cell = f"[{agg_color}]{agg:.4f}[/]" + + pareto_table.add_row(short_id, role, scores_cell, agg_cell) + + tables.append(pareto_table) + + return tables diff --git a/deepeval/optimizer/algorithms/miprov2/__init__.py b/deepeval/optimizer/algorithms/miprov2/__init__.py index 21066fa10e..5d88cb23fd 100644 --- a/deepeval/optimizer/algorithms/miprov2/__init__.py +++ b/deepeval/optimizer/algorithms/miprov2/__init__.py @@ -1,17 +1 @@ from .miprov2 import MIPROV2 -from .proposer import InstructionProposer -from .bootstrapper import ( - Demo, - DemoSet, - DemoBootstrapper, - render_prompt_with_demos, -) - -__all__ = [ - "MIPROV2", - "InstructionProposer", - "Demo", - "DemoSet", - "DemoBootstrapper", - "render_prompt_with_demos", -] diff --git a/deepeval/optimizer/algorithms/miprov2/bootstrapper.py b/deepeval/optimizer/algorithms/miprov2/bootstrapper.py index 481c63270e..39a01c795f 100644 --- a/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +++ b/deepeval/optimizer/algorithms/miprov2/bootstrapper.py @@ -1,31 +1,24 @@ -# Demo Bootstrapper for MIPROv2 -# -# This module implements few-shot demonstration bootstrapping following -# the original MIPROv2 paper. It runs the prompt on training examples -# and collects successful outputs as demonstrations. - from __future__ import annotations import asyncio import random +import uuid from dataclasses import dataclass, field -from typing import List, Optional, Union, TYPE_CHECKING, Callable, Tuple +from typing import List, Optional, Union, TYPE_CHECKING, Tuple +from deepeval.errors import DeepEvalError +from deepeval.dataset.golden import Golden, ConversationalGolden from deepeval.prompt.prompt import Prompt - -if TYPE_CHECKING: - from deepeval.dataset.golden import Golden, ConversationalGolden +from deepeval.metrics.utils import copy_metrics +from deepeval.optimizer.scorer import Scorer +from deepeval.optimizer.scorer.utils import ( + _measure_no_indicator, + _a_measure_no_indicator, +) @dataclass -class Demo: - """ - A single demonstration example for few-shot prompting. - - Attributes: - input_text: The input/question from the golden - output_text: The successful output from the model - golden_index: Index of the source golden (for tracking) - """ +class Demonstration: + """A single, mathematically verified few-shot example.""" input_text: str output_text: str @@ -33,27 +26,24 @@ class Demo: @dataclass -class DemoSet: - """ - A set of demonstrations to be included in a prompt. - - Attributes: - demos: List of Demo objects - id: Unique identifier for this demo set - """ +class DemonstrationSet: + """A set of demonstrations to be dynamically injected into a prompt.""" - demos: List[Demo] = field(default_factory=list) + demonstrations: List[Demonstration] = field(default_factory=list) id: str = "" def __post_init__(self): if not self.id: - import uuid - self.id = str(uuid.uuid4()) - def to_text(self, max_demos: Optional[int] = None) -> str: - """Render demos as text for inclusion in prompts.""" - demos_to_use = self.demos[:max_demos] if max_demos else self.demos + def to_text(self, max_demonstrations: Optional[int] = None) -> str: + """Render demonstrations as text for inclusion in prompts.""" + demos_to_use = ( + self.demonstrations[:max_demonstrations] + if max_demonstrations + else self.demonstrations + ) + if not demos_to_use: return "" @@ -61,47 +51,30 @@ def to_text(self, max_demos: Optional[int] = None) -> str: for i, demo in enumerate(demos_to_use, 1): lines.append(f"Example {i}:") lines.append(f"Input: {demo.input_text}") - lines.append(f"Output: {demo.output_text}") - lines.append("") + lines.append(f"Output: {demo.output_text}\n\n") lines.append("Now, please respond to the following:") return "\n".join(lines) -class DemoBootstrapper: +class DemonstrationBootstrapper: """ - Bootstraps few-shot demonstrations by running the prompt on - training examples and keeping successful outputs. - - Following MIPROv2, this: - 1. Samples examples from the training set - 2. Runs them through the model with the current prompt - 3. Evaluates outputs using a simple success check - 4. Keeps successful outputs as demonstration candidates - 5. Creates multiple demo sets for variety - - Parameters - ---------- - max_bootstrapped_demos : int - Maximum demos per set from bootstrapping. Default is 4. - max_labeled_demos : int - Maximum demos per set from labeled data (golden expected_output). Default is 4. - num_demo_sets : int - Number of different demo sets to create. Default is 5. - random_state : random.Random, optional - Random state for reproducibility. + Bootstraps few-shot demonstrations by running the prompt on training + examples and keeping strictly successful outputs based on metric success. """ def __init__( self, - max_bootstrapped_demos: int = 4, - max_labeled_demos: int = 4, - num_demo_sets: int = 5, + scorer: Scorer, + max_bootstrapped_demonstrations: int = 4, + max_labeled_demonstrations: int = 4, + num_demonstration_sets: int = 5, random_state: Optional[Union[int, random.Random]] = None, ): - self.max_bootstrapped_demos = max_bootstrapped_demos - self.max_labeled_demos = max_labeled_demos - self.num_demo_sets = num_demo_sets + self.scorer = scorer + self.max_bootstrapped_demonstrations = max_bootstrapped_demonstrations + self.max_labeled_demonstrations = max_labeled_demonstrations + self.num_demonstration_sets = num_demonstration_sets if isinstance(random_state, int): self.random_state = random.Random(random_state) @@ -109,211 +82,206 @@ def __init__( self.random_state = random_state or random.Random() def _extract_input( - self, - golden: Union["Golden", "ConversationalGolden"], + self, golden: Union[Golden, ConversationalGolden] ) -> str: - """Extract input text from a golden.""" - if hasattr(golden, "input") and golden.input: - return str(golden.input) - if hasattr(golden, "messages") and golden.messages: - # For conversational, use the last user message - for msg in reversed(golden.messages): - if hasattr(msg, "role") and msg.role == "user": - return ( - str(msg.content) - if hasattr(msg, "content") - else str(msg) - ) - return str(golden.messages[-1]) - return "" + """Strictly extract the input text, throwing errors on invalid state.""" + if isinstance(golden, Golden): + if not golden.input: + raise DeepEvalError( + "Golden must have a valid 'input' for MIPROv2 bootstrapping." + ) + return golden.input + + else: + user_turns = [ + t.content for t in (golden.turns or []) if t.role == "user" + ] + if not user_turns: + raise DeepEvalError( + "ConversationalGolden must have at least one 'user' turn for MIPROv2 bootstrapping." + ) + return "\n".join(user_turns) def _extract_expected_output( - self, - golden: Union["Golden", "ConversationalGolden"], + self, golden: Union[Golden, ConversationalGolden] ) -> Optional[str]: - """Extract expected output from a golden if available.""" - if hasattr(golden, "expected_output") and golden.expected_output: + """Strictly extract the expected output/outcome if it exists.""" + if isinstance(golden, Golden): + if not golden.expected_output: + raise DeepEvalError( + "Golden must have a valid 'expected_output' for MIPROv2 bootstrapping." + ) return str(golden.expected_output) - return None - - def _is_successful( - self, - actual_output: str, - expected_output: Optional[str], - ) -> bool: - """ - Simple success check for bootstrapping. - - For now, we consider an output successful if: - - It's non-empty - - If expected_output exists, actual has some overlap - - This is a simplified heuristic. In full MIPROv2, you'd use - the actual metric to validate. - """ - if not actual_output or not actual_output.strip(): - return False - - if expected_output: - # Simple overlap check - could be more sophisticated - actual_words = set(actual_output.lower().split()) - expected_words = set(expected_output.lower().split()) - if actual_words and expected_words: - overlap = len(actual_words & expected_words) / len( - expected_words + else: + if not golden.expected_outcome: + raise DeepEvalError( + "ConversationalGolden must have a valid 'expected_outcome' for MIPROv2 bootstrapping." ) - return overlap > 0.3 # At least 30% word overlap - - # If no expected output, just check it's non-empty - return len(actual_output.strip()) > 10 + return golden.expected_outcome def bootstrap( self, prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - generate_fn: Callable[ - [Prompt, Union["Golden", "ConversationalGolden"]], str - ], - ) -> List[DemoSet]: - """ - Bootstrap demonstration sets synchronously. - - Args: - prompt: The prompt to use for generation - goldens: Training examples to bootstrap from - generate_fn: Function that takes (prompt, golden) and returns output - - Returns: - List of DemoSet objects, each containing a different set of demos - """ - # Collect all successful demos - all_demos: List[Demo] = [] - labeled_demos: List[Demo] = [] - - # Shuffle goldens for variety + goldens: Union[List[Golden], List[ConversationalGolden]], + ) -> List[DemonstrationSet]: + """Synchronously builds DemonstrationSets utilizing the Scorer to verify metric success.""" + all_demonstrations: List[Demonstration] = [] + labeled_demonstrations: List[Demonstration] = [] + shuffled_indices = list(range(len(goldens))) self.random_state.shuffle(shuffled_indices) - # Try to bootstrap demos - attempts = 0 - max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3) + max_attempts = min( + len(goldens), self.max_bootstrapped_demonstrations * 3 + ) + prompt_dict = {"__module__": prompt} for idx in shuffled_indices[:max_attempts]: golden = goldens[idx] input_text = self._extract_input(golden) expected = self._extract_expected_output(golden) - if not input_text: - continue - - # If we have expected output, use it as a labeled demo if ( expected - and len(labeled_demos) - < self.max_labeled_demos * self.num_demo_sets + and len(labeled_demonstrations) + < self.max_labeled_demonstrations * self.num_demonstration_sets ): - labeled_demos.append( - Demo( + labeled_demonstrations.append( + Demonstration( input_text=input_text, output_text=expected, golden_index=idx, ) ) - # Try to bootstrap if ( - len(all_demos) - < self.max_bootstrapped_demos * self.num_demo_sets + len(all_demonstrations) + < self.max_bootstrapped_demonstrations + * self.num_demonstration_sets ): try: - output = generate_fn(prompt, golden) - if self._is_successful(output, expected): - all_demos.append( - Demo( + # 1. Generate actual output + actual_output = self.scorer.generate(prompt_dict, golden) + + # 2. Build the test case safely + test_case = self.scorer._golden_to_test_case( + golden, actual_output + ) + + # 3. Evaluate against all metrics + metrics = copy_metrics(self.scorer.metrics) + is_successful = True + for metric in metrics: + _measure_no_indicator(metric, test_case) + if not metric.is_successful(): + is_successful = False + break + + # 4. Save if all metrics passed + if is_successful: + all_demonstrations.append( + Demonstration( input_text=input_text, - output_text=output, + output_text=actual_output, golden_index=idx, ) ) except Exception: continue - attempts += 1 if ( - len(all_demos) - >= self.max_bootstrapped_demos * self.num_demo_sets - and len(labeled_demos) - >= self.max_labeled_demos * self.num_demo_sets + len(all_demonstrations) + >= self.max_bootstrapped_demonstrations + * self.num_demonstration_sets + and len(labeled_demonstrations) + >= self.max_labeled_demonstrations * self.num_demonstration_sets ): break - # Create diverse demo sets - return self._create_demo_sets(all_demos, labeled_demos) + demo_sets = self._create_demonstration_sets( + all_demonstrations, labeled_demonstrations + ) + + if not demo_sets or all(not ds.demonstrations for ds in demo_sets): + raise DeepEvalError( + "Bootstrapper failed to generate any demonstrations. " + "Please ensure your goldens contain an 'expected_output' for labeled demonstrations." + ) + + return demo_sets async def a_bootstrap( self, prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - a_generate_fn: Callable, - ) -> List[DemoSet]: - """ - Bootstrap demonstration sets asynchronously (concurrently). - """ - labeled_demos: List[Demo] = [] - + goldens: Union[List[Golden], List[ConversationalGolden]], + ) -> List[DemonstrationSet]: + """Asynchronously builds DemonstrationSets utilizing the Scorer to verify metric success.""" + labeled_demonstrations: List[Demonstration] = [] shuffled_indices = list(range(len(goldens))) self.random_state.shuffle(shuffled_indices) - max_attempts = min(len(goldens), self.max_bootstrapped_demos * 3) + max_attempts = min( + len(goldens), self.max_bootstrapped_demonstrations * 3 + ) selected_indices = shuffled_indices[:max_attempts] - # First pass: collect labeled demos (no async needed) and prepare bootstrap tasks - tasks_info: List[Tuple[int, str, Optional[str]]] = ( - [] - ) # (idx, input_text, expected) + tasks_info: List[Tuple[int, str, Optional[str]]] = [] + prompt_dict = {"__module__": prompt} for idx in selected_indices: golden = goldens[idx] input_text = self._extract_input(golden) expected = self._extract_expected_output(golden) - if not input_text: - continue - - # Collect labeled demos if ( expected - and len(labeled_demos) - < self.max_labeled_demos * self.num_demo_sets + and len(labeled_demonstrations) + < self.max_labeled_demonstrations * self.num_demonstration_sets ): - labeled_demos.append( - Demo( + labeled_demonstrations.append( + Demonstration( input_text=input_text, output_text=expected, golden_index=idx, ) ) - # Queue for bootstrapping tasks_info.append((idx, input_text, expected)) - # Limit how many we need to bootstrap - max_bootstrapped = self.max_bootstrapped_demos * self.num_demo_sets + max_bootstrapped = ( + self.max_bootstrapped_demonstrations * self.num_demonstration_sets + ) tasks_info = tasks_info[:max_bootstrapped] - # Run all bootstrap generations concurrently - async def generate_one( - idx: int, - input_text: str, - expected: Optional[str], - ) -> Optional[Demo]: + async def evaluate_one( + idx: int, input_text: str, expected: Optional[str] + ) -> Optional[Demonstration]: golden = goldens[idx] try: - output = await a_generate_fn(prompt, golden) - if self._is_successful(output, expected): - return Demo( + # 1. Generate actual output + actual_output = await self.scorer.a_generate( + prompt_dict, golden + ) + + # 2. Build the test case safely + test_case = self.scorer._golden_to_test_case( + golden, actual_output + ) + + # 3. Evaluate against all metrics + metrics = copy_metrics(self.scorer.metrics) + is_successful = True + for metric in metrics: + await _a_measure_no_indicator(metric, test_case) + if not metric.is_successful(): + is_successful = False + break + + # 4. Save if all metrics passed + if is_successful: + return Demonstration( input_text=input_text, - output_text=output, + output_text=actual_output, golden_index=idx, ) except Exception: @@ -321,51 +289,53 @@ async def generate_one( return None results = await asyncio.gather( - *[generate_one(idx, inp, exp) for idx, inp, exp in tasks_info] + *[evaluate_one(idx, inp, exp) for idx, inp, exp in tasks_info] ) + all_demonstrations = [demo for demo in results if demo is not None] - # Collect successful demos - all_demos = [demo for demo in results if demo is not None] + demo_sets = self._create_demonstration_sets( + all_demonstrations, labeled_demonstrations + ) - return self._create_demo_sets(all_demos, labeled_demos) + if not demo_sets or all(not ds.demonstrations for ds in demo_sets): + raise DeepEvalError( + "Bootstrapper failed to generate any demonstrations. " + "Please ensure your goldens contain an 'expected_output' for labeled demonstrations." + ) - def _create_demo_sets( + return demo_sets + + def _create_demonstration_sets( self, - bootstrapped_demos: List[Demo], - labeled_demos: List[Demo], - ) -> List[DemoSet]: - """ - Create multiple demo sets from bootstrapped and labeled demos. - - Each set contains a mix of bootstrapped and labeled demos, - selected randomly for diversity. - """ - demo_sets: List[DemoSet] = [] - - # Always include an empty demo set (0-shot option) - demo_sets.append(DemoSet(demos=[], id="0-shot")) - - # Create varied demo sets - for i in range(self.num_demo_sets): - demos: List[Demo] = [] - - # Sample from bootstrapped demos - if bootstrapped_demos: + bootstrapped_demonstrations: List[Demonstration], + labeled_demonstrations: List[Demonstration], + ) -> List[DemonstrationSet]: + + demo_sets: List[DemonstrationSet] = [ + DemonstrationSet(demonstrations=[], id="0-shot") + ] + + for _ in range(self.num_demonstration_sets): + demos: List[Demonstration] = [] + + if bootstrapped_demonstrations: n_boot = min( - self.max_bootstrapped_demos, len(bootstrapped_demos) + self.max_bootstrapped_demonstrations, + len(bootstrapped_demonstrations), ) - boot_sample = self.random_state.sample( - bootstrapped_demos, n_boot + demos.extend( + self.random_state.sample( + bootstrapped_demonstrations, n_boot + ) ) - demos.extend(boot_sample) - # Sample from labeled demos - if labeled_demos: - n_labeled = min(self.max_labeled_demos, len(labeled_demos)) + if labeled_demonstrations: + n_labeled = min( + self.max_labeled_demonstrations, len(labeled_demonstrations) + ) labeled_sample = self.random_state.sample( - labeled_demos, n_labeled + labeled_demonstrations, n_labeled ) - # Avoid duplicates existing_indices = {d.golden_index for d in demos} for demo in labeled_sample: if demo.golden_index not in existing_indices: @@ -374,62 +344,42 @@ def _create_demo_sets( if demos: self.random_state.shuffle(demos) - demo_sets.append(DemoSet(demos=demos)) + demo_sets.append(DemonstrationSet(demonstrations=demos)) return demo_sets -def render_prompt_with_demos( +def render_prompt_with_demonstrations( prompt: Prompt, - demo_set: Optional[DemoSet], - max_demos: int = 8, + demonstration_set: Optional[DemonstrationSet], + max_demonstrations: int = 8, ) -> Prompt: - """ - Create a new Prompt that includes demonstrations. - - This prepends the demo text to the prompt's content. - - Args: - prompt: The base prompt - demo_set: The demonstration set to include - max_demos: Maximum number of demos to include - - Returns: - A new Prompt with demos included - """ from deepeval.prompt.api import PromptType, PromptMessage - if not demo_set or not demo_set.demos: + if not demonstration_set or not demonstration_set.demonstrations: return prompt - demo_text = demo_set.to_text(max_demos=max_demos) + demo_text = demonstration_set.to_text(max_demonstrations=max_demonstrations) if prompt.type == PromptType.LIST: - # For LIST prompts, prepend demos to the system message or first message new_messages = [] demo_added = False - for msg in prompt.messages_template: if not demo_added and msg.role == "system": - # Add demos to system message - new_content = f"{msg.content}\n\n{demo_text}" new_messages.append( - PromptMessage(role=msg.role, content=new_content) + PromptMessage( + role=msg.role, content=f"{msg.content}\n\n{demo_text}" + ) ) demo_added = True else: new_messages.append(msg) if not demo_added and new_messages: - # No system message, add demos to first message first = new_messages[0] - new_content = f"{demo_text}\n\n{first.content}" new_messages[0] = PromptMessage( - role=first.role, content=new_content + role=first.role, content=f"{demo_text}\n\n{first.content}" ) - return Prompt(messages_template=new_messages) else: - # For TEXT prompts, prepend demos - new_text = f"{demo_text}\n\n{prompt.text_template}" - return Prompt(text_template=new_text) + return Prompt(text_template=f"{demo_text}\n\n{prompt.text_template}") diff --git a/deepeval/optimizer/algorithms/miprov2/miprov2.py b/deepeval/optimizer/algorithms/miprov2/miprov2.py index 3e13544d4f..a81c1feb64 100644 --- a/deepeval/optimizer/algorithms/miprov2/miprov2.py +++ b/deepeval/optimizer/algorithms/miprov2/miprov2.py @@ -1,42 +1,20 @@ # MIPROv2 - Multiprompt Instruction PRoposal Optimizer Version 2 # -# This implementation follows the original MIPROv2 paper and DSPy implementation: +# This implementation follows the original MIPROv2 paper: # https://arxiv.org/pdf/2406.11695 -# https://dspy.ai/api/optimizers/MIPROv2/ # -# The algorithm works in two phases: -# -# 1. PROPOSAL PHASE: -# a) Generate N diverse instruction candidates upfront -# b) Bootstrap few-shot demonstration sets from training data -# -# 2. OPTIMIZATION PHASE: Use Bayesian Optimization (Optuna TPE) to search -# over the joint space of (instruction_candidate, demo_set). Each trial: -# - Samples an instruction candidate index -# - Samples a demo set index -# - Renders the prompt with demos -# - Evaluates on a minibatch of examples -# - Uses the score to guide the Bayesian surrogate model -# -# Periodic full evaluation is performed every `minibatch_full_eval_steps` -# to get accurate scores on the complete validation set. - +# Phase 1: Propose N diverse instructions and bootstrap M demo sets. +# Phase 2: Use Bayesian Optimization (Optuna TPE) to search the joint +# categorical space of (Instruction, Demonstration Set) using +# stochastic minibatch evaluation and periodic full evaluations. from __future__ import annotations -import asyncio -import uuid import random import time -import logging -from typing import ( - Dict, - List, - Tuple, - TYPE_CHECKING, - Union, - Optional, - Callable, -) +from typing import Dict, List, Tuple, Union, Optional +from rich.table import Table +from rich import box +import re try: import optuna @@ -48,83 +26,32 @@ optuna = None TPESampler = None -from deepeval.models.base_model import DeepEvalBaseLLM from deepeval.errors import DeepEvalError -from deepeval.optimizer.utils import Aggregator, mean_of_all +from deepeval.prompt.prompt import Prompt from deepeval.optimizer.types import ( + AcceptedIterationDict, PromptConfiguration, - PromptConfigurationId, ModuleId, ScoreTable, OptimizationReport, RunnerStatusType, - RunnerStatusCallback, ) -from deepeval.optimizer.scorer.base import BaseScorer from deepeval.optimizer.algorithms.base import BaseAlgorithm from deepeval.optimizer.utils import build_prompt_config_snapshots -from deepeval.prompt.prompt import Prompt -from deepeval.optimizer.algorithms.miprov2.proposer import InstructionProposer -from deepeval.optimizer.algorithms.miprov2.bootstrapper import ( - DemoBootstrapper, - DemoSet, - render_prompt_with_demos, +from deepeval.optimizer.algorithms.miprov2.proposer.proposer import ( + InstructionProposer, ) -from deepeval.optimizer.algorithms.configs import ( - MIPROV2_DEFAULT_NUM_CANDIDATES, - MIPROV2_DEFAULT_NUM_TRIALS, - MIPROV2_DEFAULT_MINIBATCH_SIZE, - MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS, - MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS, - MIPROV2_DEFAULT_MAX_LABELED_DEMOS, - MIPROV2_DEFAULT_NUM_DEMO_SETS, +from deepeval.optimizer.algorithms.miprov2.bootstrapper import ( + DemonstrationBootstrapper, + render_prompt_with_demonstrations, ) - -if TYPE_CHECKING: - from deepeval.dataset.golden import Golden, ConversationalGolden - - -# Suppress Optuna's verbose logging -logging.getLogger("optuna").setLevel(logging.WARNING) +from deepeval.dataset.golden import Golden, ConversationalGolden class MIPROV2(BaseAlgorithm): """ - MIPROv2 (Multiprompt Instruction PRoposal Optimizer Version 2) - - A prompt optimizer that uses Bayesian Optimization to find the best - combination of instruction and few-shot demonstrations. Follows the - original MIPROv2 paper approach. - - The optimization process: - 1. Generate N diverse instruction candidates upfront - 2. Bootstrap M demo sets from training examples - 3. Use Optuna's TPE sampler for Bayesian Optimization over (instruction, demos) - 4. Each trial evaluates a combination on a minibatch - 5. Periodically evaluate the best combination on the full dataset - - Parameters - ---------- - num_candidates : int - Number of instruction candidates to propose. Default is 10. - num_trials : int - Number of Bayesian Optimization trials. Default is 20. - minibatch_size : int - Number of examples per minibatch evaluation. Default is 25. - minibatch_full_eval_steps : int - Evaluate best on full dataset every N trials. Default is 10. - max_bootstrapped_demos : int - Maximum bootstrapped demos per demo set. Default is 4. - max_labeled_demos : int - Maximum labeled demos (from expected_output) per set. Default is 4. - num_demo_sets : int - Number of demo sets to create. Default is 5. - random_seed : int, optional - RNG seed for reproducibility. If None, derived from time.time_ns(). - aggregate_instances : Aggregator - Function to aggregate per-instance scores. Default is mean_of_all. - scorer : BaseScorer, optional - Scorer for evaluating prompts. Set by PromptOptimizer. + MIPROv2 Optimizer (Lite Version - Single Module). + Uses Bayesian optimization over generated instructions and bootstrapped demos. """ name = "MIPROv2" @@ -132,621 +59,493 @@ class MIPROV2(BaseAlgorithm): def __init__( self, - num_candidates: int = MIPROV2_DEFAULT_NUM_CANDIDATES, - num_trials: int = MIPROV2_DEFAULT_NUM_TRIALS, - minibatch_size: int = MIPROV2_DEFAULT_MINIBATCH_SIZE, - minibatch_full_eval_steps: int = MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS, - max_bootstrapped_demos: int = MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS, - max_labeled_demos: int = MIPROV2_DEFAULT_MAX_LABELED_DEMOS, - num_demo_sets: int = MIPROV2_DEFAULT_NUM_DEMO_SETS, - random_seed: Optional[int] = None, - aggregate_instances: Aggregator = mean_of_all, - scorer: Optional[BaseScorer] = None, - ) -> None: + num_trials: int = 30, + num_candidates: int = 10, + max_bootstrapped_demonstrations: int = 4, + max_labeled_demonstrations: int = 4, + num_demonstration_sets: int = 5, + minibatch_size: int = 25, + minibatch_full_eval_steps: int = 10, + random_state: Optional[Union[int, random.Random]] = None, + ): + super().__init__() if not OPTUNA_AVAILABLE: raise DeepEvalError( - "MIPROv2 requires the 'optuna' package for Bayesian Optimization. " - "Install it with: pip install optuna" + "MIPROv2 requires optuna. Please run `pip install optuna`." ) - # Validate parameters - if num_candidates < 1: - raise ValueError("num_candidates must be >= 1") - if num_trials < 1: - raise ValueError("num_trials must be >= 1") - if minibatch_size < 1: - raise ValueError("minibatch_size must be >= 1") - if minibatch_full_eval_steps < 1: - raise ValueError("minibatch_full_eval_steps must be >= 1") - if max_bootstrapped_demos < 0: - raise ValueError("max_bootstrapped_demos must be >= 0") - if max_labeled_demos < 0: - raise ValueError("max_labeled_demos must be >= 0") - if num_demo_sets < 1: - raise ValueError("num_demo_sets must be >= 1") - - self.num_candidates = num_candidates self.num_trials = num_trials + self.num_candidates = num_candidates + self.max_bootstrapped_demonstrations = max_bootstrapped_demonstrations + self.max_labeled_demonstrations = max_labeled_demonstrations + self.num_demonstration_sets = num_demonstration_sets self.minibatch_size = minibatch_size self.minibatch_full_eval_steps = minibatch_full_eval_steps - self.max_bootstrapped_demos = max_bootstrapped_demos - self.max_labeled_demos = max_labeled_demos - self.num_demo_sets = num_demo_sets - self.aggregate_instances = aggregate_instances - self.scorer = scorer - # Random seed handling - if random_seed is None: - random_seed = time.time_ns() % (2**31) - self.random_seed = random_seed - self.random_state = random.Random(random_seed) - - # Runtime state - self.reset_state() + # Internal State Tracking + self.pareto_score_table: ScoreTable = {} + self.parents_by_id: Dict[str, str] = {} + self._config_cache: Dict[Tuple[int, int], PromptConfiguration] = {} + self.prompt_configurations_by_id: Dict[str, PromptConfiguration] = {} + + self.candidates: List[Prompt] = [] + self.demo_sets = [] + + if isinstance(random_state, int): + self.seed = random_state + self.random_state = random.Random(random_state) + else: + self.seed = random.randint(0, 999999) + self.random_state = random_state or random.Random(self.seed) + + def _init_components(self) -> None: + """Initialize the Proposer and Bootstrapper using the injected models.""" + self.proposer = InstructionProposer( + optimizer_model=self.optimizer_model, random_state=self.random_state + ) + self.bootstrapper = DemonstrationBootstrapper( + scorer=self.scorer, + max_bootstrapped_demonstrations=self.max_bootstrapped_demonstrations, + max_labeled_demonstrations=self.max_labeled_demonstrations, + num_demonstration_sets=self.num_demonstration_sets, + random_state=self.random_state, + ) - # Callbacks and models (set by PromptOptimizer) - self.status_callback: Optional[RunnerStatusCallback] = None - self.optimizer_model: Optional["DeepEvalBaseLLM"] = None + def _sample_minibatch(self, goldens: List) -> List: + """Sample a stochastic minibatch for Optuna evaluation.""" + if len(goldens) <= self.minibatch_size: + return goldens + return self.random_state.sample(goldens, self.minibatch_size) - # Lazy-loaded components - self._proposer: Optional[InstructionProposer] = None - self._bootstrapper: Optional[DemoBootstrapper] = None + def _build_config( + self, instr_idx: int, demo_idx: int + ) -> PromptConfiguration: + """Stitch an instruction and demo set into a unified prompt configuration, using a cache to prevent ID leaks.""" + cache_key = (instr_idx, demo_idx) + if hasattr(self, "_config_cache") and cache_key in self._config_cache: + return self._config_cache[cache_key] - ############## - # Public API # - ############## + base_prompt = self.candidates[instr_idx] + demo_set = self.demo_sets[demo_idx] - def execute( - self, - prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> Tuple[Prompt, OptimizationReport]: - """ - Synchronous MIPROv2 optimization. - - Phase 1: Propose instruction candidates + Bootstrap demo sets - Phase 2: Use Bayesian Optimization to find the best combination - """ - self._validate_inputs(goldens) - self._ensure_scorer() - self._ensure_proposer() - self._ensure_bootstrapper() - self.reset_state() - - # Phase 1a: Propose instruction candidates - self._update_status("Phase 1: Proposing instruction candidates...", 0) - instruction_candidates = self._proposer.propose( - prompt=prompt, - goldens=goldens, - num_candidates=self.num_candidates, + unified_prompt = render_prompt_with_demonstrations( + base_prompt, demo_set ) - self._register_instruction_candidates(instruction_candidates) - # Phase 1b: Bootstrap demo sets - self._update_status( - "Phase 1: Bootstrapping few-shot demonstrations...", 0 + config = PromptConfiguration.new( + prompts={self.SINGLE_MODULE_ID: unified_prompt} ) - self._demo_sets = self._bootstrapper.bootstrap( - prompt=prompt, - goldens=goldens, - generate_fn=self._create_generate_fn(), - ) - self._update_status(f"Bootstrapped {len(self._demo_sets)} demo sets", 0) + self.prompt_configurations_by_id[config.id] = config - # Phase 2: Bayesian Optimization over (instruction, demos) - self._update_status("Phase 2: Starting Bayesian Optimization...", 0) - best_instr_idx, best_demo_idx = self._run_bayesian_optimization(goldens) + if hasattr(self, "_config_cache"): + self._config_cache[cache_key] = config - # Final full evaluation if not already done - config_key = (best_instr_idx, best_demo_idx) - if config_key not in self._full_eval_cache: - best_config = self._get_config_by_index(best_instr_idx) - best_demo_set = self._demo_sets[best_demo_idx] - self._full_evaluate(best_config, best_demo_set, goldens) + return config - # Build report - best = self._best_by_aggregate() - return self._build_result(best) + def _update_step(self, message: str) -> None: + """Updates the bottom text row (e.g., '⤷ Bootstrapping...')""" + if getattr(self, "step_callback", None) is not None: + self.step_callback(message) - async def a_execute( + def _update_trial_progress(self, step: int, total: int) -> None: + """Advances the main top progress bar.""" + if getattr(self, "status_callback", None) is not None: + self.status_callback( + RunnerStatusType.PROGRESS, + detail="", + step_index=step, + total_steps=total, + ) + + ################################################## + # Synchronous Execution + ################################################## + + def execute( self, prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], + goldens: Union[List[Golden], List[ConversationalGolden]], ) -> Tuple[Prompt, OptimizationReport]: - """ - Asynchronous MIPROv2 optimization. - """ - self._validate_inputs(goldens) - self._ensure_scorer() - self._ensure_proposer() - self._ensure_bootstrapper() - self.reset_state() - - # Phase 1: Run proposal and bootstrapping concurrently - self._update_status( - "Phase 1: Proposing candidates & bootstrapping demos...", 0 - ) + import uuid - instruction_candidates, demo_sets = await asyncio.gather( - self._proposer.a_propose( - prompt=prompt, - goldens=goldens, - num_candidates=self.num_candidates, - ), - self._bootstrapper.a_bootstrap( - prompt=prompt, - goldens=goldens, - a_generate_fn=self._create_async_generate_fn(), - ), + self._init_components() + self._iteration_log = [] + + # Phase 1: Propose & Bootstrap + self._update_step( + f"Generating {self.num_candidates} diverse instructions..." + ) + self.candidates = self.proposer.propose( + prompt, goldens, self.num_candidates ) - self._register_instruction_candidates(instruction_candidates) - self._demo_sets = demo_sets - self._update_status( - f"Generated {len(instruction_candidates)} candidates, {len(self._demo_sets)} demo sets", - 0, + self._update_step( + f"Bootstrapping {self.num_demonstration_sets} verified demonstration sets..." ) + self.demo_sets = self.bootstrapper.bootstrap(prompt, goldens) # Phase 2: Bayesian Optimization - self._update_status("Phase 2: Starting Bayesian Optimization...", 0) - best_instr_idx, best_demo_idx = await self._a_run_bayesian_optimization( - goldens + self._update_step( + "Initializing Tree-structured Parzen Estimator (TPE)..." + ) + optuna.logging.set_verbosity(optuna.logging.WARNING) + study = optuna.create_study( + direction="maximize", sampler=TPESampler(seed=self.seed) ) - # Final full evaluation if not already done - config_key = (best_instr_idx, best_demo_idx) - if config_key not in self._full_eval_cache: - best_config = self._get_config_by_index(best_instr_idx) - best_demo_set = self._demo_sets[best_demo_idx] - await self._a_full_evaluate(best_config, best_demo_set, goldens) - - # Build report - best = self._best_by_aggregate() - return self._build_result(best) - - ################### - # State & Helpers # - ################### - - def reset_state(self) -> None: - """Reset optimization state for a new run.""" - self.optimization_id = str(uuid.uuid4()) - self.prompt_configurations_by_id: Dict[ - PromptConfigurationId, PromptConfiguration - ] = {} - self.parents_by_id: Dict[ - PromptConfigurationId, Optional[PromptConfigurationId] - ] = {} - self.pareto_score_table: ScoreTable = {} - - # Candidate tracking - self._instruction_candidates: List[PromptConfiguration] = [] - self._demo_sets: List[DemoSet] = [] - - # Score tracking: (instr_idx, demo_idx) -> list of minibatch scores - self._combination_scores: Dict[Tuple[int, int], List[float]] = {} + best_score = float("-inf") + best_config_id = None + accepted_iterations: List[AcceptedIterationDict] = [] + + for trial_idx in range(self.num_trials): + trial_start = time.time() + self._update_trial_progress(trial_idx + 1, self.num_trials) + self._update_step( + f"Running Bayesian Trial {trial_idx + 1}/{self.num_trials}..." + ) - # Full eval cache: (instr_idx, demo_idx) -> config_id - self._full_eval_cache: Dict[Tuple[int, int], PromptConfigurationId] = {} + trial = study.ask() + instr_idx = trial.suggest_categorical( + "instr_idx", list(range(len(self.candidates))) + ) + demo_idx = trial.suggest_categorical( + "demo_idx", list(range(len(self.demo_sets))) + ) - # Trial tracking - self._trial_history: List[Dict] = [] - self._best_trial_key: Tuple[int, int] = (0, 0) - self._best_trial_score: float = float("-inf") + config = self._build_config(instr_idx, demo_idx) + minibatch = self._sample_minibatch(goldens) - def _validate_inputs( - self, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> None: - """Validate input parameters.""" - if len(goldens) < 1: - raise DeepEvalError( - "MIPROv2 prompt optimization requires at least 1 golden, but " - f"received {len(goldens)}. Provide at least one golden to run " - "the optimizer." - ) + score = self.scorer.score_minibatch(config, minibatch) + study.tell(trial, score) - def _ensure_scorer(self) -> None: - """Ensure scorer is configured.""" - if self.scorer is None: - raise DeepEvalError( - "MIPROv2 requires a `scorer`. " - "Construct one in PromptOptimizer and assign it to `runner.scorer`." + self._iteration_log.append( + { + "iteration": trial_idx + 1, + "outcome": "accepted" if score > best_score else "rejected", + "before": ( + best_score if best_score != float("-inf") else 0.0 + ), + "after": score, + "reason": f"TPE Sample -> Instruction: {instr_idx}, DemoSet: {demo_idx}", + "elapsed": time.time() - trial_start, + } ) - def _ensure_proposer(self) -> None: - """Lazily initialize the instruction proposer.""" - if self._proposer is None: - if self.optimizer_model is None: - raise DeepEvalError( - "MIPROv2 requires an `optimizer_model` for instruction proposal. " - "Set it via PromptOptimizer." + # Periodic Full Pareto Evaluation + if ( + (trial_idx + 1) % self.minibatch_full_eval_steps == 0 + or trial_idx == self.num_trials - 1 + ): + self._update_step( + f"Running full validation on current best configuration..." + ) + best_trial = study.best_trial + best_eval_config = self._build_config( + best_trial.params["instr_idx"], + best_trial.params["demo_idx"], ) - self._proposer = InstructionProposer( - optimizer_model=self.optimizer_model, - random_state=self.random_state, - ) - def _ensure_bootstrapper(self) -> None: - """Lazily initialize the demo bootstrapper.""" - if self._bootstrapper is None: - self._bootstrapper = DemoBootstrapper( - max_bootstrapped_demos=self.max_bootstrapped_demos, - max_labeled_demos=self.max_labeled_demos, - num_demo_sets=self.num_demo_sets, - random_state=self.random_state, - ) + full_scores = self.scorer.score_pareto( + best_eval_config, goldens + ) + avg_full_score = sum(full_scores) / len(full_scores) + + self.pareto_score_table[best_eval_config.id] = full_scores + + if avg_full_score > best_score: + if best_config_id is not None: + accepted_iterations.append( + AcceptedIterationDict( + parent=best_config_id, + child=best_eval_config.id, + module=self.SINGLE_MODULE_ID, + before=best_score, + after=avg_full_score, + ) + ) + best_score = avg_full_score + best_config_id = best_eval_config.id + + true_best_id = None + true_best_score = float("-inf") + for cid, scores in self.pareto_score_table.items(): + avg_score = sum(scores) / len(scores) if scores else 0.0 + if avg_score > true_best_score: + true_best_score = avg_score + true_best_id = cid + + final_id = true_best_id if true_best_id else best_config_id + best_config = self.prompt_configurations_by_id[final_id] - def _create_generate_fn( - self, - ) -> Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str]: - """Create a sync generate function for bootstrapping.""" - - def generate_fn( - prompt: Prompt, - golden: Union["Golden", "ConversationalGolden"], - ) -> str: - # Create a temporary config for generation - temp_config = PromptConfiguration.new( - prompts={self.SINGLE_MODULE_ID: prompt} - ) - return self.scorer.generate(temp_config.prompts, golden) + report = OptimizationReport( + optimization_id=getattr(self, "optimization_id", str(uuid.uuid4())), + best_id=best_config.id, + accepted_iterations=accepted_iterations, + pareto_scores=self.pareto_score_table, + parents=self.parents_by_id, + prompt_configurations=build_prompt_config_snapshots( + self.prompt_configurations_by_id + ), + ) - return generate_fn + return best_config.prompts[self.SINGLE_MODULE_ID], report - def _create_async_generate_fn(self) -> Callable: - """Create an async generate function for bootstrapping.""" + ################################################## + # Asynchronous Execution + ################################################## - async def a_generate_fn( - prompt: Prompt, - golden: Union["Golden", "ConversationalGolden"], - ) -> str: - temp_config = PromptConfiguration.new( - prompts={self.SINGLE_MODULE_ID: prompt} - ) - return await self.scorer.a_generate(temp_config.prompts, golden) - - return a_generate_fn - - def _register_instruction_candidates( - self, candidates: List[Prompt] - ) -> None: - """Register all instruction candidates as configurations.""" - for i, prompt in enumerate(candidates): - config = PromptConfiguration.new( - prompts={self.SINGLE_MODULE_ID: prompt}, - parent=None if i == 0 else self._instruction_candidates[0].id, - ) - self._instruction_candidates.append(config) - self.prompt_configurations_by_id[config.id] = config - self.parents_by_id[config.id] = config.parent + async def a_execute( + self, + prompt: Prompt, + goldens: Union[List[Golden], List[ConversationalGolden]], + ) -> Tuple[Prompt, OptimizationReport]: + import uuid - def _get_config_by_index(self, idx: int) -> PromptConfiguration: - """Get configuration by instruction candidate index.""" - return self._instruction_candidates[idx] + self._init_components() + self._iteration_log = [] - def _draw_minibatch( - self, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> Union[List["Golden"], List["ConversationalGolden"]]: - """Sample a minibatch from goldens.""" - n = len(goldens) - if n <= 0: - return [] - size = min(self.minibatch_size, n) - return [goldens[self.random_state.randrange(0, n)] for _ in range(size)] - - def _render_config_with_demos( - self, - config: PromptConfiguration, - demo_set: DemoSet, - ) -> PromptConfiguration: - """Create a new config with demos rendered into the prompt.""" - base_prompt = config.prompts[self.SINGLE_MODULE_ID] - rendered_prompt = render_prompt_with_demos( - prompt=base_prompt, - demo_set=demo_set, - max_demos=self.max_bootstrapped_demos + self.max_labeled_demos, + self._update_step( + f"Generating {self.num_candidates} diverse instructions..." ) - - # Create a new config with the rendered prompt - rendered_config = PromptConfiguration.new( - prompts={self.SINGLE_MODULE_ID: rendered_prompt}, - parent=config.id, + self.candidates = await self.proposer.a_propose( + prompt, goldens, self.num_candidates ) - return rendered_config - ############################ - # Bayesian Optimization # - ############################ + self._update_step( + f"Bootstrapping {self.num_demonstration_sets} verified demonstration sets..." + ) + self.demo_sets = await self.bootstrapper.a_bootstrap(prompt, goldens) - def _run_bayesian_optimization( - self, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> Tuple[int, int]: - """ - Run Bayesian Optimization using Optuna's TPE sampler. - Returns the (instruction_idx, demo_set_idx) of the best combination. - """ - num_instructions = len(self._instruction_candidates) - num_demo_sets = len(self._demo_sets) - - # Create Optuna study with TPE sampler - sampler = TPESampler(seed=self.random_seed) + self._update_step( + "Initializing Tree-structured Parzen Estimator (TPE)..." + ) + optuna.logging.set_verbosity(optuna.logging.WARNING) study = optuna.create_study( - direction="maximize", - sampler=sampler, + direction="maximize", sampler=TPESampler(seed=self.seed) ) - def objective(trial: "optuna.Trial") -> float: - # Sample instruction and demo set indices - instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1) - demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1) - - # Get the configuration and demo set - config = self._get_config_by_index(instr_idx) - demo_set = self._demo_sets[demo_idx] - - # Render prompt with demos - rendered_config = self._render_config_with_demos(config, demo_set) + best_score = float("-inf") + best_config_id = None + accepted_iterations: List[AcceptedIterationDict] = [] + + for trial_idx in range(self.num_trials): + trial_start = time.time() + self._update_trial_progress(trial_idx + 1, self.num_trials) + self._update_step( + f"Running Bayesian Trial {trial_idx + 1}/{self.num_trials}..." + ) - # Draw minibatch and score - minibatch = self._draw_minibatch(goldens) - score = self.scorer.score_minibatch(rendered_config, minibatch) + trial = study.ask() + instr_idx = trial.suggest_categorical( + "instr_idx", list(range(len(self.candidates))) + ) + demo_idx = trial.suggest_categorical( + "demo_idx", list(range(len(self.demo_sets))) + ) - # Track scores for this combination - combo_key = (instr_idx, demo_idx) - if combo_key not in self._combination_scores: - self._combination_scores[combo_key] = [] - self._combination_scores[combo_key].append(score) + config = self._build_config(instr_idx, demo_idx) + minibatch = self._sample_minibatch(goldens) - # Update best tracking - if score > self._best_trial_score: - self._best_trial_score = score - self._best_trial_key = combo_key + score = await self.scorer.a_score_minibatch(config, minibatch) + study.tell(trial, score) - # Record trial - trial_num = len(self._trial_history) + 1 - self._trial_history.append( + self._iteration_log.append( { - "trial": trial_num, - "instr_idx": instr_idx, - "demo_idx": demo_idx, - "score": score, + "iteration": trial_idx + 1, + "outcome": "accepted" if score > best_score else "rejected", + "before": ( + best_score if best_score != float("-inf") else 0.0 + ), + "after": score, + "reason": f"TPE Sample -> Instruction: {instr_idx}, DemoSet: {demo_idx}", + "elapsed": time.time() - trial_start, } ) - # Progress update - demo_info = ( - f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot" - ) - self._update_status( - f"Trial {trial_num}/{self.num_trials} - " - f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}", - trial_num, - ) + if ( + (trial_idx + 1) % self.minibatch_full_eval_steps == 0 + or trial_idx == self.num_trials - 1 + ): + self._update_step( + f"Running full validation on current best configuration..." + ) + best_trial = study.best_trial + best_eval_config = self._build_config( + best_trial.params["instr_idx"], + best_trial.params["demo_idx"], + ) - # Periodic full evaluation - if trial_num % self.minibatch_full_eval_steps == 0: - best_instr, best_demo = self._best_trial_key - if (best_instr, best_demo) not in self._full_eval_cache: - best_config = self._get_config_by_index(best_instr) - best_demo_set = self._demo_sets[best_demo] - self._full_evaluate(best_config, best_demo_set, goldens) - - return score - - # Run optimization - study.optimize( - objective, - n_trials=self.num_trials, - show_progress_bar=False, - ) + full_scores = await self.scorer.a_score_pareto( + best_eval_config, goldens + ) + avg_full_score = sum(full_scores) / len(full_scores) + + self.pareto_score_table[best_eval_config.id] = full_scores + + if avg_full_score > best_score: + if best_config_id is not None: + accepted_iterations.append( + AcceptedIterationDict( + parent=best_config_id, + child=best_eval_config.id, + module=self.SINGLE_MODULE_ID, + before=best_score, + after=avg_full_score, + ) + ) + best_score = avg_full_score + best_config_id = best_eval_config.id + + true_best_id = None + true_best_score = float("-inf") + for cid, scores in self.pareto_score_table.items(): + avg_score = sum(scores) / len(scores) if scores else 0.0 + if avg_score > true_best_score: + true_best_score = avg_score + true_best_id = cid + + final_id = true_best_id if true_best_id else best_config_id + best_config = self.prompt_configurations_by_id[final_id] - # Return the best combination - return ( - study.best_params["instr_idx"], - study.best_params["demo_idx"], + report = OptimizationReport( + optimization_id=getattr(self, "optimization_id", str(uuid.uuid4())), + best_id=best_config.id, + accepted_iterations=accepted_iterations, + pareto_scores=self.pareto_score_table, + parents=self.parents_by_id, + prompt_configurations=build_prompt_config_snapshots( + self.prompt_configurations_by_id + ), ) - async def _a_run_bayesian_optimization( - self, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> Tuple[int, int]: - """ - Async version of Bayesian Optimization. - """ - num_instructions = len(self._instruction_candidates) - num_demo_sets = len(self._demo_sets) - - sampler = TPESampler(seed=self.random_seed) - study = optuna.create_study( - direction="maximize", - sampler=sampler, - ) + return best_config.prompts[self.SINGLE_MODULE_ID], report - for trial_num in range(1, self.num_trials + 1): - trial = study.ask() + def generate_summary_table(self, report: OptimizationReport) -> List[Table]: + """Generates MIPROv2-specific Bayesian Search logs and Validation tables.""" + from rich.table import Table + from rich import box - # Sample indices - instr_idx = trial.suggest_int("instr_idx", 0, num_instructions - 1) - demo_idx = trial.suggest_int("demo_idx", 0, num_demo_sets - 1) + _PURPLE = "rgb(106,0,255)" + _GREEN = "rgb(25,227,160)" + _DIM = "rgb(55,65,81)" - # Get config and demos - config = self._get_config_by_index(instr_idx) - demo_set = self._demo_sets[demo_idx] - rendered_config = self._render_config_with_demos(config, demo_set) + tables = [] + iteration_log = getattr(self, "_iteration_log", []) - # Score on minibatch - minibatch = self._draw_minibatch(goldens) - score = await self.scorer.a_score_minibatch( - rendered_config, minibatch + # 1. Bayesian TPE Trial Table + iter_table = Table( + title=f"🔬 [{_PURPLE}]{self.name}[/] Bayesian Search (Stochastic Minibatches)", + box=box.ROUNDED, + border_style=_PURPLE, + header_style=f"bold {_PURPLE}", + show_lines=True, + expand=True, + ) + iter_table.add_column( + "#", style="bold white", justify="right", no_wrap=True + ) + iter_table.add_column("Status", justify="center", no_wrap=True) + iter_table.add_column("Best Prior", justify="right", no_wrap=True) + iter_table.add_column("Trial Score", justify="right", no_wrap=True) + iter_table.add_column("Δ to Best", justify="right", no_wrap=True) + iter_table.add_column("Note", style=f"{_DIM}", no_wrap=False) + iter_table.add_column("Time", justify="right", no_wrap=True) + + running_max = float("-inf") + + for entry in iteration_log: + i = str(entry["iteration"]) + score = entry.get("after", 0.0) + reason = entry.get("reason", "") + elapsed = entry.get("elapsed", 0.0) + + # Define the "Before" state as the highest score seen up to this point + best_prior = running_max if running_max != float("-inf") else 0.0 + delta = score - best_prior + + # If it's a new high score, update the running max and mark it + if score > running_max: + status_cell = f"[{_GREEN}]🏆 New Best[/]" + color = "white" + sign = "+" if delta >= 0 else "" + running_max = score + else: + status_cell = f"[{_DIM}]📊 Sampled[/]" + color = _DIM + sign = "+" if delta >= 0 else "" + + best_prior_cell = f"{best_prior:.4f}" + score_cell = ( + f"[bold {color}]{score:.4f}[/]" + if score >= running_max + else f"[{color}]{score:.4f}[/]" ) - - # Track scores - combo_key = (instr_idx, demo_idx) - if combo_key not in self._combination_scores: - self._combination_scores[combo_key] = [] - self._combination_scores[combo_key].append(score) - - # Update best - if score > self._best_trial_score: - self._best_trial_score = score - self._best_trial_key = combo_key - - # Record trial - self._trial_history.append( - { - "trial": trial_num, - "instr_idx": instr_idx, - "demo_idx": demo_idx, - "score": score, - } + delta_cell = f"[{color}]{sign}{delta:.4f}[/]" + time_cell = f"[{_DIM}]{elapsed:.2f}s[/]" + + iter_table.add_row( + i, + status_cell, + best_prior_cell, + score_cell, + delta_cell, + reason, + time_cell, ) - # Tell Optuna the result - study.tell(trial, score) - - # Progress update - demo_info = ( - f"{len(demo_set.demos)} demos" if demo_set.demos else "0-shot" + tables.append(iter_table) + + # 2. Final Pareto archive table + if report and report.pareto_scores: + pareto_table = Table( + title=f"[{_PURPLE}]True Validation Archive (Full Dataset)[/]", + box=box.HORIZONTALS, + border_style=_PURPLE, + header_style=f"bold {_PURPLE}", + show_lines=True, + expand=True, ) - self._update_status( - f"Trial {trial_num}/{self.num_trials} - " - f"Instr {instr_idx}, {demo_info} - Score: {score:.4f}", - trial_num, + pareto_table.add_column( + "Config ID", style="white", justify="center", no_wrap=True + ) + pareto_table.add_column("Role", justify="center", no_wrap=True) + pareto_table.add_column( + "Scores Array", justify="center", no_wrap=False + ) + pareto_table.add_column( + "True Avg Score", justify="right", no_wrap=True ) - # Periodic full evaluation - if trial_num % self.minibatch_full_eval_steps == 0: - best_instr, best_demo = self._best_trial_key - if (best_instr, best_demo) not in self._full_eval_cache: - best_config = self._get_config_by_index(best_instr) - best_demo_set = self._demo_sets[best_demo] - await self._a_full_evaluate( - best_config, best_demo_set, goldens - ) - - return ( - study.best_params["instr_idx"], - study.best_params["demo_idx"], - ) - - ############################ - # Full Evaluation # - ############################ - - def _full_evaluate( - self, - config: PromptConfiguration, - demo_set: DemoSet, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> None: - """Perform full evaluation on all goldens.""" - # Find the indices for this combination - instr_idx = self._instruction_candidates.index(config) - demo_idx = self._demo_sets.index(demo_set) - combo_key = (instr_idx, demo_idx) - - if combo_key in self._full_eval_cache: - return - - # Render with demos - rendered_config = self._render_config_with_demos(config, demo_set) - - # Register the rendered config - self.prompt_configurations_by_id[rendered_config.id] = rendered_config - self.parents_by_id[rendered_config.id] = config.id - - # Score on full set - scores = self.scorer.score_pareto(rendered_config, goldens) - self.pareto_score_table[rendered_config.id] = scores - - # Cache the result - self._full_eval_cache[combo_key] = rendered_config.id - - async def _a_full_evaluate( - self, - config: PromptConfiguration, - demo_set: DemoSet, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - ) -> None: - """Async full evaluation.""" - instr_idx = self._instruction_candidates.index(config) - demo_idx = self._demo_sets.index(demo_set) - combo_key = (instr_idx, demo_idx) - - if combo_key in self._full_eval_cache: - return - - rendered_config = self._render_config_with_demos(config, demo_set) - self.prompt_configurations_by_id[rendered_config.id] = rendered_config - self.parents_by_id[rendered_config.id] = config.id - - scores = await self.scorer.a_score_pareto(rendered_config, goldens) - self.pareto_score_table[rendered_config.id] = scores - self._full_eval_cache[combo_key] = rendered_config.id - - ############################ - # Result Building # - ############################ - - def _best_by_aggregate(self) -> PromptConfiguration: - """Return the best candidate based on full evaluation scores.""" - if not self.pareto_score_table: - # Fall back to best by trial scores - best_instr, best_demo = self._best_trial_key - config = self._get_config_by_index(best_instr) - demo_set = self._demo_sets[best_demo] - return self._render_config_with_demos(config, demo_set) - - best_id: Optional[PromptConfigurationId] = None - best_score = float("-inf") - - for config_id, scores in self.pareto_score_table.items(): - agg_score = self.aggregate_instances(scores) - if agg_score > best_score: - best_score = agg_score - best_id = config_id - - if best_id is None: - best_instr, best_demo = self._best_trial_key - config = self._get_config_by_index(best_instr) - demo_set = self._demo_sets[best_demo] - return self._render_config_with_demos(config, demo_set) + best_id = report.best_id - return self.prompt_configurations_by_id[best_id] + for cid, scores in report.pareto_scores.items(): + is_best = cid == best_id + role = f"[{_DIM}]candidate[/]" - def _build_result( - self, - best: PromptConfiguration, - ) -> Tuple[Prompt, OptimizationReport]: - """Build the optimization result.""" - prompt_config_snapshots = build_prompt_config_snapshots( - self.prompt_configurations_by_id - ) + short_id = cid[:8] + "…" + if is_best: + short_id = f"[bold white]{short_id} ★[/]" - report = OptimizationReport( - optimization_id=self.optimization_id, - best_id=best.id, - accepted_iterations=self._trial_history, - pareto_scores=self.pareto_score_table, - parents=self.parents_by_id, - prompt_configurations=prompt_config_snapshots, - ) + if len(scores) > 6: + score_strs = ( + [f"{s:.3f}" for s in scores[:3]] + + ["..."] + + [f"{s:.3f}" for s in scores[-3:]] + ) + else: + score_strs = [f"{s:.3f}" for s in scores] + scores_cell = f"[{_DIM}][{', '.join(score_strs)}][/]" + + agg = sum(scores) / len(scores) if scores else 0.0 + agg_color = "white" if is_best else _DIM + agg_cell = ( + f"[bold {agg_color}]{agg:.4f}[/]" + if is_best + else f"[{agg_color}]{agg:.4f}[/]" + ) - return best.prompts[self.SINGLE_MODULE_ID], report + pareto_table.add_row(short_id, role, scores_cell, agg_cell) - ############################ - # Status Updates # - ############################ + tables.append(pareto_table) - def _update_status(self, message: str, step: int) -> None: - """Send status update via callback.""" - if self.status_callback is not None: - self.status_callback( - RunnerStatusType.PROGRESS, - step_index=step, - total_steps=self.num_trials, - detail=message, - ) + return tables diff --git a/deepeval/optimizer/algorithms/miprov2/proposer.py b/deepeval/optimizer/algorithms/miprov2/proposer.py deleted file mode 100644 index 6730ab8f1e..0000000000 --- a/deepeval/optimizer/algorithms/miprov2/proposer.py +++ /dev/null @@ -1,301 +0,0 @@ -# Instruction Proposer for MIPROv2 -# -# This module generates N diverse instruction candidates upfront, -# following the original MIPROv2 paper approach. Each candidate is -# generated with different "tips" (e.g., "be creative", "be concise") -# to encourage diversity in the instruction space. - -from __future__ import annotations -import asyncio -import random -from typing import List, Optional, Union, TYPE_CHECKING - -from deepeval.models.base_model import DeepEvalBaseLLM -from deepeval.prompt.prompt import Prompt -from deepeval.prompt.api import PromptType - -if TYPE_CHECKING: - from deepeval.dataset.golden import Golden, ConversationalGolden - - -# Tips for encouraging diverse instruction generation (from DSPy MIPROv2) -INSTRUCTION_TIPS = [ - "Be creative and think outside the box.", - "Be concise and direct.", - "Use step-by-step reasoning.", - "Focus on clarity and precision.", - "Include specific examples where helpful.", - "Emphasize the most important aspects.", - "Consider edge cases and exceptions.", - "Use structured formatting when appropriate.", - "Be thorough but avoid unnecessary details.", - "Prioritize accuracy over creativity.", - "Make the instruction self-contained.", - "Use natural, conversational language.", - "Be explicit about expected output format.", - "Include context about common mistakes to avoid.", - "Focus on the user's intent and goals.", -] - - -class InstructionProposer: - """ - Generates N diverse instruction candidates for a given prompt. - - Following the MIPROv2 paper, this proposer: - 1. Analyzes the current prompt and task - 2. Optionally uses example inputs/outputs from goldens - 3. Applies different "tips" to encourage diversity - 4. Generates N candidate instructions - """ - - def __init__( - self, - optimizer_model: DeepEvalBaseLLM, - random_state: Optional[Union[int, random.Random]] = None, - ): - self.optimizer_model = optimizer_model - - if isinstance(random_state, int): - self.random_state = random.Random(random_state) - else: - self.random_state = random_state or random.Random() - - def _format_prompt(self, prompt: Prompt) -> str: - """Format the prompt for the proposer context.""" - if prompt.type == PromptType.LIST: - parts = [] - for msg in prompt.messages_template: - role = msg.role or "unknown" - content = msg.content or "" - parts.append(f"[{role}]: {content}") - return "\n".join(parts) - else: - return prompt.text_template or "" - - def _format_examples( - self, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - max_examples: int = 3, - ) -> str: - """Format example inputs/outputs from goldens.""" - if not goldens: - return "No examples available." - - examples = [] - sample = self.random_state.sample( - goldens, min(max_examples, len(goldens)) - ) - - for i, golden in enumerate(sample, 1): - # Handle both Golden and ConversationalGolden - if hasattr(golden, "input"): - inp = str(golden.input) - out = str(golden.expected_output or "") - examples.append( - f"Example {i}:\n Input: {inp}\n Expected: {out}" - ) - elif hasattr(golden, "messages"): - # ConversationalGolden - msgs = golden.messages[:2] if golden.messages else [] - msg_str = " | ".join(str(m) for m in msgs) - examples.append(f"Example {i}: {msg_str}") - - return "\n".join(examples) if examples else "No examples available." - - def _compose_proposer_prompt( - self, - current_prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - tip: str, - candidate_index: int, - ) -> str: - """Compose the prompt for generating an instruction candidate.""" - prompt_text = self._format_prompt(current_prompt) - examples_text = self._format_examples(goldens) - - return f"""You are an expert prompt engineer. Your task is to propose an improved instruction/prompt for an LLM task. - -[CURRENT PROMPT] -{prompt_text} - -[EXAMPLE INPUTS/OUTPUTS FROM THE TASK] -{examples_text} - -[GENERATION TIP] -{tip} - -[INSTRUCTIONS] -Based on the current prompt, the example task inputs/outputs, and the generation tip above, propose an improved version of the prompt. - -This is candidate #{candidate_index + 1}. Make it meaningfully different from trivial variations. -Focus on improving clarity, effectiveness, and alignment with the task requirements. - -Return ONLY the new prompt text, with no explanations or meta-commentary.""" - - def propose( - self, - prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - num_candidates: int, - ) -> List[Prompt]: - """ - Generate N instruction candidates synchronously. - - Args: - prompt: The original prompt to improve - goldens: Example inputs/outputs for context - num_candidates: Number of candidates to generate - - Returns: - List of Prompt candidates (including the original) - """ - candidates: List[Prompt] = [prompt] # Always include original - - # Select tips for diversity - tips = self._select_tips(num_candidates - 1) - - for i, tip in enumerate(tips): - proposer_prompt = self._compose_proposer_prompt( - current_prompt=prompt, - goldens=goldens, - tip=tip, - candidate_index=i, - ) - - try: - output = self.optimizer_model.generate(proposer_prompt) - new_text = self._normalize_output(output) - - if new_text and new_text.strip(): - new_prompt = self._create_prompt_from_text(prompt, new_text) - if not self._is_duplicate(new_prompt, candidates): - candidates.append(new_prompt) - except Exception: - # Skip failed generations - continue - - return candidates - - async def a_propose( - self, - prompt: Prompt, - goldens: Union[List["Golden"], List["ConversationalGolden"]], - num_candidates: int, - ) -> List[Prompt]: - """ - Generate N instruction candidates asynchronously (concurrently). - """ - candidates: List[Prompt] = [prompt] # Always include original - - tips = self._select_tips(num_candidates - 1) - - # Build all proposer prompts upfront - proposer_prompts = [ - self._compose_proposer_prompt( - current_prompt=prompt, - goldens=goldens, - tip=tip, - candidate_index=i, - ) - for i, tip in enumerate(tips) - ] - - # Generate all candidates concurrently - async def generate_one(proposer_prompt: str) -> Optional[str]: - try: - output = await self.optimizer_model.a_generate(proposer_prompt) - return self._normalize_output(output) - except Exception: - return None - - results = await asyncio.gather( - *[generate_one(p) for p in proposer_prompts] - ) - - # Collect successful, non-duplicate candidates - for new_text in results: - if new_text and new_text.strip(): - new_prompt = self._create_prompt_from_text(prompt, new_text) - if not self._is_duplicate(new_prompt, candidates): - candidates.append(new_prompt) - - return candidates - - def _select_tips(self, count: int) -> List[str]: - """Select diverse tips for candidate generation.""" - if count <= 0: - return [] - - if count >= len(INSTRUCTION_TIPS): - # Use all tips, possibly repeating - tips = list(INSTRUCTION_TIPS) - while len(tips) < count: - tips.append(self.random_state.choice(INSTRUCTION_TIPS)) - return tips[:count] - - return self.random_state.sample(INSTRUCTION_TIPS, count) - - def _normalize_output(self, output) -> str: - """Normalize LLM output to string.""" - if isinstance(output, str): - return output.strip() - if isinstance(output, tuple): - return str(output[0]).strip() if output else "" - if isinstance(output, list): - return str(output[0]).strip() if output else "" - return str(output).strip() - - def _create_prompt_from_text( - self, original: Prompt, new_text: str - ) -> Prompt: - """Create a new Prompt from generated text, preserving structure.""" - if original.type == PromptType.LIST: - # For LIST prompts, update the system or first assistant message - new_messages = [] - updated = False - - for msg in original.messages_template: - if not updated and msg.role in ("system", "assistant"): - new_msg = type(msg)(role=msg.role, content=new_text) - new_messages.append(new_msg) - updated = True - else: - new_messages.append(msg) - - if not updated and new_messages: - # Update the first message if no system/assistant found - first = new_messages[0] - new_messages[0] = type(first)(role=first.role, content=new_text) - - return Prompt(messages_template=new_messages) - else: - return Prompt(text_template=new_text) - - def _is_duplicate(self, new_prompt: Prompt, existing: List[Prompt]) -> bool: - """Check if a prompt is a duplicate of existing candidates.""" - new_text = self._get_prompt_text(new_prompt).strip().lower() - - for p in existing: - existing_text = self._get_prompt_text(p).strip().lower() - # Consider duplicates if >90% similar - if new_text == existing_text: - return True - # Simple similarity check - if len(new_text) > 0 and len(existing_text) > 0: - shorter = min(len(new_text), len(existing_text)) - longer = max(len(new_text), len(existing_text)) - if shorter / longer > 0.9: - # Check prefix similarity - if new_text[:shorter] == existing_text[:shorter]: - return True - return False - - def _get_prompt_text(self, prompt: Prompt) -> str: - """Extract text from a prompt for comparison.""" - if prompt.type == PromptType.LIST: - parts = [] - for msg in prompt.messages_template: - parts.append(msg.content or "") - return " ".join(parts) - return prompt.text_template or "" diff --git a/deepeval/optimizer/algorithms/miprov2/proposer/__init__.py b/deepeval/optimizer/algorithms/miprov2/proposer/__init__.py new file mode 100644 index 0000000000..905434b819 --- /dev/null +++ b/deepeval/optimizer/algorithms/miprov2/proposer/__init__.py @@ -0,0 +1 @@ +from .proposer import InstructionProposer diff --git a/deepeval/optimizer/algorithms/miprov2/proposer/proposer.py b/deepeval/optimizer/algorithms/miprov2/proposer/proposer.py new file mode 100644 index 0000000000..e8235aad21 --- /dev/null +++ b/deepeval/optimizer/algorithms/miprov2/proposer/proposer.py @@ -0,0 +1,312 @@ +from __future__ import annotations +import asyncio +import random +import json +import difflib +from typing import List, Optional, Union + +from deepeval.models.base_model import DeepEvalBaseLLM +from deepeval.prompt.prompt import Prompt +from deepeval.prompt.api import PromptType +from deepeval.metrics.utils import ( + initialize_model, + generate_with_schema_and_extract, + a_generate_with_schema_and_extract, +) +from deepeval.optimizer.utils import _parse_prompt, _create_prompt +from .schema import DatasetSummarySchema, InstructionProposalSchema +from .template import ProposerTemplate + +from deepeval.dataset.golden import Golden, ConversationalGolden + +INSTRUCTION_TIPS = [ + "Be creative and think outside the box.", + "Be concise and direct.", + "Use step-by-step reasoning.", + "Focus on clarity and precision.", + "Include specific examples where helpful.", + "Emphasize the most important aspects.", + "Consider edge cases and exceptions.", + "Use structured formatting when appropriate.", + "Be thorough but avoid unnecessary details.", + "Prioritize accuracy over creativity.", + "Make the instruction self-contained.", + "Use natural, conversational language.", + "Be explicit about expected output format.", + "Include context about common mistakes to avoid.", + "Focus on the user's intent and goals.", +] + + +class InstructionProposer: + """ + Generates N diverse instruction candidates for a given prompt using + Program-and-Data-Aware grounding and Bayesian tip diversity. + """ + + def __init__( + self, + optimizer_model: DeepEvalBaseLLM, + random_state: Optional[Union[int, random.Random]] = None, + ): + self.model, self.using_native_model = initialize_model(optimizer_model) + + if isinstance(random_state, int): + self.random_state = random.Random(random_state) + else: + self.random_state = random_state or random.Random() + + def _accrue_cost(self, cost: float) -> None: + pass + + def _format_examples( + self, + goldens: Union[List["Golden"], List["ConversationalGolden"]], + max_examples: int = 3, + ) -> str: + if not goldens: + return "No examples available." + + examples = [] + sample = self.random_state.sample( + goldens, min(max_examples, len(goldens)) + ) + + for i, golden in enumerate(sample, 1): + if isinstance(golden, Golden): + inp = str(golden.input) + out = str(golden.expected_output or "") + examples.append( + f"Example {i}:\n Input: {inp}\n Expected: {out}" + ) + else: + msgs = golden.turns if golden.turns else [] + msg_str = " | ".join(str(m) for m in msgs) + examples.append(f"Example {i}: {msg_str}") + + return "\n".join(examples) if examples else "No examples available." + + ############################# + # Synchronous Generation # + ############################# + + def _generate_dataset_summary(self, examples_text: str) -> str: + prompt = ProposerTemplate.generate_dataset_summary(examples_text) + + return generate_with_schema_and_extract( + metric=self, + prompt=prompt, + schema_cls=DatasetSummarySchema, + extract_schema=lambda s: s.summary, + extract_json=lambda data: data["summary"], + ) + + def _generate_candidate_instruction( + self, + current_prompt: str, + dataset_summary: str, + examples_text: str, + tip: str, + candidate_index: int, + is_list_format: bool = False, + ) -> Union[str, List[dict]]: + prompt = ProposerTemplate.generate_instruction_proposal( + current_prompt=current_prompt, + dataset_summary=dataset_summary, + examples_text=examples_text, + tip=tip, + candidate_index=candidate_index, + is_list_format=is_list_format, + ) + + return generate_with_schema_and_extract( + metric=self, + prompt=prompt, + schema_cls=InstructionProposalSchema, + extract_schema=lambda s: s.revised_instruction, + extract_json=lambda data: data["revised_instruction"], + ) + + def propose( + self, + prompt: Prompt, + goldens: Union[List["Golden"], List["ConversationalGolden"]], + num_candidates: int, + ) -> List[Prompt]: + candidates: List[Prompt] = [prompt] + + # 1. Format inputs using the global utility + is_list = ( + prompt.type.value == "list" + if hasattr(prompt.type, "value") + else prompt.type == "list" + ) + prompt_text = _parse_prompt(prompt) + examples_text = self._format_examples(goldens, max_examples=5) + + # 2. Generate Data-Aware Summary + try: + dataset_summary = self._generate_dataset_summary(examples_text) + except Exception: + dataset_summary = ( + "A standard text processing task based on the provided inputs." + ) + + # 3. Generate Candidates + tips = self._select_tips(num_candidates - 1) + + for i, tip in enumerate(tips): + try: + new_text = self._generate_candidate_instruction( + current_prompt=prompt_text, + dataset_summary=dataset_summary, + examples_text=examples_text, + tip=tip, + candidate_index=i, + is_list_format=is_list, + ) + + if new_text: + if isinstance(new_text, list): + new_text = json.dumps(new_text) + + if new_text.strip(): + new_prompt = _create_prompt(prompt, new_text) + if not self._is_duplicate(new_prompt, candidates): + candidates.append(new_prompt) + except Exception: + continue + + return candidates + + ############################# + # Asynchronous Generation # + ############################# + + async def _a_generate_dataset_summary(self, examples_text: str) -> str: + prompt = ProposerTemplate.generate_dataset_summary(examples_text) + + return await a_generate_with_schema_and_extract( + metric=self, + prompt=prompt, + schema_cls=DatasetSummarySchema, + extract_schema=lambda s: s.summary, + extract_json=lambda data: data["summary"], + ) + + async def _a_generate_candidate_instruction( + self, + current_prompt: str, + dataset_summary: str, + examples_text: str, + tip: str, + candidate_index: int, + is_list_format: bool = False, + ) -> Optional[Union[str, List[dict]]]: + prompt = ProposerTemplate.generate_instruction_proposal( + current_prompt=current_prompt, + dataset_summary=dataset_summary, + examples_text=examples_text, + tip=tip, + candidate_index=candidate_index, + is_list_format=is_list_format, + ) + + try: + return await a_generate_with_schema_and_extract( + metric=self, + prompt=prompt, + schema_cls=InstructionProposalSchema, + extract_schema=lambda s: s.revised_instruction, + extract_json=lambda data: data["revised_instruction"], + ) + except Exception: + return None + + async def a_propose( + self, + prompt: Prompt, + goldens: Union[List["Golden"], List["ConversationalGolden"]], + num_candidates: int, + ) -> List[Prompt]: + candidates: List[Prompt] = [prompt] + + is_list = ( + prompt.type.value == "list" + if hasattr(prompt.type, "value") + else prompt.type == "list" + ) + prompt_text = _parse_prompt(prompt) + examples_text = self._format_examples(goldens, max_examples=5) + + try: + dataset_summary = await self._a_generate_dataset_summary( + examples_text + ) + except Exception: + dataset_summary = ( + "A standard text processing task based on the provided inputs." + ) + + tips = self._select_tips(num_candidates - 1) + + # Run all N candidate generations concurrently + tasks = [ + self._a_generate_candidate_instruction( + current_prompt=prompt_text, + dataset_summary=dataset_summary, + examples_text=examples_text, + tip=tip, + candidate_index=i, + is_list_format=is_list, + ) + for i, tip in enumerate(tips) + ] + + results = await asyncio.gather(*tasks) + + for new_text in results: + if new_text: + if isinstance(new_text, list): + new_text = json.dumps(new_text) + + if new_text.strip(): + new_prompt = _create_prompt(prompt, new_text) + if not self._is_duplicate(new_prompt, candidates): + candidates.append(new_prompt) + + return candidates + + ############################# + # Internal Utility Methods # + ############################# + + def _select_tips(self, count: int) -> List[str]: + if count <= 0: + return [] + if count >= len(INSTRUCTION_TIPS): + tips = list(INSTRUCTION_TIPS) + while len(tips) < count: + tips.append(self.random_state.choice(INSTRUCTION_TIPS)) + return tips[:count] + return self.random_state.sample(INSTRUCTION_TIPS, count) + + def _is_duplicate(self, new_prompt: Prompt, existing: List[Prompt]) -> bool: + new_text = _parse_prompt(new_prompt).strip().lower() + + for p in existing: + existing_text = _parse_prompt(p).strip().lower() + + # Exact match + if new_text == existing_text: + return True + + # Mathematical similarity match (>90% similar) + if len(new_text) > 0 and len(existing_text) > 0: + similarity = difflib.SequenceMatcher( + None, new_text, existing_text + ).ratio() + if similarity > 0.90: + return True + + return False diff --git a/deepeval/optimizer/algorithms/miprov2/proposer/schema.py b/deepeval/optimizer/algorithms/miprov2/proposer/schema.py new file mode 100644 index 0000000000..7c42812c2b --- /dev/null +++ b/deepeval/optimizer/algorithms/miprov2/proposer/schema.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel +from typing import Union, List, Dict + + +class DatasetSummarySchema(BaseModel): + summary: str + + +class InstructionProposalSchema(BaseModel): + thought_process: str + revised_instruction: Union[str, List[Dict[str, str]]] diff --git a/deepeval/optimizer/algorithms/miprov2/proposer/template.py b/deepeval/optimizer/algorithms/miprov2/proposer/template.py new file mode 100644 index 0000000000..df1c9bbfb8 --- /dev/null +++ b/deepeval/optimizer/algorithms/miprov2/proposer/template.py @@ -0,0 +1,91 @@ +class ProposerTemplate: + + @staticmethod + def generate_dataset_summary(examples_text: str) -> str: + return f"""You are an expert AI data analyst. Your task is to analyze a sample of inputs and expected outputs from a specific task and summarize the core objective. + +[EXAMPLE DATA] +{examples_text} + +[INSTRUCTIONS] +Based on the examples above, write a concise 2-3 sentence summary of the dataset. +You MUST identify: +1. The overarching objective (What is the task trying to achieve?) +2. The expected format (How should the outputs be structured?) +3. Potential edge cases (Are there trick questions, specific constraints, or exceptions?) + +** +IMPORTANT: You must only return in JSON format matching the schema. +Example JSON: +{{ + "summary": "The objective is to classify sentiment. The format is always a single word ('Positive' or 'Negative'). Edge cases include sarcastic inputs which should be classified based on literal text." +}} +** + +JSON: +""" + + @staticmethod + def generate_instruction_proposal( + current_prompt: str, + dataset_summary: str, + examples_text: str, + tip: str, + candidate_index: int, + is_list_format: bool = False, + ) -> str: + + if is_list_format: + format_instruction = ( + "A STRICT JSON array of message objects representing the revised conversational prompt " + '(e.g., [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]).' + ) + example_instruction = '[{"role": "system", "content": "Determine the sentiment of the text. Pay special attention to sarcasm..."},{"role": "user", "content": "{{input}}"}]' + else: + format_instruction = "The final string representing the optimized revised instruction." + example_instruction = "\"Determine the sentiment of the text. Respond with only 'Positive' or 'Negative'. Pay special attention to sarcasm...\"" + + return f"""You are an expert prompt engineer. Your task is to propose an improved instruction for an LLM task. + +[CURRENT PROMPT] +{current_prompt} + +[DATASET SUMMARY] +{dataset_summary} + +[EXAMPLE INPUTS/OUTPUTS] +{examples_text} + +[GENERATION TIP] +{tip} + +[INSTRUCTIONS] +Based on the [CURRENT PROMPT], the global [DATASET SUMMARY], the specific examples, and the [GENERATION TIP], propose an improved version of the prompt. +This is candidate #{candidate_index + 1}. You must critically apply the [GENERATION TIP] to make this candidate meaningfully different from trivial variations. + +--- RULES --- +1. Focus on improving clarity, effectiveness, and alignment with the task requirements. +2. DO NOT hardcode the specific examples from [EXAMPLE INPUTS/OUTPUTS] directly into your new instruction. The instruction must generalize to all data. +3. Keep the instruction self-contained and actionable. +4. DO NOT wrap your revised_instruction in markdown blocks (like ```). +5. Always keep the interpolation type of the prompt the same as the current prompt. We use regex to interpolate the prompt so keep the same format. +6. If revised_instruction is LIST format, it MUST be a valid JSON array starting with `[` and ending with `]`. +7. For LIST format, every element must be an object with exactly: "role" and "content" keys. +8. Do NOT output multiple top-level JSON objects separated by commas. Output one JSON array only. + +** +IMPORTANT: You must only return in JSON format matching the schema. +You must provide your 'thought_process' first, explaining how you will apply the tip and summary, followed by the 'revised_instruction'. + + +"revised_instruction": format {format_instruction} + +Example JSON: +{{ + "thought_process": "The dataset summary indicates we need to handle sarcastic edge cases. The tip says 'Be concise'. I will update the prompt to explicitly mention sarcasm while removing the wordy introductory sentences.", + "revised_instruction": {example_instruction} +}} +** + +JSON: +""" diff --git a/deepeval/optimizer/configs.py b/deepeval/optimizer/configs.py index 0d1ad4c5c0..37d9abaf88 100644 --- a/deepeval/optimizer/configs.py +++ b/deepeval/optimizer/configs.py @@ -10,22 +10,3 @@ class DisplayConfig(BaseModel): announce_ties: bool = Field( False, description="Print a one-line note when a tie is detected" ) - - -class MutationTargetType(Enum): - RANDOM = "random" - FIXED_INDEX = "fixed_index" - - -# default all messages -class MutationConfig(BaseModel): - target_type: MutationTargetType = MutationTargetType.RANDOM - # should be list - target_role: Optional[str] = Field( - default=None, - description="If set, restricts candidates to messages with this role (case insensitive).", - ) - target_index: conint(ge=0) = Field( - default=0, - description="0-based index used when target_type == FIXED_INDEX.", - ) diff --git a/deepeval/optimizer/policies.py b/deepeval/optimizer/policies.py index 8770fa06c6..388341fc53 100644 --- a/deepeval/optimizer/policies.py +++ b/deepeval/optimizer/policies.py @@ -8,18 +8,20 @@ def _is_dominated( - candidate_scores: List[float], other_scores: List[float] + candidate_scores: List[float], + other_scores: List[float], + min_delta: float = 0.01, ) -> bool: """ Return True if `candidate_scores` is dominated by `other_scores`: (other >= candidate on all dimensions) AND (other > candidate on at least one). """ other_ge_everywhere = all( - other_score >= candidate_score + (other_score + 1e-9) >= candidate_score for candidate_score, other_score in zip(candidate_scores, other_scores) ) other_gt_somewhere = any( - other_score > candidate_score + other_score > (candidate_score + min_delta) for candidate_score, other_score in zip(candidate_scores, other_scores) ) return other_ge_everywhere and other_gt_somewhere diff --git a/deepeval/optimizer/prompt_optimizer.py b/deepeval/optimizer/prompt_optimizer.py index 93ebbf5cca..e5cac3e5bb 100644 --- a/deepeval/optimizer/prompt_optimizer.py +++ b/deepeval/optimizer/prompt_optimizer.py @@ -1,13 +1,12 @@ +import sys from contextlib import contextmanager from typing import ( - Callable, - Dict, List, Optional, Tuple, Union, ) - +from rich.console import Console from rich.progress import ( Progress, SpinnerColumn, @@ -33,7 +32,6 @@ ) from deepeval.optimizer.configs import ( DisplayConfig, - MutationConfig, AsyncConfig, ) from deepeval.prompt.prompt import Prompt @@ -59,7 +57,6 @@ def __init__( algorithm: Union[GEPA, MIPROV2, COPRO, SIMBA] = GEPA(), async_config: Optional[AsyncConfig] = AsyncConfig(), display_config: Optional[DisplayConfig] = DisplayConfig(), - mutation_config: Optional[MutationConfig] = MutationConfig(), ): self.optimizer_model, self.using_native_model = initialize_model( optimizer_model @@ -74,14 +71,13 @@ def __init__( self.async_config = async_config self.display_config = display_config - self.mutation_config = mutation_config self.algorithm = algorithm self.optimization_report = None self._configure_algorithm() # Internal state used only when a progress indicator is active. # Tuple is (Progress instance, task_id). - self._progress_state: Optional[Tuple[Progress, int]] = None + self._progress_state: Optional[Tuple[Progress, int, int]] = None ############## # Public API # @@ -98,13 +94,13 @@ def optimize( self.a_optimize(prompt=prompt, goldens=goldens) ) - try: - with self._progress_context(): - best_prompt, self.optimization_report = self.algorithm.execute( - prompt=prompt, goldens=goldens - ) - except Exception as exc: - self._handle_optimization_error(exc) + with self._progress_context(): + best_prompt, self.optimization_report = self.algorithm.execute( + prompt=prompt, goldens=goldens + ) + + if self.display_config.show_indicator: + self._print_summary_table() return best_prompt @@ -113,15 +109,13 @@ async def a_optimize( prompt: Prompt, goldens: Union[List[Golden], List[ConversationalGolden]], ) -> Prompt: - try: - with self._progress_context(): - best_prompt, self.optimization_report = ( - await self.algorithm.a_execute( - prompt=prompt, goldens=goldens - ) - ) - except Exception as exc: - self._handle_optimization_error(exc) + with self._progress_context(): + best_prompt, self.optimization_report = ( + await self.algorithm.a_execute(prompt=prompt, goldens=goldens) + ) + + if self.display_config.show_indicator: + self._print_summary_table() return best_prompt @@ -135,6 +129,7 @@ def _configure_algorithm(self) -> None: model_callback=self.model_callback, metrics=self.metrics, max_concurrent=self.async_config.max_concurrent, + optimizer_model=self.optimizer_model, throttle_seconds=float(self.async_config.throttle_value), ) @@ -143,16 +138,34 @@ def _configure_algorithm(self) -> None: if isinstance(self.algorithm, GEPA): max_chars = GEPA_REWRITE_INSTRUCTION_MAX_CHARS else: + self.algorithm.optimizer_model = self.optimizer_model max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS self.algorithm._rewriter = Rewriter( optimizer_model=self.optimizer_model, max_chars=max_chars, - list_mutation_config=self.mutation_config, random_state=self.algorithm.random_state, ) # Set status callback self.algorithm.status_callback = self._on_status + # Set sub-step callback (updates the bottom progress row) + self.algorithm.step_callback = self._on_step + + def _print_summary_table(self) -> None: + console = Console(file=sys.stderr) + + if hasattr(self.algorithm, "generate_summary_table"): + renderables = self.algorithm.generate_summary_table( + self.optimization_report + ) + console.print() + for renderable in renderables: + console.print(renderable) + console.print() + else: + console.print( + f"[dim]Optimization complete. (No summary table provided by {self.algorithm.name})[/]" + ) @contextmanager def _progress_context(self): @@ -164,23 +177,21 @@ def _progress_context(self): with Progress( SpinnerColumn(style="rgb(106,0,255)"), TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=40), + BarColumn(bar_width=60), TimeElapsedColumn(), transient=True, ) as progress: - task = progress.add_task( - f"Optimizing prompt with {self.algorithm.name}..." + iter_task = progress.add_task( + f"[bold white]Optimizing prompt with {self.algorithm.name}[/]" ) - self._progress_state = (progress, task) + step_task = progress.add_task("[rgb(55,65,81)]waiting...[/]") + self._progress_state = (progress, iter_task, step_task) try: yield finally: self._progress_state = None def _handle_optimization_error(self, exc: Exception) -> None: - """ - Handle optimization errors by formatting and raising a user-friendly message. - """ total_steps: Optional[int] = None iterations: Optional[int] = getattr(self.algorithm, "iterations", None) if iterations is not None: @@ -220,11 +231,18 @@ def _on_status( if kind is RunnerStatusType.ERROR: if self._progress_state is not None: - progress, task = self._progress_state + progress, iter_task, step_task = self._progress_state if total_steps is not None: - progress.update(task, total=total_steps) - description = self._format_progress_description(detail) - progress.update(task, description=description) + progress.update(iter_task, total=total_steps) + progress.update( + iter_task, + description=self._format_iter_description( + step_index, total_steps + ), + ) + progress.update( + step_task, description=f"[rgb(255,85,85)]✕ {detail}[/]" + ) print(f"[{algo}] {detail}") return @@ -240,24 +258,40 @@ def _on_status( if self._progress_state is None: return - progress, task = self._progress_state + progress, iter_task, step_task = self._progress_state if total_steps is not None: - progress.update(task, total=total_steps) + progress.update(iter_task, total=total_steps) if step_index is not None and step_index > 0: - progress.advance(task, 1) + progress.advance(iter_task, 1) - description = self._format_progress_description(detail) - progress.update(task, description=description) + progress.update( + iter_task, + description=self._format_iter_description(step_index, total_steps), + ) - def _format_progress_description(self, detail: str) -> str: - """ - Compose a human readable progress line using an algorithm agnostic - prefix and an algorithm specific detail string provided by the algorithm. - """ + def _on_step(self, label: str) -> None: + if self._progress_state is None: + return + progress, _, step_task = self._progress_state + progress.update( + step_task, description=self._format_step_description(label) + ) + + def _format_iter_description( + self, + step_index: Optional[int], + total_steps: Optional[int], + ) -> str: algo = self.algorithm.name - base = f"Optimizing prompt with {algo}" - if detail: - return f"{base} [rgb(25,227,160)]{detail}[/]" + base = f"[bold white]Optimizing prompt with {algo}[/]" + if step_index is not None and total_steps is not None: + pct = int(100 * step_index / total_steps) if total_steps else 0 + return f"{base} [rgb(55,65,81)]iteration {step_index}/{total_steps} ({pct}%)[/]" return base + + def _format_step_description(self, label: str) -> str: + if label: + return f"[rgb(25,227,160)]⤷ {label}[/]" + return "" diff --git a/deepeval/optimizer/rewriter/rewriter.py b/deepeval/optimizer/rewriter/rewriter.py index 0e1f724c8c..fe1535e215 100644 --- a/deepeval/optimizer/rewriter/rewriter.py +++ b/deepeval/optimizer/rewriter/rewriter.py @@ -1,21 +1,23 @@ from __future__ import annotations import random -from typing import Optional, Tuple, Union +import json +from typing import Optional, Union from deepeval.models.base_model import DeepEvalBaseLLM +from deepeval.optimizer.scorer.schema import ScorerDiagnosisResult from deepeval.optimizer.types import ( ModuleId, ) -from deepeval.optimizer.configs import ( - MutationConfig, -) from deepeval.prompt.prompt import Prompt -from deepeval.optimizer.rewriter.utils import ( - _summarize_prompt_for_rewrite, - _compose_prompt_messages, - _normalize_llm_output_to_text, - _apply_rewritten_prompt, +from deepeval.optimizer.utils import _parse_prompt, _create_prompt +from deepeval.prompt.api import PromptType +from deepeval.metrics.utils import ( + a_generate_with_schema_and_extract, + generate_with_schema_and_extract, + initialize_model, ) +from .schema import RewriterSchema +from .template import RewriterTemplate class Rewriter: @@ -31,12 +33,10 @@ def __init__( self, optimizer_model: DeepEvalBaseLLM, max_chars: int = 4000, - list_mutation_config: Optional[MutationConfig] = None, random_state: Optional[Union[int, random.Random]] = None, ): - self.optimizer_model = optimizer_model + self.model, self.using_native_model = initialize_model(optimizer_model) self.max_chars = max_chars - self.list_mutation_config = list_mutation_config or MutationConfig() # Accept either an int seed or a Random instance. if isinstance(random_state, int): @@ -46,79 +46,77 @@ def __init__( else: self.random_state = random_state or random.Random() - def _compose_messages( - self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str - ) -> Tuple[str, str]: - current_prompt_block = _summarize_prompt_for_rewrite( - old_prompt, self.max_chars - ) - system_message = ( - "You are refining a prompt used in a multi-step LLM pipeline. " - "Given the current prompt and concise feedback, produce a revised prompt " - "that addresses the issues while preserving intent and style. " - "Return only the new prompt text, no explanations." - ) - user_message = f"""[Current Prompt] -{current_prompt_block} - -[Feedback] -{feedback_text[:self.max_chars]} - -[Instruction] -Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text. -""" - return system_message, user_message - def rewrite( self, - module_id: ModuleId, old_prompt: Prompt, - feedback_text: str, + feedback_diagnosis: ScorerDiagnosisResult, ) -> Prompt: - if not feedback_text.strip(): + if not feedback_diagnosis or not feedback_diagnosis.analysis: return old_prompt - system_message, user_message = self._compose_messages( - module_id=module_id, - old_prompt=old_prompt, - feedback_text=feedback_text, - ) - merged_prompt_text = _compose_prompt_messages( - system_message, user_message + current_prompt_block = _parse_prompt(old_prompt) + + failures_block = feedback_diagnosis.failures + successes_block = feedback_diagnosis.successes + results_block = "\n\n---\n\n".join(feedback_diagnosis.results) + + mutation_prompt = RewriterTemplate.generate_mutation( + original_prompt=current_prompt_block, + failures=failures_block, + successes=successes_block, + results=results_block, + analysis=feedback_diagnosis.analysis, + is_list_format=old_prompt.type == PromptType.LIST, ) - out = self.optimizer_model.generate(merged_prompt_text) - new_text = _normalize_llm_output_to_text(out) - return _apply_rewritten_prompt( - old_prompt, - new_text, - self.random_state, - self.list_mutation_config, + revised_prompt_text = generate_with_schema_and_extract( + metric=self, + prompt=mutation_prompt, + schema_cls=RewriterSchema, + extract_schema=lambda s: s.revised_prompt, + extract_json=lambda data: data["revised_prompt"], ) + if isinstance(revised_prompt_text, list): + revised_prompt_text = json.dumps(revised_prompt_text) + + return _create_prompt(old_prompt, revised_prompt_text) + async def a_rewrite( self, - module_id: ModuleId, old_prompt: Prompt, - feedback_text: str, + feedback_diagnosis: ScorerDiagnosisResult, ) -> Prompt: - if not feedback_text.strip(): + if not feedback_diagnosis or not feedback_diagnosis.analysis: return old_prompt - system_message, user_message = self._compose_messages( - module_id=module_id, - old_prompt=old_prompt, - feedback_text=feedback_text, - ) - merged_prompt_text = _compose_prompt_messages( - system_message, user_message + current_prompt_block = _parse_prompt(old_prompt) + + failures_block = feedback_diagnosis.failures + successes_block = feedback_diagnosis.successes + results_block = "\n\n---\n\n".join(feedback_diagnosis.results) + + mutation_prompt = RewriterTemplate.generate_mutation( + original_prompt=current_prompt_block, + failures=failures_block, + successes=successes_block, + results=results_block, + analysis=feedback_diagnosis.analysis, + is_list_format=old_prompt.type == PromptType.LIST, ) - out = await self.optimizer_model.a_generate(merged_prompt_text) - new_text = _normalize_llm_output_to_text(out) - return _apply_rewritten_prompt( - old_prompt, - new_text, - self.random_state, - self.list_mutation_config, + revised_prompt_text = await a_generate_with_schema_and_extract( + metric=self, + prompt=mutation_prompt, + schema_cls=RewriterSchema, + extract_schema=lambda s: s.revised_prompt, + extract_json=lambda data: data["revised_prompt"], ) + + if isinstance(revised_prompt_text, list): + revised_prompt_text = json.dumps(revised_prompt_text) + + return _create_prompt(old_prompt, revised_prompt_text) + + def _accrue_cost(self, cost: float) -> None: + pass diff --git a/deepeval/optimizer/rewriter/schema.py b/deepeval/optimizer/rewriter/schema.py new file mode 100644 index 0000000000..c279a5b357 --- /dev/null +++ b/deepeval/optimizer/rewriter/schema.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel +from typing import Union, List, Dict + + +class RewriterSchema(BaseModel): + thought_process: str + revised_prompt: Union[str, List[Dict[str, str]]] diff --git a/deepeval/optimizer/rewriter/template.py b/deepeval/optimizer/rewriter/template.py new file mode 100644 index 0000000000..1e77d2ebdc --- /dev/null +++ b/deepeval/optimizer/rewriter/template.py @@ -0,0 +1,65 @@ +class RewriterTemplate: + @staticmethod + def generate_mutation( + original_prompt: str, + failures: str, + successes: str, + results: str, + analysis: str, + is_list_format: bool = False, + ) -> str: + + if is_list_format: + format_instruction = ( + "A JSON array of message objects representing the revised conversational prompt " + "(e.g., [{'role': 'system', 'content': '...'}, {'role': 'user', 'content': '...'}])." + ) + example_prompt = '[{"role": "system", "content": "You are a helpful assistant..."},{"role": "user", "content": "{{input}}"}]' + else: + format_instruction = ( + "The final string representing the optimized revised prompt." + ) + example_prompt = '""' + + return f"""You are an expert AI Prompt Engineer. Your goal is to perform a 'Prompt Mutation' to move the prompt closer to the Pareto Frontier. + +# Context +- **Original Prompt:** The current best-performing candidate. +- **Diagnostic Report:** A 'gradient' signal identifying high-loss areas (low scores) and anchors (high scores). +- **Failure Cases:** The failure cases from the diagnostic report. +- **Success Cases:** The success cases from the diagnostic report. +- **Actual Results:** The actual results from the previous generation. +- **Overall Analysis:** The overall analysis of the diagnostic report. + +# Original Prompt +{original_prompt} + +# Diagnostic Report +Failures: {failures} +Successes: {successes} + +Actual results from the previous generation: {results} + +Overall analysis of the diagnostic report: {analysis} + +# Mutation Instructions +1. **Targeted Fixes:** Use the Diagnostic Report to apply 'surgical' edits. Focus heavily on the examples that received low numerical scores. +2. **Constraint Satisfaction:** Do NOT degrade performance on the 'Anchor' examples (those with 1.0 scores). Your mutation must be a 'non-dominated' improvement. +3. **Preserve Placeholders:** Maintain all runtime tokens like `{{input}}` or `{{context}}`. +4. **Iterative Refinement:** If the report mentions a lack of clarity, add explicit 'Rules' or 'Negative Constraints' (what NOT to do). +5. Always keep the interpolation type of the prompt the same as the original prompt. We use regex to interpolate the prompt so keep the same format. + +**Output Format** +Return a JSON object: +- "thought_process": Explain how you are addressing the low-score failures while preserving high-score successes. +- "revised_prompt": {format_instruction} + +Example JSON: +{{ + "thought_process": "", + "revised_prompt": {example_prompt} +}} + + +JSON: +""" diff --git a/deepeval/optimizer/rewriter/utils.py b/deepeval/optimizer/rewriter/utils.py deleted file mode 100644 index a6a40d929a..0000000000 --- a/deepeval/optimizer/rewriter/utils.py +++ /dev/null @@ -1,214 +0,0 @@ -from __future__ import annotations -import json -import random -from typing import List, Optional, Tuple, Union - -from deepeval.errors import DeepEvalError -from deepeval.optimizer.utils import ( - validate_int_in_range, - validate_instance, -) -from deepeval.optimizer.configs import ( - MutationConfig, - MutationTargetType, -) -from deepeval.prompt.api import PromptType, PromptMessage -from deepeval.prompt.prompt import Prompt - - -################## -# Common Helpers # -################## -def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str: - """ - Produce a human-readable summary of the current prompt for the - rewriter instruction block. - - - For TEXT prompts, this is just `text_template`. - - For LIST prompts, this is a numbered list of (role, content) lines. - """ - - # LIST prompts: show each message with its role. - if old_prompt.type is PromptType.LIST and old_prompt.messages_template: - lines: List[str] = [] - for message_index, message in enumerate(old_prompt.messages_template): - role = message.role or "" - content = message.content or "" - lines.append(f"[{message_index+1}] ({role}) {content}") - combined = "\n".join(lines) - return combined[:max_chars] - - # Since it is not a LIST prompt, just use text_template. - text = old_prompt.text_template or "" - return text[:max_chars] - - -def _select_list_target_index( - messages: List[PromptMessage], - config: MutationConfig, - random_state: random.Random, -) -> int: - """ - Select which list message index to rewrite, based on PromptListMutationConfig. - - Rules: - - Start with all indices in scope. - - If target_role is set, restrict candidates to messages with that role - (case insensitive). If no messages match, fall back to all indices. - - target_type: - * FIRST: pick the first candidate index. - * RANDOM: pick a candidate via random_state.choice(candidates). - * FIXED_INDEX: use target_index when valid (and consistent with role - filter), otherwise fall back to the first candidate. - """ - if not messages: - raise DeepEvalError( - "Rewriter._select_list_target_index expected at least one " - "message, but received an empty message list." - ) - - validate_instance( - component="Rewriter._select_list_target_index", - param_name="target_type", - value=config.target_type, - expected_types=MutationTargetType, - ) - - messages_length = len(messages) - candidate_indices = list(range(messages_length)) - - # Optional case insensitive role restriction - if config.target_role: - target_role_lower = config.target_role.lower() - filtered = [ - index - for index, message in enumerate(messages) - if (message.role or "").lower() == target_role_lower - ] - if filtered: - candidate_indices = filtered - - target_type = config.target_type - - if target_type is MutationTargetType.RANDOM: - return random_state.choice(candidate_indices) - - if target_type is MutationTargetType.FIXED_INDEX: - index = validate_int_in_range( - component="Rewriter._select_list_target_index", - param_name="target_index", - value=int(config.target_index), - min_inclusive=0, - max_exclusive=len(candidate_indices), - ) - return candidate_indices[index] - - # if you got this error it means that a new PromptListMutationTargetType was added, - # but not handled above - raise DeepEvalError( - "Rewriter._select_list_target_index received unsupported " - f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX." - ) - - -def _apply_rewritten_prompt( - old_prompt: Prompt, - new_text: str, - random_state: random.Random, - list_mutation_config: Optional[MutationConfig] = None, -) -> Prompt: - """ - Apply the rewritten text to a Prompt, preserving representation: - - - For TEXT prompts, update `text_template`. - - For LIST prompts, rewrite the content of a single message while - keeping the number of messages the same. - - Preserve additonal Prompt meta such as `label` and `interpolation_type` - """ - if not new_text: - return old_prompt - - if old_prompt.type is PromptType.LIST and old_prompt.messages_template: - messages = old_prompt.messages_template - config = list_mutation_config or MutationConfig() - - target_index = _select_list_target_index( - messages=messages, - config=config, - random_state=random_state, - ) - - new_messages: List[PromptMessage] = [] - for message_index, message in enumerate(messages): - if message_index == target_index: - # Preserve the original role; do not inject a new one. - new_messages.append( - PromptMessage( - role=message.role, - content=new_text, - ) - ) - else: - new_messages.append(message) - - new_prompt = Prompt( - alias=old_prompt.alias, - text_template=None, - messages_template=new_messages, - model_settings=old_prompt.model_settings, - output_type=old_prompt.output_type, - output_schema=old_prompt.output_schema, - ) - - else: - # Since it is not LIST, it must be TEXT type - new_prompt = Prompt( - alias=old_prompt.alias, - text_template=new_text, - model_settings=old_prompt.model_settings, - output_type=old_prompt.output_type, - output_schema=old_prompt.output_schema, - ) - - new_prompt.label = old_prompt.label - new_prompt.interpolation_type = old_prompt.interpolation_type - return new_prompt - - -def _compose_prompt_messages(system_message: str, user_message: str) -> str: - """ - Join system and user messages into a single prompt string. - Strips surrounding whitespace from each part; if the system message is - empty or absent, returns just the user message. - """ - system_text = (system_message or "").strip() - user_text = (user_message or "").strip() - return f"{system_text}\n\n{user_text}" if system_text else user_text - - -def _normalize_llm_output_to_text( - result: Union[str, Tuple[Union[str, dict], float], dict], -) -> str: - """ - Convert a DeepEval LLM generate() / a_generate() result to a clean string. - - Accepted inputs: - - str -> returned as trimmed - - (str|dict, float_cost) -> first element extracted and normalized - - dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False - - Fallback: if serialization fails, str(value).strip() is used. - """ - output_value: Union[str, dict] - if isinstance(result, tuple): - output_value = result[0] - else: - output_value = result - - if isinstance(output_value, str): - return output_value.strip() - - try: - return json.dumps(output_value, ensure_ascii=False) - except Exception: - return str(output_value).strip() diff --git a/deepeval/optimizer/scorer/base.py b/deepeval/optimizer/scorer/base.py index 14cd824c38..180e473c29 100644 --- a/deepeval/optimizer/scorer/base.py +++ b/deepeval/optimizer/scorer/base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from typing import Union, List +from deepeval.optimizer.scorer.schema import ScorerDiagnosisResult from deepeval.optimizer.types import PromptConfiguration, ScoreVector from deepeval.dataset.golden import Golden, ConversationalGolden @@ -42,17 +43,10 @@ def get_minibatch_feedback( prompt_configuration: PromptConfiguration, module: ModuleId, minibatch: Union[List[Golden], List[ConversationalGolden]], - ) -> str: + ) -> ScorerDiagnosisResult: """Return μ_f text for the module (metric.reason + traces, etc.).""" raise NotImplementedError - @abstractmethod - def select_module( - self, prompt_configuration: PromptConfiguration - ) -> ModuleId: - """Pick a module to mutate.""" - raise NotImplementedError - # Async @abstractmethod async def a_score_pareto( @@ -76,11 +70,8 @@ async def a_get_minibatch_feedback( prompt_configuration: PromptConfiguration, module: ModuleId, minibatch: Union[List[Golden], List[ConversationalGolden]], - ) -> str: + ) -> ScorerDiagnosisResult: raise NotImplementedError - @abstractmethod - async def a_select_module( - self, prompt_configuration: PromptConfiguration - ) -> ModuleId: - raise NotImplementedError + def _accrue_cost(self, cost: float) -> None: + pass diff --git a/deepeval/optimizer/scorer/schema.py b/deepeval/optimizer/scorer/schema.py new file mode 100644 index 0000000000..197ac0e15c --- /dev/null +++ b/deepeval/optimizer/scorer/schema.py @@ -0,0 +1,15 @@ +from typing import List +from pydantic import BaseModel + + +class ScorerDiagnosisSchema(BaseModel): + failures: str + successes: str + analysis: str + + +class ScorerDiagnosisResult(BaseModel): + failures: str + successes: str + analysis: str + results: List[str] diff --git a/deepeval/optimizer/scorer/scorer.py b/deepeval/optimizer/scorer/scorer.py index 84b5c330ce..437a827d00 100644 --- a/deepeval/optimizer/scorer/scorer.py +++ b/deepeval/optimizer/scorer/scorer.py @@ -6,6 +6,7 @@ Dict, List, Optional, + Tuple, Union, ) @@ -14,24 +15,29 @@ convert_goldens_to_test_cases, convert_convo_goldens_to_convo_test_cases, ) +from deepeval.models import DeepEvalBaseLLM from deepeval.errors import DeepEvalError from deepeval.metrics import ( BaseMetric, BaseConversationalMetric, ) from deepeval.metrics.utils import copy_metrics +from deepeval.prompt.api import PromptType from deepeval.test_case import ( LLMTestCase, ConversationalTestCase, Turn, ) from deepeval.prompt.prompt import Prompt +from deepeval.metrics.utils import ( + a_generate_with_schema_and_extract, + generate_with_schema_and_extract, + initialize_model, +) from deepeval.optimizer.types import ( ModelCallback, PromptConfiguration, - Objective, - MeanObjective, ModuleId, ) from deepeval.optimizer.scorer.base import BaseScorer @@ -45,6 +51,8 @@ _measure_no_indicator, _a_measure_no_indicator, ) +from .template import ScorerTemplate +from .schema import ScorerDiagnosisResult, ScorerDiagnosisSchema class Scorer(BaseScorer): @@ -61,14 +69,14 @@ def __init__( metrics: Union[List[BaseMetric], List[BaseConversationalMetric]], max_concurrent: int, throttle_seconds: float, - objective_scalar: Objective = MeanObjective(), + optimizer_model: DeepEvalBaseLLM, ): self.model_callback = validate_callback( component="Scorer", model_callback=model_callback, ) self.metrics = validate_metrics(component="Scorer", metrics=metrics) - self.objective_scalar = objective_scalar + self.model, self.using_native_model = initialize_model(optimizer_model) self._semaphore = asyncio.Semaphore(max_concurrent) self._throttle = float(throttle_seconds) @@ -136,27 +144,57 @@ def get_minibatch_feedback( prompt_configuration: PromptConfiguration, module: ModuleId, minibatch: Union[List[Golden], List[ConversationalGolden]], - ) -> str: - # default metric feedback (μ_f): concat metric.reason across minibatch and cap length - reasons: List[str] = [] + ) -> ScorerDiagnosisResult: + results: List[str] = [] for golden in minibatch: actual = self.generate(prompt_configuration.prompts, golden) test_case = self._golden_to_test_case(golden, actual) - for metric in copy_metrics(self.metrics): + + metrics = copy_metrics(self.metrics) + for metric in metrics: _measure_no_indicator(metric=metric, test_case=test_case) - if metric.reason: - reasons.append(str(metric.reason)) - if not reasons: - return "" - unique: List[str] = [] - seen = set() - for reason in reasons: - if reason not in seen: - unique.append(reason) - seen.add(reason) - return "\n---\n".join( - unique[:8] - ) # TODO: Make how much feedback configurable + + evaluation_results_block = self._build_evaluation_results_block( + golden, actual, metrics + ) + if evaluation_results_block: + results.append(evaluation_results_block) + + if not results: + return ScorerDiagnosisResult( + failures=[], + successes=[], + analysis="", + results=[], + ) + + evaluation_results = "\n\n---\n\n".join(results) + + prompt = prompt_configuration.prompts[module] + original_prompt = ( + prompt.text_template + if prompt.type == PromptType.TEXT + else prompt.messages_template + ) + + diagnosis_prompt = ScorerTemplate.generate_diagnosis( + original_prompt=original_prompt, + evaluation_results=evaluation_results, + ) + + diagnosis = generate_with_schema_and_extract( + metric=self, + prompt=diagnosis_prompt, + schema_cls=ScorerDiagnosisSchema, + extract_schema=lambda s: s, + extract_json=lambda data: data, + ) + return ScorerDiagnosisResult( + failures=diagnosis.failures, + successes=diagnosis.successes, + analysis=diagnosis.analysis, + results=results, + ) async def a_score_pareto( self, @@ -186,32 +224,61 @@ async def a_get_minibatch_feedback( prompt_configuration: PromptConfiguration, module: ModuleId, minibatch: Union[List[Golden], List[ConversationalGolden]], - ) -> str: - async def reasons_one(golden) -> List[str]: - # Clone per task to avoid shared state - metrics = copy_metrics(self.metrics) - # metrics = self.metrics + ) -> ScorerDiagnosisResult: + async def process_one_trace(golden) -> Optional[str]: actual = await self.a_generate(prompt_configuration.prompts, golden) test_case = self._golden_to_test_case(golden, actual) - out: List[str] = [] + + metrics = copy_metrics(self.metrics) for metric in metrics: - await _a_measure_no_indicator(metric, test_case) - if metric.reason: - out.append(str(metric.reason)) - return out - - tasks = [self._bounded(reasons_one(golden)) for golden in minibatch] - nested = await asyncio.gather(*tasks) - reasons: List[str] = [reason for sub in nested for reason in sub] - if not reasons: - return "" - unique: List[str] = [] - seen = set() - for reason in reasons: - if reason not in seen: - unique.append(reason) - seen.add(reason) - return "\n---\n".join(unique[:8]) + await _a_measure_no_indicator( + metric=metric, test_case=test_case + ) + + return self._build_evaluation_results_block(golden, actual, metrics) + + tasks = [ + self._bounded(process_one_trace(golden)) for golden in minibatch + ] + raw_results = await asyncio.gather(*tasks) + + results = [r for r in raw_results if r] + + if not results: + return ScorerDiagnosisResult( + failures=[], + successes=[], + analysis="", + results=[], + ) + + evaluation_results = "\n\n---\n\n".join(results) + + prompt = prompt_configuration.prompts[module] + original_prompt = ( + prompt.text_template + if prompt.type == PromptType.TEXT + else prompt.messages_template + ) + + diagnosis_prompt = ScorerTemplate.generate_diagnosis( + original_prompt=original_prompt, + evaluation_results=evaluation_results, + ) + + diagnosis = await a_generate_with_schema_and_extract( + metric=self, + prompt=diagnosis_prompt, + schema_cls=ScorerDiagnosisSchema, + extract_schema=lambda s: s, + extract_json=lambda data: data, + ) + return ScorerDiagnosisResult( + failures=diagnosis.failures, + successes=diagnosis.successes, + analysis=diagnosis.analysis, + results=results, + ) ################### # scoring helpers # @@ -265,7 +332,10 @@ async def _a_score_one( for metric in metrics: score = await _a_measure_no_indicator(metric, test_case) per_metric[metric.__class__.__name__] = float(score) - return self.objective_scalar.scalarize(per_metric) + score = ( + sum(per_metric.values()) / len(per_metric) if per_metric else 0.0 + ) + return score def _score_one( self, @@ -280,19 +350,14 @@ def _score_one( for metric in metrics: score = _measure_no_indicator(metric, test_case) per_metric[metric.__class__.__name__] = float(score) - return self.objective_scalar.scalarize(per_metric) + score = ( + sum(per_metric.values()) / len(per_metric) if per_metric else 0.0 + ) + return score def _select_module_id_from_prompts( self, prompts_by_module: Dict[ModuleId, Prompt] ) -> ModuleId: - """ - Default module selection strategy: - - - Prefer the synthetic '__module__' key when present - - Otherwise fall back to the first key in prompts_by_module. - - Assumes `prompts_by_module` is non-empty; callers should validate that. - """ if self.DEFAULT_MODULE_ID in prompts_by_module: return self.DEFAULT_MODULE_ID @@ -305,12 +370,32 @@ def _select_module_id_from_prompts( "received an empty `prompts_by_module`. At least one Prompt is required." ) - def select_module( - self, prompt_configuration: PromptConfiguration - ) -> ModuleId: - return self._select_module_id_from_prompts(prompt_configuration.prompts) + def _build_evaluation_results_block( + self, + golden: Union[Golden, ConversationalGolden], + actual: str, + metrics: List[BaseMetric], + ) -> str: + if isinstance(golden, Golden): + input_str = golden.input + expected_str = golden.expected_output or "None provided" + else: + input_str = "\n".join( + [t.content for t in golden.turns if t.role == "user"] + ) + expected_str = golden.expected_outcome or "None provided" - async def a_select_module( - self, prompt_configuration: PromptConfiguration - ) -> ModuleId: - return self.select_module(prompt_configuration) + reasons = [] + for metric in metrics: + score = metric.score + reason = metric.reason + reasons.append( + f"- {metric.__class__.__name__} (Score: {score}): {reason}" + ) + + return ( + f"[Input]: {input_str}\n" + f"[Expected]: {expected_str}\n" + f"[Actual Model Output]: {actual}\n" + f"[Evaluation Reasons]:\n" + "\n".join(reasons) + ) diff --git a/deepeval/optimizer/scorer/template.py b/deepeval/optimizer/scorer/template.py new file mode 100644 index 0000000000..97fd11e46b --- /dev/null +++ b/deepeval/optimizer/scorer/template.py @@ -0,0 +1,40 @@ +class ScorerTemplate: + @staticmethod + def generate_diagnosis( + original_prompt: str, + evaluation_results: str, + ) -> str: + return f"""You are an expert Prompt Engineer and AI Diagnoser. Your task is to perform a 'Prompt Gradient Analysis'. + +You are provided with: +1. The Original Prompt. +2. Evaluation Results: A batch of execution traces including Inputs, Expected Outputs, Actual Outputs, and Numerical Scores (0.0 to 1.0). + +# Original Prompt: +'{original_prompt}' + +# Evaluation Results +{evaluation_results} + +# Instructions +Perform a precise diagnosis to guide the next mutation: +1. **Identify the High-Loss Examples:** Look for instances with the lowest numerical scores. Analyze exactly what caused the model to deviate from the expected output in these specific cases. +2. **Identify the Anchors:** Look for instances with scores of 1.0. Determine which parts of the prompt are working correctly so they aren't accidentally removed. +3. **Correlate Scores to Instructions:** Explicitly state: "Instruction X led to a score of 0.0 on Input Y because [reason]." +4. **Synthesize the 'Gradient'**: Provide a clear signal on what needs to be 'intensified' (added) or 'dampened' (removed/changed). + +**Output Format** +Return a JSON object: +- "failures": List of failures. +- "successes": List of successes. +- "analysis": A synthesized diagnostic signal. You MUST include the numerical scores in your citations (e.g., "On example A (Score: 0.2), the model failed to...") to provide a magnitude of failure. + +Example JSON: +{{ + "failures": ["The model consistently fails logic tests (Score 0.0) while passing formatting tests (Score 1.0)..."], + "successes": ["The JSON formatting instruction is perfect (Score 1.0)."], + "analysis": "CRITICAL FAILURE: The prompt lacks a 'step-by-step' requirement, leading to a 0.0 score on logic examples like [Quote]. SUCCESS: The JSON formatting instruction is perfect (Score 1.0)." +}} + +JSON: +""" diff --git a/deepeval/optimizer/types.py b/deepeval/optimizer/types.py index 65085ed347..3dac77b441 100644 --- a/deepeval/optimizer/types.py +++ b/deepeval/optimizer/types.py @@ -1,6 +1,5 @@ from __future__ import annotations import uuid -from abc import ABC, abstractmethod from dataclasses import dataclass from typing import ( @@ -9,23 +8,17 @@ List, Optional, TypedDict, - TYPE_CHECKING, Union, ) from enum import Enum from pydantic import BaseModel, ConfigDict - from deepeval.prompt.prompt import Prompt - -if TYPE_CHECKING: - from deepeval.dataset.golden import Golden, ConversationalGolden +from deepeval.dataset.golden import Golden, ConversationalGolden PromptConfigurationId = str ModuleId = str -ScoreVector = List[float] # scores per instance on D_pareto, aligned order +ScoreVector = List[float] ScoreTable = Dict[PromptConfigurationId, ScoreVector] - -# Type alias for model callback function ModelCallback = Callable[[Prompt, Union["Golden", "ConversationalGolden"]], str] @@ -53,67 +46,9 @@ class RunnerStatusType(str, Enum): ERROR = "error" -# Type alias for status callback function RunnerStatusCallback = Callable[..., None] -class Objective(ABC): - """Strategy for reducing scores per-metric to a single scalar value. - - Implementations receive a mapping from metric name to score - (for example, {"AnswerRelevancyMetric": 0.82}) and return a - single float used for comparisons inside the optimizer. - """ - - @abstractmethod - def scalarize(self, scores_by_metric: Dict[str, float]) -> float: - raise NotImplementedError - - -class MeanObjective(Objective): - """Default scalarizer: unweighted arithmetic mean. - - - If `scores_by_metric` is non-empty, returns the arithmetic - mean of all metric scores. - - If `scores_by_metric` is empty, returns 0.0. - """ - - def scalarize(self, scores_by_metric: Dict[str, float]) -> float: - if not scores_by_metric: - return 0.0 - return sum(scores_by_metric.values()) / len(scores_by_metric) - - -class WeightedObjective(Objective): - """ - Objective that scales each metric's score by a user-provided weight and sums them. - - - `weights_by_metric` keys should match the names of the metrics passed to the - metric class names passed to the PromptOptimizer. - - Metrics not present in `weights_by_metric` receive `default_weight`. - This makes it easy to emphasize a subset of metrics while keeping - everything else at a baseline weight of 1.0, e.g.: - - WeightedObjective({"AnswerRelevancyMetric": 2.0}) - - which treats AnswerRelevancy as 2x as important as the other metrics. - """ - - def __init__( - self, - weights_by_metric: Optional[Dict[str, float]] = None, - default_weight: float = 1.0, - ): - self.weights_by_metric: Dict[str, float] = dict(weights_by_metric or {}) - self.default_weight: float = float(default_weight) - - def scalarize(self, scores_by_metric: Dict[str, float]) -> float: - return sum( - self.weights_by_metric.get(name, self.default_weight) * score - for name, score in scores_by_metric.items() - ) - - class AcceptedIterationDict(TypedDict): parent: PromptConfigurationId child: PromptConfigurationId diff --git a/deepeval/optimizer/utils.py b/deepeval/optimizer/utils.py index 15fb3b3ce0..29f12f18d1 100644 --- a/deepeval/optimizer/utils.py +++ b/deepeval/optimizer/utils.py @@ -1,38 +1,34 @@ from __future__ import annotations +import json +import re import inspect import random -import re import statistics from typing import ( Any, - Callable, List, Optional, Protocol, Sequence, Tuple, - TYPE_CHECKING, Union, Dict, - Set, ) - +from deepeval.errors import DeepEvalError +from deepeval.prompt.api import PromptType, PromptMessage +from deepeval.prompt.prompt import Prompt from deepeval.errors import DeepEvalError from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric from deepeval.prompt.prompt import Prompt from deepeval.prompt.api import PromptMessage from deepeval.optimizer.types import ( ModelCallback, - ModuleId, PromptConfigurationId, PromptConfiguration, PromptConfigSnapshot, OptimizationReport, ) - -if TYPE_CHECKING: - from deepeval.dataset.golden import Golden, ConversationalGolden - from deepeval.prompt.api import PromptMessage +from deepeval.dataset.golden import Golden, ConversationalGolden def split_goldens( @@ -94,67 +90,6 @@ def split_goldens( return d_feedback, d_pareto -################################ -# Prompt normalization helpers # -################################ - - -def _slug(text: str) -> str: - slug = text.lower() - slug = re.sub(r"[^a-z0-9]+", "-", slug) - return slug.strip("-") - - -def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str: - """ - Build a human readable module id stable within a single optimization run. - Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars. - """ - parts: List[str] = [] - if prompt.alias: - parts.append(str(prompt.alias)) - if prompt.label: - parts.append(str(prompt.label)) - - ms = prompt.model_settings - if ms is not None: - if ms.provider is not None: - parts.append(ms.provider.value) - if ms.name: - parts.append(ms.name) - - base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}" - base = base[:64] or f"module-{index+1}" - - candidate = base - suffix = 2 - while candidate in existing: - candidate = f"{base}-{suffix}" - candidate = candidate[:64] - suffix += 1 - - existing.add(candidate) - return candidate - - -def normalize_seed_prompts( - seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]], -) -> Dict[ModuleId, Prompt]: - """ - Accept either {module_id: Prompt} or List[Prompt]. - If a list is given, generate human readable module ids. - """ - if isinstance(seed_prompts, dict): - return dict(seed_prompts) # shallow copy - - mapping: Dict[ModuleId, Prompt] = {} - used: Set[str] = set() - for i, prompt in enumerate(seed_prompts): - module_id = generate_module_id(prompt, i, used) - mapping[module_id] = prompt - return mapping - - def invoke_model_callback( *, model_callback: ModelCallback, @@ -475,5 +410,61 @@ def mean_of_all(scores: Sequence[float]) -> float: return statistics.fmean(scores) if scores else 0.0 -def median_of_all(scores: Sequence[float]) -> float: - return statistics.median(scores) if scores else 0.0 +########################### +#### Prompt Utils ######### +########################### + + +def _parse_prompt(prompt: Prompt) -> str: + if prompt.type == PromptType.TEXT: + return prompt.text_template + + elif prompt.type == PromptType.LIST: + messages = [ + {"role": msg.role, "content": msg.content} + for msg in prompt.messages_template + ] + return json.dumps(messages, indent=4) + else: + raise DeepEvalError(f"Invalid prompt type: {prompt.type}") + + +def _create_prompt(old_prompt: Prompt, new_content: str) -> Prompt: + prompt_kwargs = { + "alias": old_prompt.alias, + "model_settings": old_prompt.model_settings, + "output_type": old_prompt.output_type, + "output_schema": old_prompt.output_schema, + "branch": old_prompt.branch, + "interpolation_type": old_prompt.interpolation_type, + "confident_api_key": old_prompt.confident_api_key, + } + + if old_prompt.type == PromptType.TEXT: + prompt_kwargs["text_template"] = new_content + prompt_kwargs["messages_template"] = None + + elif old_prompt.type == PromptType.LIST: + prompt_kwargs["text_template"] = None + + try: + parsed_messages: List[Dict[str, str]] = json.loads(new_content) + + messages_template = [ + PromptMessage(role=msg.get("role"), content=msg.get("content")) + for msg in parsed_messages + ] + prompt_kwargs["messages_template"] = messages_template + + except json.JSONDecodeError as e: + raise DeepEvalError( + f"Failed to parse the LLM's rewritten messages into JSON: {e}" + ) + except Exception as e: + raise DeepEvalError(f"Failed to reconstruct PromptMessages: {e}") + + new_prompt = Prompt(**prompt_kwargs) + + new_prompt.label = old_prompt.label + + return new_prompt diff --git a/docs/content/docs/(algorithms)/prompt-optimization-gepa.mdx b/docs/content/docs/(algorithms)/prompt-optimization-gepa.mdx index 7d0da35098..08f362d7d2 100644 --- a/docs/content/docs/(algorithms)/prompt-optimization-gepa.mdx +++ b/docs/content/docs/(algorithms)/prompt-optimization-gepa.mdx @@ -51,28 +51,108 @@ You can customize GEPA's behavior by passing arguments directly to the `GEPA` co ```python from deepeval.optimizer.algorithms import GEPA -gepa = GEPA(iterations=10, pareto_size=5, minibatch_size=4) +gepa = GEPA( + iterations=10, + pareto_size=5, + minibatch_size=4, + patience=4, + random_seed=42, +) ``` -There are **FIVE** optional parameters when creating a `GEPA` instance: +There are **NINE** optional parameters when creating a `GEPA` instance: - [Optional] `iterations`: total number of mutation attempts. Defaulted to `5`. - [Optional] `pareto_size`: number of goldens in the Pareto validation set (`D_pareto`). Defaulted to `3`. - [Optional] `minibatch_size`: number of goldens drawn for feedback per iteration. Automatically clamped to available data. Defaulted to `8`. -- [Optional] `random_seed`: seed for reproducibility. Controls the randomness in golden splitting, minibatch sampling, Pareto selection, and tie-breaking. Set a fixed value (e.g., `42`) to get identical results across runs. Defaulted to `time.time_ns()`. +- [Optional] `patience`: stop early after this many consecutive rejected children. Defaulted to `3`. +- [Optional] `random_seed`: seed for reproducibility. Controls golden splitting, minibatch sampling, Pareto parent selection, and tie-breaking. Set a fixed value (e.g., `42`) to get reproducible runs. Defaulted to `time.time_ns()`. - [Optional] `tie_breaker`: policy for breaking ties (`PREFER_ROOT`, `PREFER_CHILD`, or `RANDOM`). Defaulted to `PREFER_CHILD`. +- [Optional] `aggregate_instances`: function that aggregates a prompt's per-golden Pareto scores into a scalar for ranking/tie handling. Defaulted to `mean_of_all`. +- [Optional] `reflection_model`: LLM used for diagnosis/feedback generation. Defaulted to `"gpt-4o-mini"`. +- [Optional] `mutation_model`: LLM used for rewriting the prompt. Defaulted to `"gpt-4o"`. +- [Optional] `scorer`: custom scorer instance. In most workflows this is injected by `PromptOptimizer`. ## How Does GEPA Work? +```mermaid +flowchart TD + subgraph GEPA [GEPA: Diagnostic Hill-Climbing] + A[Initialize Prompt] --> B[Evaluate on Pareto Archive] + B --> C[Diagnosis: Failures & Successes] + C --> D[Mutate Prompt via Rewriter] + D --> E[Score Minibatch Candidate] + E --> F{Accepted by Pareto?} + F -- Yes --> G[Add to Archive] + F -- No --> H[Discard Candidate] + G --> B + H --> B + end +``` + Rather than forcing a single "best" prompt, GEPA maintains a **diverse population of candidate prompts** and uses [Pareto selection](#step-2-pareto-selection) to balance exploration of different strategies with exploitation of proven improvements. This prevents the optimization from getting stuck at a local maximum. -The algorithm runs for a configurable number of `iterations`. Each iteration attempts to evolve a new prompt variant and decides whether to keep it based on performance. Here's an overview of the five steps: +```mermaid +sequenceDiagram + participant User as User / Prompt + participant GEPA as GEPA Engine + participant Scorer as Scorer + participant Rewriter as Rewriter + + activate GEPA + User->>GEPA: Start iteration + GEPA->>GEPA: Split goldens -> D_feedback + D_pareto + GEPA->>Scorer: score_pareto(root, D_pareto) (once) + activate Scorer + Scorer-->>GEPA: Root Pareto scores + deactivate Scorer + + loop Iterations + GEPA->>GEPA: Pick parent from Pareto frontier + GEPA->>GEPA: Draw minibatch from D_feedback + GEPA->>Scorer: get_minibatch_feedback(parent, minibatch) + activate Scorer + Scorer-->>GEPA: Diagnosis feedback + deactivate Scorer + + GEPA->>Scorer: score_minibatch(parent, minibatch) + Scorer-->>GEPA: Parent minibatch score + + GEPA->>Rewriter: rewrite(parent, feedback) + activate Rewriter + Rewriter-->>GEPA: Child prompt + deactivate Rewriter + + alt child unchanged or type changed + GEPA->>GEPA: Skip iteration + else child valid + GEPA->>Scorer: score_minibatch(child, minibatch) + Scorer-->>GEPA: Child minibatch score + alt child <= parent on minibatch + GEPA->>GEPA: Skip child + else child > parent + GEPA->>Scorer: score_pareto(child, D_pareto) + Scorer-->>GEPA: Child Pareto scores + alt child non-dominated + GEPA->>GEPA: Accept child, update archive, prune dominated + else dominated + GEPA->>GEPA: Reject child (update patience counter) + end + end + end + end + + GEPA-->>User: Best Prompt & OptimizationReport + deactivate GEPA +``` -1. **Golden Splitting** — Split your goldens into a validation set (`D_pareto`) and a feedback set (`D_feedback`) -2. **Pareto Selection** — Choose a parent prompt from the Pareto frontier using frequency-weighted sampling -3. **Feedback & Mutation** — Collect metric feedback on a minibatch and use an LLM to rewrite the prompt -4. **Acceptance** — If the child prompt improves over the parent, add it to the candidate pool -5. **Final Selection** — After all iterations, select the best prompt by aggregate score +The algorithm runs for a configurable number of `iterations`. Each iteration tries to evolve one new prompt variant, then decides whether to keep it. Here's the exact high-level flow: + +1. **Golden Splitting** — Split goldens into a fixed validation set (`D_pareto`) and feedback set (`D_feedback`) +2. **Parent Selection** — Sample a parent from the Pareto frontier using frequency-weighted selection +3. **Feedback & Rewrite** — Score a minibatch, collect diagnosis, and generate a child prompt +4. **Filter + Acceptance** — Reject unchanged/weak candidates, then run Pareto acceptance +5. **Final Pick** — Choose the top prompt by aggregate score (with tie-breaker policy) ### Step 1: Golden Splitting @@ -149,27 +229,34 @@ In this example: The Pareto frontier contains **P₂, P₃, and P₄**. Each wins exactly 1 golden, giving them **equal selection probability** (33% each). Despite P₄ having the highest mean score, GEPA might still select P₂ or P₃ as parents to explore their specialized strategies—this is how GEPA avoids local optima and maintains prompt diversity. -### Step 3: Feedback & Mutation +### Step 3: Feedback & Rewrite -Once a parent prompt is selected, GEPA generates a mutated child prompt through **feedback-driven rewriting**: +Once a parent prompt is selected, GEPA creates a child prompt through **feedback-driven rewriting**: -1. **Sample a minibatch**: Draw `minibatch_size` goldens from `D_feedback` -2. **Execute the model**: Run your `model_callback` with the parent prompt on each minibatch golden -3. **Evaluate with metrics**: Score each response using your evaluation metrics -4. **Collect feedback**: Extract the `reason` field from metric evaluations—these contain specific explanations of what went wrong or right -5. **Rewrite the prompt**: An LLM takes the parent prompt plus concatenated feedback and proposes a revised prompt that addresses the identified issues +1. **Sample a minibatch**: Draw up to `minibatch_size` examples from `D_feedback` +2. **Diagnose**: Gather scorer feedback (`get_minibatch_feedback`) on the parent +3. **Baseline score**: Score the parent on that same minibatch +4. **Rewrite**: Use the rewriter to generate a child prompt from the diagnosis +5. **Sanity filter**: Skip the child if it is effectively unchanged or has a different prompt type -The feedback mechanism is key to GEPA's efficiency. Rather than random mutations, the algorithm uses **targeted, metric-driven improvements** based on actual failure cases. +This keeps mutations targeted: changes are driven by metric feedback rather than random prompt edits. ### Step 4: Acceptance -The child prompt is evaluated on the **same minibatch** as the parent. If the child's score exceeds the parent's score by a minimum threshold (`GEPA_MIN_DELTA`), the child is **accepted**: +GEPA applies acceptance in two gates: + +1. **Minibatch gate**: The child must strictly beat the parent on the same minibatch. +2. **Pareto gate**: On `D_pareto`, the child must be non-dominated relative to both: + - the parent prompt configuration + - all existing configurations in the archive + +When accepted, GEPA: -1. Added to the candidate pool -2. Scored on all `D_pareto` goldens for future Pareto comparisons -3. Becomes eligible for selection as a parent in subsequent iterations +1. Adds the child to the prompt-configuration graph +2. Inserts the child's Pareto scores into the archive +3. Removes archive entries that are dominated by the new child -If the child doesn't improve sufficiently, it's **discarded**—the pool remains unchanged and the next iteration begins. +If rejected by the Pareto gate, GEPA increments a consecutive-rejection counter and can early-stop once it reaches `patience`. ### Step 5: Final Selection diff --git a/docs/content/docs/(algorithms)/prompt-optimization-miprov2.mdx b/docs/content/docs/(algorithms)/prompt-optimization-miprov2.mdx index 2ace1cfe59..09dffa1482 100644 --- a/docs/content/docs/(algorithms)/prompt-optimization-miprov2.mdx +++ b/docs/content/docs/(algorithms)/prompt-optimization-miprov2.mdx @@ -52,41 +52,106 @@ from deepeval.optimizer.algorithms import MIPROV2 miprov2 = MIPROV2( num_candidates=10, - num_trials=20, + num_trials=30, minibatch_size=25, - max_bootstrapped_demos=4, - max_labeled_demos=4, - num_demo_sets=5 + max_bootstrapped_demonstrations=4, + max_labeled_demonstrations=4, + num_demonstration_sets=5, + random_state=42, ) ``` There are **EIGHT** optional parameters when creating a `MIPROV2` instance: - [Optional] `num_candidates`: number of diverse instruction candidates to generate in the proposal phase. Defaulted to `10`. -- [Optional] `num_trials`: number of Bayesian Optimization trials to run. Each trial evaluates a different (instruction, demo_set) combination. Defaulted to `20`. +- [Optional] `num_trials`: number of Bayesian Optimization trials to run. Each trial evaluates a different (instruction, demo_set) combination. Defaulted to `30`. - [Optional] `minibatch_size`: number of goldens sampled per trial for evaluation. Larger batches give more reliable scores but cost more. Defaulted to `25`. - [Optional] `minibatch_full_eval_steps`: run a full evaluation on all goldens every N trials. This provides accurate score estimates periodically. Defaulted to `10`. -- [Optional] `max_bootstrapped_demos`: maximum number of bootstrapped demonstrations (model-generated outputs that passed validation) per demo set. Defaulted to `4`. -- [Optional] `max_labeled_demos`: maximum number of labeled demonstrations (from `expected_output` in your goldens) per demo set. Defaulted to `4`. -- [Optional] `num_demo_sets`: number of different demo set configurations to create. More sets provide more variety for the optimizer to explore. Defaulted to `5`. -- [Optional] `random_seed`: seed for reproducibility. Controls randomness in candidate generation, demo bootstrapping, and trial sampling. Set a fixed value (e.g., `42`) to get identical results across runs. Defaulted to `time.time_ns()`. +- [Optional] `max_bootstrapped_demonstrations`: maximum number of bootstrapped demonstrations (model-generated outputs that passed validation) per demo set. Defaulted to `4`. +- [Optional] `max_labeled_demonstrations`: maximum number of labeled demonstrations (from `expected_output` in your goldens) per demo set. Defaulted to `4`. +- [Optional] `num_demonstration_sets`: number of different demo set configurations to create. More sets provide more variety for the optimizer to explore. Defaulted to `5`. +- [Optional] `random_state`: reproducibility control. You can pass either an `int` seed or a `random.Random` instance. This affects candidate generation, demo bootstrapping, minibatch sampling, and TPE sampling. ## How Does MIPROv2 Work? -MIPROv2 works in **two phases**: a **Proposal Phase** that generates candidates upfront, followed by an **Optimization Phase** that uses Bayesian Optimization to find the best combination. +```mermaid +sequenceDiagram + participant User as User / Prompt + participant MIPRO as MIPROv2 Engine + participant Proposer as InstructionProposer + participant Bootstrapper as DemonstrationBootstrapper + participant Optuna as TPESampler + participant Scorer as Scorer + + activate MIPRO + User->>MIPRO: Start optimization + MIPRO->>MIPRO: Initialize proposer + bootstrapper + MIPRO->>Proposer: propose(Prompt, goldens, num_candidates) + activate Proposer + Proposer-->>MIPRO: Instruction candidates (includes baseline) + deactivate Proposer + + MIPRO->>Bootstrapper: bootstrap(Prompt, goldens) + activate Bootstrapper + Bootstrapper-->>MIPRO: Demonstration sets (includes 0-shot) + deactivate Bootstrapper + + MIPRO->>Optuna: create_study(direction='maximize') + + loop Trials 1..num_trials + MIPRO->>Optuna: suggest(instr_idx, demo_idx) + Optuna-->>MIPRO: Selected Configuration Indices + + MIPRO->>MIPRO: Build config + sample minibatch + MIPRO->>Scorer: score_minibatch(config, minibatch) + activate Scorer + Scorer-->>MIPRO: Minibatch Score + deactivate Scorer + MIPRO->>Optuna: tell(trial, score) + + alt Full eval step or final trial + MIPRO->>MIPRO: Read study.best_trial and build config + MIPRO->>Scorer: score_pareto(best_config, goldens) + activate Scorer + Scorer-->>MIPRO: Full Validation Scores + deactivate Scorer + MIPRO->>MIPRO: Update archive and running best + end + end + + MIPRO->>MIPRO: Pick best full-evaluated config (fallback if needed) + MIPRO-->>User: Optimized Prompt + OptimizationReport + deactivate MIPRO +``` + +MIPROv2 works in **two phases**: a **Proposal Phase** that builds the search space, followed by an **Optimization Phase** that searches that space with Bayesian Optimization. Unlike GEPA which evolves prompts iteratively through mutations, MIPROv2 generates all instruction candidates at once and then intelligently searches the space of (instruction, demonstration) combinations. +```mermaid +flowchart TD + subgraph MIPROv2 [MIPROv2: Bayesian Joint Search] + AA[Initialize Prompt & Goldens] --> BB[Propose Diverse Candidates & Bootstrap Demos] + BB --> CC[Sample TPE Parameters Instruction x Demo] + CC --> DD[Evaluate Minibatch Score] + DD --> EE{Periodic Full Eval?} + EE -- Yes --> FF[Test on Full Dataset] + EE -- No --> CC + FF --> GG[Update Pareto Score Archive] + GG --> CC + end +``` + ### Phase 1: Proposal -The proposal phase runs once at the start and consists of two parallel tasks: +The proposal phase runs once at the start and has two steps: -1. **Instruction Proposal** — Generate N diverse instruction candidates -2. **Demo Bootstrapping** — Create M demo sets from training examples +1. **Instruction Proposal** — Generate diverse instruction candidates (baseline + variants) +2. **Demo Bootstrapping** — Build multiple demonstration sets from your goldens #### Step 1a: Instruction Proposal -The instruction proposer generates `num_candidates` diverse instruction variations using the optimizer's LLM. Each candidate is generated with a different "tip" to encourage diversity: +The instruction proposer starts with your original prompt, then asks the optimizer LLM to generate variants with different "tips" to encourage diversity: | Tip Example | Effect | | ------------------------------------ | ------------------------------------------------------ | @@ -95,16 +160,17 @@ The instruction proposer generates `num_candidates` diverse instruction variatio | "Focus on clarity and precision" | Generates explicit, unambiguous instructions | | "Consider edge cases and exceptions" | Generates robust, defensive instructions | -The original prompt is always included as candidate #0 (baseline), so you always have a reference point. +The original prompt is always kept as candidate `0` (baseline), so optimization can always fall back to it. #### Step 1b: Demo Bootstrapping -The bootstrapper creates `num_demo_sets` different few-shot demonstration sets. Each set contains a mix of: +The bootstrapper creates a set of candidate few-shot demonstration bundles. It: -- **Bootstrapped demos**: Generated by running the prompt on training examples and keeping outputs that pass validation -- **Labeled demos**: Taken directly from `expected_output` in your goldens +- Collects **bootstrapped demos** by running the current prompt and keeping only outputs that pass all metrics +- Collects **labeled demos** from `expected_output` / `expected_outcome` +- Builds `num_demonstration_sets` mixed sets from those pools -A **0-shot option** (empty demo set) is always included, allowing the optimizer to test whether few-shot examples help or hurt performance. +A **0-shot option** (empty demo set) is always included, so the optimizer can test whether demonstrations help or hurt. :::tip Demo bootstrapping is particularly powerful when your task benefits from examples. For complex reasoning or formatting tasks, the right few-shot demos can dramatically improve performance. @@ -112,7 +178,7 @@ Demo bootstrapping is particularly powerful when your task benefits from example ### Phase 2: Bayesian Optimization -After the proposal phase creates the candidate space, MIPROv2 uses **Bayesian Optimization** (via Optuna's TPE sampler) to efficiently search for the best (instruction, demo_set) combination. +After proposal, MIPROv2 uses **Optuna TPE** to search over `(instruction_idx, demonstration_set_idx)` combinations. #### What is Bayesian Optimization? @@ -129,18 +195,18 @@ Bayesian Optimization is a sample-efficient strategy for finding the maximum of #### Trial Evaluation -Each trial in the optimization phase: +Each optimization trial: -1. **Samples** an instruction index and demo set index (guided by the TPE sampler) -2. **Renders** the prompt with the selected demos -3. **Evaluates** on a minibatch of goldens (size = `minibatch_size`) -4. **Reports** the score back to Optuna to update the surrogate model +1. **Samples** instruction and demonstration-set indices (guided by TPE) +2. **Builds** a prompt configuration by combining that instruction + demo set +3. **Scores** it on a stochastic minibatch (`score_minibatch`) +4. **Reports** the trial score back to Optuna (`study.tell`) -Minibatch evaluation provides a noisy but fast estimate of prompt quality. Every `minibatch_full_eval_steps` trials, the current best combination is evaluated on the **full** dataset to get an accurate score. +Minibatch scoring is fast but noisy. Every `minibatch_full_eval_steps` trials (and always on the final trial), MIPROv2 runs full-dataset scoring (`score_pareto`) on Optuna's current best trial and stores those true validation scores. #### Example: Trial Progression -Here's what a typical optimization might look like with `num_candidates=5` and `num_demo_sets=4`: +Here's what a typical optimization might look like with `num_candidates=5` and `num_demonstration_sets=4`: | Trial | Instruction | Demo Set | Score | Notes | | ----- | ------------ | ---------- | -------- | ------------------------------- | @@ -158,9 +224,9 @@ Notice how TPE tends to revisit promising combinations (instruction 2, demo set After all trials complete: -1. **Identify** the (instruction, demo_set) combination with the highest score -2. **Run full evaluation** if not already cached -3. **Return** the optimized prompt with demos rendered inline +1. **Scan full-eval archive** (`pareto_score_table`) and pick the highest average full-dataset score +2. **Fallback** to the running best config if needed +3. **Return** the prompt from that winning configuration with demonstrations rendered inline The returned prompt includes both the best instruction and the best demonstrations, ready to use in production. diff --git a/tests/test_core/stubs.py b/tests/test_core/stubs.py index e8e666c5d4..231e360ed9 100644 --- a/tests/test_core/stubs.py +++ b/tests/test_core/stubs.py @@ -465,20 +465,16 @@ def __init__(self, suffix: str = " CHILD") -> None: self.calls = [] self.a_calls = [] - def rewrite(self, *, module_id, old_prompt, feedback_text, **kwargs): - self.calls.append((module_id, old_prompt, feedback_text)) + def rewrite(self, *, old_prompt, feedback_diagnosis=None, **kwargs): + self.calls.append((old_prompt, feedback_diagnosis)) return Prompt( text_template=(old_prompt.text_template or "") + self.suffix ) - async def a_rewrite( - self, *, module_id, old_prompt, feedback_text, **kwargs - ): - self.a_calls.append((module_id, old_prompt, feedback_text)) + async def a_rewrite(self, *, old_prompt, feedback_diagnosis=None, **kwargs): + self.a_calls.append((old_prompt, feedback_diagnosis)) return self.rewrite( - module_id=module_id, - old_prompt=old_prompt, - feedback_text=feedback_text, + old_prompt=old_prompt, feedback_diagnosis=feedback_diagnosis ) diff --git a/tests/test_core/test_optimization/test_gepa/test_loop.py b/tests/test_core/test_optimization/test_gepa/test_loop.py index adf185d207..fe646cfe65 100644 --- a/tests/test_core/test_optimization/test_gepa/test_loop.py +++ b/tests/test_core/test_optimization/test_gepa/test_loop.py @@ -51,7 +51,7 @@ def test_execute_raises_without_scorer() -> None: prompt = Prompt(text_template="base") goldens = [object(), object()] - with pytest.raises(DeepEvalError, match="requires a `scorer`"): + with pytest.raises((DeepEvalError, AttributeError)): runner.execute(prompt=prompt, goldens=goldens) @@ -175,12 +175,14 @@ def test_draw_minibatch_clamps_to_available_data() -> None: def test_should_accept_child_respects_jitter() -> None: - # Internal jitter (1e-6) applies to prevent floating-point ties + # Acceptance now uses non-domination against parent and archive vectors. runner = GEPA(scorer=StubScoringAdapter()) + runner.pareto_score_table = {"root": [0.5, 0.5]} - assert runner._should_accept_child(1.0, 1.0) is False - assert runner._should_accept_child(1.0, 1.0 + 1e-7) is False - assert runner._should_accept_child(1.0, 1.0 + 2e-6) is True + # child dominated by parent -> reject + assert runner._should_accept_child([0.4, 0.4], [0.5, 0.5]) is False + # child non-dominated against parent/archive -> accept + assert runner._should_accept_child([0.6, 0.4], [0.5, 0.5]) is True ###################################### @@ -200,7 +202,7 @@ def test_generate_child_prompt_returns_none_when_text_unchanged() -> None: runner._rewriter = _DummyRewriter() child = runner._generate_child_prompt( - GEPA.SINGLE_MODULE_ID, parent, feedback_text="unused" + GEPA.SINGLE_MODULE_ID, parent, feedback_diagnosis="unused" ) assert child is None @@ -211,7 +213,7 @@ def test_generate_child_prompt_returns_new_prompt_when_text_changes() -> None: runner._rewriter = SuffixRewriter(" CHILD") child = runner._generate_child_prompt( - GEPA.SINGLE_MODULE_ID, parent, feedback_text="unused" + GEPA.SINGLE_MODULE_ID, parent, feedback_diagnosis="unused" ) assert isinstance(child, Prompt) assert child.text_template == "Hello CHILD" @@ -224,7 +226,7 @@ async def test_a_generate_child_prompt_async_mirrors_sync_behavior() -> None: runner._rewriter = SuffixRewriter(" CHILD") child = await runner._a_generate_child_prompt( - GEPA.SINGLE_MODULE_ID, parent, feedback_text="unused" + GEPA.SINGLE_MODULE_ID, parent, feedback_diagnosis="unused" ) assert isinstance(child, Prompt) assert child.text_template == "Hello CHILD" @@ -264,15 +266,16 @@ def test_accept_child_updates_state_and_returns_iteration_dict() -> None: parent=parent_conf.id, ) - d_pareto = [object(), object()] + child_pareto_scores = [1.0, 1.0] + runner.pareto_score_table[parent_conf.id] = [0.5, 0.5] accepted = runner._accept_child( GEPA.SINGLE_MODULE_ID, parent_conf, child_conf, - d_pareto, - parent_score=0.5, - child_score=1.0, + child_pareto_scores, + parent_agg_score=0.5, + child_agg_score=1.0, ) # Child must be registered with a Pareto score @@ -304,15 +307,16 @@ async def test_a_accept_child_updates_state_and_returns_iteration_dict() -> ( parent=parent_conf.id, ) - d_pareto = [object(), object()] + child_pareto_scores = [1.0, 1.0] + runner.pareto_score_table[parent_conf.id] = [0.5, 0.5] accepted = await runner._a_accept_child( GEPA.SINGLE_MODULE_ID, parent_conf, child_conf, - d_pareto, - parent_score=0.5, - child_score=1.0, + child_pareto_scores, + parent_agg_score=0.5, + child_agg_score=1.0, ) assert child_conf.id in runner.pareto_score_table @@ -426,8 +430,8 @@ def status_cb(kind, *, detail, step_index=None, total_steps=None): async def failing_iteration() -> bool: raise ValueError("boom") - # Should not raise, but should report an ERROR - await runner._a_run_loop_iteration(failing_iteration) + with pytest.raises(ValueError, match="boom"): + await runner._a_run_loop_iteration(failing_iteration) kinds = [e[0] for e in events] assert kinds[0] is RunnerStatusType.PROGRESS # initial event diff --git a/tests/test_core/test_optimization/test_miprov2/test_report_contract.py b/tests/test_core/test_optimization/test_miprov2/test_report_contract.py new file mode 100644 index 0000000000..1f2efe5b46 --- /dev/null +++ b/tests/test_core/test_optimization/test_miprov2/test_report_contract.py @@ -0,0 +1,124 @@ +import pytest +from types import SimpleNamespace + +from deepeval.dataset.golden import Golden +from deepeval.optimizer.algorithms.miprov2.bootstrapper import DemonstrationSet +from deepeval.optimizer.algorithms.miprov2.miprov2 import MIPROV2 +from deepeval.prompt.prompt import Prompt + + +class _DummyTrial: + def __init__(self): + self.params = {} + + def suggest_categorical(self, name, choices): + choice = choices[0] + self.params[name] = choice + return choice + + +class _DummyStudy: + def __init__(self): + self._trial = _DummyTrial() + + def ask(self): + return self._trial + + def tell(self, trial, score): + self.best_trial = trial + + @property + def best_trial(self): + return self._trial + + @best_trial.setter + def best_trial(self, trial): + self._trial = trial + + +class _DummyProposer: + def propose(self, prompt, goldens, num_candidates): + return [prompt] + + async def a_propose(self, prompt, goldens, num_candidates): + return [prompt] + + +class _DummyBootstrapper: + def bootstrap(self, prompt, goldens): + return [DemonstrationSet(demonstrations=[])] + + async def a_bootstrap(self, prompt, goldens): + return [DemonstrationSet(demonstrations=[])] + + +class _DummyScorer: + def score_minibatch(self, prompt_configuration, minibatch): + return 0.5 + + async def a_score_minibatch(self, prompt_configuration, minibatch): + return 0.5 + + def score_pareto(self, prompt_configuration, goldens): + return [0.5 for _ in goldens] + + async def a_score_pareto(self, prompt_configuration, goldens): + return [0.5 for _ in goldens] + + +@pytest.fixture +def _miprov2_with_stubs(monkeypatch): + from deepeval.optimizer.algorithms.miprov2 import miprov2 as miprov2_module + + monkeypatch.setattr(miprov2_module, "OPTUNA_AVAILABLE", True) + monkeypatch.setattr(miprov2_module, "TPESampler", lambda seed: None) + monkeypatch.setattr( + miprov2_module, + "optuna", + SimpleNamespace( + create_study=lambda **kwargs: _DummyStudy(), + logging=SimpleNamespace( + WARNING=0, + set_verbosity=lambda *args, **kwargs: None, + ), + ), + ) + algo = MIPROV2(num_trials=1, num_candidates=1, minibatch_full_eval_steps=1) + algo.scorer = _DummyScorer() + algo.optimizer_model = object() + algo._init_components = lambda: ( + setattr(algo, "proposer", _DummyProposer()), + setattr(algo, "bootstrapper", _DummyBootstrapper()), + ) + return algo + + +def test_miprov2_execute_report_contract(_miprov2_with_stubs): + prompt = Prompt(text_template="base {input}") + goldens = [Golden(input="q1", expected_output="a1")] + + best_prompt, report = _miprov2_with_stubs.execute( + prompt=prompt, goldens=goldens + ) + + assert best_prompt is not None + assert isinstance(report.pareto_scores, dict) + assert report.pareto_scores + assert all(isinstance(v, list) for v in report.pareto_scores.values()) + assert isinstance(report.accepted_iterations, list) + + +@pytest.mark.asyncio +async def test_miprov2_a_execute_report_contract(_miprov2_with_stubs): + prompt = Prompt(text_template="base {input}") + goldens = [Golden(input="q1", expected_output="a1")] + + best_prompt, report = await _miprov2_with_stubs.a_execute( + prompt=prompt, goldens=goldens + ) + + assert best_prompt is not None + assert isinstance(report.pareto_scores, dict) + assert report.pareto_scores + assert all(isinstance(v, list) for v in report.pareto_scores.values()) + assert isinstance(report.accepted_iterations, list) diff --git a/tests/test_core/test_optimization/test_mutations/test_prompt_rewriter.py b/tests/test_core/test_optimization/test_mutations/test_prompt_rewriter.py index 3f43a1e7ea..d275ff503a 100644 --- a/tests/test_core/test_optimization/test_mutations/test_prompt_rewriter.py +++ b/tests/test_core/test_optimization/test_mutations/test_prompt_rewriter.py @@ -1,57 +1,73 @@ -# tests/test_core/test_optimization/test_mutations/test_prompt_rewriter.py import pytest - -from deepeval.optimizer.rewriter.utils import ( - _compose_prompt_messages, - _normalize_llm_output_to_text, -) - -############################### -# _compose_prompt_messages # -############################### - - -@pytest.mark.parametrize( - "system,user,expected", - [ - ("System", "User", "System\n\nUser"), - ("", "User only", "User only"), - (" ", " Trim me ", "Trim me"), - ], -) -def test_compose_prompt_messages(system, user, expected): - assert _compose_prompt_messages(system, user) == expected - - -######################################## -# _normalize_llm_output_to_text # -######################################## - - -def test_normalize_llm_output_to_text_str_and_tuple(): - assert _normalize_llm_output_to_text(" hi ") == "hi" - - text = _normalize_llm_output_to_text((" hello ", 1.23)) - assert text == "hello" - - -def test_normalize_llm_output_to_text_dict_json_and_unicode(): - data = {"answer": "café", "score": 0.9} - out = _normalize_llm_output_to_text(data) - - # Should be JSON and preserve unicode characters - assert '"answer"' in out - assert "café" in out - assert '"score"' in out - - -def test_normalize_llm_output_to_text_unserializable_falls_back_to_str(): - class Unserializable: - def __repr__(self): - return "" - - obj = Unserializable() - out = _normalize_llm_output_to_text(obj) - - assert isinstance(out, str) - assert "UnserializableObject" in out +from deepeval.errors import DeepEvalError +from deepeval.optimizer.utils import _parse_prompt, _create_prompt +from deepeval.prompt.prompt import Prompt +from deepeval.prompt import PromptMessage + + +def test_parse_prompt_text_returns_template(): + prompt = Prompt(text_template="Hello {input}") + assert _parse_prompt(prompt) == "Hello {input}" + + +def test_parse_prompt_list_returns_json_string(): + prompt = Prompt( + messages_template=[ + PromptMessage(role="system", content="You are helpful."), + PromptMessage(role="user", content="Q: {input}"), + ] + ) + out = _parse_prompt(prompt) + assert '"role": "system"' in out + assert '"content": "Q: {input}"' in out + + +def test_create_prompt_list_accepts_json_array(): + old_prompt = Prompt( + messages_template=[ + PromptMessage(role="system", content="old"), + PromptMessage(role="user", content="{input}"), + ] + ) + new_content = ( + '[{"role":"system","content":"new system"},' + '{"role":"user","content":"new user"}]' + ) + + new_prompt = _create_prompt(old_prompt, new_content) + assert new_prompt.messages_template is not None + assert len(new_prompt.messages_template) == 2 + assert new_prompt.messages_template[0].content == "new system" + + +def test_create_prompt_list_rejects_comma_separated_objects_without_array(): + old_prompt = Prompt( + messages_template=[ + PromptMessage(role="system", content="old"), + PromptMessage(role="user", content="{input}"), + ] + ) + new_content = ( + '{"role":"system","content":"new system"},' + '{"role":"user","content":"new user"}' + ) + + with pytest.raises( + DeepEvalError, + match="Failed to parse the LLM's rewritten messages into JSON", + ): + _create_prompt(old_prompt, new_content) + + +def test_create_prompt_list_raises_for_invalid_json(): + old_prompt = Prompt( + messages_template=[ + PromptMessage(role="system", content="old"), + PromptMessage(role="user", content="{input}"), + ] + ) + with pytest.raises( + DeepEvalError, + match="Failed to parse the LLM's rewritten messages into JSON", + ): + _create_prompt(old_prompt, "not-json-at-all") diff --git a/tests/test_core/test_optimization/test_prompt_optimizer.py b/tests/test_core/test_optimization/test_prompt_optimizer.py index bc313b2165..1fc78ed2e5 100644 --- a/tests/test_core/test_optimization/test_prompt_optimizer.py +++ b/tests/test_core/test_optimization/test_prompt_optimizer.py @@ -112,12 +112,13 @@ def test_on_status_progress_updates_progress_when_indicator_enabled(): ) progress = DummyProgress() - task_id = 42 - optimizer._progress_state = (progress, task_id) + iter_task_id = 42 + step_task_id = 43 + optimizer._progress_state = (progress, iter_task_id, step_task_id) optimizer._on_status( RunnerStatusType.PROGRESS, - detail="• iteration 1/5", + detail="", step_index=1, total_steps=5, ) @@ -140,19 +141,28 @@ def test_on_status_progress_updates_progress_when_indicator_enabled(): if kind == "update" and "description" in kwargs ] assert desc_updates - desc = desc_updates[-1]["description"] - assert "Optimizing prompt with GEPA" in desc - assert "[rgb(25,227,160)]" in desc + descriptions = [u["description"] for u in desc_updates] + assert any("Optimizing prompt with GEPA" in d for d in descriptions) -def test_format_progress_description_includes_colored_detail(): +def test_format_iter_description_includes_iteration_and_percent(): optimizer = PromptOptimizer( model_callback=_dummy_model_callback, metrics=[_DummyMetric()], display_config=DisplayConfig(show_indicator=False), ) - text = optimizer._format_progress_description("details here") - assert ( - text == "Optimizing prompt with GEPA [rgb(25,227,160)]details here[/]" + text = optimizer._format_iter_description(step_index=1, total_steps=5) + assert "Optimizing prompt with GEPA" in text + assert "iteration 1/5 (20%)" in text + + +def test_format_step_description_includes_arrow_and_color(): + optimizer = PromptOptimizer( + model_callback=_dummy_model_callback, + metrics=[_DummyMetric()], + display_config=DisplayConfig(show_indicator=False), ) + + text = optimizer._format_step_description("gathering feedback...") + assert text == "[rgb(25,227,160)]⤷ gathering feedback...[/]" diff --git a/tests/test_core/test_optimization/test_utils.py b/tests/test_core/test_optimization/test_utils.py index 8ada4af400..0d4bc3e4e9 100644 --- a/tests/test_core/test_optimization/test_utils.py +++ b/tests/test_core/test_optimization/test_utils.py @@ -2,12 +2,8 @@ import pytest -from tests.test_core.stubs import StubProvider, StubModelSettings, StubPrompt -from deepeval.prompt.prompt import Prompt from deepeval.errors import DeepEvalError from deepeval.optimizer.utils import ( - generate_module_id, - normalize_seed_prompts, split_goldens, validate_callback, validate_instance, @@ -107,86 +103,6 @@ def test_split_goldens_deterministic_and_disjoint_with_fixed_seed() -> None: assert sorted(combined, key=lambda g: goldens.index(g)) == goldens -############################################### -# generate_module_id / normalize_seed_prompts # -############################################### - - -def test_generate_module_id_includes_alias_label_and_model_info() -> None: - existing: set[str] = set() - prompt = StubPrompt( - alias="My Prompt", - label="For Chatbot", - model_settings=StubModelSettings( - provider=StubProvider("OPEN_AI"), - name="gpt-4o-mini", - ), - ) - - module_id = generate_module_id(prompt, index=0, existing=existing) - - assert module_id == "my-prompt-for-chatbot-open-ai-gpt-4o-mini" - assert module_id in existing - - -def test_generate_module_id_uses_fallback_and_dedupes() -> None: - existing: set[str] = set() - - p1 = StubPrompt() - id1 = generate_module_id(p1, index=0, existing=existing) - assert id1 == "module-1" - - # Same parameters and index but different prompt should get a suffixed id - p2 = StubPrompt() - id2 = generate_module_id(p2, index=0, existing=existing) - assert id2 == "module-1-2" - - assert id1 in existing - assert id2 in existing - assert id1 != id2 - - -def test_generate_module_id_truncates_long_ids_to_64_chars() -> None: - long_alias = "A" * 100 - existing: set[str] = set() - prompt = StubPrompt(alias=long_alias) - - module_id = generate_module_id(prompt, index=0, existing=existing) - - assert len(module_id) <= 64 - # base should not be empty - assert module_id != "" - - -def test_normalize_seed_prompts_returns_shallow_copy_for_dict() -> None: - prompt1 = StubPrompt(alias="A") - prompt2 = StubPrompt(alias="B") - seed = {"m1": prompt1, "m2": prompt2} - - normalized = normalize_seed_prompts(seed) - - assert normalized is not seed - assert normalized == seed - # Values are the same objects (shallow copy) - assert normalized["m1"] is prompt1 - assert normalized["m2"] is prompt2 - - -def test_normalize_seed_prompts_generates_unique_ids_for_list() -> None: - p1 = StubPrompt(alias="First Prompt") - p2 = StubPrompt(alias="Second Prompt") - prompts = [p1, p2] - - normalized = normalize_seed_prompts(prompts) - - # Values preserved - assert set(normalized.values()) == {p1, p2} - # Unique, string keys generated - keys = list(normalized.keys()) - assert all(isinstance(k, str) for k in keys) - assert len(keys) == len(set(keys)) == len(prompts) - - ##################### # validate_callback # #####################