@@ -151,6 +151,14 @@ def run(
151151 generate_num_samples : int = 10 ,
152152 feedback : bool = True ,
153153 verbose : int = 1 ,
154+ llm_as_judge_criteria : str | list [str ] = "relevance" ,
155+ llm_as_judge_custom_templates : Optional [dict [str , str ]] = None ,
156+ llm_as_judge_metric_ceil : int = 10 ,
157+ geval_criteria : Optional [str ] = None ,
158+ geval_evaluation_steps : Optional [list [str ]] = None ,
159+ geval_evaluation_params : Optional [list ] = None ,
160+ geval_strict_mode : bool = False ,
161+ return_final_prompt : bool = True ,
154162 ** kwargs ,
155163 ) -> str :
156164 """Optimizes prompts using provided model.
@@ -189,6 +197,32 @@ def run(
189197 0 - no logging
190198 1 - steps logging
191199 2 - steps and prompts logging
200+ llm_as_judge_criteria (str | list[str]): Criteria for LLM-as-judge metric when
201+ metric == 'llm_as_judge'. Accepts a single criterion (e.g., "relevance")
202+ or a list of criteria (e.g., ["relevance", "fluency"]). Built‑in
203+ keys: "accuracy", "coherence", "fluency", "relevance". Custom
204+ names are supported when paired with `llm_as_judge_custom_templates`.
205+ llm_as_judge_custom_templates (dict[str, str] | None): Optional mapping
206+ from criterion name to a custom judge prompt template. Each
207+ template must include placeholders: `{metric_ceil}`, `{request}`
208+ and `{response}`; the judge must return ONLY a single number.
209+ llm_as_judge_metric_ceil (int): Maximum integer score expected from the
210+ judge (1..ceil). Judge outputs are clipped to [0, ceil] and
211+ normalized to [0, 1] for averaging.
212+ geval_criteria (str | None): High-level natural language description
213+ of what GEval should evaluate. Mutually exclusive with
214+ `geval_evaluation_steps`. If both are provided, GEvalMetric
215+ will raise a ValueError.
216+ geval_evaluation_steps (list[str] | None): Explicit step-by-step
217+ instructions for GEval. If provided, `geval_criteria` must be
218+ None.
219+ geval_evaluation_params (list | None): Optional list of
220+ LLMTestCaseParams controlling which fields of each
221+ LLMTestCase are visible to GEval. Defaults to
222+ [INPUT, ACTUAL_OUTPUT, EXPECTED_OUTPUT] inside GEvalMetric
223+ when left as None.
224+ geval_strict_mode (bool): When True, GEval behaves in strict mode
225+ (binary pass/fail with threshold forced to 1).
192226 **kwargs (dict[str, Any]): other key-word arguments.
193227
194228 Returns:
@@ -229,7 +263,22 @@ def run(
229263 problem_description ,
230264 validation_size ,
231265 )
232- metric = validate_and_create_metric (task , metric )
266+ metric = validate_and_create_metric (
267+ task ,
268+ metric ,
269+ model = (
270+ self ._system_model
271+ if metric in ("llm_as_judge" , "geval" )
272+ else None
273+ ),
274+ llm_as_judge_criteria = llm_as_judge_criteria ,
275+ llm_as_judge_custom_templates = llm_as_judge_custom_templates ,
276+ llm_as_judge_metric_ceil = llm_as_judge_metric_ceil ,
277+ geval_criteria = geval_criteria ,
278+ geval_evaluation_steps = geval_evaluation_steps ,
279+ geval_evaluation_params = geval_evaluation_params ,
280+ geval_strict_mode = geval_strict_mode ,
281+ )
233282 evaluator = Evaluator (self ._target_model , task , metric )
234283 final_prompt = ""
235284 generator = SyntheticDataGenerator (self ._system_model )
@@ -329,10 +378,14 @@ def run(
329378 if feedback :
330379 prompt_assistant = PromptAssistant (self ._target_model )
331380 self .assistant_feedback = correct (
332- prompt = prompt_assistant .get_feedback (start_prompt , final_prompt ),
381+ prompt = prompt_assistant .get_feedback (
382+ start_prompt , final_prompt
383+ ),
333384 rule = LanguageRule (self ._system_model ),
334385 start_prompt = start_prompt ,
335386 )
336387
337388 logger .info ("=== Assistant's feedback ===" )
338389 logger .info (self .assistant_feedback )
390+
391+ return final_prompt if return_final_prompt else None
0 commit comments