Skip to content

Commit 559b810

Browse files
author
N Kulin
committed
update for 1.2.0
- added new metrics - optimized default llm (fp16)
1 parent ecdbb49 commit 559b810

File tree

9 files changed

+1903
-1136
lines changed

9 files changed

+1903
-1136
lines changed

coolprompt/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .assistant import PromptTuner
2+
3+
__all__ = [
4+
"PromptTuner",
5+
]

coolprompt/assistant.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,14 @@ def run(
151151
generate_num_samples: int = 10,
152152
feedback: bool = True,
153153
verbose: int = 1,
154+
llm_as_judge_criteria: str | list[str] = "relevance",
155+
llm_as_judge_custom_templates: Optional[dict[str, str]] = None,
156+
llm_as_judge_metric_ceil: int = 10,
157+
geval_criteria: Optional[str] = None,
158+
geval_evaluation_steps: Optional[list[str]] = None,
159+
geval_evaluation_params: Optional[list] = None,
160+
geval_strict_mode: bool = False,
161+
return_final_prompt: bool = True,
154162
**kwargs,
155163
) -> str:
156164
"""Optimizes prompts using provided model.
@@ -189,6 +197,32 @@ def run(
189197
0 - no logging
190198
1 - steps logging
191199
2 - steps and prompts logging
200+
llm_as_judge_criteria (str | list[str]): Criteria for LLM-as-judge metric when
201+
metric == 'llm_as_judge'. Accepts a single criterion (e.g., "relevance")
202+
or a list of criteria (e.g., ["relevance", "fluency"]). Built‑in
203+
keys: "accuracy", "coherence", "fluency", "relevance". Custom
204+
names are supported when paired with `llm_as_judge_custom_templates`.
205+
llm_as_judge_custom_templates (dict[str, str] | None): Optional mapping
206+
from criterion name to a custom judge prompt template. Each
207+
template must include placeholders: `{metric_ceil}`, `{request}`
208+
and `{response}`; the judge must return ONLY a single number.
209+
llm_as_judge_metric_ceil (int): Maximum integer score expected from the
210+
judge (1..ceil). Judge outputs are clipped to [0, ceil] and
211+
normalized to [0, 1] for averaging.
212+
geval_criteria (str | None): High-level natural language description
213+
of what GEval should evaluate. Mutually exclusive with
214+
`geval_evaluation_steps`. If both are provided, GEvalMetric
215+
will raise a ValueError.
216+
geval_evaluation_steps (list[str] | None): Explicit step-by-step
217+
instructions for GEval. If provided, `geval_criteria` must be
218+
None.
219+
geval_evaluation_params (list | None): Optional list of
220+
LLMTestCaseParams controlling which fields of each
221+
LLMTestCase are visible to GEval. Defaults to
222+
[INPUT, ACTUAL_OUTPUT, EXPECTED_OUTPUT] inside GEvalMetric
223+
when left as None.
224+
geval_strict_mode (bool): When True, GEval behaves in strict mode
225+
(binary pass/fail with threshold forced to 1).
192226
**kwargs (dict[str, Any]): other key-word arguments.
193227
194228
Returns:
@@ -229,7 +263,22 @@ def run(
229263
problem_description,
230264
validation_size,
231265
)
232-
metric = validate_and_create_metric(task, metric)
266+
metric = validate_and_create_metric(
267+
task,
268+
metric,
269+
model=(
270+
self._system_model
271+
if metric in ("llm_as_judge", "geval")
272+
else None
273+
),
274+
llm_as_judge_criteria=llm_as_judge_criteria,
275+
llm_as_judge_custom_templates=llm_as_judge_custom_templates,
276+
llm_as_judge_metric_ceil=llm_as_judge_metric_ceil,
277+
geval_criteria=geval_criteria,
278+
geval_evaluation_steps=geval_evaluation_steps,
279+
geval_evaluation_params=geval_evaluation_params,
280+
geval_strict_mode=geval_strict_mode,
281+
)
233282
evaluator = Evaluator(self._target_model, task, metric)
234283
final_prompt = ""
235284
generator = SyntheticDataGenerator(self._system_model)
@@ -329,10 +378,14 @@ def run(
329378
if feedback:
330379
prompt_assistant = PromptAssistant(self._target_model)
331380
self.assistant_feedback = correct(
332-
prompt=prompt_assistant.get_feedback(start_prompt, final_prompt),
381+
prompt=prompt_assistant.get_feedback(
382+
start_prompt, final_prompt
383+
),
333384
rule=LanguageRule(self._system_model),
334385
start_prompt=start_prompt,
335386
)
336387

337388
logger.info("=== Assistant's feedback ===")
338389
logger.info(self.assistant_feedback)
390+
391+
return final_prompt if return_final_prompt else None

coolprompt/evaluator/evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def evaluate(
7878
a.content if isinstance(a, AIMessage) else a for a in answers
7979
]
8080

81-
return self.metric.compute(answers, targets)
81+
return self.metric.compute(answers, targets, dataset)
8282

8383
def _get_full_prompt(
8484
self,
@@ -121,4 +121,4 @@ def _get_default_template(self) -> str:
121121
case Task.CLASSIFICATION:
122122
return CLASSIFICATION_TASK_TEMPLATE
123123
case Task.GENERATION:
124-
return GENERATION_TASK_TEMPLATE
124+
return GENERATION_TASK_TEMPLATE

0 commit comments

Comments
 (0)