Skip to content

Commit 2ccbdc8

Browse files
committed
feat: integrate panel of judges
1 parent 34cbae4 commit 2ccbdc8

11 files changed

Lines changed: 600 additions & 41 deletions

File tree

src/lightspeed_evaluation/core/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@
6060

6161
DEFAULT_API_NUM_RETRIES = 3
6262

63+
# Frameworks that don't require judge LLM (NLP, script-based evaluations)
64+
NON_LLM_FRAMEWORKS = frozenset({"nlp", "script"})
65+
6366
DEFAULT_LLM_PROVIDER = "openai"
6467
DEFAULT_LLM_MODEL = "gpt-4o-mini"
6568
DEFAULT_SSL_VERIFY = True
@@ -96,6 +99,8 @@
9699
"api_output_tokens",
97100
"judge_llm_input_tokens",
98101
"judge_llm_output_tokens",
102+
# Per-judge scores (JSON array with one entry for single judge)
103+
"judge_scores",
99104
# Streaming performance metrics
100105
"time_to_first_token",
101106
"streaming_duration",

src/lightspeed_evaluation/core/llm/manager.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,22 @@ def get_primary_judge(self) -> "LLMManager":
163163
return self.judge_managers[0]
164164
return self
165165

166+
def get_judges_for_metric(self, metric_identifier: str) -> list["LLMManager"]:
167+
"""Get list of judges to use for a specific metric.
168+
169+
Returns all judges if metric should use panel, otherwise returns
170+
list with single primary judge. Always returns a list.
171+
172+
Args:
173+
metric_identifier: Metric identifier (e.g., "ragas:faithfulness")
174+
175+
Returns:
176+
List of LLMManager instances to use for this metric
177+
"""
178+
if self.should_use_panel_for_metric(metric_identifier):
179+
return self.get_judge_managers()
180+
return [self.get_primary_judge()]
181+
166182
def should_use_panel_for_metric(self, metric_identifier: str) -> bool:
167183
"""Determine if a metric should use judge panel based on enabled_metrics.
168184

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
from lightspeed_evaluation.core.models.data import (
99
EvaluationData,
1010
EvaluationRequest,
11-
MetricResult,
1211
EvaluationResult,
1312
EvaluationScope,
13+
JudgeScore,
14+
MetricResult,
1415
TurnData,
1516
)
1617
from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin
@@ -34,6 +35,7 @@
3435
"TurnData",
3536
"EvaluationData",
3637
"EvaluationRequest",
38+
"JudgeScore",
3739
"MetricResult",
3840
"EvaluationResult",
3941
"EvaluationScope",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,24 @@ def validate_conversation_metrics(
419419
return v
420420

421421

422+
class JudgeScore(BaseModel):
423+
"""Model for individual judge evaluation score in a judge panel.
424+
425+
Used when multiple judges evaluate the same metric, storing per-judge
426+
details for transparency and analysis.
427+
"""
428+
429+
judge_id: str = Field(
430+
..., min_length=1, description="Judge identifier (model ID from llm_pool)"
431+
)
432+
score: Optional[float] = Field(
433+
default=None, ge=0.0, le=1.0, description="Score between 0 and 1"
434+
)
435+
reason: str = Field(default="", description="Explanation from this judge")
436+
input_tokens: int = Field(default=0, ge=0, description="Input tokens used")
437+
output_tokens: int = Field(default=0, ge=0, description="Output tokens used")
438+
439+
422440
class MetricResult(BaseModel):
423441
"""Model for framework metric result."""
424442

@@ -439,6 +457,10 @@ class MetricResult(BaseModel):
439457
judge_llm_output_tokens: int = Field(
440458
default=0, ge=0, description="Judge LLM output tokens used"
441459
)
460+
judge_scores: Optional[list[JudgeScore]] = Field(
461+
default=None,
462+
description="Per-judge scores when using judge panel (for transparency)",
463+
)
442464

443465
@field_validator("result")
444466
@classmethod

src/lightspeed_evaluation/core/output/generator.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ def _generate_csv_report(
196196
# Special formatting for execution_time
197197
if column == "execution_time" and value is not None:
198198
row_data.append(f"{value:.3f}")
199+
# Convert judge_scores to JSON string
200+
elif column == "judge_scores" and value is not None:
201+
row_data.append(
202+
json.dumps(
203+
[js.model_dump() for js in value], default=str
204+
)
205+
)
199206
else:
200207
row_data.append(value)
201208
else:
@@ -254,6 +261,12 @@ def _generate_json_summary( # pylint: disable=too-many-arguments,too-many-posit
254261
"execution_time": round(r.execution_time, 3),
255262
"judge_llm_input_tokens": r.judge_llm_input_tokens,
256263
"judge_llm_output_tokens": r.judge_llm_output_tokens,
264+
# Judge panel scores (when using multiple judges)
265+
"judge_scores": (
266+
[js.model_dump() for js in r.judge_scores]
267+
if r.judge_scores
268+
else None
269+
),
257270
# Streaming performance metrics
258271
"time_to_first_token": r.time_to_first_token,
259272
"streaming_duration": r.streaming_duration,

src/lightspeed_evaluation/core/system/loader.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
SystemConfig,
1717
VisualizationConfig,
1818
)
19+
from lightspeed_evaluation.core.models.system import (
20+
JudgePanelConfig,
21+
LLMPoolConfig,
22+
)
1923
from lightspeed_evaluation.core.system.setup import (
2024
setup_environment_variables,
2125
setup_logging,
@@ -156,6 +160,17 @@ def load_system_config(self, config_path: str) -> SystemConfig:
156160
def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
157161
"""Create SystemConfig object from validated configuration data."""
158162
metrics_metadata = config_data.get("metrics_metadata", {})
163+
164+
# Parse llm_pool if present
165+
llm_pool = None
166+
if "llm_pool" in config_data:
167+
llm_pool = LLMPoolConfig(**config_data["llm_pool"])
168+
169+
# Parse judge_panel if present
170+
judge_panel = None
171+
if "judge_panel" in config_data:
172+
judge_panel = JudgePanelConfig(**config_data["judge_panel"])
173+
159174
return SystemConfig(
160175
core=CoreConfig(**config_data.get("core", {})),
161176
llm=LLMConfig(**config_data.get("llm", {})),
@@ -164,6 +179,8 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig:
164179
output=OutputConfig(**config_data.get("output", {})),
165180
logging=LoggingConfig(**config_data.get("logging", {})),
166181
visualization=VisualizationConfig(**config_data.get("visualization", {})),
182+
llm_pool=llm_pool,
183+
judge_panel=judge_panel,
167184
default_turn_metrics_metadata=metrics_metadata.get("turn_level", {}),
168185
default_conversation_metrics_metadata=metrics_metadata.get(
169186
"conversation_level", {}

0 commit comments

Comments
 (0)