Skip to content

Commit dfa74f6

Browse files
committed
[LEADS-389] Support user-defined metadata in evaluation data format for GDS quality grading and traceability
1 parent 454d871 commit dfa74f6

14 files changed

Lines changed: 677 additions & 28 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ For field tables, full YAML examples (file-only, file + SQLite, file + Postgres)
320320
| Field | Type | Required | Description |
321321
|---------------------------------|----------------|----------|----------------------------------------------------------------------|
322322
| `conversation_group_id` | string | ✅ | Unique identifier for conversation |
323+
| `metadata` | ConversationMetadata | ❌ | User-defined metadata for traceability and quality grading |
323324
| `description` | string | ❌ | Optional description |
324325
| `tag` | string | ❌ | Tag for grouping eval conversations (default: "eval") |
325326
| `setup_script` | string | ❌ | Path to setup script (Optional, used when API is enabled) |
@@ -333,6 +334,7 @@ For field tables, full YAML examples (file-only, file + SQLite, file + Postgres)
333334
| Field | Type | Required | Description | API Populated |
334335
|-----------------------|------------------|----------|--------------------------------------|-----------------------|
335336
| `turn_id` | string | ✅ | Unique identifier for the turn | ❌ |
337+
| `metadata` | TurnMetadata | ❌ | User-defined metadata for traceability and quality grading | ❌ |
336338
| `query` | string | ✅ | The question/prompt to evaluate | ❌ |
337339
| `response` | string | 📋 | Actual response from system | ✅ (if API enabled) |
338340
| `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) |

src/lightspeed_evaluation/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@
2424
from lightspeed_evaluation.core.llm import LLMManager
2525
from lightspeed_evaluation.core.models import (
2626
APIConfig,
27+
ConversationMetadata,
28+
DatasetMetadata,
2729
EvaluationData,
2830
EvaluationResult,
2931
LLMConfig,
3032
LoggingConfig,
3133
TurnData,
34+
TurnMetadata,
3235
VisualizationConfig,
3336
)
3437
from lightspeed_evaluation.core.models.summary import EvaluationSummary
@@ -79,6 +82,12 @@
7982
"VisualizationConfig": ("lightspeed_evaluation.core.models", "VisualizationConfig"),
8083
"EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
8184
"TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
85+
"TurnMetadata": ("lightspeed_evaluation.core.models", "TurnMetadata"),
86+
"ConversationMetadata": (
87+
"lightspeed_evaluation.core.models",
88+
"ConversationMetadata",
89+
),
90+
"DatasetMetadata": ("lightspeed_evaluation.core.models", "DatasetMetadata"),
8291
"EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
8392
"EvaluationSummary": (
8493
"lightspeed_evaluation.core.models.summary",

src/lightspeed_evaluation/api.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
print(summary.by_metric)
2424
"""
2525

26-
from typing import Optional
26+
from typing import TYPE_CHECKING, Optional
2727

2828
from lightspeed_evaluation.core.models import (
2929
EvaluationData,
@@ -35,22 +35,31 @@
3535
from lightspeed_evaluation.core.system import ConfigLoader
3636
from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
3737

38+
if TYPE_CHECKING:
39+
from lightspeed_evaluation.core.models.data import DatasetMetadata
40+
3841

3942
def evaluate(
4043
config: SystemConfig,
4144
data: list[EvaluationData],
4245
output_dir: Optional[str] = None,
46+
original_data_path: Optional[str] = None,
47+
dataset_metadata: Optional["DatasetMetadata"] = None,
4348
) -> list[EvaluationResult]:
4449
"""Run evaluation on the provided data using the given configuration.
4550
4651
Creates a fully-initialized pipeline from the ``SystemConfig``, runs
47-
evaluation on every conversation in *data*, and returns the raw results.
52+
evaluation on every conversations in *data*, and returns the raw results.
4853
No reports are generated -- file I/O is the caller's responsibility.
4954
5055
Args:
5156
config: A pre-built SystemConfig instance.
5257
data: List of EvaluationData conversations to evaluate.
5358
output_dir: Optional override for the output directory.
59+
original_data_path: Path to the original evaluation data file.
60+
Required for saving amended data when agents are enabled.
61+
dataset_metadata: Optional dataset-level metadata to preserve in
62+
amended output files.
5463
5564
Returns:
5665
List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,7 +70,11 @@ def evaluate(
6170
loader = ConfigLoader.from_config(config)
6271
pipeline = EvaluationPipeline(loader, output_dir)
6372
try:
64-
return pipeline.run_evaluation(data)
73+
return pipeline.run_evaluation(
74+
data,
75+
original_data_path=original_data_path,
76+
dataset_metadata=dataset_metadata,
77+
)
6578
finally:
6679
pipeline.close()
6780

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
AttachmentData,
1515
)
1616
from lightspeed_evaluation.core.models.data import (
17+
ConversationMetadata,
18+
DatasetMetadata,
1719
EvaluationData,
1820
EvaluationRequest,
1921
EvaluationResult,
2022
EvaluationScope,
2123
JudgeScore,
2224
MetricResult,
2325
TurnData,
26+
TurnMetadata,
2427
)
2528
from lightspeed_evaluation.core.models.llm import (
2629
EmbeddingConfig,
@@ -61,7 +64,10 @@
6164
"ProposalAgentConfig",
6265
# Data models
6366
"TurnData",
67+
"TurnMetadata",
6468
"EvaluationData",
69+
"ConversationMetadata",
70+
"DatasetMetadata",
6571
"EvaluationRequest",
6672
"JudgeScore",
6773
"MetricResult",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,39 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
class TurnMetadata(BaseModel):
16+
"""Optional user-defined metadata for a single turn.
17+
18+
Schema-free: any key-value pairs are accepted.
19+
See the Evaluation Data Collection Guide for recommended fields
20+
(e.g. complexity, data_source, human_verified, persona).
21+
"""
22+
23+
model_config = ConfigDict(extra="allow")
24+
25+
26+
class ConversationMetadata(BaseModel):
27+
"""Optional user-defined metadata for a conversation group.
28+
29+
Schema-free: any key-value pairs are accepted.
30+
See the Evaluation Data Collection Guide for recommended fields
31+
(e.g. scenario_category, use_case, interaction_type, topic).
32+
"""
33+
34+
model_config = ConfigDict(extra="allow")
35+
36+
37+
class DatasetMetadata(BaseModel):
38+
"""Optional user-defined metadata for the entire evaluation dataset.
39+
40+
Schema-free: any key-value pairs are accepted.
41+
See the Evaluation Data Collection Guide for recommended fields
42+
(e.g. team_product, dataset_version, pii_confirmed_removed).
43+
"""
44+
45+
model_config = ConfigDict(extra="allow")
46+
47+
1548
def _validate_and_deduplicate_metrics(
1649
metrics: list[str], metric_type: str = "metric"
1750
) -> list[str]:
@@ -39,6 +72,10 @@ class TurnData(StreamingMetricsMixin):
3972
model_config = ConfigDict(extra="forbid")
4073

4174
turn_id: str = Field(..., min_length=1, description="Turn ID (alphanumeric)")
75+
metadata: Optional[TurnMetadata] = Field(
76+
default=None,
77+
description="User-defined metadata for traceability and quality grading",
78+
)
4279
query: Optional[str] = Field(
4380
default=None,
4481
min_length=1,
@@ -428,6 +465,10 @@ class EvaluationData(BaseModel):
428465
conversation_group_id: str = Field(
429466
..., min_length=1, description="Unique conversation group identifier"
430467
)
468+
metadata: Optional[ConversationMetadata] = Field(
469+
default=None,
470+
description="User-defined metadata for traceability and quality grading",
471+
)
431472
description: Optional[str] = Field(
432473
default=None,
433474
min_length=1,

src/lightspeed_evaluation/core/output/data_persistence.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,28 @@
22

33
from datetime import UTC, datetime
44
from pathlib import Path
5-
from typing import Optional
5+
from typing import Any, Optional
66

77
import yaml
88

99
from lightspeed_evaluation.core.constants import DEFAULT_OUTPUT_DIR
1010
from lightspeed_evaluation.core.models import EvaluationData
11+
from lightspeed_evaluation.core.models.data import DatasetMetadata
1112

1213

13-
# Use caching
1414
def save_evaluation_data(
1515
evaluation_data: list[EvaluationData],
1616
original_data_path: str,
1717
output_dir: str = DEFAULT_OUTPUT_DIR,
18+
dataset_metadata: Optional[DatasetMetadata] = None,
1819
) -> Optional[str]:
19-
"""Save amended evaluation data to output directory with timestamp."""
20+
"""Save amended evaluation data to output directory with timestamp.
21+
22+
When *dataset_metadata* is provided the file is written in the dict
23+
format (``metadata`` + ``conversations`` keys) so that dataset-level
24+
metadata is preserved across amend cycles. Without metadata the
25+
original list format is used for backward compatibility.
26+
"""
2027
original_path = Path(original_data_path)
2128
amended_data_path = None
2229

@@ -33,10 +40,21 @@ def save_evaluation_data(
3340
/ f"{original_path.stem}_amended_{timestamp}{original_path.suffix}"
3441
)
3542

43+
conversations = [
44+
conv_data.model_dump(mode="json") for conv_data in evaluation_data
45+
]
46+
47+
output_data: Any = conversations
48+
if dataset_metadata is not None:
49+
output_data = {
50+
"metadata": dataset_metadata.model_dump(mode="json", exclude_none=True),
51+
"conversations": conversations,
52+
}
53+
3654
# Save amended data to output directory
3755
with open(amended_data_path, "w", encoding="utf-8") as f:
3856
yaml.dump(
39-
[conv_data.model_dump(mode="json") for conv_data in evaluation_data],
57+
output_data,
4058
f,
4159
default_flow_style=False,
4260
sort_keys=False,

src/lightspeed_evaluation/core/system/validator.py

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pydantic import ValidationError
99

1010
from lightspeed_evaluation.core.models import EvaluationData, TurnData
11+
from lightspeed_evaluation.core.models.data import DatasetMetadata
1112
from lightspeed_evaluation.core.system.exceptions import DataValidationError
1213

1314
if TYPE_CHECKING:
@@ -167,6 +168,7 @@ def __init__(
167168
"""
168169
self.validation_errors: list[str] = []
169170
self.evaluation_data: Optional[list[EvaluationData]] = None
171+
self.dataset_metadata: Optional[DatasetMetadata] = None
170172
self.api_enabled = api_enabled
171173
self.original_data_path: Optional[str] = None
172174
self.fail_on_invalid_data = fail_on_invalid_data
@@ -189,6 +191,15 @@ def _conversation_level_metrics(self) -> set[str]:
189191
def _load_and_parse_yaml(self, data_path: str) -> list[EvaluationData]:
190192
"""Load a YAML file and convert each entry to an EvaluationData model.
191193
194+
Supports two root formats for backward compatibility:
195+
196+
1. **List format** (original): YAML root is a list of conversations.
197+
2. **Dict format** (new): YAML root is a dict with optional ``metadata``
198+
and required ``conversations`` keys.
199+
200+
When the dict format is used, dataset-level metadata is parsed and
201+
stored on ``self.dataset_metadata``.
202+
192203
Args:
193204
data_path: Path to the evaluation data YAML file.
194205
@@ -211,13 +222,12 @@ def _load_and_parse_yaml(self, data_path: str) -> list[EvaluationData]:
211222

212223
if raw_data is None:
213224
raise DataValidationError("Empty or invalid YAML file")
214-
if not isinstance(raw_data, list):
215-
raise DataValidationError(
216-
f"YAML root must be a list, got {type(raw_data).__name__}"
217-
)
225+
226+
self.dataset_metadata = None
227+
raw_conversations = self._extract_conversations_and_metadata(raw_data)
218228

219229
evaluation_data = []
220-
for i, data_dict in enumerate(raw_data):
230+
for i, data_dict in enumerate(raw_conversations):
221231
try:
222232
eval_data = EvaluationData(**data_dict)
223233
evaluation_data.append(eval_data)
@@ -235,6 +245,57 @@ def _load_and_parse_yaml(self, data_path: str) -> list[EvaluationData]:
235245
) from e
236246
return evaluation_data
237247

248+
def _extract_conversations_and_metadata(self, raw_data: object) -> list[dict]:
249+
"""Extract conversation list and optional dataset metadata from raw YAML.
250+
251+
Args:
252+
raw_data: Parsed YAML data (list or dict).
253+
254+
Returns:
255+
List of raw conversation dicts.
256+
257+
Raises:
258+
DataValidationError: If the structure is invalid.
259+
"""
260+
if isinstance(raw_data, list):
261+
return raw_data
262+
263+
if isinstance(raw_data, dict):
264+
if "conversations" not in raw_data:
265+
raise DataValidationError(
266+
"YAML root is a dict but missing required 'conversations' key. "
267+
"Expected either a list of conversations or a dict with "
268+
"'conversations' (and optional 'metadata') keys."
269+
)
270+
271+
metadata_raw = raw_data.get("metadata")
272+
if metadata_raw is not None:
273+
if not isinstance(metadata_raw, dict):
274+
raise DataValidationError(
275+
f"'metadata' must be a mapping, "
276+
f"got {type(metadata_raw).__name__}"
277+
)
278+
try:
279+
self.dataset_metadata = DatasetMetadata(**metadata_raw)
280+
except ValidationError as e:
281+
error_details = format_pydantic_error(e)
282+
raise DataValidationError(
283+
f"Invalid dataset metadata: {error_details}"
284+
) from e
285+
286+
raw_conversations = raw_data["conversations"]
287+
if not isinstance(raw_conversations, list):
288+
raise DataValidationError(
289+
"'conversations' must be a list, "
290+
f"got {type(raw_conversations).__name__}"
291+
)
292+
return raw_conversations
293+
294+
raise DataValidationError(
295+
f"YAML root must be a list or a dict with 'conversations' key, "
296+
f"got {type(raw_data).__name__}"
297+
)
298+
238299
def _apply_metrics_filter(
239300
self, evaluation_data: list[EvaluationData], metrics: list[str]
240301
) -> None:

0 commit comments

Comments
 (0)