Skip to content

Commit f5f6721

Browse files
committed
[LEADS-389] Support user-defined metadata in evaluation data format for GDS quality grading and traceability
1 parent 454d871 commit f5f6721

14 files changed

Lines changed: 761 additions & 28 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ For field tables, full YAML examples (file-only, file + SQLite, file + Postgres)
320320
| Field | Type | Required | Description |
321321
|---------------------------------|----------------|----------|----------------------------------------------------------------------|
322322
| `conversation_group_id` | string | ✅ | Unique identifier for conversation |
323+
| `metadata` | ConversationMetadata | ❌ | User-defined metadata for traceability and quality grading |
323324
| `description` | string | ❌ | Optional description |
324325
| `tag` | string | ❌ | Tag for grouping eval conversations (default: "eval") |
325326
| `setup_script` | string | ❌ | Path to setup script (Optional, used when API is enabled) |
@@ -333,6 +334,7 @@ For field tables, full YAML examples (file-only, file + SQLite, file + Postgres)
333334
| Field | Type | Required | Description | API Populated |
334335
|-----------------------|------------------|----------|--------------------------------------|-----------------------|
335336
| `turn_id` | string | ✅ | Unique identifier for the turn | ❌ |
337+
| `metadata` | TurnMetadata | ❌ | User-defined metadata for traceability and quality grading | ❌ |
336338
| `query` | string | ✅ | The question/prompt to evaluate | ❌ |
337339
| `response` | string | 📋 | Actual response from system | ✅ (if API enabled) |
338340
| `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) |

src/lightspeed_evaluation/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@
2424
from lightspeed_evaluation.core.llm import LLMManager
2525
from lightspeed_evaluation.core.models import (
2626
APIConfig,
27+
ConversationMetadata,
28+
DatasetMetadata,
2729
EvaluationData,
2830
EvaluationResult,
2931
LLMConfig,
3032
LoggingConfig,
3133
TurnData,
34+
TurnMetadata,
3235
VisualizationConfig,
3336
)
3437
from lightspeed_evaluation.core.models.summary import EvaluationSummary
@@ -79,6 +82,12 @@
7982
"VisualizationConfig": ("lightspeed_evaluation.core.models", "VisualizationConfig"),
8083
"EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
8184
"TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
85+
"TurnMetadata": ("lightspeed_evaluation.core.models", "TurnMetadata"),
86+
"ConversationMetadata": (
87+
"lightspeed_evaluation.core.models",
88+
"ConversationMetadata",
89+
),
90+
"DatasetMetadata": ("lightspeed_evaluation.core.models", "DatasetMetadata"),
8291
"EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
8392
"EvaluationSummary": (
8493
"lightspeed_evaluation.core.models.summary",

src/lightspeed_evaluation/api.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
print(summary.by_metric)
2424
"""
2525

26-
from typing import Optional
26+
from typing import TYPE_CHECKING, Optional
2727

2828
from lightspeed_evaluation.core.models import (
2929
EvaluationData,
@@ -35,22 +35,31 @@
3535
from lightspeed_evaluation.core.system import ConfigLoader
3636
from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
3737

38+
if TYPE_CHECKING:
39+
from lightspeed_evaluation.core.models.data import DatasetMetadata
40+
3841

3942
def evaluate(
4043
config: SystemConfig,
4144
data: list[EvaluationData],
4245
output_dir: Optional[str] = None,
46+
original_data_path: Optional[str] = None,
47+
dataset_metadata: Optional["DatasetMetadata"] = None,
4348
) -> list[EvaluationResult]:
4449
"""Run evaluation on the provided data using the given configuration.
4550
4651
Creates a fully-initialized pipeline from the ``SystemConfig``, runs
47-
evaluation on every conversation in *data*, and returns the raw results.
52+
evaluation on every conversations in *data*, and returns the raw results.
4853
No reports are generated -- file I/O is the caller's responsibility.
4954
5055
Args:
5156
config: A pre-built SystemConfig instance.
5257
data: List of EvaluationData conversations to evaluate.
5358
output_dir: Optional override for the output directory.
59+
original_data_path: Path to the original evaluation data file.
60+
Required for saving amended data when agents are enabled.
61+
dataset_metadata: Optional dataset-level metadata to preserve in
62+
amended output files.
5463
5564
Returns:
5665
List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,7 +70,11 @@ def evaluate(
6170
loader = ConfigLoader.from_config(config)
6271
pipeline = EvaluationPipeline(loader, output_dir)
6372
try:
64-
return pipeline.run_evaluation(data)
73+
return pipeline.run_evaluation(
74+
data,
75+
original_data_path=original_data_path,
76+
dataset_metadata=dataset_metadata,
77+
)
6578
finally:
6679
pipeline.close()
6780

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
AttachmentData,
1515
)
1616
from lightspeed_evaluation.core.models.data import (
17+
ConversationMetadata,
18+
DatasetMetadata,
1719
EvaluationData,
1820
EvaluationRequest,
1921
EvaluationResult,
2022
EvaluationScope,
2123
JudgeScore,
2224
MetricResult,
2325
TurnData,
26+
TurnMetadata,
2427
)
2528
from lightspeed_evaluation.core.models.llm import (
2629
EmbeddingConfig,
@@ -61,7 +64,10 @@
6164
"ProposalAgentConfig",
6265
# Data models
6366
"TurnData",
67+
"TurnMetadata",
6468
"EvaluationData",
69+
"ConversationMetadata",
70+
"DatasetMetadata",
6571
"EvaluationRequest",
6672
"JudgeScore",
6773
"MetricResult",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,99 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
class TurnMetadata(BaseModel):
16+
"""Optional user-defined metadata for a single turn."""
17+
18+
model_config = ConfigDict(extra="forbid")
19+
20+
complexity: Optional[str] = Field(
21+
default=None,
22+
description="Complexity level (e.g. Simple, Moderate, Complex)",
23+
)
24+
data_source: Optional[str] = Field(
25+
default=None,
26+
description="Data source (e.g. Human-written, Production log, Synthetic)",
27+
)
28+
human_verified: Optional[bool] = Field(
29+
default=None, description="Whether a domain expert verified this turn"
30+
)
31+
verified_by: Optional[str] = Field(default=None, description="Verifier name or ID")
32+
negative_type: Optional[str] = Field(
33+
default=None,
34+
description="Negative scenario type (e.g. Out-of-scope, Adversarial, Ambiguous)",
35+
)
36+
persona: Optional[str] = Field(
37+
default=None,
38+
description="User persona represented (e.g. developer, admin, beginner)",
39+
)
40+
date_created: Optional[str] = Field(
41+
default=None, description="When the sample was created (e.g. 2025-06-15)"
42+
)
43+
additional_metadata: Optional[dict[str, Any]] = Field(
44+
default=None, description="Arbitrary key-value pairs for extra metadata"
45+
)
46+
47+
48+
class ConversationMetadata(BaseModel):
49+
"""Optional user-defined metadata for a conversation group."""
50+
51+
model_config = ConfigDict(extra="forbid")
52+
53+
scenario_category: Optional[str] = Field(
54+
default=None,
55+
description="Scenario category (e.g. Core/Happy path, Edge Case, Negative)",
56+
)
57+
use_case: Optional[str] = Field(
58+
default=None,
59+
description="System capability (e.g. RAG, Agent/Tools)",
60+
)
61+
interaction_type: Optional[str] = Field(
62+
default=None,
63+
description="Interaction type (e.g. Single-turn, Multi-turn)",
64+
)
65+
topic: Optional[str] = Field(
66+
default=None,
67+
description="Domain subject area (e.g. networking, storage)",
68+
)
69+
jtbd_reference: Optional[str] = Field(
70+
default=None, description="Jobs-to-be-done reference (Job/Task)"
71+
)
72+
notes: Optional[str] = Field(
73+
default=None, description="Free-text notes about the conversation"
74+
)
75+
additional_metadata: Optional[dict[str, Any]] = Field(
76+
default=None, description="Arbitrary key-value pairs for extra metadata"
77+
)
78+
79+
80+
class DatasetMetadata(BaseModel):
81+
"""Optional user-defined metadata for the entire evaluation dataset."""
82+
83+
model_config = ConfigDict(extra="forbid")
84+
85+
team_product: Optional[str] = Field(
86+
default=None, description="Owning team or product (with contact details)"
87+
)
88+
dataset_version: Optional[str] = Field(
89+
default=None, description="Dataset version for tracking iterations"
90+
)
91+
pii_confirmed_removed: Optional[bool] = Field(
92+
default=None, description="Whether PII has been confirmed removed"
93+
)
94+
generation_tools: Optional[list[str]] = Field(
95+
default=None, description="Tools used for synthetic data generation"
96+
)
97+
llms_used: Optional[list[str]] = Field(
98+
default=None, description="LLMs used in the generation pipeline"
99+
)
100+
last_updated: Optional[str] = Field(
101+
default=None, description="Date the dataset was last updated (e.g. 2025-06-15)"
102+
)
103+
additional_metadata: Optional[dict[str, Any]] = Field(
104+
default=None, description="Arbitrary key-value pairs for extra metadata"
105+
)
106+
107+
15108
def _validate_and_deduplicate_metrics(
16109
metrics: list[str], metric_type: str = "metric"
17110
) -> list[str]:
@@ -39,6 +132,10 @@ class TurnData(StreamingMetricsMixin):
39132
model_config = ConfigDict(extra="forbid")
40133

41134
turn_id: str = Field(..., min_length=1, description="Turn ID (alphanumeric)")
135+
metadata: Optional[TurnMetadata] = Field(
136+
default=None,
137+
description="User-defined metadata for traceability and quality grading",
138+
)
42139
query: Optional[str] = Field(
43140
default=None,
44141
min_length=1,
@@ -428,6 +525,10 @@ class EvaluationData(BaseModel):
428525
conversation_group_id: str = Field(
429526
..., min_length=1, description="Unique conversation group identifier"
430527
)
528+
metadata: Optional[ConversationMetadata] = Field(
529+
default=None,
530+
description="User-defined metadata for traceability and quality grading",
531+
)
431532
description: Optional[str] = Field(
432533
default=None,
433534
min_length=1,

src/lightspeed_evaluation/core/output/data_persistence.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,28 @@
22

33
from datetime import UTC, datetime
44
from pathlib import Path
5-
from typing import Optional
5+
from typing import Any, Optional
66

77
import yaml
88

99
from lightspeed_evaluation.core.constants import DEFAULT_OUTPUT_DIR
1010
from lightspeed_evaluation.core.models import EvaluationData
11+
from lightspeed_evaluation.core.models.data import DatasetMetadata
1112

1213

13-
# Use caching
1414
def save_evaluation_data(
1515
evaluation_data: list[EvaluationData],
1616
original_data_path: str,
1717
output_dir: str = DEFAULT_OUTPUT_DIR,
18+
dataset_metadata: Optional[DatasetMetadata] = None,
1819
) -> Optional[str]:
19-
"""Save amended evaluation data to output directory with timestamp."""
20+
"""Save amended evaluation data to output directory with timestamp.
21+
22+
When *dataset_metadata* is provided the file is written in the dict
23+
format (``metadata`` + ``conversations`` keys) so that dataset-level
24+
metadata is preserved across amend cycles. Without metadata the
25+
original list format is used for backward compatibility.
26+
"""
2027
original_path = Path(original_data_path)
2128
amended_data_path = None
2229

@@ -33,10 +40,21 @@ def save_evaluation_data(
3340
/ f"{original_path.stem}_amended_{timestamp}{original_path.suffix}"
3441
)
3542

43+
conversations = [
44+
conv_data.model_dump(mode="json") for conv_data in evaluation_data
45+
]
46+
47+
output_data: Any = conversations
48+
if dataset_metadata is not None:
49+
output_data = {
50+
"metadata": dataset_metadata.model_dump(mode="json", exclude_none=True),
51+
"conversations": conversations,
52+
}
53+
3654
# Save amended data to output directory
3755
with open(amended_data_path, "w", encoding="utf-8") as f:
3856
yaml.dump(
39-
[conv_data.model_dump(mode="json") for conv_data in evaluation_data],
57+
output_data,
4058
f,
4159
default_flow_style=False,
4260
sort_keys=False,

0 commit comments

Comments
 (0)