lightspeed-evaluation/config/system.yaml at d6deb751de49314c6413ecfe70af8c91c79ff8c1 · lightspeed-core/lightspeed-evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# LightSpeed Evaluation Framework Configuration

# Core evaluation parameters
core:
  max_threads: 50             # Maximum number of threads, set to null for Python default. 50 is OK for bigger datasets
  fail_on_invalid_data: true  # If False don't fail on invalid conversations (like missing context for some metrics)
  skip_on_failure: false      # If True, skip remaining turns when a turn evaluation fails (can be overridden per conversation)

# LLM as a judge configuration (Legacy)
# Deprecated: top-level llm: (single judge) will be removed — use llm_pool + judge_panel only.
# llm:
#   provider: "openai"                  # LLM Provider (openai, watsonx, gemini, hosted_vllm etc..)
#   model: "gpt-4o-mini"                # Model name for the provider
#   ssl_verify: true                    # Verify SSL certificates for specified provider
#   ssl_cert_file: null                 # Path to custom CA
#   temperature: 0.0                    # Generation temperature
#   max_tokens: 512                     # Maximum tokens in response
#   timeout: 300                        # Request timeout in seconds
#   num_retries: 3                      # Retry attempts
#   cache_dir: ".caches/llm_cache"      # Directory with LLM cache
#   cache_enabled: true                 # Is LLM cache enabled?

# Pool of named models (judges reference these IDs)
# Default values merge into each model; parameters supports extra provider keys; null removes an inherited parameter.
llm_pool:
  defaults:
    cache_enabled: true
    cache_dir: ".caches/llm_cache"
    timeout: 300
    num_retries: 3
    parameters:
      temperature: 0.0
      max_completion_tokens: 1024
  models:
    judge_gpt_4o_mini:
      # Uses default parameters
      provider: openai
      model: gpt-4o-mini
      # timeout: 360
      # ssl_verify: true
      # ssl_cert_file: null
    judge_gpt_4_1_mini:
      provider: openai
      model: gpt-4.1-mini
      parameters:
        # Add, remove or override model specific parameters
        temperature: null             # Removes temperature from default
        max_completion_tokens: 2048   # Overrides default

# Judge Panel: multiple judges from the pool
# Combine their scores. First judge in judges is the fallback when the full panel is not used for a metric.
judge_panel:
  judges:
    - judge_gpt_4_1_mini
    - judge_gpt_4o_mini
  # Enable Metrics: non-empty list → panel only for listed metrics (others → first judge);
  # omit or null → all metrics use all judges; [] → none use panel (first judge only).
  enabled_metrics:
    - "custom:answer_correctness"
  aggregation_strategy: max  # Other Strategies: average, majority_vote

# Default embedding (for LLM as a judge) configuration:
embedding:
  provider: "openai"
  model: "text-embedding-3-small"
  provider_kwargs: {}
  cache_dir: ".caches/embedding_cache"
  cache_enabled: true


# Lightspeed-stack API Configuration
# To get real time data. Currently it supports lightspeed-stack API.
# But can be easily integrated with other APIs with minimal change.
api:
  enabled: true                        # Enable API calls instead of using pre-filled data
  api_base: http://localhost:8080      # Base API URL (without version)
  version: v1                          # API version (e.g., v1, v2)
  endpoint_type: streaming             # Use "streaming" or "query" endpoint
  timeout: 300                         # API request timeout in seconds

  # Retry configuration for 429 Too Many Requests API errors
  num_retries: 3                       # Number of retry attempts (default 3)

  # API input configuration
  provider: "openai"                   # LLM provider for queries
  model: "gpt-4o-mini"                 # Model to use for queries
  no_tools: null                       # Whether to bypass tools and MCP servers (optional)
  system_prompt: null                  # System prompt (default None)
  # Extra parameters merged into API request payload (per-turn override in eval.yaml takes priority)
  # Example: extra_request_params:
  #            mode: troubleshooting
  extra_request_params: null

  cache_dir: ".caches/api_cache"  # Directory with lightspeed-stack cache
  cache_enabled: true                  # Is lightspeed-stack cache enabled?

  # MCP Server Authentication Configuration
  mcp_headers:
    enabled: false                     # Enable MCP headers functionality
    servers:                           # MCP server configurations
      filesystem-tools:
        env_var: API_KEY               # Environment variable containing the token/key

  # Legacy authentication (fallback when mcp_headers is not configured or disabled)
  # Authentication via API_KEY environment variable only for MCP server (without Server name)

# Quality Score Configuration
# Aggregated score from selected metrics for overall system quality assessment
quality_score:
  metrics:
    - "ragas:faithfulness"
    - "ragas:context_precision_with_reference"
    - "custom:tool_eval"
    - "custom:answer_correctness"
  default: true # If true, all metrics in this list get default: true

# Default metrics metadata
metrics_metadata:
  # Turn-level metrics metadata
  turn_level:
    # Ragas Response Evaluation metrics
    "ragas:response_relevancy":
      threshold: 0.8
      description: "How relevant the response is to the question"
      default: true  # This metric is applied by default when no turn_metrics specified

    "ragas:faithfulness":
      threshold: 0.8
      description: "How faithful the response is to the provided context"
      default: false  # By default the value is false

    # Ragas Context/Retrieval Evaluation metrics
    "ragas:context_recall":
      threshold: 0.8
      description: "Did we fetch every fact the answer needs?"

    "ragas:context_precision_with_reference":
      threshold: 0.7
      description: "How precise the retrieved context is (with reference)"

    "ragas:context_precision_without_reference":
      threshold: 0.7
      description: "How precise the retrieved context is (without reference)"

    "ragas:context_relevance":
      threshold: 0.7
      description: "Is what we retrieved actually relevant to user query?"

    # Custom metrics
    "custom:keywords_eval":  # boolean eval (either 0 or 1)
      description: "Keywords (ALL) matching evaluation with alternative sets"

    "custom:answer_correctness":
      threshold: 0.75
      description: "Correctness vs expected answer using custom LLM evaluation"

    "custom:intent_eval":
      threshold: 1  # boolean eval (either 0 or 1)
      description: "Intent alignment evaluation using custom LLM evaluation"

    "custom:tool_eval":
      description: "Tool call evaluation comparing expected vs actual tool calls"
      ordered: true       # true (default): sequence order matters, false: any order allowed
      full_match: true    # true (default): exact 1:1 match, false: expected tools found in actual (extras allowed)

    # Script-based metrics
    "script:action_eval":
      description: "Script-based evaluation for infrastructure/environment validation"

    # NLP-based metrics (non-LLM text comparison)
    "nlp:bleu":
      threshold: 0.5
      description: "BLEU score - measures n-gram overlap between response and expected_response"
      max_ngram: 4  # Options: 1 (unigrams), 2 (bigrams), 3 (trigrams), 4 (standard BLEU-4)
      default: false

    "nlp:rouge":
      threshold: 0.3  # Note: ROUGE measures n-gram overlap, not semantic meaning. Meaningful responses with different wording may score low.
      description: "ROUGE score - measures n-gram overlap. Returns fmeasure (F1) with precision/recall in reason."
      rouge_type: "rougeL"  # Options: rouge1, rouge2, rougeL, rougeLsum
      default: false

    "nlp:semantic_similarity_distance":
      threshold: 0.7
      description: "String distance metrics (Levenshtein, Jaro, etc.) - NOT recommended for LLM outputs as it measures character similarity, not semantic meaning"
      distance_measure: "levenshtein"  # Options: levenshtein, hamming, jaro, jaro_winkler
      default: false  # Use custom:answer_correctness for semantic comparison instead

    # GEval turn-level metrics (criteria = required; evaluation_steps, rubrics = optional)
    "geval:technical_accuracy":
      criteria: |  # required
        Assess whether the response provides technically accurate information,
        commands, code, syntax, and follows relevant industry or
        domain-specific best practices. The response should
        contain valid syntax and use appropriate functions, modules, or tools.
      evaluation_params:
        - query
        - response
        - expected_response
      evaluation_steps:  # optional: how to evaluate; if omitted, GEval generates from criteria
        - "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
        - "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
        - "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
        - "Verify the response directly and accurately addresses the user's specific query or task."
        - "Check for potential security issues, significant inefficiencies, or anti-patterns."
      # rubrics:  # optional: score ranges 0-10, non-overlapping, but final score is 0-1; same style as evaluation_steps
      #   - score_range: [0, 3]
      #     expected_outcome: "Incorrect or invalid."
      #   - score_range: [4, 7]
      #     expected_outcome: "Partially correct or has issues."
      #   - score_range: [8, 10]
      #     expected_outcome: "Technically correct and follows best practices."
      threshold: 0.7
      description: "General technical accuracy of provided commands, code, or technical information"

  # Conversation-level metrics metadata
  conversation_level:
    # DeepEval metrics
    "deepeval:conversation_completeness":
      threshold: 0.8
      description: "How completely the conversation addresses user intentions"
      default: false

    "deepeval:conversation_relevancy":
      threshold: 0.7
      description: "How relevant the conversation is to the topic/context"

    "deepeval:knowledge_retention":
      threshold: 0.7
      description: "How well the model retains information from previous turns"

    # GEval conversation-level metrics (criteria = required; evaluation_steps, rubrics = optional)
    "geval:conversation_coherence":
      criteria: |  # required
        Evaluate whether the conversation maintains context and provides coherent
        responses across multiple turns. The assistant should reference previous
        exchanges and build upon earlier context.
      evaluation_params:
        - query
        - response
      evaluation_steps:  # optional
        - "Check if the assistant remembers information from previous turns"
        - "Verify responses build logically on previous context"
        - "Assess whether the conversation flows naturally"
        - "Check for contradictions with earlier statements"
      threshold: 0.6
      description: "Context maintenance and coherence across conversation turns"


# Storage Configuration
# Configures how evaluation results are stored (file outputs and optional database)
storage:
  # File backend - outputs evaluation results to CSV, JSON, and TXT files
  - type: "file"
    output_dir: "./eval_output"
    base_filename: "evaluation"
    enabled_outputs:          # Enable specific output types
      - csv                   # Detailed results CSV
      - json                  # Summary JSON with statistics
      - txt                   # Human-readable summary

    # CSV columns to include
    csv_columns:
      - "conversation_group_id"
      - "turn_id"
      - "metric_identifier"
      - "metric_metadata"
      - "result"
      - "score"
      - "threshold"
      - "reason"
      - "execution_time"
      - "query"
      - "response"
      - "api_input_tokens"
      - "api_output_tokens"
      - "api_latency"
      # Streaming performance metrics (only populated when using streaming endpoint)
      - "time_to_first_token"    # Time to first token in seconds
      - "streaming_duration"      # Total streaming duration in seconds
      - "tokens_per_second"       # Output tokens per second throughput
      - "judge_llm_input_tokens"
      - "judge_llm_output_tokens"
      - "embedding_tokens"
      - "judge_scores"
      - "tool_calls"
      - "contexts"
      - "expected_response"
      - "expected_intent"
      - "expected_keywords"
      - "expected_tool_calls"
    summary_config_sections:  # Save configs to json report
      - core
      - llm_pool
      - judge_panel
      - embedding
      - api

  # Database backend (optional) - stores results incrementally to database
  # Uncomment below to enable SQLite storage:
  # - type: "sqlite"
  #   database: "./eval_results.db"
  #   table_name: "evaluation_results"

# Visualization settings
visualization:
  figsize: [12, 8]            # Graph size (width, height)
  dpi: 300                    # Image resolution

  # Graph types to generate
  enabled_graphs:
    - "pass_rates"            # Pass rate bar chart
    - "score_distribution"    # Score distribution box plot
    - "conversation_heatmap"  # Heatmap of conversation performance
    - "status_breakdown"      # Pie chart for pass/fail/error breakdown

# Environment Variables - Automatically get set before any imports
environment:
  DEEPEVAL_TELEMETRY_OPT_OUT: "YES"        # Disable DeepEval telemetry
  DEEPEVAL_DISABLE_PROGRESS_BAR: "YES"     # Disable DeepEval progress bars

  LITELLM_LOG: ERROR                       # Suppress LiteLLM verbose logging

# Logging Configuration
logging:
  # Source code logging level
  source_level: INFO          # DEBUG, INFO, WARNING, ERROR, CRITICAL

  # Package logging level (imported libraries)
  package_level: ERROR

  # Log format and display options
  log_format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  show_timestamps: true

  # Specific package log levels (override package_level for specific libraries)
  package_overrides:
    httpx: ERROR
    urllib3: ERROR
    requests: ERROR
    matplotlib: ERROR
    LiteLLM: WARNING
    DeepEval: WARNING
    ragas: WARNING