Skip to content

Commit a2b5d92

Browse files
committed
fix evaluate
1 parent 1b94326 commit a2b5d92

3 files changed

Lines changed: 60 additions & 39 deletions

File tree

deepeval/openai/evaluate.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,41 @@
1-
from ast import keyword
1+
from dataclasses import dataclass
2+
from typing import List, Dict, Any
23
import asyncio
34
import atexit
4-
from dataclasses import dataclass
5-
from typing import List, Optional, Dict
6-
from collections import defaultdict
7-
from deepeval import evaluate
8-
from deepeval.test_case import LLMTestCase
9-
from deepeval.metrics import BaseMetric
5+
106
from deepeval.openai.extractors import InputParameters
11-
from deepeval.test_run import auto_log_hyperparameters
7+
from deepeval.test_case import LLMTestCase
128
from deepeval.evaluate import AsyncConfig
9+
from deepeval.metrics import BaseMetric
10+
from deepeval import evaluate
1311

1412
@dataclass
1513
class TestCaseMetricPair:
1614
test_case: LLMTestCase
1715
metrics: List[BaseMetric]
16+
hyperparameters: Dict[str, Any]
1817

1918
@dataclass
2019
class TestCasesMetricSet:
2120
test_cases: List[LLMTestCase]
2221
metrics: List[BaseMetric]
22+
hyperparameters: Dict[str, Any]
2323

2424
test_case_pairs: List[TestCaseMetricPair] = []
2525

2626

27-
def add_test_case(test_case: LLMTestCase, metrics: List[BaseMetric]):
28-
test_case_pairs.append(TestCaseMetricPair(test_case=test_case, metrics=metrics))
27+
def add_test_case(
28+
test_case: LLMTestCase,
29+
metrics: List[BaseMetric],
30+
input_parameters: InputParameters,
31+
):
32+
test_case_pairs.append(
33+
TestCaseMetricPair(
34+
test_case=test_case,
35+
metrics=metrics,
36+
hyperparameters=create_hyperparameters_map(input_parameters)
37+
)
38+
)
2939

3040
##############################################
3141
# Evaluation
@@ -41,13 +51,17 @@ async def evaluate_async():
4151
if key not in grouped:
4252
grouped[key] = TestCasesMetricSet(
4353
test_cases=[pair.test_case],
44-
metrics=pair.metrics
54+
metrics=pair.metrics,
55+
hyperparameters=pair.hyperparameters
4556
)
4657
else:
4758
grouped[key].test_cases.append(pair.test_case)
4859
for key, cases in grouped.items():
49-
evaluate(test_cases=cases.test_cases, metrics=cases.metrics)
50-
60+
evaluate(
61+
test_cases=cases.test_cases,
62+
metrics=cases.metrics,
63+
hyperparameters=cases.hyperparameters
64+
)
5165

5266
def evaluate_sync():
5367
sync_config = AsyncConfig(run_async=False)
@@ -60,12 +74,18 @@ def evaluate_sync():
6074
if key not in grouped:
6175
grouped[key] = TestCasesMetricSet(
6276
test_cases=[pair.test_case],
63-
metrics=pair.metrics
77+
metrics=pair.metrics,
78+
hyperparameters=pair.hyperparameters
6479
)
6580
else:
6681
grouped[key].test_cases.append(pair.test_case)
6782
for key, cases in grouped.items():
68-
evaluate(test_cases=cases.test_cases, metrics=cases.metrics, async_config=sync_config)
83+
evaluate(
84+
test_cases=cases.test_cases,
85+
metrics=cases.metrics,
86+
hyperparameters=cases.hyperparameters,
87+
async_config=sync_config
88+
)
6989

7090
@atexit.register
7191
def run_evaluations_atexit():
@@ -80,11 +100,12 @@ def run_evaluations_atexit():
80100
except Exception as e:
81101
print("⚠️ Could not schedule async evaluation in atexit: ", e)
82102

103+
83104
##############################################
84105
# Hyperparameters
85106
##############################################
86107

87-
def log_hyperparameters(input_parameters: InputParameters):
108+
def create_hyperparameters_map(input_parameters: InputParameters):
88109
hyperparameters = {"model": input_parameters.model}
89110
if input_parameters.instructions:
90111
hyperparameters["system_prompt"] = input_parameters.instructions
@@ -94,4 +115,4 @@ def log_hyperparameters(input_parameters: InputParameters):
94115
hyperparameters["system_prompt"] = (
95116
system_messages[0] if len(system_messages) == 1 else str(system_messages)
96117
)
97-
auto_log_hyperparameters(hyperparameters)
118+
return hyperparameters

deepeval/openai/patch.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
import inspect
44
import uuid
55

6-
from deepeval.openai.evaluate import log_hyperparameters, add_test_case
76
from deepeval.tracing.attributes import LlmAttributes, ToolAttributes
87
from deepeval.openai.utils import get_attr_path, set_attr_path
98
from deepeval.test_case import LLMTestCase, ToolCall
109
from deepeval.tracing import trace_manager, observe
1110
from deepeval.metrics.base_metric import BaseMetric
11+
from deepeval.openai.evaluate import add_test_case
1212

1313
from deepeval.tracing.types import (
1414
TraceSpanStatus,
@@ -113,7 +113,6 @@ async def llm_generation(*args, **kwargs):
113113
return await llm_generation(*args, **kwargs)
114114
else:
115115
response = await orig_method(*args, **kwargs)
116-
log_hyperparameters(input_parameters)
117116
output_parameters = extract_output_parameters(is_completion_method, response, input_parameters)
118117
test_case = LLMTestCase(
119118
input=input_parameters.input,
@@ -124,7 +123,7 @@ async def llm_generation(*args, **kwargs):
124123
tools_called=output_parameters.tools_called,
125124
expected_tools=expected_tools
126125
)
127-
add_test_case(test_case=test_case, metrics=metrics)
126+
add_test_case(test_case=test_case, metrics=metrics, input_parameters=input_parameters)
128127
return response
129128

130129
return patched_async_openai_method
@@ -171,7 +170,6 @@ def llm_generation(*args, **kwargs):
171170
return llm_generation(*args, **kwargs)
172171
else:
173172
response = orig_method(*args, **kwargs)
174-
log_hyperparameters(input_parameters)
175173
output_parameters = extract_output_parameters(is_completion_method, response, input_parameters)
176174
test_case = LLMTestCase(
177175
input=input_parameters.input,
@@ -182,7 +180,7 @@ def llm_generation(*args, **kwargs):
182180
tools_called=output_parameters.tools_called,
183181
expected_tools=expected_tools
184182
)
185-
add_test_case(test_case=test_case, metrics=metrics)
183+
add_test_case(test_case=test_case, metrics=metrics, input_parameters=input_parameters)
186184
return response
187185

188186
return patched_sync_openai_method
Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
1+
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric
12
from deepeval.openai import OpenAI
2-
from deepeval.metrics import AnswerRelevancyMetric
33

44
client = OpenAI()
55

6-
client.chat.completions.create(
7-
model="gpt-4o",
8-
messages=[
9-
{"role": "system", "content": "You are a helpful assistant."},
10-
{"role": "user", "content": "Hello, how are you?"}
11-
],
12-
metrics=[AnswerRelevancyMetric()]
13-
)
6+
for i in range(5):
7+
client.chat.completions.create(
8+
model="gpt-4o",
9+
messages=[
10+
{"role": "system", "content": "You are a helpful assistant."},
11+
{"role": "user", "content": "Hello, how are you?"},
12+
],
13+
metrics=[AnswerRelevancyMetric()],
14+
)
1415

15-
client.chat.completions.create(
16-
model="gpt-4o",
17-
messages=[
18-
{"role": "system", "content": "You are a helpful assistant."},
19-
{"role": "user", "content": "hiihi"}
20-
],
21-
metrics=[AnswerRelevancyMetric()]
22-
)
16+
for i in range(5):
17+
client.chat.completions.create(
18+
model="gpt-4o",
19+
messages=[
20+
{"role": "system", "content": "You are a helpful chatbot."},
21+
{"role": "user", "content": "Hello!"},
22+
],
23+
metrics=[AnswerRelevancyMetric(), BiasMetric()],
24+
)

0 commit comments

Comments
 (0)