Skip to content

Commit 4563bde

Browse files
committed
.
1 parent a2b5d92 commit 4563bde

4 files changed

Lines changed: 134 additions & 14 deletions

File tree

deepeval/openai/evaluate.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def add_test_case(
3737
)
3838
)
3939

40+
4041
##############################################
4142
# Evaluation
4243
##############################################

deepeval/openai/patch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async def patched_async_openai_method(
8686
is_observed = len(trace_manager.traces) > 0
8787

8888
if is_observed:
89-
@observe(type="llm", model=input_parameters.model)
89+
@observe(type="llm", model=input_parameters.model, metrics=metrics)
9090
async def llm_generation(*args, **kwargs):
9191
response = await orig_method(*args, **kwargs)
9292
output_parameters = extract_output_parameters(is_completion_method, response, input_parameters)
@@ -143,7 +143,7 @@ def patched_sync_openai_method(
143143
is_observed = len(trace_manager.traces) > 0
144144

145145
if is_observed:
146-
@observe(type="llm", model=input_parameters.model)
146+
@observe(type="llm", model=input_parameters.model, metrics=metrics)
147147
def llm_generation(*args, **kwargs):
148148
response = orig_method(*args, **kwargs)
149149
output_parameters = extract_output_parameters(is_completion_method, response, input_parameters)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric
2+
from deepeval.openai import OpenAI
3+
from deepeval.tracing import observe
4+
5+
client = OpenAI()
6+
7+
##############################################
8+
# Test end-to-end Evaluation
9+
##############################################
10+
11+
def test_end_to_end_evaluation():
12+
for i in range(5):
13+
client.chat.completions.create(
14+
model="gpt-4o",
15+
messages=[
16+
{"role": "system", "content": "You are a helpful assistant."},
17+
{"role": "user", "content": "Hello, how are you?"},
18+
],
19+
metrics=[AnswerRelevancyMetric()],
20+
)
21+
22+
for i in range(5):
23+
client.chat.completions.create(
24+
model="gpt-4o",
25+
messages=[
26+
{"role": "system", "content": "You are a helpful chatbot."},
27+
{"role": "user", "content": "Hello!"},
28+
],
29+
metrics=[AnswerRelevancyMetric(), BiasMetric()],
30+
)
31+
32+
33+
##############################################
34+
# Test tracing
35+
##############################################
36+
37+
@observe()
38+
def llm_app(input: str):
39+
response = client.chat.completions.create(
40+
model="gpt-4o",
41+
messages=[
42+
{"role": "system", "content": "You are a helpful chatbot."},
43+
{"role": "user", "content": input},
44+
],
45+
metrics=[AnswerRelevancyMetric(), BiasMetric()],
46+
)
47+
return response.choices[0].message.content
48+
49+
llm_app("hi")
50+
51+
##############################################
52+
# Test tracing
53+
##############################################
54+
55+
@observe()
56+
def llm_app(input: str):
57+
response = client.chat.completions.create(
58+
model="gpt-4o",
59+
messages=[
60+
{"role": "system", "content": "You are a helpful chatbot."},
61+
{"role": "user", "content": input},
62+
],
63+
metrics=[AnswerRelevancyMetric(), BiasMetric()],
64+
)
65+
return response.choices[0].message.content
66+
67+
llm_app("hi")
Lines changed: 64 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,76 @@
11
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric
2+
from deepeval.tracing import observe
3+
from deepeval.dataset import Golden
24
from deepeval.openai import OpenAI
5+
from deepeval import evaluate
36

47
client = OpenAI()
58

6-
for i in range(5):
7-
client.chat.completions.create(
8-
model="gpt-4o",
9-
messages=[
10-
{"role": "system", "content": "You are a helpful assistant."},
11-
{"role": "user", "content": "Hello, how are you?"},
12-
],
13-
metrics=[AnswerRelevancyMetric()],
14-
)
9+
##############################################
10+
# Test end-to-end Evaluation
11+
##############################################
12+
13+
def test_end_to_end_evaluation():
14+
for i in range(5):
15+
client.chat.completions.create(
16+
model="gpt-4o",
17+
messages=[
18+
{"role": "system", "content": "You are a helpful assistant."},
19+
{"role": "user", "content": "Hello, how are you?"},
20+
],
21+
metrics=[AnswerRelevancyMetric()],
22+
)
23+
24+
for i in range(5):
25+
client.chat.completions.create(
26+
model="gpt-4o",
27+
messages=[
28+
{"role": "system", "content": "You are a helpful chatbot."},
29+
{"role": "user", "content": "Hello!"},
30+
],
31+
metrics=[AnswerRelevancyMetric(), BiasMetric()],
32+
)
33+
34+
# test_end_to_end_evaluation()
35+
1536

16-
for i in range(5):
17-
client.chat.completions.create(
37+
##############################################
38+
# Test tracing
39+
##############################################
40+
41+
@observe()
42+
def llm_app(input: str):
43+
response = client.chat.completions.create(
1844
model="gpt-4o",
1945
messages=[
2046
{"role": "system", "content": "You are a helpful chatbot."},
21-
{"role": "user", "content": "Hello!"},
47+
{"role": "user", "content": input},
2248
],
2349
metrics=[AnswerRelevancyMetric(), BiasMetric()],
2450
)
51+
return response.choices[0].message.content
52+
53+
def test_tracing():
54+
llm_app("hi")
55+
llm_app("hello")
56+
llm_app("how are you?")
57+
llm_app("what is the capital of France?")
58+
59+
# test_tracing()
60+
61+
##############################################
62+
# Test traceable evaluate
63+
##############################################
64+
65+
def test_traceable_evaluate():
66+
evaluate(
67+
observed_callback=llm_app,
68+
goldens=[
69+
Golden(input="hi"),
70+
Golden(input="hello"),
71+
Golden(input="how are you?"),
72+
Golden(input="what is the capital of France?"),
73+
],
74+
)
75+
76+
test_traceable_evaluate()

0 commit comments

Comments
 (0)