11from deepeval .metrics import AnswerRelevancyMetric , BiasMetric
2+ from deepeval .tracing import observe
3+ from deepeval .dataset import Golden
24from deepeval .openai import OpenAI
5+ from deepeval import evaluate
36
47client = OpenAI ()
58
6- for i in range (5 ):
7- client .chat .completions .create (
8- model = "gpt-4o" ,
9- messages = [
10- {"role" : "system" , "content" : "You are a helpful assistant." },
11- {"role" : "user" , "content" : "Hello, how are you?" },
12- ],
13- metrics = [AnswerRelevancyMetric ()],
14- )
9+ ##############################################
10+ # Test end-to-end Evaluation
11+ ##############################################
12+
13+ def test_end_to_end_evaluation ():
14+ for i in range (5 ):
15+ client .chat .completions .create (
16+ model = "gpt-4o" ,
17+ messages = [
18+ {"role" : "system" , "content" : "You are a helpful assistant." },
19+ {"role" : "user" , "content" : "Hello, how are you?" },
20+ ],
21+ metrics = [AnswerRelevancyMetric ()],
22+ )
23+
24+ for i in range (5 ):
25+ client .chat .completions .create (
26+ model = "gpt-4o" ,
27+ messages = [
28+ {"role" : "system" , "content" : "You are a helpful chatbot." },
29+ {"role" : "user" , "content" : "Hello!" },
30+ ],
31+ metrics = [AnswerRelevancyMetric (), BiasMetric ()],
32+ )
33+
34+ # test_end_to_end_evaluation()
35+
1536
16- for i in range (5 ):
17- client .chat .completions .create (
37+ ##############################################
38+ # Test tracing
39+ ##############################################
40+
41+ @observe ()
42+ def llm_app (input : str ):
43+ response = client .chat .completions .create (
1844 model = "gpt-4o" ,
1945 messages = [
2046 {"role" : "system" , "content" : "You are a helpful chatbot." },
21- {"role" : "user" , "content" : "Hello!" },
47+ {"role" : "user" , "content" : input },
2248 ],
2349 metrics = [AnswerRelevancyMetric (), BiasMetric ()],
2450 )
51+ return response .choices [0 ].message .content
52+
53+ def test_tracing ():
54+ llm_app ("hi" )
55+ llm_app ("hello" )
56+ llm_app ("how are you?" )
57+ llm_app ("what is the capital of France?" )
58+
59+ # test_tracing()
60+
61+ ##############################################
62+ # Test traceable evaluate
63+ ##############################################
64+
65+ def test_traceable_evaluate ():
66+ evaluate (
67+ observed_callback = llm_app ,
68+ goldens = [
69+ Golden (input = "hi" ),
70+ Golden (input = "hello" ),
71+ Golden (input = "how are you?" ),
72+ Golden (input = "what is the capital of France?" ),
73+ ],
74+ )
75+
76+ test_traceable_evaluate ()
0 commit comments