1+ # pylint: disable=line-too-long,useless-suppression
2+ # ------------------------------------
3+ # Copyright (c) Microsoft Corporation.
4+ # Licensed under the MIT License.
5+ # ------------------------------------
6+
7+ """
8+ DESCRIPTION:
9+ Given an AIProjectClient, this sample demonstrates how to evaluate an
10+ agent from its traces by filtering traces from Application Insights using an
11+ agent name/version or agent ID, with smart filtering.
12+
13+ Three agent filter forms are supported:
14+ - agent_name + agent_version: Specify the agent by name and version separately.
15+ - agent_id: Specify the agent as a single "name:version" string.
16+ - smart_filtering: Use filter_strategy="smart_filtering" to bias trace
17+ selection toward more interesting conversations.
18+
19+ USAGE:
20+ python sample_agent_trace_evaluation_smart_filter.py
21+ python sample_agent_trace_evaluation_smart_filter.py --agent-id "my-agent:1"
22+
23+ Before running the sample:
24+
25+ pip install "azure-ai-projects>=2.2.0" python-dotenv
26+
27+ Set these environment variables with your own values:
28+ 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint.
29+ 2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators.
30+ 3) FOUNDRY_AGENT_NAME - Required. The name of the agent whose traces to evaluate.
31+ 4) FOUNDRY_AGENT_VERSION - Optional. The agent version. If not set, latest is used.
32+ """
33+
34+ import argparse
35+ import os
36+ import time
37+ from pprint import pprint
38+ from dotenv import load_dotenv
39+ from azure .identity import DefaultAzureCredential
40+ from azure .ai .projects import AIProjectClient
41+ from azure .ai .projects .models import TestingCriterionAzureAIEvaluator
42+
43+ load_dotenv ()
44+
45+ endpoint = os .environ ["FOUNDRY_PROJECT_ENDPOINT" ]
46+ model_deployment_name = os .environ ["FOUNDRY_MODEL_NAME" ]
47+ agent_name = os .environ ["FOUNDRY_AGENT_NAME" ]
48+ agent_version = os .environ .get ("FOUNDRY_AGENT_VERSION" , "" )
49+
50+ parser = argparse .ArgumentParser (description = "Evaluate agent traces using agent filter." )
51+ parser .add_argument ("--agent-id" , default = None , help = 'Agent ID in "name:version" format' )
52+ parser .add_argument ("--max-traces" , type = int , default = 5 , help = "Max traces to evaluate (default: 5)" )
53+ parser .add_argument ("--lookback-hours" , type = int , default = 24 , help = "Hours to look back (default: 24)" )
54+ args = parser .parse_args ()
55+
56+ with (
57+ DefaultAzureCredential () as credential ,
58+ AIProjectClient (endpoint = endpoint , credential = credential ) as project_client ,
59+ project_client .get_openai_client () as client ,
60+ ):
61+ # Eval group for trace-based evaluations
62+ data_source_config = {
63+ "type" : "azure_ai_source" ,
64+ "scenario" : "traces" ,
65+ }
66+
67+ testing_criteria = [
68+ TestingCriterionAzureAIEvaluator (
69+ type = "azure_ai_evaluator" ,
70+ name = "task_completion" ,
71+ evaluator_name = "builtin.task_completion" ,
72+ initialization_parameters = {"model" : model_deployment_name },
73+ data_mapping = {
74+ "query" : "{{item.query}}" ,
75+ "response" : "{{item.response}}" ,
76+ },
77+ ),
78+ TestingCriterionAzureAIEvaluator (
79+ type = "azure_ai_evaluator" ,
80+ name = "conversation_coherence" ,
81+ evaluator_name = "builtin.coherence" ,
82+ initialization_parameters = {"model" : model_deployment_name },
83+ data_mapping = {
84+ "query" : "{{item.query}}" ,
85+ "response" : "{{item.response}}" ,
86+ },
87+ ),
88+ TestingCriterionAzureAIEvaluator (
89+ type = "azure_ai_evaluator" ,
90+ name = "groundedness" ,
91+ evaluator_name = "builtin.groundedness" ,
92+ initialization_parameters = {"model" : model_deployment_name },
93+ data_mapping = {
94+ "query" : "{{item.query}}" ,
95+ "response" : "{{item.response}}" ,
96+ },
97+ ),
98+ TestingCriterionAzureAIEvaluator (
99+ type = "azure_ai_evaluator" ,
100+ name = "violence" ,
101+ evaluator_name = "builtin.violence" ,
102+ initialization_parameters = {"model" : model_deployment_name },
103+ data_mapping = {
104+ "query" : "{{item.query}}" ,
105+ "response" : "{{item.response}}" ,
106+ },
107+ ),
108+ ]
109+
110+ print ("Creating trace-based evaluation group" )
111+ eval_object = client .evals .create (
112+ name = "Trace Evaluation (Agent Smart Filter)" ,
113+ data_source_config = data_source_config , # type: ignore
114+ testing_criteria = testing_criteria ,
115+ )
116+ print (f"Evaluation created (id: { eval_object .id } )" )
117+
118+ # Compute time window in unix seconds
119+ # Pad end_time by +600s (10 min) to avoid ingestion-delay edge exclusion
120+ now_unix = int (time .time ())
121+ end_time = now_unix + 600
122+ start_time = now_unix - (args .lookback_hours * 3600 )
123+
124+ # Build trace_source based on mode
125+ trace_source : dict = {
126+ "type" : "agent_filter" ,
127+ "start_time" : start_time ,
128+ "end_time" : end_time ,
129+ "max_traces" : args .max_traces ,
130+ "filter_strategy" : "smart_filtering"
131+ }
132+
133+ if args .agent_id :
134+ trace_source ["agent_id" ] = args .agent_id
135+ print (f"Using agent_id filter: { args .agent_id } " )
136+ else :
137+ trace_source ["agent_name" ] = agent_name
138+ if agent_version :
139+ trace_source ["agent_version" ] = agent_version
140+ print (f"Using agent filter: { agent_name } v{ agent_version or '(latest)' } " )
141+
142+ data_source = {
143+ "type" : "azure_ai_trace_data_source_preview" ,
144+ "trace_source" : trace_source ,
145+ }
146+
147+ eval_run = client .evals .runs .create (
148+ eval_id = eval_object .id ,
149+ name = "trace-evaluation-agent-smart-filter-run" ,
150+ data_source = data_source , # type: ignore
151+ )
152+ print (f"Evaluation run created (id: { eval_run .id } )" )
153+
154+ while True :
155+ run = client .evals .runs .retrieve (run_id = eval_run .id , eval_id = eval_object .id )
156+ if run .status in ("completed" , "failed" ):
157+ break
158+ print (f"Waiting for eval run to complete... current status: { run .status } " )
159+ time .sleep (5 )
160+
161+ if run .status == "completed" :
162+ print ("\n ✓ Evaluation run completed successfully!" )
163+ print (f"Result Counts: { run .result_counts } " )
164+
165+ output_items = list (client .evals .runs .output_items .list (run_id = run .id , eval_id = eval_object .id ))
166+ print (f"\n OUTPUT ITEMS (Total: { len (output_items )} )" )
167+ print (f"{ '-' * 60 } " )
168+ pprint (output_items )
169+ print (f"{ '-' * 60 } " )
170+
171+ print (f"\n Eval Run Report URL: { run .report_url } " )
172+ else :
173+ print (f"\n ✗ Evaluation run failed: { run .error } " )
174+
175+ client .evals .delete (eval_id = eval_object .id )
176+ print ("Evaluation deleted" )
0 commit comments