-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.sh
More file actions
executable file
·331 lines (283 loc) · 8.87 KB
/
evaluate.sh
File metadata and controls
executable file
·331 lines (283 loc) · 8.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/bin/bash
# Llama Stack Agent Evaluation Runner Script
# This script helps run evaluations with different configurations
set -e
# Default values
CSV_FILE=""
MODEL=""
TOOLS="" # Remove default, will auto-discover if empty
STACK_URL="http://localhost:8080"
OUTPUT_DIR="evaluation_results"
VERBOSE=false
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check if Llama Stack is running
check_llama_stack() {
print_info "Checking Llama Stack connectivity..."
if curl -s --connect-timeout 5 "$STACK_URL/health" > /dev/null 2>&1; then
print_success "Llama Stack is running at $STACK_URL"
return 0
else
print_error "Cannot connect to Llama Stack at $STACK_URL"
print_info "Please ensure Llama Stack is running:"
print_info " 1. Check if the server is started"
print_info " 2. Verify the URL is correct"
print_info " 3. Check firewall/network settings"
return 1
fi
}
# Function to check dependencies
check_dependencies() {
print_info "Checking Python dependencies with uv..."
if ! uv run python -c "import deepeval, llama_stack_client" > /dev/null 2>&1; then
print_warning "Missing required dependencies"
print_info "Installing dependencies using uv..."
if uv sync; then
print_success "Dependencies synced successfully"
else
print_error "Failed to sync dependencies"
print_info "Try running: uv add deepeval llama-stack-client pandas pyyaml"
return 1
fi
else
print_success "All dependencies are available"
fi
}
# Function to validate CSV file
validate_csv() {
local csv_file="$1"
print_info "Validating CSV file: $csv_file"
if [ ! -f "$csv_file" ]; then
print_error "CSV file not found: $csv_file"
return 1
fi
# Check if CSV has required headers
local headers=$(head -n 1 "$csv_file")
local required_headers="question,expected_answer,tool_name,tool_parameters,evaluation_criteria,category"
if [[ "$headers" == *"question"* ]] && [[ "$headers" == *"expected_answer"* ]]; then
print_success "CSV file format appears valid"
# Count test cases
local test_count=$(($(wc -l < "$csv_file") - 1))
print_info "Found $test_count test cases in CSV"
return 0
else
print_error "CSV file missing required headers"
print_info "Required headers: $required_headers"
return 1
fi
}
# Function to setup evaluation environment
setup_environment() {
print_info "Setting up evaluation environment..."
# Create output directory
mkdir -p "$OUTPUT_DIR"
# Run Python setup if available
if uv run python -c "from evaluate.utils import setup_evaluation_environment; setup_evaluation_environment()" 2>/dev/null; then
print_success "Python environment setup completed"
else
print_info "Skipped Python environment setup (not available)"
fi
print_success "Evaluation environment ready"
}
# Function to run evaluation
run_evaluation() {
local csv_file="$1"
local model="$2"
local tools="$3"
local stack_url="$4"
local output_file="$5"
local verbose="$6"
print_info "Starting evaluation..."
print_info " CSV file: $csv_file"
print_info " Model: $model"
print_info " Tools: $tools"
print_info " Stack URL: $stack_url"
print_info " Output: $output_file"
# Build command
local cmd="uv run -m evaluate \"$csv_file\" --model \"$model\" --stack-url \"$stack_url\""
if [ "$output_file" != "" ]; then
cmd="$cmd --output \"$output_file\""
fi
if [ "$verbose" = true ]; then
cmd="$cmd --verbose"
fi
# Add tools only if specified
if [ -n "$tools" ]; then
for tool in $tools; do
cmd="$cmd --tools $tool"
done
fi
print_info "Running command: $cmd"
# Execute evaluation
if eval "$cmd"; then
print_success "Evaluation completed successfully"
if [ "$output_file" != "" ] && [ -f "$output_file" ]; then
print_info "Results saved to: $output_file"
fi
return 0
else
print_error "Evaluation failed"
return 1
fi
}
# Function to run quick test
run_quick_test() {
print_info "Running quick connectivity test..."
if check_llama_stack; then
print_success "Quick test passed - Llama Stack is accessible"
else
print_error "Quick test failed - cannot connect to Llama Stack"
return 1
fi
}
# Function to show usage
show_usage() {
echo "Usage: $0 [OPTIONS] [COMMAND]"
echo ""
echo "Commands:"
echo " run Run evaluation (default)"
echo " test Run quick connectivity test"
echo " setup Setup evaluation environment"
echo " validate Validate CSV file only"
echo ""
echo "Options:"
echo " --csv FILE CSV file path (required)"
echo " --model MODEL Model ID (required)"
echo " -t, --tools TOOLS Space-separated mcp::* tool groups (optional, auto-discovers if not specified)"
echo " -u, --url URL Llama Stack URL (default: $STACK_URL)"
echo " -o, --output FILE Output file path"
echo " -v, --verbose Enable verbose output"
echo " -h, --help Show this help"
echo ""
echo "Examples:"
echo " $0 run --csv scratch/compatibility.csv --model llama-3-1-8b-w4a16 -v"
echo " $0 run --csv scratch/compatibility.csv --model llama-4-scout-17b -o results.json"
echo " $0 test"
echo " $0 setup"
}
# Parse command line arguments
COMMAND="run"
OUTPUT_FILE=""
while [[ $# -gt 0 ]]; do
case $1 in
--csv)
CSV_FILE="$2"
shift 2
;;
--model)
MODEL="$2"
shift 2
;;
-t|--tools)
TOOLS="$2"
shift 2
;;
-u|--url)
STACK_URL="$2"
shift 2
;;
-o|--output)
OUTPUT_FILE="$2"
shift 2
;;
-v|--verbose)
VERBOSE=true
shift
;;
-h|--help)
show_usage
exit 0
;;
run|test|setup|validate)
COMMAND="$1"
shift
;;
*)
print_error "Unknown option: $1"
show_usage
exit 1
;;
esac
done
# Main execution
main() {
print_info "Llama Stack Agent Evaluation Runner"
print_info "Command: $COMMAND"
case "$COMMAND" in
"test")
run_quick_test
;;
"setup")
setup_environment
;;
"validate")
if [ "$CSV_FILE" == "" ]; then
print_error "CSV file is required for validation. Use --csv FILE"
show_usage
exit 1
fi
validate_csv "$CSV_FILE"
;;
"run")
# Check required parameters
if [ "$CSV_FILE" == "" ]; then
print_error "CSV file is required. Use --csv FILE"
show_usage
exit 1
fi
if [ "$MODEL" == "" ]; then
print_error "Model is required. Use --model MODEL"
show_usage
exit 1
fi
# Full evaluation run
setup_environment
if ! validate_csv "$CSV_FILE"; then
exit 1
fi
# if ! check_dependencies; then
# exit 1
# fi
if ! check_llama_stack; then
exit 1
fi
# Set output file if not specified
if [ "$OUTPUT_FILE" == "" ]; then
timestamp=$(date +"%Y%m%d_%H%M%S")
OUTPUT_FILE="$OUTPUT_DIR/evaluation_results_${timestamp}.json"
fi
if run_evaluation "$CSV_FILE" "$MODEL" "$TOOLS" "$STACK_URL" "$OUTPUT_FILE" "$VERBOSE"; then
print_success "Evaluation completed successfully!"
if [ -f "$OUTPUT_FILE" ]; then
print_info "Results available at: $OUTPUT_FILE"
fi
else
exit 1
fi
;;
*)
print_error "Unknown command: $COMMAND"
show_usage
exit 1
;;
esac
}
# Run main function
main