llama-stack-sandbox/evaluate.sh at main · alpha-hack-program/llama-stack-sandbox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/bin/bash

# Llama Stack Agent Evaluation Runner Script
# This script helps run evaluations with different configurations

set -e

# Default values
CSV_FILE=""
MODEL=""
TOOLS=""  # Remove default, will auto-discover if empty
STACK_URL="http://localhost:8080"
OUTPUT_DIR="evaluation_results"
VERBOSE=false

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Function to print colored output
print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Function to check if Llama Stack is running
check_llama_stack() {
    print_info "Checking Llama Stack connectivity..."

    if curl -s --connect-timeout 5 "$STACK_URL/health" > /dev/null 2>&1; then
        print_success "Llama Stack is running at $STACK_URL"
        return 0
    else
        print_error "Cannot connect to Llama Stack at $STACK_URL"
        print_info "Please ensure Llama Stack is running:"
        print_info "  1. Check if the server is started"
        print_info "  2. Verify the URL is correct"
        print_info "  3. Check firewall/network settings"
        return 1
    fi
}

# Function to check dependencies
check_dependencies() {
    print_info "Checking Python dependencies with uv..."

    if ! uv run python -c "import deepeval, llama_stack_client" > /dev/null 2>&1; then
        print_warning "Missing required dependencies"
        print_info "Installing dependencies using uv..."

        if uv sync; then
            print_success "Dependencies synced successfully"
        else
            print_error "Failed to sync dependencies"
            print_info "Try running: uv add deepeval llama-stack-client pandas pyyaml"
            return 1
        fi
    else
        print_success "All dependencies are available"
    fi
}

# Function to validate CSV file
validate_csv() {
    local csv_file="$1"

    print_info "Validating CSV file: $csv_file"

    if [ ! -f "$csv_file" ]; then
        print_error "CSV file not found: $csv_file"
        return 1
    fi

    # Check if CSV has required headers
    local headers=$(head -n 1 "$csv_file")
    local required_headers="question,expected_answer,tool_name,tool_parameters,evaluation_criteria,category"

    if [[ "$headers" == *"question"* ]] && [[ "$headers" == *"expected_answer"* ]]; then
        print_success "CSV file format appears valid"

        # Count test cases
        local test_count=$(($(wc -l < "$csv_file") - 1))
        print_info "Found $test_count test cases in CSV"
        return 0
    else
        print_error "CSV file missing required headers"
        print_info "Required headers: $required_headers"
        return 1
    fi
}

# Function to setup evaluation environment
setup_environment() {
    print_info "Setting up evaluation environment..."

    # Create output directory
    mkdir -p "$OUTPUT_DIR"

    # Run Python setup if available
    if uv run python -c "from evaluate.utils import setup_evaluation_environment; setup_evaluation_environment()" 2>/dev/null; then
        print_success "Python environment setup completed"
    else
        print_info "Skipped Python environment setup (not available)"
    fi

    print_success "Evaluation environment ready"
}

# Function to run evaluation
run_evaluation() {
    local csv_file="$1"
    local model="$2"
    local tools="$3"
    local stack_url="$4"
    local output_file="$5"
    local verbose="$6"

    print_info "Starting evaluation..."
    print_info "  CSV file: $csv_file"
    print_info "  Model: $model"
    print_info "  Tools: $tools"
    print_info "  Stack URL: $stack_url"
    print_info "  Output: $output_file"

    # Build command
    local cmd="uv run -m evaluate \"$csv_file\" --model \"$model\" --stack-url \"$stack_url\""

    if [ "$output_file" != "" ]; then
        cmd="$cmd --output \"$output_file\""
    fi

    if [ "$verbose" = true ]; then
        cmd="$cmd --verbose"
    fi

    # Add tools only if specified
    if [ -n "$tools" ]; then
        for tool in $tools; do
            cmd="$cmd --tools $tool"
        done
    fi

    print_info "Running command: $cmd"

    # Execute evaluation
    if eval "$cmd"; then
        print_success "Evaluation completed successfully"

        if [ "$output_file" != "" ] && [ -f "$output_file" ]; then
            print_info "Results saved to: $output_file"
        fi
        return 0
    else
        print_error "Evaluation failed"
        return 1
    fi
}

# Function to run quick test
run_quick_test() {
    print_info "Running quick connectivity test..."

    if check_llama_stack; then
        print_success "Quick test passed - Llama Stack is accessible"
    else
        print_error "Quick test failed - cannot connect to Llama Stack"
        return 1
    fi
}

# Function to show usage
show_usage() {
    echo "Usage: $0 [OPTIONS] [COMMAND]"
    echo ""
    echo "Commands:"
    echo "  run                 Run evaluation (default)"
    echo "  test                Run quick connectivity test"
    echo "  setup               Setup evaluation environment"
    echo "  validate            Validate CSV file only"
    echo ""
    echo "Options:"
    echo "  --csv FILE          CSV file path (required)"
    echo "  --model MODEL       Model ID (required)"
    echo "  -t, --tools TOOLS   Space-separated mcp::* tool groups (optional, auto-discovers if not specified)"
    echo "  -u, --url URL       Llama Stack URL (default: $STACK_URL)"
    echo "  -o, --output FILE   Output file path"
    echo "  -v, --verbose       Enable verbose output"
    echo "  -h, --help          Show this help"
    echo ""
    echo "Examples:"
    echo "  $0 run --csv scratch/compatibility.csv --model llama-3-1-8b-w4a16 -v"
    echo "  $0 run --csv scratch/compatibility.csv --model llama-4-scout-17b -o results.json"
    echo "  $0 test"
    echo "  $0 setup"
}

# Parse command line arguments
COMMAND="run"
OUTPUT_FILE=""

while [[ $# -gt 0 ]]; do
    case $1 in
        --csv)
            CSV_FILE="$2"
            shift 2
            ;;
        --model)
            MODEL="$2"
            shift 2
            ;;
        -t|--tools)
            TOOLS="$2"
            shift 2
            ;;
        -u|--url)
            STACK_URL="$2"
            shift 2
            ;;
        -o|--output)
            OUTPUT_FILE="$2"
            shift 2
            ;;
        -v|--verbose)
            VERBOSE=true
            shift
            ;;
        -h|--help)
            show_usage
            exit 0
            ;;
        run|test|setup|validate)
            COMMAND="$1"
            shift
            ;;
        *)
            print_error "Unknown option: $1"
            show_usage
            exit 1
            ;;
    esac
done

# Main execution
main() {
    print_info "Llama Stack Agent Evaluation Runner"
    print_info "Command: $COMMAND"

    case "$COMMAND" in
        "test")
            run_quick_test
            ;;
        "setup")
            setup_environment
            ;;
        "validate")
            if [ "$CSV_FILE" == "" ]; then
                print_error "CSV file is required for validation. Use --csv FILE"
                show_usage
                exit 1
            fi
            validate_csv "$CSV_FILE"
            ;;
        "run")
            # Check required parameters
            if [ "$CSV_FILE" == "" ]; then
                print_error "CSV file is required. Use --csv FILE"
                show_usage
                exit 1
            fi

            if [ "$MODEL" == "" ]; then
                print_error "Model is required. Use --model MODEL"
                show_usage
                exit 1
            fi

            # Full evaluation run
            setup_environment

            if ! validate_csv "$CSV_FILE"; then
                exit 1
            fi

            # if ! check_dependencies; then
            #     exit 1
            # fi

            if ! check_llama_stack; then
                exit 1
            fi

            # Set output file if not specified
            if [ "$OUTPUT_FILE" == "" ]; then
                timestamp=$(date +"%Y%m%d_%H%M%S")
                OUTPUT_FILE="$OUTPUT_DIR/evaluation_results_${timestamp}.json"
            fi

            if run_evaluation "$CSV_FILE" "$MODEL" "$TOOLS" "$STACK_URL" "$OUTPUT_FILE" "$VERBOSE"; then
                print_success "Evaluation completed successfully!"

                if [ -f "$OUTPUT_FILE" ]; then
                    print_info "Results available at: $OUTPUT_FILE"
                fi
            else
                exit 1
            fi
            ;;
        *)
            print_error "Unknown command: $COMMAND"
            show_usage
            exit 1
            ;;
    esac
}

# Run main function
main