ClawWork/run_test_agent.sh at main · anchapin/ClawWork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash

# Quick Test Script for LiveBench Dashboard
# Runs an agent with specified config to populate the dashboard
#
# Usage:
#   ./run_test_agent.sh                                    # Uses default config
#   ./run_test_agent.sh livebench/configs/test_glm47.json  # Custom config
#
# Exhaust Mode:
#   ./run_test_agent.sh --exhaust                                    # Default config, exhaust all tasks
#   ./run_test_agent.sh --exhaust livebench/configs/test_glm47.json  # Custom config, exhaust all tasks
#
# Exhaust mode keeps the agent running until every GDPVal task has been attempted.
# The date advances past the config end_date as needed.
# Tasks that hit API errors are retried up to 10 times before being abandoned.

# Parse flags
EXHAUST_FLAG=""
if [ "$1" = "--exhaust" ]; then
    EXHAUST_FLAG="--exhaust"
    shift  # Remove --exhaust from args so $1 is now the config (if provided)
fi

# Get config file from argument or use default
CONFIG_FILE=${1:-"livebench/configs/test_gpt4o.json"}

echo "🎯 LiveBench Agent Test"
echo "===================================="
echo ""
echo "📋 Config: $CONFIG_FILE"
if [ -n "$EXHAUST_FLAG" ]; then
    echo "🔥 Mode: EXHAUST (run all GDPVal tasks)"
fi
echo ""

# Activate conda environment
echo "🔧 Activating livebench conda environment..."
source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate livebench
echo "   Using Python: $(which python)"
echo ""

# Load environment variables from .env if it exists
if [ -f ".env" ]; then
    echo "📝 Loading environment variables from .env..."
    source .env
    echo ""
fi

# Validate config file exists
if [ ! -f "$CONFIG_FILE" ]; then
    echo "❌ Config file not found: $CONFIG_FILE"
    echo ""
    echo "Available configs:"
    ls -1 livebench/configs/*.json 2>/dev/null || echo "  (none found)"
    echo ""
    exit 1
fi
echo "✓ Config file found"
echo ""

# Check environment variables
echo "🔍 Checking environment..."

if [ -z "$OPENAI_API_KEY" ]; then
    echo "❌ OPENAI_API_KEY not set"
    echo "   Please set it: export OPENAI_API_KEY='your-key-here'"
    exit 1
fi
echo "✓ OPENAI_API_KEY set"

if [ -z "$WEB_SEARCH_API_KEY" ]; then
    echo "❌ WEB_SEARCH_API_KEY not set"
    echo "   Please set it: export WEB_SEARCH_API_KEY='your-key-here'"
    echo "   You can also set WEB_SEARCH_PROVIDER (default: tavily)"
    exit 1
fi
echo "✓ WEB_SEARCH_API_KEY set"

echo ""

# Set MCP port if not set
export LIVEBENCH_HTTP_PORT=${LIVEBENCH_HTTP_PORT:-8010}

# Add project root to PYTHONPATH to ensure imports work
export PYTHONPATH="/root/-Live-Bench:$PYTHONPATH"

# Extract agent info from config (basic parsing)
AGENT_NAME=$(grep -oP '"signature"\s*:\s*"\K[^"]+' "$CONFIG_FILE" | head -1)
BASEMODEL=$(grep -oP '"basemodel"\s*:\s*"\K[^"]+' "$CONFIG_FILE" | head -1)
INIT_DATE=$(grep -oP '"init_date"\s*:\s*"\K[^"]+' "$CONFIG_FILE" | head -1)
END_DATE=$(grep -oP '"end_date"\s*:\s*"\K[^"]+' "$CONFIG_FILE" | head -1)
INITIAL_BALANCE=$(grep -oP '"initial_balance"\s*:\s*\K[0-9.]+' "$CONFIG_FILE" | head -1)

echo "===================================="
echo "🤖 Running Agent"
echo "===================================="
echo ""
echo "Configuration:"
echo "  - Config: $(basename $CONFIG_FILE)"
echo "  - Agent: ${AGENT_NAME:-unknown}"
echo "  - Model: ${BASEMODEL:-unknown}"
if [ -n "$EXHAUST_FLAG" ]; then
    echo "  - Mode: EXHAUST (start: ${INIT_DATE:-N/A}, runs until all tasks done)"
else
    echo "  - Date Range: ${INIT_DATE:-N/A} to ${END_DATE:-N/A}"
fi
echo "  - Initial Balance: \$${INITIAL_BALANCE:-1000}"
echo ""
echo "Note: The agent will handle MCP service internally"
echo ""
if [ -n "$EXHAUST_FLAG" ]; then
    echo "This will run until ALL GDPVal tasks are conducted (can take a long time)..."
else
    echo "This will take a few minutes..."
fi
echo ""
echo "===================================="
echo ""

# Run the agent with specified config (and optional --exhaust flag)
python livebench/main.py "$CONFIG_FILE" $EXHAUST_FLAG

echo ""
echo "===================================="
echo "✅ Test completed!"
echo "===================================="
echo ""
echo "📊 View results in dashboard:"
echo "   http://localhost:3000"
echo ""
echo "🔧 API endpoints:"
echo "   http://localhost:8000/api/agents"
echo "   http://localhost:8000/docs"
echo ""