agent-skills/scripts/eval.sh at main · couchbaselabs/agent-skills · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env bash
# Run eval cases from examples/examples.md against an LLM.
#
# Each case sends the `input` field to the LLM with the skill loaded and
# asserts that all `expect` terms appear and no `reject` terms appear in
# the response. The `code_block` field (if set) asserts a fenced code block
# of that language is present. The `threshold` field (if set) relaxes the
# expect check to require only that many matches.
#
# Usage:
#   ./scripts/eval.sh --dry-run [--skill SKILL]
#   ./scripts/eval.sh --execute --model MODEL --api-key KEY [--skill SKILL]
#
# Options:
#   --dry-run          List cases that would be evaluated without calling any LLM.
#   --execute          Run cases against the LLM (requires --model and --api-key).
#   --skill SKILL      Evaluate only the named skill (default: all skills).
#   --model MODEL      LLM model identifier (e.g. claude-3-5-sonnet-20241022).
#   --api-key KEY      API key for the LLM provider.
#
# Exit codes:
#   0  All cases passed (or --dry-run completed).
#   1  One or more cases failed.
#   2  Usage error.
#
# The --execute path calls the Anthropic Messages API (requires jq and curl).
# Each case loads the skill's SKILL.md as the system prompt, sends `input` as
# the user message, then asserts expect/reject/code_block/threshold against
# the response text.

set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SKILLS_DIR="$REPO_ROOT/skills"

MODE=""
FILTER_SKILL=""
MODEL=""
API_KEY=""

usage() {
    sed -n '3,28p' "$0" | sed 's/^# \{0,1\}//'
    exit 2
}

while [[ $# -gt 0 ]]; do
    case "$1" in
        --dry-run)  MODE="dry-run" ;;
        --execute)  MODE="execute" ;;
        --skill)    FILTER_SKILL="${2:-}"; shift ;;
        --model)    MODEL="${2:-}"; shift ;;
        --api-key)  API_KEY="${2:-}"; shift ;;
        -h|--help)  usage ;;
        *) echo "Unknown option: $1" >&2; usage ;;
    esac
    shift
done

if [[ -z "$MODE" ]]; then
    echo "Error: --dry-run or --execute required" >&2
    usage
fi

# ── execute: run cases against Anthropic API ─────────────────────────────────

if [[ "$MODE" == "execute" ]]; then
    if [[ -z "$MODEL" ]]; then
        echo "Error: --model required for --execute" >&2
        usage
    fi
    if [[ -z "$API_KEY" ]]; then
        echo "Error: --api-key required for --execute" >&2
        usage
    fi

    PASS=0
    FAIL=0
    ERRORS=0

    # Send one message to the Anthropic Messages API and print the text response.
    # Args: $1=model $2=api_key $3=system_prompt $4=user_message
    call_llm() {
        local model="$1" api_key="$2" system_prompt="$3" user_message="$4"
        local payload response text

        # Build JSON payload using printf to avoid heredoc quoting issues
        payload=$(printf '{"model":"%s","max_tokens":2048,"system":%s,"messages":[{"role":"user","content":%s}]}' \
            "$model" \
            "$(printf '%s' "$system_prompt" | jq -Rs .)" \
            "$(printf '%s' "$user_message"  | jq -Rs .)")

        response=$(curl -s -f \
            -H "x-api-key: $api_key" \
            -H "anthropic-version: 2023-06-01" \
            -H "content-type: application/json" \
            -d "$payload" \
            "https://api.anthropic.com/v1/messages" 2>&1) || {
            echo "ERROR: curl failed: $response" >&2
            return 1
        }

        text=$(printf '%s' "$response" | jq -r '.content[0].text // empty' 2>/dev/null)
        if [[ -z "$text" ]]; then
            echo "ERROR: empty or unexpected API response: $response" >&2
            return 1
        fi
        printf '%s' "$text"
    }

    for examples_file in "$SKILLS_DIR"/*/examples/examples.md; do
        [ -f "$examples_file" ] || continue
        skill_dir=$(dirname "$(dirname "$examples_file")")
        skill=$(basename "$skill_dir")

        if [[ -n "$FILTER_SKILL" && "$skill" != "$FILTER_SKILL" ]]; then
            continue
        fi

        skill_md="$skill_dir/SKILL.md"
        [ -f "$skill_md" ] || continue
        system_prompt=$(cat "$skill_md")

        echo "=== $skill ==="

        # Parse cases from YAML frontmatter.
        # Each case block starts with "  - id:" and ends before the next "  - id:" or "---".
        # We extract fields: id, input, expect (list), reject (list), code_block, threshold.
        # Strategy: split frontmatter into per-case blocks, then parse each.

        frontmatter=$(awk '/^---$/{c++;next} c==1{print} c==2{exit}' "$examples_file")

        # Extract case blocks: everything from "  - id:" to the next "  - id:" or end
        mapfile -t case_blocks < <(awk '
            /^  - id:/ { if (block != "") print block; block = $0; next }
            block != "" { block = block "\n" $0 }
            END { if (block != "") print block }
        ' <<< "$frontmatter")

        for block in "${case_blocks[@]}"; do
            case_id=$(awk '/^  - id:/{print $3; exit}' <<< "$block")
            input=$(awk '/^    input:/{sub(/^    input: /,""); gsub(/^"|"$/,""); print; exit}' <<< "$block")
            code_block=$(awk '/^    code_block:/{print $2; exit}' <<< "$block")
            threshold=$(awk '/^    threshold:/{print $2; exit}' <<< "$block")

            # Extract expect list items
            mapfile -t expect_terms < <(awk '
                /^    expect:/{in_e=1;next}
                in_e && /^      - /{sub(/^      - /,""); gsub(/^"|"$/,""); print}
                in_e && !/^      /{in_e=0}
            ' <<< "$block")

            # Extract reject list items
            mapfile -t reject_terms < <(awk '
                /^    reject:/{in_r=1;next}
                in_r && /^      - /{sub(/^      - /,""); gsub(/^"|"$/,""); print}
                in_r && !/^      /{in_r=0}
            ' <<< "$block")

            [[ -z "$case_id" || -z "$input" ]] && continue

            printf "  %-50s " "$case_id"

            response=$(call_llm "$MODEL" "$API_KEY" "$system_prompt" "$input") || {
                echo "ERROR (API call failed)"
                ERRORS=$((ERRORS + 1))
                continue
            }

            case_pass=true
            fail_reasons=()

            # Check expect terms
            matched=0
            for term in "${expect_terms[@]}"; do
                [[ -z "$term" ]] && continue
                if grep -qi "$term" <<< "$response"; then
                    matched=$((matched + 1))
                fi
            done
            total_expect=${#expect_terms[@]}
            if [[ $total_expect -gt 0 ]]; then
                if [[ -n "$threshold" ]]; then
                    if [[ $matched -lt $threshold ]]; then
                        case_pass=false
                        fail_reasons+=("expect: $matched/$total_expect matched, need $threshold")
                    fi
                else
                    if [[ $matched -lt $total_expect ]]; then
                        case_pass=false
                        # Find which terms missed
                        for term in "${expect_terms[@]}"; do
                            [[ -z "$term" ]] && continue
                            grep -qi "$term" <<< "$response" || fail_reasons+=("missing: '$term'")
                        done
                    fi
                fi
            fi

            # Check reject terms
            for term in "${reject_terms[@]}"; do
                [[ -z "$term" ]] && continue
                if grep -qi "$term" <<< "$response"; then
                    case_pass=false
                    fail_reasons+=("rejected term present: '$term'")
                fi
            done

            # Check code_block language
            if [[ -n "$code_block" ]]; then
                if ! grep -q "^\`\`\`${code_block}" <<< "$response"; then
                    case_pass=false
                    fail_reasons+=("no \`\`\`${code_block} code block in response")
                fi
            fi

            if $case_pass; then
                echo "PASS"
                PASS=$((PASS + 1))
            else
                echo "FAIL"
                for reason in "${fail_reasons[@]}"; do
                    echo "    ✗ $reason"
                done
                FAIL=$((FAIL + 1))
            fi
        done
        echo ""
    done

    echo "Results: $PASS passed | $FAIL failed | $ERRORS errors"
    [[ $FAIL -eq 0 && $ERRORS -eq 0 ]] && exit 0 || exit 1
fi

# ── dry-run: list all eval cases ─────────────────────────────────────────────

total_skills=0
total_cases=0

for examples_file in "$SKILLS_DIR"/*/examples/examples.md; do
    [ -f "$examples_file" ] || continue
    skill_dir=$(dirname "$(dirname "$examples_file")")
    skill=$(basename "$skill_dir")

    if [[ -n "$FILTER_SKILL" && "$skill" != "$FILTER_SKILL" ]]; then
        continue
    fi

    # Extract case IDs from frontmatter (lines between --- markers)
    case_ids=$(awk '
        /^---$/ { count++; next }
        count == 1 && /^  - id:/ { print $3 }
    ' "$examples_file")

    n=$(echo "$case_ids" | grep -c . || true)
    echo "$skill ($n cases):"
    echo "$case_ids" | while IFS= read -r id; do
        [ -n "$id" ] && echo "  - $id"
    done
    total_skills=$((total_skills + 1))
    total_cases=$((total_cases + n))
done

echo ""
echo "$total_skills skill(s), $total_cases case(s) — use --execute to run against an LLM"