-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.sh
More file actions
executable file
·264 lines (225 loc) · 9.23 KB
/
eval.sh
File metadata and controls
executable file
·264 lines (225 loc) · 9.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env bash
# Run eval cases from examples/examples.md against an LLM.
#
# Each case sends the `input` field to the LLM with the skill loaded and
# asserts that all `expect` terms appear and no `reject` terms appear in
# the response. The `code_block` field (if set) asserts a fenced code block
# of that language is present. The `threshold` field (if set) relaxes the
# expect check to require only that many matches.
#
# Usage:
# ./scripts/eval.sh --dry-run [--skill SKILL]
# ./scripts/eval.sh --execute --model MODEL --api-key KEY [--skill SKILL]
#
# Options:
# --dry-run List cases that would be evaluated without calling any LLM.
# --execute Run cases against the LLM (requires --model and --api-key).
# --skill SKILL Evaluate only the named skill (default: all skills).
# --model MODEL LLM model identifier (e.g. claude-3-5-sonnet-20241022).
# --api-key KEY API key for the LLM provider.
#
# Exit codes:
# 0 All cases passed (or --dry-run completed).
# 1 One or more cases failed.
# 2 Usage error.
#
# The --execute path calls the Anthropic Messages API (requires jq and curl).
# Each case loads the skill's SKILL.md as the system prompt, sends `input` as
# the user message, then asserts expect/reject/code_block/threshold against
# the response text.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SKILLS_DIR="$REPO_ROOT/skills"
MODE=""
FILTER_SKILL=""
MODEL=""
API_KEY=""
usage() {
sed -n '3,28p' "$0" | sed 's/^# \{0,1\}//'
exit 2
}
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run) MODE="dry-run" ;;
--execute) MODE="execute" ;;
--skill) FILTER_SKILL="${2:-}"; shift ;;
--model) MODEL="${2:-}"; shift ;;
--api-key) API_KEY="${2:-}"; shift ;;
-h|--help) usage ;;
*) echo "Unknown option: $1" >&2; usage ;;
esac
shift
done
if [[ -z "$MODE" ]]; then
echo "Error: --dry-run or --execute required" >&2
usage
fi
# ── execute: run cases against Anthropic API ─────────────────────────────────
if [[ "$MODE" == "execute" ]]; then
if [[ -z "$MODEL" ]]; then
echo "Error: --model required for --execute" >&2
usage
fi
if [[ -z "$API_KEY" ]]; then
echo "Error: --api-key required for --execute" >&2
usage
fi
PASS=0
FAIL=0
ERRORS=0
# Send one message to the Anthropic Messages API and print the text response.
# Args: $1=model $2=api_key $3=system_prompt $4=user_message
call_llm() {
local model="$1" api_key="$2" system_prompt="$3" user_message="$4"
local payload response text
# Build JSON payload using printf to avoid heredoc quoting issues
payload=$(printf '{"model":"%s","max_tokens":2048,"system":%s,"messages":[{"role":"user","content":%s}]}' \
"$model" \
"$(printf '%s' "$system_prompt" | jq -Rs .)" \
"$(printf '%s' "$user_message" | jq -Rs .)")
response=$(curl -s -f \
-H "x-api-key: $api_key" \
-H "anthropic-version: 2023-06-01" \
-H "content-type: application/json" \
-d "$payload" \
"https://api.anthropic.com/v1/messages" 2>&1) || {
echo "ERROR: curl failed: $response" >&2
return 1
}
text=$(printf '%s' "$response" | jq -r '.content[0].text // empty' 2>/dev/null)
if [[ -z "$text" ]]; then
echo "ERROR: empty or unexpected API response: $response" >&2
return 1
fi
printf '%s' "$text"
}
for examples_file in "$SKILLS_DIR"/*/examples/examples.md; do
[ -f "$examples_file" ] || continue
skill_dir=$(dirname "$(dirname "$examples_file")")
skill=$(basename "$skill_dir")
if [[ -n "$FILTER_SKILL" && "$skill" != "$FILTER_SKILL" ]]; then
continue
fi
skill_md="$skill_dir/SKILL.md"
[ -f "$skill_md" ] || continue
system_prompt=$(cat "$skill_md")
echo "=== $skill ==="
# Parse cases from YAML frontmatter.
# Each case block starts with " - id:" and ends before the next " - id:" or "---".
# We extract fields: id, input, expect (list), reject (list), code_block, threshold.
# Strategy: split frontmatter into per-case blocks, then parse each.
frontmatter=$(awk '/^---$/{c++;next} c==1{print} c==2{exit}' "$examples_file")
# Extract case blocks: everything from " - id:" to the next " - id:" or end
mapfile -t case_blocks < <(awk '
/^ - id:/ { if (block != "") print block; block = $0; next }
block != "" { block = block "\n" $0 }
END { if (block != "") print block }
' <<< "$frontmatter")
for block in "${case_blocks[@]}"; do
case_id=$(awk '/^ - id:/{print $3; exit}' <<< "$block")
input=$(awk '/^ input:/{sub(/^ input: /,""); gsub(/^"|"$/,""); print; exit}' <<< "$block")
code_block=$(awk '/^ code_block:/{print $2; exit}' <<< "$block")
threshold=$(awk '/^ threshold:/{print $2; exit}' <<< "$block")
# Extract expect list items
mapfile -t expect_terms < <(awk '
/^ expect:/{in_e=1;next}
in_e && /^ - /{sub(/^ - /,""); gsub(/^"|"$/,""); print}
in_e && !/^ /{in_e=0}
' <<< "$block")
# Extract reject list items
mapfile -t reject_terms < <(awk '
/^ reject:/{in_r=1;next}
in_r && /^ - /{sub(/^ - /,""); gsub(/^"|"$/,""); print}
in_r && !/^ /{in_r=0}
' <<< "$block")
[[ -z "$case_id" || -z "$input" ]] && continue
printf " %-50s " "$case_id"
response=$(call_llm "$MODEL" "$API_KEY" "$system_prompt" "$input") || {
echo "ERROR (API call failed)"
ERRORS=$((ERRORS + 1))
continue
}
case_pass=true
fail_reasons=()
# Check expect terms
matched=0
for term in "${expect_terms[@]}"; do
[[ -z "$term" ]] && continue
if grep -qi "$term" <<< "$response"; then
matched=$((matched + 1))
fi
done
total_expect=${#expect_terms[@]}
if [[ $total_expect -gt 0 ]]; then
if [[ -n "$threshold" ]]; then
if [[ $matched -lt $threshold ]]; then
case_pass=false
fail_reasons+=("expect: $matched/$total_expect matched, need $threshold")
fi
else
if [[ $matched -lt $total_expect ]]; then
case_pass=false
# Find which terms missed
for term in "${expect_terms[@]}"; do
[[ -z "$term" ]] && continue
grep -qi "$term" <<< "$response" || fail_reasons+=("missing: '$term'")
done
fi
fi
fi
# Check reject terms
for term in "${reject_terms[@]}"; do
[[ -z "$term" ]] && continue
if grep -qi "$term" <<< "$response"; then
case_pass=false
fail_reasons+=("rejected term present: '$term'")
fi
done
# Check code_block language
if [[ -n "$code_block" ]]; then
if ! grep -q "^\`\`\`${code_block}" <<< "$response"; then
case_pass=false
fail_reasons+=("no \`\`\`${code_block} code block in response")
fi
fi
if $case_pass; then
echo "PASS"
PASS=$((PASS + 1))
else
echo "FAIL"
for reason in "${fail_reasons[@]}"; do
echo " ✗ $reason"
done
FAIL=$((FAIL + 1))
fi
done
echo ""
done
echo "Results: $PASS passed | $FAIL failed | $ERRORS errors"
[[ $FAIL -eq 0 && $ERRORS -eq 0 ]] && exit 0 || exit 1
fi
# ── dry-run: list all eval cases ─────────────────────────────────────────────
total_skills=0
total_cases=0
for examples_file in "$SKILLS_DIR"/*/examples/examples.md; do
[ -f "$examples_file" ] || continue
skill_dir=$(dirname "$(dirname "$examples_file")")
skill=$(basename "$skill_dir")
if [[ -n "$FILTER_SKILL" && "$skill" != "$FILTER_SKILL" ]]; then
continue
fi
# Extract case IDs from frontmatter (lines between --- markers)
case_ids=$(awk '
/^---$/ { count++; next }
count == 1 && /^ - id:/ { print $3 }
' "$examples_file")
n=$(echo "$case_ids" | grep -c . || true)
echo "$skill ($n cases):"
echo "$case_ids" | while IFS= read -r id; do
[ -n "$id" ] && echo " - $id"
done
total_skills=$((total_skills + 1))
total_cases=$((total_cases + n))
done
echo ""
echo "$total_skills skill(s), $total_cases case(s) — use --execute to run against an LLM"