|
| 1 | +#!/usr/bin/env bash |
| 2 | +# run_benchmark.sh — runs a full 4-strategy benchmark campaign against a |
| 3 | +# locally running SentinelCore instance and saves the report to results/. |
| 4 | +# |
| 5 | +# Prerequisites: |
| 6 | +# - docker compose up -d (PostgreSQL) |
| 7 | +# - mvn spring-boot:run -Dspring-boot.run.profiles=local (app on :8080) |
| 8 | +# - jq installed (brew install jq) |
| 9 | +# |
| 10 | +# Usage: |
| 11 | +# ./scripts/run_benchmark.sh |
| 12 | +# ./scripts/run_benchmark.sh --model gemini-2.0-flash |
| 13 | +# ./scripts/run_benchmark.sh --model claude-haiku-4-5-20251001 |
| 14 | + |
| 15 | +set -euo pipefail |
| 16 | + |
| 17 | +BASE_URL="http://localhost:8080" |
| 18 | +MODEL="gemini-2.0-flash" |
| 19 | +RESULTS_DIR="$(dirname "$0")/../results" |
| 20 | + |
| 21 | +while [[ $# -gt 0 ]]; do |
| 22 | + case "$1" in |
| 23 | + --model) MODEL="$2"; shift 2 ;; |
| 24 | + *) echo "Unknown argument: $1"; exit 1 ;; |
| 25 | + esac |
| 26 | +done |
| 27 | + |
| 28 | +command -v jq >/dev/null 2>&1 || { echo "jq is required but not installed. Run: brew install jq"; exit 1; } |
| 29 | +curl -sf "$BASE_URL/actuator/health" >/dev/null 2>&1 \ |
| 30 | + || curl -sf "$BASE_URL/v3/api-docs" >/dev/null 2>&1 \ |
| 31 | + || { echo "SentinelCore does not appear to be running on $BASE_URL"; exit 1; } |
| 32 | + |
| 33 | +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") |
| 34 | +OUT_DIR="$RESULTS_DIR/${TIMESTAMP}_${MODEL}" |
| 35 | +mkdir -p "$OUT_DIR" |
| 36 | + |
| 37 | +echo "=== SentinelCore Benchmark Campaign ===" |
| 38 | +echo "Model: $MODEL" |
| 39 | +echo "Output dir: $OUT_DIR" |
| 40 | +echo "" |
| 41 | + |
| 42 | +# ── 1. Create benchmark ────────────────────────────────────────────────────── |
| 43 | +echo "[1/3] Creating benchmark..." |
| 44 | +CREATE_RESPONSE=$(curl -sf -X POST "$BASE_URL/api/benchmarks" \ |
| 45 | + -H "Content-Type: application/json" \ |
| 46 | + -d "{ |
| 47 | + \"model\": \"$MODEL\", |
| 48 | + \"strategyTypes\": [\"INPUT_FILTER\", \"INPUT_OUTPUT\", \"PROMPT_HARDENING\"] |
| 49 | + }") |
| 50 | + |
| 51 | +BENCHMARK_ID=$(echo "$CREATE_RESPONSE" | jq -r '.benchmarkId') |
| 52 | +echo " Benchmark ID: $BENCHMARK_ID" |
| 53 | +echo "$CREATE_RESPONSE" | jq . > "$OUT_DIR/01_create.json" |
| 54 | + |
| 55 | +# ── 2. Execute benchmark (synchronous — may take several minutes) ───────────── |
| 56 | +echo "[2/3] Executing benchmark (runs all cases for each strategy — please wait)..." |
| 57 | +EXECUTE_RESPONSE=$(curl -sf -X POST "$BASE_URL/api/benchmarks/$BENCHMARK_ID/execute" \ |
| 58 | + --max-time 600) |
| 59 | + |
| 60 | +STATUS=$(echo "$EXECUTE_RESPONSE" | jq -r '.status') |
| 61 | +echo " Status: $STATUS" |
| 62 | +echo "$EXECUTE_RESPONSE" | jq . > "$OUT_DIR/02_execute.json" |
| 63 | + |
| 64 | +if [[ "$STATUS" != "COMPLETED" ]]; then |
| 65 | + echo "Benchmark did not complete successfully (status=$STATUS). Check $OUT_DIR/02_execute.json." |
| 66 | + exit 1 |
| 67 | +fi |
| 68 | + |
| 69 | +# ── 3. Fetch report ─────────────────────────────────────────────────────────── |
| 70 | +echo "[3/3] Fetching report..." |
| 71 | +REPORT=$(curl -sf "$BASE_URL/api/benchmarks/$BENCHMARK_ID/report") |
| 72 | +echo "$REPORT" | jq . > "$OUT_DIR/03_report.json" |
| 73 | + |
| 74 | +# ── Summary table ───────────────────────────────────────────────────────────── |
| 75 | +echo "" |
| 76 | +echo "=== Results ===" |
| 77 | +echo "$REPORT" | jq -r ' |
| 78 | + ["Strategy", "AttackSuccess", "FalsePositive", "Refusal", "AvgLatencyMs"], |
| 79 | + (.runs[] | [ |
| 80 | + .strategyType, |
| 81 | + (.metrics.metrics.attackSuccessRate | tostring), |
| 82 | + (.metrics.metrics.falsePositiveRate | tostring), |
| 83 | + (.metrics.metrics.refusalRate | tostring), |
| 84 | + (.metrics.metrics.avgLatencyMs | tostring) |
| 85 | + ]) | @tsv' | column -t |
| 86 | + |
| 87 | +echo "" |
| 88 | +echo "Full report saved to: $OUT_DIR/03_report.json" |
| 89 | +echo "" |
| 90 | +echo "To add these results to the README, copy the numbers from the table above" |
| 91 | +echo "into the 'Benchmark Results' section in README.md." |
0 commit comments