Merge pull request #17 from PSchmitz-Valckenberg/feat/step-17-repetitions-ci

PSchmitz-Valckenberg · web-flow · commit cd15be373ac4 · 2026-04-27T05:05:09.000+02:00
feat: Step 17 — repetitions and confidence intervals for benchmarks
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# run_benchmark.sh — runs a full 4-strategy benchmark campaign against a
+# run_benchmark.sh — runs a full 5-strategy benchmark campaign against a
 # locally running SentinelCore instance and saves the report to results/.
 #
 # Prerequisites:
@@ -9,7 +9,12 @@
 #
 # Usage:
 #   ./scripts/run_benchmark.sh
-#   ./scripts/run_benchmark.sh --label gemini-2.0-flash
+#   ./scripts/run_benchmark.sh --label gemini-2.5-flash
+#   ./scripts/run_benchmark.sh --label gemini-2.5-flash --repetitions 5
+#
+# --repetitions N  How many times each strategy is run (default: 3).
+#                  Mean and stddev are reported per strategy. N=1 gives no
+#                  stddev (marked as null in the JSON report).
 #
 # Note: the active LLM provider is configured in application-local.yml
 # (sentinelcore.llm.provider / sentinelcore.llm.model). --label is a free-form
@@ -20,14 +25,18 @@
 set -euo pipefail
 
 BASE_URL="http://localhost:8080"
-LABEL="gemini-2.0-flash"
+LABEL="gemini-2.5-flash"
+REPETITIONS=3
 RESULTS_DIR="$(dirname "$0")/../results"
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --label)
       [[ $# -ge 2 ]] || { echo "Error: --label requires an argument"; exit 1; }
       LABEL="$2"; shift 2 ;;
+    --repetitions)
+      [[ $# -ge 2 ]] || { echo "Error: --repetitions requires an argument"; exit 1; }
+      REPETITIONS="$2"; shift 2 ;;
     *) echo "Unknown argument: $1"; exit 1 ;;
   esac
 done
@@ -42,15 +51,17 @@ OUT_DIR="$RESULTS_DIR/${TIMESTAMP}_${LABEL}"
 mkdir -p "$OUT_DIR"
 
 echo "=== SentinelCore Benchmark Campaign ==="
-echo "Label:      $LABEL"
-echo "Output dir: $OUT_DIR"
+echo "Label:       $LABEL"
+echo "Repetitions: $REPETITIONS"
+echo "Output dir:  $OUT_DIR"
 echo ""
 
 # ── 1. Create benchmark ──────────────────────────────────────────────────────
 echo "[1/3] Creating benchmark..."
 CREATE_RESPONSE=$(curl -sfS -X POST "$BASE_URL/api/benchmarks" \
   -H "Content-Type: application/json" \
-  -d "$(jq -n --arg model "$LABEL" '{model: $model, strategyTypes: ["INPUT_FILTER","INPUT_OUTPUT","PROMPT_HARDENING","RAG_CONTENT_FILTER"]}')")
+  -d "$(jq -n --arg model "$LABEL" --argjson reps "$REPETITIONS" \
+       '{model: $model, strategyTypes: ["INPUT_FILTER","INPUT_OUTPUT","PROMPT_HARDENING","RAG_CONTENT_FILTER"], repetitions: $reps}')")
 
 BENCHMARK_ID=$(echo "$CREATE_RESPONSE" | jq -r '.benchmarkId')
 echo "      Benchmark ID: $BENCHMARK_ID"
@@ -76,16 +87,30 @@ REPORT=$(curl -sfS "$BASE_URL/api/benchmarks/$BENCHMARK_ID/report")
 echo "$REPORT" | jq . > "$OUT_DIR/03_report.json"
 
 # ── Summary table ─────────────────────────────────────────────────────────────
+REPS=$(echo "$REPORT" | jq '.repetitions')
+echo ""
+echo "=== Results (N=$REPS repetitions per strategy) ==="
+echo "Mean per strategy:"
+echo "$REPORT" | jq -r '
+  ["Strategy", "ASR", "FPR", "Refusal", "Latency(ms)"],
+  (.runs[] | [
+    .strategyType,
+    (.aggregated.attackSuccessRateMean | tostring),
+    (.aggregated.falsePositiveRateMean | tostring),
+    (.aggregated.refusalRateMean | tostring),
+    (.aggregated.avgLatencyMsMean | tostring)
+  ]) | @tsv' | column -t
+
 echo ""
-echo "=== Results ==="
+echo "Stddev per strategy (null = N=1, not computable):"
 echo "$REPORT" | jq -r '
-  ["Strategy", "AttackSuccess", "FalsePositive", "Refusal", "AvgLatencyMs"],
+  ["Strategy", "ASR-stddev", "FPR-stddev", "Refusal-stddev", "Latency-stddev(ms)"],
   (.runs[] | [
     .strategyType,
-    (.metrics.metrics.attackSuccessRate | tostring),
-    (.metrics.metrics.falsePositiveRate | tostring),
-    (.metrics.metrics.refusalRate | tostring),
-    (.metrics.metrics.avgLatencyMs | tostring)
+    (.aggregated.attackSuccessRateStddev // "null" | tostring),
+    (.aggregated.falsePositiveRateStddev // "null" | tostring),
+    (.aggregated.refusalRateStddev // "null" | tostring),
+    (.aggregated.avgLatencyMsStddev // "null" | tostring)
   ]) | @tsv' | column -t
 
 echo ""
diff --git a/src/main/java/com/sentinelcore/controller/BenchmarkController.java b/src/main/java/com/sentinelcore/controller/BenchmarkController.java
@@ -21,7 +21,8 @@ public class BenchmarkController {
 
     @PostMapping
     public ResponseEntity<BenchmarkCreateResponse> createBenchmark(@Valid @RequestBody BenchmarkCreateRequest request) {
-        Benchmark benchmark = benchmarkService.createBenchmark(request.model(), request.strategyTypes());
+        Benchmark benchmark = benchmarkService.createBenchmark(
+                request.model(), request.strategyTypes(), request.repetitionsOrDefault());
         return ResponseEntity
                 .status(HttpStatus.CREATED)
                 .body(new BenchmarkCreateResponse(
diff --git a/src/main/java/com/sentinelcore/domain/entity/Benchmark.java b/src/main/java/com/sentinelcore/domain/entity/Benchmark.java
@@ -42,8 +42,12 @@ public class Benchmark {
             name = "benchmark_runs",
             joinColumns = @JoinColumn(name = "benchmark_id")
     )
+    @OrderBy("repetitionIndex ASC")
         private List<BenchmarkRun> runs = new ArrayList<>();
 
+        @Column(name = "repetitions", nullable = false)
+        private int repetitions = 1;
+
         @Column(name = "created_at", nullable = false)
         private Instant createdAt;
 
diff --git a/src/main/java/com/sentinelcore/domain/entity/BenchmarkRun.java b/src/main/java/com/sentinelcore/domain/entity/BenchmarkRun.java
@@ -23,4 +23,7 @@ public class BenchmarkRun {
 
     @Column(name = "run_id", nullable = false)
     private String runId;
+
+    @Column(name = "repetition_index", nullable = false)
+    private int repetitionIndex;
 }
diff --git a/src/main/java/com/sentinelcore/dto/AggregatedStrategyMetrics.java b/src/main/java/com/sentinelcore/dto/AggregatedStrategyMetrics.java
@@ -0,0 +1,13 @@
+package com.sentinelcore.dto;
+
+public record AggregatedStrategyMetrics(
+        int repetitions,
+        double attackSuccessRateMean,
+        Double attackSuccessRateStddev,
+        double falsePositiveRateMean,
+        Double falsePositiveRateStddev,
+        double refusalRateMean,
+        Double refusalRateStddev,
+        double avgLatencyMsMean,
+        Double avgLatencyMsStddev
+) {}
diff --git a/src/main/java/com/sentinelcore/dto/BenchmarkCreateRequest.java b/src/main/java/com/sentinelcore/dto/BenchmarkCreateRequest.java
@@ -1,6 +1,8 @@
 package com.sentinelcore.dto;
 
 import com.sentinelcore.domain.enums.StrategyType;
+import jakarta.validation.constraints.Max;
+import jakarta.validation.constraints.Min;
 import jakarta.validation.constraints.NotBlank;
 import jakarta.validation.constraints.NotEmpty;
 import jakarta.validation.constraints.NotNull;
@@ -9,5 +11,10 @@
 
 public record BenchmarkCreateRequest(
         @NotBlank String model,
-        @NotEmpty List<@NotNull StrategyType> strategyTypes
-) {}
+        @NotEmpty List<@NotNull StrategyType> strategyTypes,
+        @Min(1) @Max(10) Integer repetitions
+) {
+    public int repetitionsOrDefault() {
+        return repetitions != null ? repetitions : 1;
+    }
+}
diff --git a/src/main/java/com/sentinelcore/dto/BenchmarkReportResponse.java b/src/main/java/com/sentinelcore/dto/BenchmarkReportResponse.java
@@ -6,5 +6,6 @@ public record BenchmarkReportResponse(
         String benchmarkId,
         String model,
         String status,
+        int repetitions,
         List<RunComparisonEntry> runs
 ) {}
diff --git a/src/main/java/com/sentinelcore/dto/RunComparisonEntry.java b/src/main/java/com/sentinelcore/dto/RunComparisonEntry.java
@@ -2,9 +2,12 @@
 
 import com.sentinelcore.domain.enums.StrategyType;
 
+import java.util.List;
+
 public record RunComparisonEntry(
-        String runId,
+        List<String> runIds,
         StrategyType strategyType,
         RunMetricsResponse metrics,
+        AggregatedStrategyMetrics aggregated,
         DeltaMetrics deltaToBaseline
 ) {}
diff --git a/src/main/java/com/sentinelcore/service/BenchmarkService.java b/src/main/java/com/sentinelcore/service/BenchmarkService.java
@@ -6,6 +6,7 @@
 import com.sentinelcore.domain.enums.BenchmarkStatus;
 import com.sentinelcore.domain.enums.RunMode;
 import com.sentinelcore.domain.enums.StrategyType;
+import com.sentinelcore.dto.AggregatedStrategyMetrics;
 import com.sentinelcore.dto.BenchmarkExecutionResponse;
 import com.sentinelcore.dto.BenchmarkReportResponse;
 import com.sentinelcore.dto.DeltaMetrics;
@@ -20,8 +21,10 @@
 
 import java.time.Instant;
 import java.util.ArrayList;
+import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 
 @Slf4j
@@ -34,7 +37,7 @@ public class BenchmarkService {
     private final ReportingService reportingService;
 
     @Transactional
-    public Benchmark createBenchmark(String model, List<StrategyType> strategyTypes) {
+    public Benchmark createBenchmark(String model, List<StrategyType> strategyTypes, int repetitions) {
         LinkedHashSet<StrategyType> deduped = new LinkedHashSet<>();
         deduped.add(StrategyType.NONE);
         deduped.addAll(strategyTypes);
@@ -44,6 +47,7 @@ public Benchmark createBenchmark(String model, List<StrategyType> strategyTypes)
         benchmark.setModel(model);
         benchmark.setStrategyTypes(new ArrayList<>(deduped));
         benchmark.setRuns(new ArrayList<>());
+        benchmark.setRepetitions(repetitions);
         benchmark.setStatus(BenchmarkStatus.CREATED);
         benchmark.setCreatedAt(Instant.now());
         return benchmarkRepository.save(benchmark);
@@ -64,15 +68,18 @@ public BenchmarkExecutionResponse executeBenchmark(String benchmarkId) {
         benchmarkRepository.saveAndFlush(benchmark);
 
         List<BenchmarkRun> completedRuns = new ArrayList<>();
+        int repetitions = benchmark.getRepetitions();
 
         try {
             for (StrategyType strategyType : benchmark.getStrategyTypes()) {
                 RunMode mode = (strategyType == StrategyType.NONE) ? RunMode.BASELINE : RunMode.DEFENDED;
-                EvaluationRun run = runService.createRun(mode, benchmark.getModel(), strategyType);
-                runService.executeRun(run.getId());
-                completedRuns.add(new BenchmarkRun(strategyType, run.getId()));
-                log.info("Benchmark {}: completed run {} with strategy {}",
-                        benchmarkId, run.getId(), strategyType);
+                for (int rep = 0; rep < repetitions; rep++) {
+                    EvaluationRun run = runService.createRun(mode, benchmark.getModel(), strategyType);
+                    runService.executeRun(run.getId());
+                    completedRuns.add(new BenchmarkRun(strategyType, run.getId(), rep));
+                    log.info("Benchmark {}: completed run {} (strategy={}, rep={}/{})",
+                            benchmarkId, run.getId(), strategyType, rep + 1, repetitions);
+                }
             }
             benchmark.setStatus(BenchmarkStatus.COMPLETED);
         } catch (RuntimeException ex) {
@@ -98,39 +105,94 @@ public BenchmarkReportResponse getReport(String benchmarkId) {
         Benchmark benchmark = benchmarkRepository.findById(benchmarkId)
                 .orElseThrow(() -> new EntityNotFoundException("Benchmark not found: " + benchmarkId));
 
-        List<RunWithMetrics> runMetrics = benchmark.getRuns().stream()
-                .map(br -> new RunWithMetrics(
-                        br.getRunId(),
-                        br.getStrategyType(),
-                        reportingService.getMetrics(br.getRunId())
-                ))
-                .toList();
-
-        RunMetricsResponse baseline = runMetrics.stream()
-                .filter(r -> r.strategyType() == StrategyType.NONE)
-                .map(RunWithMetrics::metrics)
-                .findFirst()
-                .orElse(null);
-
-        List<RunComparisonEntry> entries = runMetrics.stream()
-                .map(r -> new RunComparisonEntry(
-                        r.runId(),
-                        r.strategyType(),
-                        r.metrics(),
-                        (baseline != null && r.strategyType() != StrategyType.NONE)
-                                ? computeDelta(baseline, r.metrics())
-                                : null
-                ))
-                .toList();
+        // Group runs by strategy in repetition_index order (guaranteed by @OrderBy on Benchmark.runs)
+        Map<StrategyType, List<RunMetricsResponse>> metricsByStrategy = new LinkedHashMap<>();
+        Map<StrategyType, List<String>> runIdsByStrategy = new LinkedHashMap<>();
+        // Track the rep-0 run per strategy explicitly so representative is stable
+        Map<StrategyType, RunMetricsResponse> rep0ByStrategy = new LinkedHashMap<>();
+        for (BenchmarkRun br : benchmark.getRuns()) {
+            RunMetricsResponse m = reportingService.getMetrics(br.getRunId());
+            metricsByStrategy.computeIfAbsent(br.getStrategyType(), k -> new ArrayList<>()).add(m);
+            runIdsByStrategy.computeIfAbsent(br.getStrategyType(), k -> new ArrayList<>())
+                    .add(br.getRunId());
+            if (br.getRepetitionIndex() == 0) {
+                rep0ByStrategy.put(br.getStrategyType(), m);
+            }
+        }
+
+        // Use rep-0 of NONE as baseline for delta computation
+        RunMetricsResponse baselineSample = rep0ByStrategy.get(StrategyType.NONE);
+
+        List<RunComparisonEntry> entries = new ArrayList<>();
+        for (Map.Entry<StrategyType, List<RunMetricsResponse>> e : metricsByStrategy.entrySet()) {
+            StrategyType strategy = e.getKey();
+            List<RunMetricsResponse> runs = e.getValue();
+            List<String> runIds = runIdsByStrategy.get(strategy);
+
+            // rep-0 as the representative for backwards-compatible single-metrics field
+            RunMetricsResponse representative = rep0ByStrategy.getOrDefault(strategy, runs.get(0));
+            AggregatedStrategyMetrics aggregated = aggregate(runs);
+            DeltaMetrics delta = (baselineSample != null && strategy != StrategyType.NONE)
+                    ? computeDelta(baselineSample, representative) : null;
+
+            entries.add(new RunComparisonEntry(runIds, strategy, representative, aggregated, delta));
+        }
 
         return new BenchmarkReportResponse(
                 benchmarkId,
                 benchmark.getModel(),
                 benchmark.getStatus().name(),
+                benchmark.getRepetitions(),
                 entries
         );
     }
 
+    // --- Statistics ---
+
+    static AggregatedStrategyMetrics aggregate(List<RunMetricsResponse> runs) {
+        int n = runs.size();
+        double[] asr = extract(runs, r -> r.metrics().attackSuccessRate());
+        double[] fpr = extract(runs, r -> r.metrics().falsePositiveRate());
+        double[] rr  = extract(runs, r -> r.metrics().refusalRate());
+        double[] lat = extract(runs, r -> r.metrics().avgLatencyMs());
+        return new AggregatedStrategyMetrics(
+                n,
+                mean(asr), stddev(asr),
+                mean(fpr), stddev(fpr),
+                mean(rr),  stddev(rr),
+                mean(lat), stddev(lat)
+        );
+    }
+
+    private static double[] extract(List<RunMetricsResponse> runs,
+                                    java.util.function.ToDoubleFunction<RunMetricsResponse> fn) {
+        double[] vals = new double[runs.size()];
+        for (int i = 0; i < runs.size(); i++) {
+            vals[i] = fn.applyAsDouble(runs.get(i));
+        }
+        return vals;
+    }
+
+    static double mean(double[] values) {
+        return round3(rawMean(values));
+    }
+
+    // Returns null for N=1 (not computable), otherwise population stddev
+    static Double stddev(double[] values) {
+        if (values.length <= 1) return null;
+        double m = rawMean(values);
+        double sumSq = 0;
+        for (double v : values) sumSq += (v - m) * (v - m);
+        return round3(Math.sqrt(sumSq / values.length));
+    }
+
+    private static double rawMean(double[] values) {
+        if (values.length == 0) return 0.0;
+        double sum = 0;
+        for (double v : values) sum += v;
+        return sum / values.length;
+    }
+
     private DeltaMetrics computeDelta(RunMetricsResponse baseline, RunMetricsResponse defended) {
         RunMetricsResponse.Metrics b = baseline.metrics();
         RunMetricsResponse.Metrics d = defended.metrics();
@@ -142,9 +204,7 @@ private DeltaMetrics computeDelta(RunMetricsResponse baseline, RunMetricsRespons
         );
     }
 
-    private double round3(double value) {
+    private static double round3(double value) {
         return Math.round(value * 1000.0) / 1000.0;
     }
-
-        private record RunWithMetrics(String runId, StrategyType strategyType, RunMetricsResponse metrics) {}
-}
+}
diff --git a/src/main/resources/db/migration/V6__add_repetitions.sql b/src/main/resources/db/migration/V6__add_repetitions.sql
diff --git a/src/test/java/com/sentinelcore/service/BenchmarkStatisticsTest.java b/src/test/java/com/sentinelcore/service/BenchmarkStatisticsTest.java

Original file line number	Diff line number	Diff line change
`@@ -23,4 +23,7 @@ public class BenchmarkRun {`
`23`	`23`
`24`	`24`	`@Column(name = "run_id", nullable = false)`
`25`	`25`	`private String runId;`
	`26`	`+`
	`27`	`+ @Column(name = "repetition_index", nullable = false)`
	`28`	`+ private int repetitionIndex;`
`26`	`29`	`}`