66import com .sentinelcore .domain .enums .BenchmarkStatus ;
77import com .sentinelcore .domain .enums .RunMode ;
88import com .sentinelcore .domain .enums .StrategyType ;
9+ import com .sentinelcore .dto .AggregatedStrategyMetrics ;
910import com .sentinelcore .dto .BenchmarkExecutionResponse ;
1011import com .sentinelcore .dto .BenchmarkReportResponse ;
1112import com .sentinelcore .dto .DeltaMetrics ;
2021
2122import java .time .Instant ;
2223import java .util .ArrayList ;
24+ import java .util .LinkedHashMap ;
2325import java .util .LinkedHashSet ;
2426import java .util .List ;
27+ import java .util .Map ;
2528import java .util .UUID ;
2629
2730@ Slf4j
@@ -34,7 +37,7 @@ public class BenchmarkService {
3437 private final ReportingService reportingService ;
3538
3639 @ Transactional
37- public Benchmark createBenchmark (String model , List <StrategyType > strategyTypes ) {
40+ public Benchmark createBenchmark (String model , List <StrategyType > strategyTypes , int repetitions ) {
3841 LinkedHashSet <StrategyType > deduped = new LinkedHashSet <>();
3942 deduped .add (StrategyType .NONE );
4043 deduped .addAll (strategyTypes );
@@ -44,6 +47,7 @@ public Benchmark createBenchmark(String model, List<StrategyType> strategyTypes)
4447 benchmark .setModel (model );
4548 benchmark .setStrategyTypes (new ArrayList <>(deduped ));
4649 benchmark .setRuns (new ArrayList <>());
50+ benchmark .setRepetitions (repetitions );
4751 benchmark .setStatus (BenchmarkStatus .CREATED );
4852 benchmark .setCreatedAt (Instant .now ());
4953 return benchmarkRepository .save (benchmark );
@@ -64,15 +68,18 @@ public BenchmarkExecutionResponse executeBenchmark(String benchmarkId) {
6468 benchmarkRepository .saveAndFlush (benchmark );
6569
6670 List <BenchmarkRun > completedRuns = new ArrayList <>();
71+ int repetitions = benchmark .getRepetitions ();
6772
6873 try {
6974 for (StrategyType strategyType : benchmark .getStrategyTypes ()) {
7075 RunMode mode = (strategyType == StrategyType .NONE ) ? RunMode .BASELINE : RunMode .DEFENDED ;
71- EvaluationRun run = runService .createRun (mode , benchmark .getModel (), strategyType );
72- runService .executeRun (run .getId ());
73- completedRuns .add (new BenchmarkRun (strategyType , run .getId ()));
74- log .info ("Benchmark {}: completed run {} with strategy {}" ,
75- benchmarkId , run .getId (), strategyType );
76+ for (int rep = 0 ; rep < repetitions ; rep ++) {
77+ EvaluationRun run = runService .createRun (mode , benchmark .getModel (), strategyType );
78+ runService .executeRun (run .getId ());
79+ completedRuns .add (new BenchmarkRun (strategyType , run .getId (), rep ));
80+ log .info ("Benchmark {}: completed run {} (strategy={}, rep={}/{})" ,
81+ benchmarkId , run .getId (), strategyType , rep + 1 , repetitions );
82+ }
7683 }
7784 benchmark .setStatus (BenchmarkStatus .COMPLETED );
7885 } catch (RuntimeException ex ) {
@@ -98,39 +105,94 @@ public BenchmarkReportResponse getReport(String benchmarkId) {
98105 Benchmark benchmark = benchmarkRepository .findById (benchmarkId )
99106 .orElseThrow (() -> new EntityNotFoundException ("Benchmark not found: " + benchmarkId ));
100107
101- List <RunWithMetrics > runMetrics = benchmark .getRuns ().stream ()
102- .map (br -> new RunWithMetrics (
103- br .getRunId (),
104- br .getStrategyType (),
105- reportingService .getMetrics (br .getRunId ())
106- ))
107- .toList ();
108-
109- RunMetricsResponse baseline = runMetrics .stream ()
110- .filter (r -> r .strategyType () == StrategyType .NONE )
111- .map (RunWithMetrics ::metrics )
112- .findFirst ()
113- .orElse (null );
114-
115- List <RunComparisonEntry > entries = runMetrics .stream ()
116- .map (r -> new RunComparisonEntry (
117- r .runId (),
118- r .strategyType (),
119- r .metrics (),
120- (baseline != null && r .strategyType () != StrategyType .NONE )
121- ? computeDelta (baseline , r .metrics ())
122- : null
123- ))
124- .toList ();
108+ // Group runs by strategy in repetition_index order (guaranteed by @OrderBy on Benchmark.runs)
109+ Map <StrategyType , List <RunMetricsResponse >> metricsByStrategy = new LinkedHashMap <>();
110+ Map <StrategyType , List <String >> runIdsByStrategy = new LinkedHashMap <>();
111+ // Track the rep-0 run per strategy explicitly so representative is stable
112+ Map <StrategyType , RunMetricsResponse > rep0ByStrategy = new LinkedHashMap <>();
113+ for (BenchmarkRun br : benchmark .getRuns ()) {
114+ RunMetricsResponse m = reportingService .getMetrics (br .getRunId ());
115+ metricsByStrategy .computeIfAbsent (br .getStrategyType (), k -> new ArrayList <>()).add (m );
116+ runIdsByStrategy .computeIfAbsent (br .getStrategyType (), k -> new ArrayList <>())
117+ .add (br .getRunId ());
118+ if (br .getRepetitionIndex () == 0 ) {
119+ rep0ByStrategy .put (br .getStrategyType (), m );
120+ }
121+ }
122+
123+ // Use rep-0 of NONE as baseline for delta computation
124+ RunMetricsResponse baselineSample = rep0ByStrategy .get (StrategyType .NONE );
125+
126+ List <RunComparisonEntry > entries = new ArrayList <>();
127+ for (Map .Entry <StrategyType , List <RunMetricsResponse >> e : metricsByStrategy .entrySet ()) {
128+ StrategyType strategy = e .getKey ();
129+ List <RunMetricsResponse > runs = e .getValue ();
130+ List <String > runIds = runIdsByStrategy .get (strategy );
131+
132+ // rep-0 as the representative for backwards-compatible single-metrics field
133+ RunMetricsResponse representative = rep0ByStrategy .getOrDefault (strategy , runs .get (0 ));
134+ AggregatedStrategyMetrics aggregated = aggregate (runs );
135+ DeltaMetrics delta = (baselineSample != null && strategy != StrategyType .NONE )
136+ ? computeDelta (baselineSample , representative ) : null ;
137+
138+ entries .add (new RunComparisonEntry (runIds , strategy , representative , aggregated , delta ));
139+ }
125140
126141 return new BenchmarkReportResponse (
127142 benchmarkId ,
128143 benchmark .getModel (),
129144 benchmark .getStatus ().name (),
145+ benchmark .getRepetitions (),
130146 entries
131147 );
132148 }
133149
150+ // --- Statistics ---
151+
152+ static AggregatedStrategyMetrics aggregate (List <RunMetricsResponse > runs ) {
153+ int n = runs .size ();
154+ double [] asr = extract (runs , r -> r .metrics ().attackSuccessRate ());
155+ double [] fpr = extract (runs , r -> r .metrics ().falsePositiveRate ());
156+ double [] rr = extract (runs , r -> r .metrics ().refusalRate ());
157+ double [] lat = extract (runs , r -> r .metrics ().avgLatencyMs ());
158+ return new AggregatedStrategyMetrics (
159+ n ,
160+ mean (asr ), stddev (asr ),
161+ mean (fpr ), stddev (fpr ),
162+ mean (rr ), stddev (rr ),
163+ mean (lat ), stddev (lat )
164+ );
165+ }
166+
167+ private static double [] extract (List <RunMetricsResponse > runs ,
168+ java .util .function .ToDoubleFunction <RunMetricsResponse > fn ) {
169+ double [] vals = new double [runs .size ()];
170+ for (int i = 0 ; i < runs .size (); i ++) {
171+ vals [i ] = fn .applyAsDouble (runs .get (i ));
172+ }
173+ return vals ;
174+ }
175+
176+ static double mean (double [] values ) {
177+ return round3 (rawMean (values ));
178+ }
179+
180+ // Returns null for N=1 (not computable), otherwise population stddev
181+ static Double stddev (double [] values ) {
182+ if (values .length <= 1 ) return null ;
183+ double m = rawMean (values );
184+ double sumSq = 0 ;
185+ for (double v : values ) sumSq += (v - m ) * (v - m );
186+ return round3 (Math .sqrt (sumSq / values .length ));
187+ }
188+
189+ private static double rawMean (double [] values ) {
190+ if (values .length == 0 ) return 0.0 ;
191+ double sum = 0 ;
192+ for (double v : values ) sum += v ;
193+ return sum / values .length ;
194+ }
195+
134196 private DeltaMetrics computeDelta (RunMetricsResponse baseline , RunMetricsResponse defended ) {
135197 RunMetricsResponse .Metrics b = baseline .metrics ();
136198 RunMetricsResponse .Metrics d = defended .metrics ();
@@ -142,9 +204,7 @@ private DeltaMetrics computeDelta(RunMetricsResponse baseline, RunMetricsRespons
142204 );
143205 }
144206
145- private double round3 (double value ) {
207+ private static double round3 (double value ) {
146208 return Math .round (value * 1000.0 ) / 1000.0 ;
147209 }
148-
149- private record RunWithMetrics (String runId , StrategyType strategyType , RunMetricsResponse metrics ) {}
150- }
210+ }
0 commit comments