-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.html
More file actions
727 lines (685 loc) · 30.6 KB
/
Copy pathindex.html
File metadata and controls
727 lines (685 loc) · 30.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>The Changing North Star of AI Control</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Fraunces:ital,opsz,wght@0,9..144,300;0,9..144,500;0,9..144,700;1,9..144,400&family=JetBrains+Mono:wght@400;500;700&display=swap" rel="stylesheet">
<style>
:root {
--bg: #faf8f3;
--surface: #ffffff;
--surface-alt: #f2efe7;
--text: #1c1b18;
--text-dim: #595752;
--text-faint: #8a8780;
--accent: #0f6e56;
--accent-dim: rgba(15, 110, 86, 0.12);
--warn: #a32d2d;
--warn-dim: rgba(163, 45, 45, 0.08);
--input: #fff8d6;
--input-border: #e6d47c;
--border: rgba(28, 27, 24, 0.12);
--border-strong: rgba(28, 27, 24, 0.28);
}
@media (prefers-color-scheme: dark) {
:root {
--bg: #14140f;
--surface: #1d1d18;
--surface-alt: #252520;
--text: #f0ebdc;
--text-dim: #a8a49a;
--text-faint: #6d6a61;
--accent: #4db895;
--accent-dim: rgba(77, 184, 149, 0.15);
--warn: #e07a7a;
--warn-dim: rgba(224, 122, 122, 0.1);
--input: #3a3620;
--input-border: #7a6d2e;
--border: rgba(240, 235, 220, 0.12);
--border-strong: rgba(240, 235, 220, 0.28);
}
}
* { box-sizing: border-box; }
html, body { margin: 0; padding: 0; }
body {
background: var(--bg);
color: var(--text);
font-family: 'Fraunces', Georgia, serif;
font-weight: 400;
font-size: 16px;
line-height: 1.5;
font-feature-settings: "ss01", "ss02";
}
.app {
max-width: 1200px;
margin: 0 auto;
padding: 3rem 2rem 5rem;
}
header {
margin-bottom: 3rem;
padding-bottom: 2rem;
border-bottom: 0.5px solid var(--border);
}
h1 {
font-family: 'Fraunces', Georgia, serif;
font-weight: 500;
font-size: clamp(1.8rem, 4vw, 2.75rem);
line-height: 1.1;
letter-spacing: -0.02em;
margin: 0 0 0.5rem;
}
.subtitle {
font-size: 1rem;
color: var(--text-dim);
font-style: italic;
margin: 0;
}
.intro {
margin-top: 1.5rem;
font-size: 0.95rem;
color: var(--text-dim);
max-width: 70ch;
line-height: 1.6;
}
main {
display: grid;
grid-template-columns: 1fr 1.2fr;
gap: 3rem;
}
@media (max-width: 900px) {
main { grid-template-columns: 1fr; gap: 2rem; }
}
.panel {
background: var(--surface);
border: 0.5px solid var(--border);
border-radius: 4px;
padding: 1.75rem;
}
.panel h2 {
font-family: 'Fraunces', Georgia, serif;
font-weight: 500;
font-size: 0.78rem;
letter-spacing: 0.18em;
text-transform: uppercase;
color: var(--text-faint);
margin: 0 0 1.5rem;
padding-bottom: 0.6rem;
border-bottom: 0.5px solid var(--border);
}
.section-header {
font-family: 'Fraunces', Georgia, serif;
font-weight: 500;
font-size: 0.9rem;
color: var(--text);
margin: 1.5rem 0 0.75rem;
padding-bottom: 0.3rem;
border-bottom: 0.25px solid var(--border);
}
.section-header:first-of-type { margin-top: 0; }
.input-row {
display: grid;
grid-template-columns: 1fr auto;
gap: 0.75rem;
align-items: center;
padding: 0.55rem 0;
}
.input-row label {
font-size: 0.875rem;
color: var(--text-dim);
line-height: 1.35;
}
.input-row .hint {
font-size: 0.75rem;
color: var(--text-faint);
font-style: italic;
display: block;
margin-top: 0.15rem;
}
.input-row input {
font-family: 'JetBrains Mono', monospace;
font-size: 0.85rem;
font-weight: 500;
padding: 0.4rem 0.7rem;
background: var(--input);
border: 0.5px solid var(--input-border);
border-radius: 3px;
color: var(--text);
width: 140px;
text-align: right;
transition: all 0.15s ease;
}
.input-row input:focus {
outline: none;
border-color: var(--accent);
box-shadow: 0 0 0 2px var(--accent-dim);
}
.result-row {
display: grid;
grid-template-columns: 1fr auto;
gap: 1rem;
align-items: baseline;
padding: 0.6rem 0;
border-bottom: 0.25px dotted var(--border);
}
.result-row:last-child { border-bottom: none; }
.result-row .label {
font-size: 0.875rem;
color: var(--text-dim);
line-height: 1.35;
}
.result-row .label .hint {
display: block;
font-size: 0.72rem;
color: var(--text-faint);
font-style: italic;
margin-top: 0.1rem;
}
.result-row .value {
font-family: 'JetBrains Mono', monospace;
font-size: 0.9rem;
font-weight: 500;
color: var(--text);
white-space: nowrap;
}
.result-row.highlight .value {
color: var(--warn);
font-weight: 700;
font-size: 1rem;
}
.result-row.mega-highlight {
margin-top: 1rem;
padding: 1rem;
background: var(--warn-dim);
border-radius: 4px;
border: 0.5px solid var(--warn);
}
.result-row.mega-highlight .label {
color: var(--text);
font-weight: 500;
}
.result-row.mega-highlight .value {
color: var(--warn);
font-weight: 700;
font-size: 1.3rem;
}
.scenario-pill {
display: inline-block;
font-size: 0.7rem;
font-family: 'JetBrains Mono', monospace;
letter-spacing: 0.05em;
padding: 0.15rem 0.5rem;
background: var(--accent-dim);
color: var(--accent);
border-radius: 3px;
margin-left: 0.5rem;
font-style: normal;
}
.presets {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1.5rem;
}
.preset-btn {
font-family: 'Fraunces', Georgia, serif;
font-size: 0.78rem;
font-weight: 500;
padding: 0.4rem 0.85rem;
background: transparent;
border: 0.5px solid var(--border-strong);
border-radius: 3px;
color: var(--text-dim);
cursor: pointer;
transition: all 0.15s ease;
}
.preset-btn:hover {
background: var(--accent-dim);
color: var(--accent);
border-color: var(--accent);
}
footer {
margin-top: 4rem;
padding-top: 2rem;
border-top: 0.5px solid var(--border);
font-size: 0.825rem;
color: var(--text-faint);
line-height: 1.7;
}
footer strong {
color: var(--text-dim);
font-weight: 500;
}
footer a {
color: var(--accent);
text-decoration: none;
border-bottom: 0.5px solid var(--accent-dim);
}
footer a:hover { border-bottom-color: var(--accent); }
.footnote-block { margin-bottom: 1rem; }
.divider-quote {
font-family: 'Fraunces', Georgia, serif;
font-style: italic;
font-size: 0.95rem;
color: var(--text-dim);
padding: 1.25rem 0 0.5rem;
border-left: 2px solid var(--accent);
padding-left: 1rem;
margin: 1.5rem 0 0.5rem;
}
.narrative {
max-width: 70ch;
margin: 0 0 3rem;
}
.narrative p {
font-size: 0.95rem;
color: var(--text-dim);
line-height: 1.7;
margin: 0 0 1rem;
}
.narrative p:last-child { margin-bottom: 0; }
.narrative .section-heading {
font-family: 'Fraunces', Georgia, serif;
font-weight: 500;
font-size: 1.15rem;
color: var(--text);
margin: 2rem 0 0.75rem;
}
.narrative .section-heading:first-child { margin-top: 0; }
.calculator-intro {
max-width: 70ch;
margin: 2.5rem 0 1.5rem;
padding-top: 2rem;
border-top: 0.5px solid var(--border);
}
.calculator-intro p {
font-size: 0.95rem;
color: var(--text-dim);
line-height: 1.7;
margin: 0;
}
.unit-toggle {
display: inline-flex;
font-family: 'JetBrains Mono', monospace;
font-size: 0.7rem;
margin-left: 0.5rem;
}
.unit-toggle button {
background: transparent;
border: 0.5px solid var(--border);
padding: 0.15rem 0.4rem;
color: var(--text-faint);
cursor: pointer;
}
.unit-toggle button.active {
background: var(--accent);
color: var(--bg);
border-color: var(--accent);
}
</style>
</head>
<body>
<div class="app">
<header>
<h1>The Changing North Star<br>of AI Control</h1>
<p class="subtitle">A scaling-math scenario for the AI control agenda</p>
</header>
<section class="narrative">
<p>
On December 1st, 2025, the GDM mech interp team published a LessWrong article
declaring a pivot to a pragmatic approach to interpretability. Much time had been
lost chasing numbers in SAE reconstruction loss; they argued that optimizing such
proxies did not get them closer to their north star: making AI go well through
understanding how deep neural networks process information mechanistically. Has
AI control gone the way of mech interp? Is its north star of stopping harmful
actions from human-level AI on track?
</p>
<p>
In mech interp, SAE reconstruction loss turned out to be a number you could
optimize without getting closer to understanding models. My worry is that Pareto
safety–usefulness frontiers are becoming the same kind of number for control:
optimizable, publishable, and only loosely coupled to how fast we can catch a
scheming model in production.
</p>
<p class="section-heading">A real-world precedent</p>
<p>
On December 31st, 2025, Alibaba reported an incident where, without instructions
to do so, their models in their RL runs circumvented security policies and diverted
training GPUs for crypto-mining [1]. The authors took actions to prevent this from
recurring, and called on the broader community to take such failure modes seriously.
AI control can help mitigate such incidents, but its north star is set on
post-deployment.
</p>
<p class="section-heading">The proxy trap</p>
<p>
I recently spent three months on an AI control project, stacking defensive
protocols to optimize a safety–usefulness trade-off. Then I read an article on
AI-assisted red-teaming. AI control is supposed to defend against human-level AI,
but the line between a human red team and an AI-augmented one has started to blur.
And we're still optimizing our proxy task of safety-usefulness frontiers as if that
line were clean.
</p>
<p class="section-heading">The detection-to-pull gap</p>
<p>
Even when control works as intended post-deployment and catches an attack, the
technical job ends where the organizational one begins. Between the time a model
is flagged and the time it is pulled, there are weeks of discovery, escalation,
and legal review during which a frontier model keeps acting on the world
(<a href="https://reinthal.github.io/cost-to-detection/">damage calculator</a>).
That window is the thing we should be trying to shrink.
</p>
<p>
The damage done would be larger without AI control. But the proxy task of optimizing
Pareto frontiers is far removed from what actually matters, and, like SAE
reconstruction loss before it, risks becoming a number we optimize because it's
easier than measuring what we care about: minimizing the time window a bad model
has before detection. The sooner we intervene, the less damage it can do, and the
more context we have to understand its behavior. And as we have seen, this can be
detected and stopped as early as during training/RL.
</p>
<p class="section-heading">Pre-deployment interventions</p>
<p>
To be clear, pre-deployment interventions already exist in the control
literature. Korbak et al. (2025) sketch a "control safety case" that models how
safe an agent would be to deploy [2]. Lindner et al. (2025) explicitly identify
latency as one of three safety conditions for frontier monitoring [3]. Yet most
papers still default to a post-deployment north star and proxy-task optimization,
without asking where in the lifecycle interventions would be most effective.
</p>
<p>
We should lift our gaze from proxy tasks and reconsider where our efforts have
the most impact. The crypto-mining incident shows why pre-deployment matters: it
was detected and stopped during training, not after release.
</p>
</section>
<div class="calculator-intro">
<p>
The calculator below takes published monitor performance numbers from the Redwood-lineage AI control agenda
(benchmarks like BashArena and ControlArena) and runs them at real deployment scale, then compares the annual
cost of operating the detection infrastructure against the damage from a single undetected incident.
All yellow inputs are editable; everything else recalculates live.
</p>
</div>
<div class="presets">
<button class="preset-btn" data-preset="conservative">Conservative default</button>
<button class="preset-btn" data-preset="benchmark">Benchmark operating point (4% FPR)</button>
<button class="preset-btn" data-preset="optimistic">Optimistic (TM works better)</button>
<button class="preset-btn" data-preset="patient">Patient adversary (0.001%)</button>
<button class="preset-btn" data-preset="aggressive">Aggressive timeline (14d pull)</button>
</div>
<main>
<section class="panel">
<h2>Inputs</h2>
<div class="section-header">Traffic</div>
<div class="input-row">
<label>Total requests per day <span class="hint">OpenAI publicly reports ~2.5B/day</span></label>
<input type="number" id="total_requests" value="2500000000" step="100000000">
</div>
<div class="input-row">
<label>% routed through agentic model <span class="hint">Subset of total traffic</span></label>
<input type="number" id="agent_fraction" value="10" step="1" min="0" max="100">
</div>
<div class="section-header">Scheming behavior</div>
<div class="input-row">
<label>Scheming attack rate (%) <span class="hint">0.01% = 1 attempt per 10,000 requests</span></label>
<input type="number" id="scheming_rate" value="0.01" step="0.001" min="0">
</div>
<div class="section-header">Monitor performance</div>
<div class="input-row">
<label>Monitor TPR (%) <span class="hint">Logit-ROC extrapolation at 10⁻⁵ step-FPR, slope 0.8 → ~1.1%</span></label>
<input type="number" id="tpr" value="1.1" step="0.1" min="0" max="100">
</div>
<div class="input-row">
<label>Monitor FPR (%) <span class="hint">10⁻⁵ = 0.001%, operationally realistic low-FPR point</span></label>
<input type="number" id="fpr" value="0.001" step="0.0001" min="0">
</div>
<div class="section-header">Detection timeline</div>
<div class="input-row">
<label>Days from first attack to model pulled <span class="hint">Enterprise incident response typically 6–8 weeks</span></label>
<input type="number" id="days_to_pull" value="42" step="1" min="1">
</div>
<div class="section-header">SOC cost model</div>
<div class="input-row">
<label>Tokens per alert (agent trajectory) <span class="hint">Context window to review</span></label>
<input type="number" id="tokens_per_alert" value="100000" step="10000">
</div>
<div class="input-row">
<label>Analyst reading speed (wpm) <span class="hint">Careful review rate</span></label>
<input type="number" id="read_speed" value="250" step="25">
</div>
<div class="input-row">
<label>Trusted-monitor assist reduction (×) <span class="hint">0.01 = TM makes review 100× faster</span></label>
<input type="number" id="tm_reduction" value="0.01" step="0.005" min="0" max="1">
</div>
<div class="input-row">
<label>Productive analyst hours per day <span class="hint">After meetings, breaks, context switching</span></label>
<input type="number" id="productive_hours" value="5" step="0.5" min="1" max="12">
</div>
<div class="input-row">
<label>Analyst fully-loaded cost ($/yr) <span class="hint">Salary + benefits + overhead</span></label>
<input type="number" id="analyst_salary" value="100000" step="5000">
</div>
<div class="input-row">
<label>Manager fully-loaded cost ($/yr) <span class="hint">One manager per N analysts</span></label>
<input type="number" id="manager_salary" value="200000" step="5000">
</div>
<div class="input-row">
<label>Analysts per manager <span class="hint">Span of control</span></label>
<input type="number" id="analysts_per_mgr" value="10" step="1" min="1">
</div>
<div class="section-header">Damage model</div>
<div class="input-row">
<label>Tokens per attack output <span class="hint">Agent trajectory length</span></label>
<input type="number" id="tokens_per_attack" value="100000" step="10000">
</div>
<div class="input-row">
<label>Chars per token <span class="hint">Code tokenization ratio</span></label>
<input type="number" id="chars_per_token" value="4" step="0.5">
</div>
<div class="input-row">
<label>Chars per LOC <span class="hint">Avg line length</span></label>
<input type="number" id="chars_per_loc" value="80" step="5">
</div>
<div class="input-row">
<label>Forensic cost per LOC ($) <span class="hint">Engineer time to review and classify</span></label>
<input type="number" id="loc_cost" value="5" step="0.5">
</div>
<div class="input-row">
<label>Affected customers <span class="hint">Incident blast radius</span></label>
<input type="number" id="affected_customers" value="100000" step="1000">
</div>
<div class="input-row">
<label>Ransomware baseline per customer ($) <span class="hint">IBM/Ponemon 2024 avg ~$4.5M</span></label>
<input type="number" id="ransomware_baseline" value="4500000" step="100000">
</div>
<div class="input-row">
<label>Rogue cloud investigation per customer ($) <span class="hint">Tracing unauthorized infra</span></label>
<input type="number" id="rogue_cloud" value="250000" step="10000">
</div>
</section>
<section class="panel">
<h2>Results</h2>
<div class="section-header">Per-day breakdown</div>
<div class="result-row"><div class="label">Agent requests/day</div><div class="value" id="r_agent_requests">—</div></div>
<div class="result-row"><div class="label">Attack attempts/day</div><div class="value" id="r_attack_attempts">—</div></div>
<div class="result-row"><div class="label">Attacks caught/day</div><div class="value" id="r_caught">—</div></div>
<div class="result-row highlight"><div class="label">Attacks missed/day <span class="hint">Successful undetected</span></div><div class="value" id="r_missed">—</div></div>
<div class="result-row"><div class="label">False positives/day</div><div class="value" id="r_fp">—</div></div>
<div class="result-row"><div class="label">Audit queue/day (flagged)</div><div class="value" id="r_queue">—</div></div>
<div class="result-row"><div class="label">Queue true-positive rate <span class="hint">Real attacks as % of queue</span></div><div class="value" id="r_tprate">—</div></div>
<div class="section-header">Cumulative by model-pull decision</div>
<div class="result-row"><div class="label">Attack attempts before pull</div><div class="value" id="r_total_attempts">—</div></div>
<div class="result-row"><div class="label">Attacks caught before pull</div><div class="value" id="r_total_caught">—</div></div>
<div class="result-row highlight"><div class="label">Successful undetected attacks before pull</div><div class="value" id="r_total_missed">—</div></div>
<div class="section-header">SOC cost (to operate detection)</div>
<div class="result-row"><div class="label">Minutes per alert (manual)</div><div class="value" id="r_manual_min">—</div></div>
<div class="result-row"><div class="label">Minutes per alert (TM-assisted)</div><div class="value" id="r_assisted_min">—</div></div>
<div class="result-row"><div class="label">Analysts needed (manual)</div><div class="value" id="r_manual_analysts">—</div></div>
<div class="result-row"><div class="label">Analysts needed (TM-assisted)</div><div class="value" id="r_assisted_analysts">—</div></div>
<div class="result-row"><div class="label">SOC cost / year (manual)</div><div class="value" id="r_manual_cost">—</div></div>
<div class="result-row highlight"><div class="label">SOC cost / year (TM-assisted)</div><div class="value" id="r_assisted_cost">—</div></div>
<div class="section-header">Damage from undetected attacks</div>
<div class="result-row"><div class="label">Lines of malicious code written</div><div class="value" id="r_total_loc">—</div></div>
<div class="result-row"><div class="label">LOC forensic cleanup cost</div><div class="value" id="r_loc_cost">—</div></div>
<div class="result-row"><div class="label">Customer-incident damage</div><div class="value" id="r_customer_cost">—</div></div>
<div class="result-row"><div class="label">Rogue cloud investigation cost</div><div class="value" id="r_rogue_cost">—</div></div>
<div class="result-row mega-highlight"><div class="label">Total estimated damage <span class="hint">Single incident, assuming the model eventually does get pulled</span></div><div class="value" id="r_total_damage">—</div></div>
<p class="divider-quote">
By the time "pull the model" is executed, the forensic audit to untangle what was done is months-long and mixes
malicious artifacts with legitimate developer work across customer environments the lab has no visibility into.
</p>
<div class="result-row"><div class="label">Damage / annual SOC cost ratio <span class="hint">Higher = more cost-effective to invest in detection</span></div><div class="value" id="r_ratio">—</div></div>
</section>
</main>
<footer>
<div class="footnote-block">
<strong>Monitor TPR (1.1%) derivation.</strong> Anchored on the BashArena operating point: 74% trajectory-wise TPR at 4% trajectory-wise FPR
(GPT-4.1 mini monitor with n=3 ensemble, Claude Sonnet 4.5 attacker), equivalent to step-wise FPR ≈ 8×10⁻⁴.
Logit-linear extrapolation at slope 0.8 (plausible-low regime) gives ~1.1% TPR at step-FPR 10⁻⁵. The real curve
likely bends below the linear fit outside the validated 10⁻⁴ to 10⁻² FPR range due to null-component effect and
quantization — so 1.1% is an optimistic upper bound.
</div>
<div class="footnote-block">
<strong>Citations.</strong><br>
[1] Wang, Xu, An, Dai, Gao et al. (2025), <em>Let It Flow: Building the ROME Model within an Open Agentic Learning Ecosystem</em>,
<a href="https://arxiv.org/abs/2512.24873">arXiv:2512.24873</a>.<br>
[2] Korbak et al. (2025), <em>A sketch of an AI control safety case</em>,
<a href="https://arxiv.org/abs/2501.17315">arXiv:2501.17315</a>.<br>
[3] Lindner et al. (2025), <em>Practical challenges of control monitoring in frontier AI deployments</em>,
<a href="https://arxiv.org/abs/2512.22154">arXiv:2512.22154</a>.
</div>
<div class="footnote-block" style="margin-top: 1.5rem; opacity: 0.7;">
Companion scenario to "The Changing North Star of AI Control."
</div>
</footer>
</div>
<script>
const PRESETS = {
conservative: { total_requests: 2500000000, agent_fraction: 10, scheming_rate: 0.01, tpr: 1.1, fpr: 0.001, days_to_pull: 42, tokens_per_alert: 100000, read_speed: 250, tm_reduction: 0.01, productive_hours: 5, analyst_salary: 100000, manager_salary: 200000, analysts_per_mgr: 10, tokens_per_attack: 100000, chars_per_token: 4, chars_per_loc: 80, loc_cost: 5, affected_customers: 100000, ransomware_baseline: 4500000, rogue_cloud: 250000 },
benchmark: { total_requests: 2500000000, agent_fraction: 10, scheming_rate: 0.01, tpr: 74, fpr: 4, days_to_pull: 42, tokens_per_alert: 100000, read_speed: 250, tm_reduction: 0.01, productive_hours: 5, analyst_salary: 100000, manager_salary: 200000, analysts_per_mgr: 10, tokens_per_attack: 100000, chars_per_token: 4, chars_per_loc: 80, loc_cost: 5, affected_customers: 100000, ransomware_baseline: 4500000, rogue_cloud: 250000 },
optimistic: { total_requests: 2500000000, agent_fraction: 10, scheming_rate: 0.01, tpr: 25, fpr: 0.01, days_to_pull: 21, tokens_per_alert: 100000, read_speed: 250, tm_reduction: 0.005, productive_hours: 5, analyst_salary: 100000, manager_salary: 200000, analysts_per_mgr: 10, tokens_per_attack: 100000, chars_per_token: 4, chars_per_loc: 80, loc_cost: 5, affected_customers: 100000, ransomware_baseline: 4500000, rogue_cloud: 250000 },
patient: { total_requests: 2500000000, agent_fraction: 10, scheming_rate: 0.001, tpr: 1.1, fpr: 0.001, days_to_pull: 90, tokens_per_alert: 100000, read_speed: 250, tm_reduction: 0.01, productive_hours: 5, analyst_salary: 100000, manager_salary: 200000, analysts_per_mgr: 10, tokens_per_attack: 100000, chars_per_token: 4, chars_per_loc: 80, loc_cost: 5, affected_customers: 100000, ransomware_baseline: 4500000, rogue_cloud: 250000 },
aggressive: { total_requests: 2500000000, agent_fraction: 10, scheming_rate: 0.01, tpr: 1.1, fpr: 0.001, days_to_pull: 14, tokens_per_alert: 100000, read_speed: 250, tm_reduction: 0.01, productive_hours: 5, analyst_salary: 100000, manager_salary: 200000, analysts_per_mgr: 10, tokens_per_attack: 100000, chars_per_token: 4, chars_per_loc: 80, loc_cost: 5, affected_customers: 100000, ransomware_baseline: 4500000, rogue_cloud: 250000 }
};
const inputs = [
'total_requests','agent_fraction','scheming_rate','tpr','fpr','days_to_pull',
'tokens_per_alert','read_speed','tm_reduction','productive_hours',
'analyst_salary','manager_salary','analysts_per_mgr',
'tokens_per_attack','chars_per_token','chars_per_loc','loc_cost',
'affected_customers','ransomware_baseline','rogue_cloud'
];
function val(id) { return parseFloat(document.getElementById(id).value) || 0; }
function fmtInt(n) {
if (!isFinite(n)) return '—';
if (Math.abs(n) >= 1e12) return (n/1e12).toFixed(2) + 'T';
if (Math.abs(n) >= 1e9) return (n/1e9).toFixed(2) + 'B';
if (Math.abs(n) >= 1e6) return (n/1e6).toFixed(2) + 'M';
if (Math.abs(n) >= 1e3) return n.toLocaleString('en-US', { maximumFractionDigits: 0 });
return n.toLocaleString('en-US', { maximumFractionDigits: 1 });
}
function fmtDollar(n) {
if (!isFinite(n)) return '—';
if (Math.abs(n) >= 1e12) return '$' + (n/1e12).toFixed(2) + 'T';
if (Math.abs(n) >= 1e9) return '$' + (n/1e9).toFixed(2) + 'B';
if (Math.abs(n) >= 1e6) return '$' + (n/1e6).toFixed(2) + 'M';
if (Math.abs(n) >= 1e3) return '$' + (n/1e3).toFixed(1) + 'K';
return '$' + n.toFixed(0);
}
function fmtPct(n) {
if (!isFinite(n)) return '—';
if (n < 0.01) return (n*100).toFixed(3) + '%';
return (n*100).toFixed(1) + '%';
}
function fmtRatio(n) {
if (!isFinite(n)) return '—';
if (Math.abs(n) >= 1000) return n.toLocaleString('en-US', { maximumFractionDigits: 0 }) + '×';
return n.toFixed(1) + '×';
}
function recalc() {
const totalReq = val('total_requests');
const agentFrac = val('agent_fraction') / 100;
const schemingRate = val('scheming_rate') / 100;
const tpr = val('tpr') / 100;
const fpr = val('fpr') / 100;
const days = val('days_to_pull');
const agentRequests = totalReq * agentFrac;
const attackAttempts = agentRequests * schemingRate;
const benignRequests = agentRequests - attackAttempts;
const caught = attackAttempts * tpr;
const missed = attackAttempts * (1 - tpr);
const falsePositives = benignRequests * fpr;
const queue = caught + falsePositives;
const queueTPRate = queue > 0 ? caught / queue : 0;
const totalAttempts = attackAttempts * days;
const totalCaught = caught * days;
const totalMissed = missed * days;
// SOC
const tokensPerAlert = val('tokens_per_alert');
const readSpeed = val('read_speed');
const wordsPerToken = 0.75;
const manualMinutes = (tokensPerAlert * wordsPerToken) / readSpeed;
const tmReduction = val('tm_reduction');
const assistedMinutes = manualMinutes * tmReduction;
const productiveMin = val('productive_hours') * 60;
const analystSalary = val('analyst_salary');
const mgrSalary = val('manager_salary');
const analystsPerMgr = val('analysts_per_mgr');
const manualAnalysts = Math.ceil((queue * manualMinutes) / productiveMin);
const assistedAnalysts = Math.ceil((queue * assistedMinutes) / productiveMin);
const manualCost = manualAnalysts * analystSalary + Math.ceil(manualAnalysts / analystsPerMgr) * mgrSalary;
const assistedCost = assistedAnalysts * analystSalary + Math.ceil(assistedAnalysts / analystsPerMgr) * mgrSalary;
// Damage
const tokensPerAttack = val('tokens_per_attack');
const charsPerToken = val('chars_per_token');
const charsPerLoc = val('chars_per_loc');
const locPerAttack = (tokensPerAttack * charsPerToken) / charsPerLoc;
const totalLoc = totalMissed * locPerAttack;
const locCleanup = totalLoc * val('loc_cost');
const customerCost = val('affected_customers') * val('ransomware_baseline');
const rogueCost = val('affected_customers') * val('rogue_cloud');
const totalDamage = locCleanup + customerCost + rogueCost;
const ratio = assistedCost > 0 ? totalDamage / assistedCost : Infinity;
// Set
const s = (id, v) => document.getElementById(id).textContent = v;
s('r_agent_requests', fmtInt(agentRequests));
s('r_attack_attempts', fmtInt(attackAttempts));
s('r_caught', fmtInt(caught));
s('r_missed', fmtInt(missed));
s('r_fp', fmtInt(falsePositives));
s('r_queue', fmtInt(queue));
s('r_tprate', fmtPct(queueTPRate));
s('r_total_attempts', fmtInt(totalAttempts));
s('r_total_caught', fmtInt(totalCaught));
s('r_total_missed', fmtInt(totalMissed));
s('r_manual_min', manualMinutes.toFixed(0) + ' min');
s('r_assisted_min', assistedMinutes.toFixed(2) + ' min');
s('r_manual_analysts', fmtInt(manualAnalysts));
s('r_assisted_analysts', fmtInt(assistedAnalysts));
s('r_manual_cost', fmtDollar(manualCost));
s('r_assisted_cost', fmtDollar(assistedCost));
s('r_total_loc', fmtInt(totalLoc));
s('r_loc_cost', fmtDollar(locCleanup));
s('r_customer_cost', fmtDollar(customerCost));
s('r_rogue_cost', fmtDollar(rogueCost));
s('r_total_damage', fmtDollar(totalDamage));
s('r_ratio', fmtRatio(ratio));
}
inputs.forEach(id => {
document.getElementById(id).addEventListener('input', recalc);
});
document.querySelectorAll('.preset-btn').forEach(btn => {
btn.addEventListener('click', () => {
const preset = PRESETS[btn.dataset.preset];
Object.keys(preset).forEach(k => {
const el = document.getElementById(k);
if (el) el.value = preset[k];
});
recalc();
});
});
recalc();
</script>
</body>
</html>