Skip to content

Commit abe4f30

Browse files
committed
Update docs with Microsoft repo links and Hands-on results (formatted)
1 parent 207a3b8 commit abe4f30

File tree

3 files changed

+48
-45
lines changed

3 files changed

+48
-45
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44
<a href="https://arxiv.org/abs/2409.13082"><img src="https://img.shields.io/badge/AutoVerus-arXiv%202409.13082-b31b1b.svg?style=for-the-badge"></a>
55
<a href="https://arxiv.org/abs/2512.18436"><img src="https://img.shields.io/badge/VeruSAGE-arXiv%202512.18436-b31b1b.svg?style=for-the-badge"></a>
66
<a href="https://www.microsoft.com/en-us/research/project/practical-system-verification/"><img src="https://img.shields.io/badge/Website-blue.svg?style=for-the-badge"></a>
7+
<a href="https://microsoft.github.io/verus-proof-synthesis/"><img src="https://img.shields.io/badge/🏆_Leaderboard-View_Results-6366f1.svg?style=for-the-badge"></a>
78
</p>
89

910
This repository contains code and artifacts for automated [Verus](https://github.com/verus-lang/verus) proof synthesis using LLM-based approaches. It includes two proof synthesis systems and two benchmark suites.
1011

12+
> 🏆 **[View the Leaderboard](https://microsoft.github.io/verus-proof-synthesis/)** — Compare proof synthesis systems on our benchmarks!
13+
1114
---
1215

1316
## � Repository Contents

leaderboard/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ A web-based leaderboard for tracking progress in LLM-based formal verification f
44

55
## 🌐 Live Website
66

7-
Visit the leaderboard at: **[https://yangchenyuan.github.io/verus-proof-synthesis/](https://yangchenyuan.github.io/verus-proof-synthesis/)**
7+
Visit the leaderboard at: **[https://microsoft.github.io/verus-proof-synthesis/](https://microsoft.github.io/verus-proof-synthesis/)**
88

99
## 📁 Directory Structure
1010

leaderboard/data/verusage-bench-results.json

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -122,24 +122,24 @@
122122
"paper_url": "https://arxiv.org/abs/2512.18436",
123123
"code_url": "https://github.com/microsoft/verus-proof-synthesis",
124124
"verified": true,
125-
"notes": "Table 6: Hands-off mode - Best overall (67.5%)"
125+
"notes": "Table 6: Hands-off mode - Best hands-off (67.5%)"
126126
},
127127
{
128128
"submission_id": "verusage-handson-sonnet45-2026",
129129
"system_name": "VeruSAGE (Hands-on)",
130130
"model": "Sonnet 4.5",
131131
"date": "2026-01-05",
132132
"results": {
133-
"solved": 569,
133+
"solved": 687,
134134
"total": 849,
135-
"percent_solved": 67.0,
135+
"percent_solved": 80.9,
136136
"avg_time_minutes": 6.9,
137137
"avg_cost_usd": 1.47
138138
},
139139
"breakdown": [
140140
{
141141
"category": "AL",
142-
"solved": 86,
142+
"solved": 104,
143143
"total": 104
144144
},
145145
{
@@ -149,44 +149,44 @@
149149
},
150150
{
151151
"category": "IR",
152-
"solved": 79,
152+
"solved": 99,
153153
"total": 118
154154
},
155155
{
156156
"category": "MA",
157-
"solved": 75,
157+
"solved": 80,
158158
"total": 89
159159
},
160160
{
161161
"category": "NO",
162-
"solved": 28,
162+
"solved": 29,
163163
"total": 29
164164
},
165165
{
166166
"category": "NR",
167-
"solved": 122,
167+
"solved": 151,
168168
"total": 204
169169
},
170170
{
171171
"category": "OS",
172-
"solved": 97,
172+
"solved": 130,
173173
"total": 157
174174
},
175175
{
176176
"category": "ST",
177-
"solved": 45,
177+
"solved": 49,
178178
"total": 63
179179
},
180180
{
181181
"category": "VE",
182-
"solved": 17,
182+
"solved": 22,
183183
"total": 22
184184
}
185185
],
186186
"paper_url": "https://arxiv.org/abs/2512.18436",
187187
"code_url": "https://github.com/microsoft/verus-proof-synthesis",
188188
"verified": true,
189-
"notes": "Table 7: Hands-on mode with Sonnet 4.5"
189+
"notes": "Table 7: Hands-on mode - Best overall (81%)"
190190
},
191191
{
192192
"submission_id": "verusage-handsoff-sonnet4-2026",
@@ -258,26 +258,26 @@
258258
"model": "Sonnet 4",
259259
"date": "2026-01-05",
260260
"results": {
261-
"solved": 492,
261+
"solved": 544,
262262
"total": 849,
263-
"percent_solved": 58.0,
263+
"percent_solved": 64.1,
264264
"avg_time_minutes": 7.1,
265265
"avg_cost_usd": 1.72
266266
},
267267
"breakdown": [
268268
{
269269
"category": "AL",
270-
"solved": 72,
270+
"solved": 89,
271271
"total": 104
272272
},
273273
{
274274
"category": "AC",
275-
"solved": 13,
275+
"solved": 15,
276276
"total": 63
277277
},
278278
{
279279
"category": "IR",
280-
"solved": 63,
280+
"solved": 81,
281281
"total": 118
282282
},
283283
{
@@ -287,27 +287,27 @@
287287
},
288288
{
289289
"category": "NO",
290-
"solved": 29,
290+
"solved": 25,
291291
"total": 29
292292
},
293293
{
294294
"category": "NR",
295-
"solved": 114,
295+
"solved": 113,
296296
"total": 204
297297
},
298298
{
299299
"category": "OS",
300-
"solved": 85,
300+
"solved": 96,
301301
"total": 157
302302
},
303303
{
304304
"category": "ST",
305-
"solved": 34,
305+
"solved": 40,
306306
"total": 63
307307
},
308308
{
309309
"category": "VE",
310-
"solved": 14,
310+
"solved": 18,
311311
"total": 22
312312
}
313313
],
@@ -322,56 +322,56 @@
322322
"model": "GPT-5",
323323
"date": "2026-01-05",
324324
"results": {
325-
"solved": 467,
325+
"solved": 425,
326326
"total": 849,
327-
"percent_solved": 55.0,
327+
"percent_solved": 50.1,
328328
"avg_time_minutes": 12.7,
329329
"avg_cost_usd": 0.52
330330
},
331331
"breakdown": [
332332
{
333333
"category": "AL",
334-
"solved": 82,
334+
"solved": 80,
335335
"total": 104
336336
},
337337
{
338338
"category": "AC",
339-
"solved": 20,
339+
"solved": 10,
340340
"total": 63
341341
},
342342
{
343343
"category": "IR",
344-
"solved": 52,
344+
"solved": 73,
345345
"total": 118
346346
},
347347
{
348348
"category": "MA",
349-
"solved": 64,
349+
"solved": 53,
350350
"total": 89
351351
},
352352
{
353353
"category": "NO",
354-
"solved": 24,
354+
"solved": 27,
355355
"total": 29
356356
},
357357
{
358358
"category": "NR",
359-
"solved": 98,
359+
"solved": 79,
360360
"total": 204
361361
},
362362
{
363363
"category": "OS",
364-
"solved": 71,
364+
"solved": 68,
365365
"total": 157
366366
},
367367
{
368368
"category": "ST",
369-
"solved": 39,
369+
"solved": 21,
370370
"total": 63
371371
},
372372
{
373373
"category": "VE",
374-
"solved": 16,
374+
"solved": 14,
375375
"total": 22
376376
}
377377
],
@@ -450,56 +450,56 @@
450450
"model": "o4-mini",
451451
"date": "2026-01-05",
452452
"results": {
453-
"solved": 348,
453+
"solved": 144,
454454
"total": 849,
455-
"percent_solved": 41.0,
455+
"percent_solved": 17.0,
456456
"avg_time_minutes": 12.8,
457457
"avg_cost_usd": 0.67
458458
},
459459
"breakdown": [
460460
{
461461
"category": "AL",
462-
"solved": 50,
462+
"solved": 26,
463463
"total": 104
464464
},
465465
{
466466
"category": "AC",
467-
"solved": 12,
467+
"solved": 7,
468468
"total": 63
469469
},
470470
{
471471
"category": "IR",
472-
"solved": 41,
472+
"solved": 29,
473473
"total": 118
474474
},
475475
{
476476
"category": "MA",
477-
"solved": 55,
477+
"solved": 13,
478478
"total": 89
479479
},
480480
{
481481
"category": "NO",
482-
"solved": 21,
482+
"solved": 14,
483483
"total": 29
484484
},
485485
{
486486
"category": "NR",
487-
"solved": 61,
487+
"solved": 29,
488488
"total": 204
489489
},
490490
{
491491
"category": "OS",
492-
"solved": 58,
492+
"solved": 14,
493493
"total": 157
494494
},
495495
{
496496
"category": "ST",
497-
"solved": 31,
497+
"solved": 6,
498498
"total": 63
499499
},
500500
{
501501
"category": "VE",
502-
"solved": 15,
502+
"solved": 6,
503503
"total": 22
504504
}
505505
],

0 commit comments

Comments
 (0)