|
122 | 122 | "paper_url": "https://arxiv.org/abs/2512.18436", |
123 | 123 | "code_url": "https://github.com/microsoft/verus-proof-synthesis", |
124 | 124 | "verified": true, |
125 | | - "notes": "Table 6: Hands-off mode - Best overall (67.5%)" |
| 125 | + "notes": "Table 6: Hands-off mode - Best hands-off (67.5%)" |
126 | 126 | }, |
127 | 127 | { |
128 | 128 | "submission_id": "verusage-handson-sonnet45-2026", |
129 | 129 | "system_name": "VeruSAGE (Hands-on)", |
130 | 130 | "model": "Sonnet 4.5", |
131 | 131 | "date": "2026-01-05", |
132 | 132 | "results": { |
133 | | - "solved": 569, |
| 133 | + "solved": 687, |
134 | 134 | "total": 849, |
135 | | - "percent_solved": 67.0, |
| 135 | + "percent_solved": 80.9, |
136 | 136 | "avg_time_minutes": 6.9, |
137 | 137 | "avg_cost_usd": 1.47 |
138 | 138 | }, |
139 | 139 | "breakdown": [ |
140 | 140 | { |
141 | 141 | "category": "AL", |
142 | | - "solved": 86, |
| 142 | + "solved": 104, |
143 | 143 | "total": 104 |
144 | 144 | }, |
145 | 145 | { |
|
149 | 149 | }, |
150 | 150 | { |
151 | 151 | "category": "IR", |
152 | | - "solved": 79, |
| 152 | + "solved": 99, |
153 | 153 | "total": 118 |
154 | 154 | }, |
155 | 155 | { |
156 | 156 | "category": "MA", |
157 | | - "solved": 75, |
| 157 | + "solved": 80, |
158 | 158 | "total": 89 |
159 | 159 | }, |
160 | 160 | { |
161 | 161 | "category": "NO", |
162 | | - "solved": 28, |
| 162 | + "solved": 29, |
163 | 163 | "total": 29 |
164 | 164 | }, |
165 | 165 | { |
166 | 166 | "category": "NR", |
167 | | - "solved": 122, |
| 167 | + "solved": 151, |
168 | 168 | "total": 204 |
169 | 169 | }, |
170 | 170 | { |
171 | 171 | "category": "OS", |
172 | | - "solved": 97, |
| 172 | + "solved": 130, |
173 | 173 | "total": 157 |
174 | 174 | }, |
175 | 175 | { |
176 | 176 | "category": "ST", |
177 | | - "solved": 45, |
| 177 | + "solved": 49, |
178 | 178 | "total": 63 |
179 | 179 | }, |
180 | 180 | { |
181 | 181 | "category": "VE", |
182 | | - "solved": 17, |
| 182 | + "solved": 22, |
183 | 183 | "total": 22 |
184 | 184 | } |
185 | 185 | ], |
186 | 186 | "paper_url": "https://arxiv.org/abs/2512.18436", |
187 | 187 | "code_url": "https://github.com/microsoft/verus-proof-synthesis", |
188 | 188 | "verified": true, |
189 | | - "notes": "Table 7: Hands-on mode with Sonnet 4.5" |
| 189 | + "notes": "Table 7: Hands-on mode - Best overall (81%)" |
190 | 190 | }, |
191 | 191 | { |
192 | 192 | "submission_id": "verusage-handsoff-sonnet4-2026", |
|
258 | 258 | "model": "Sonnet 4", |
259 | 259 | "date": "2026-01-05", |
260 | 260 | "results": { |
261 | | - "solved": 492, |
| 261 | + "solved": 544, |
262 | 262 | "total": 849, |
263 | | - "percent_solved": 58.0, |
| 263 | + "percent_solved": 64.1, |
264 | 264 | "avg_time_minutes": 7.1, |
265 | 265 | "avg_cost_usd": 1.72 |
266 | 266 | }, |
267 | 267 | "breakdown": [ |
268 | 268 | { |
269 | 269 | "category": "AL", |
270 | | - "solved": 72, |
| 270 | + "solved": 89, |
271 | 271 | "total": 104 |
272 | 272 | }, |
273 | 273 | { |
274 | 274 | "category": "AC", |
275 | | - "solved": 13, |
| 275 | + "solved": 15, |
276 | 276 | "total": 63 |
277 | 277 | }, |
278 | 278 | { |
279 | 279 | "category": "IR", |
280 | | - "solved": 63, |
| 280 | + "solved": 81, |
281 | 281 | "total": 118 |
282 | 282 | }, |
283 | 283 | { |
|
287 | 287 | }, |
288 | 288 | { |
289 | 289 | "category": "NO", |
290 | | - "solved": 29, |
| 290 | + "solved": 25, |
291 | 291 | "total": 29 |
292 | 292 | }, |
293 | 293 | { |
294 | 294 | "category": "NR", |
295 | | - "solved": 114, |
| 295 | + "solved": 113, |
296 | 296 | "total": 204 |
297 | 297 | }, |
298 | 298 | { |
299 | 299 | "category": "OS", |
300 | | - "solved": 85, |
| 300 | + "solved": 96, |
301 | 301 | "total": 157 |
302 | 302 | }, |
303 | 303 | { |
304 | 304 | "category": "ST", |
305 | | - "solved": 34, |
| 305 | + "solved": 40, |
306 | 306 | "total": 63 |
307 | 307 | }, |
308 | 308 | { |
309 | 309 | "category": "VE", |
310 | | - "solved": 14, |
| 310 | + "solved": 18, |
311 | 311 | "total": 22 |
312 | 312 | } |
313 | 313 | ], |
|
322 | 322 | "model": "GPT-5", |
323 | 323 | "date": "2026-01-05", |
324 | 324 | "results": { |
325 | | - "solved": 467, |
| 325 | + "solved": 425, |
326 | 326 | "total": 849, |
327 | | - "percent_solved": 55.0, |
| 327 | + "percent_solved": 50.1, |
328 | 328 | "avg_time_minutes": 12.7, |
329 | 329 | "avg_cost_usd": 0.52 |
330 | 330 | }, |
331 | 331 | "breakdown": [ |
332 | 332 | { |
333 | 333 | "category": "AL", |
334 | | - "solved": 82, |
| 334 | + "solved": 80, |
335 | 335 | "total": 104 |
336 | 336 | }, |
337 | 337 | { |
338 | 338 | "category": "AC", |
339 | | - "solved": 20, |
| 339 | + "solved": 10, |
340 | 340 | "total": 63 |
341 | 341 | }, |
342 | 342 | { |
343 | 343 | "category": "IR", |
344 | | - "solved": 52, |
| 344 | + "solved": 73, |
345 | 345 | "total": 118 |
346 | 346 | }, |
347 | 347 | { |
348 | 348 | "category": "MA", |
349 | | - "solved": 64, |
| 349 | + "solved": 53, |
350 | 350 | "total": 89 |
351 | 351 | }, |
352 | 352 | { |
353 | 353 | "category": "NO", |
354 | | - "solved": 24, |
| 354 | + "solved": 27, |
355 | 355 | "total": 29 |
356 | 356 | }, |
357 | 357 | { |
358 | 358 | "category": "NR", |
359 | | - "solved": 98, |
| 359 | + "solved": 79, |
360 | 360 | "total": 204 |
361 | 361 | }, |
362 | 362 | { |
363 | 363 | "category": "OS", |
364 | | - "solved": 71, |
| 364 | + "solved": 68, |
365 | 365 | "total": 157 |
366 | 366 | }, |
367 | 367 | { |
368 | 368 | "category": "ST", |
369 | | - "solved": 39, |
| 369 | + "solved": 21, |
370 | 370 | "total": 63 |
371 | 371 | }, |
372 | 372 | { |
373 | 373 | "category": "VE", |
374 | | - "solved": 16, |
| 374 | + "solved": 14, |
375 | 375 | "total": 22 |
376 | 376 | } |
377 | 377 | ], |
|
450 | 450 | "model": "o4-mini", |
451 | 451 | "date": "2026-01-05", |
452 | 452 | "results": { |
453 | | - "solved": 348, |
| 453 | + "solved": 144, |
454 | 454 | "total": 849, |
455 | | - "percent_solved": 41.0, |
| 455 | + "percent_solved": 17.0, |
456 | 456 | "avg_time_minutes": 12.8, |
457 | 457 | "avg_cost_usd": 0.67 |
458 | 458 | }, |
459 | 459 | "breakdown": [ |
460 | 460 | { |
461 | 461 | "category": "AL", |
462 | | - "solved": 50, |
| 462 | + "solved": 26, |
463 | 463 | "total": 104 |
464 | 464 | }, |
465 | 465 | { |
466 | 466 | "category": "AC", |
467 | | - "solved": 12, |
| 467 | + "solved": 7, |
468 | 468 | "total": 63 |
469 | 469 | }, |
470 | 470 | { |
471 | 471 | "category": "IR", |
472 | | - "solved": 41, |
| 472 | + "solved": 29, |
473 | 473 | "total": 118 |
474 | 474 | }, |
475 | 475 | { |
476 | 476 | "category": "MA", |
477 | | - "solved": 55, |
| 477 | + "solved": 13, |
478 | 478 | "total": 89 |
479 | 479 | }, |
480 | 480 | { |
481 | 481 | "category": "NO", |
482 | | - "solved": 21, |
| 482 | + "solved": 14, |
483 | 483 | "total": 29 |
484 | 484 | }, |
485 | 485 | { |
486 | 486 | "category": "NR", |
487 | | - "solved": 61, |
| 487 | + "solved": 29, |
488 | 488 | "total": 204 |
489 | 489 | }, |
490 | 490 | { |
491 | 491 | "category": "OS", |
492 | | - "solved": 58, |
| 492 | + "solved": 14, |
493 | 493 | "total": 157 |
494 | 494 | }, |
495 | 495 | { |
496 | 496 | "category": "ST", |
497 | | - "solved": 31, |
| 497 | + "solved": 6, |
498 | 498 | "total": 63 |
499 | 499 | }, |
500 | 500 | { |
501 | 501 | "category": "VE", |
502 | | - "solved": 15, |
| 502 | + "solved": 6, |
503 | 503 | "total": 22 |
504 | 504 | } |
505 | 505 | ], |
|
0 commit comments