|
26 | 26 | "validity_check": 1.0 |
27 | 27 | }, |
28 | 28 | "composite_score": 0.9797 |
| 29 | + }, |
| 30 | + { |
| 31 | + "label": "gpt-5-eval-2026-04-02", |
| 32 | + "model_name": "gpt-5", |
| 33 | + "provider": "openai", |
| 34 | + "model_version": "2026-04-02", |
| 35 | + "benchmark_name": "ABC-GenBench", |
| 36 | + "benchmark_version": "0.2.0-eval", |
| 37 | + "benchmark_split": "eval", |
| 38 | + "run_type": "community", |
| 39 | + "submission_date": "2026-04-01", |
| 40 | + "notes": "", |
| 41 | + "aggregate_scores": { |
| 42 | + "constraint_following": 0.95, |
| 43 | + "editing_continuation": 0.8926, |
| 44 | + "validity_renderability": 0.99 |
| 45 | + }, |
| 46 | + "task_type_scores": { |
| 47 | + "controlled_generation": 0.95, |
| 48 | + "error_correction": 1.0, |
| 49 | + "free_continuation": 0.6574, |
| 50 | + "middle_infilling": 0.9875, |
| 51 | + "next_bar_choice": 1.0, |
| 52 | + "style_variation": 0.9215, |
| 53 | + "validity_check": 0.99 |
| 54 | + }, |
| 55 | + "composite_score": 0.9442 |
| 56 | + }, |
| 57 | + { |
| 58 | + "label": "gpt-5-mini-eval-2026-04-02", |
| 59 | + "model_name": "gpt-5-mini", |
| 60 | + "provider": "openai", |
| 61 | + "model_version": "2026-04-02", |
| 62 | + "benchmark_name": "ABC-GenBench", |
| 63 | + "benchmark_version": "0.2.0-eval", |
| 64 | + "benchmark_split": "eval", |
| 65 | + "run_type": "community", |
| 66 | + "submission_date": "2026-04-01", |
| 67 | + "notes": "", |
| 68 | + "aggregate_scores": { |
| 69 | + "constraint_following": 0.9175, |
| 70 | + "editing_continuation": 0.9133, |
| 71 | + "validity_renderability": 0.8918 |
| 72 | + }, |
| 73 | + "task_type_scores": { |
| 74 | + "controlled_generation": 0.9175, |
| 75 | + "error_correction": 0.9536, |
| 76 | + "free_continuation": 0.7676, |
| 77 | + "middle_infilling": 0.9792, |
| 78 | + "next_bar_choice": 1.0, |
| 79 | + "style_variation": 0.9305, |
| 80 | + "validity_check": 0.8918 |
| 81 | + }, |
| 82 | + "composite_score": 0.9075 |
| 83 | + }, |
| 84 | + { |
| 85 | + "label": "gpt-5-nano-eval-2026-04-02", |
| 86 | + "model_name": "gpt-5-nano", |
| 87 | + "provider": "openai", |
| 88 | + "model_version": "2026-04-02", |
| 89 | + "benchmark_name": "ABC-GenBench", |
| 90 | + "benchmark_version": "0.2.0-eval", |
| 91 | + "benchmark_split": "eval", |
| 92 | + "run_type": "community", |
| 93 | + "submission_date": "2026-04-01", |
| 94 | + "notes": "", |
| 95 | + "aggregate_scores": { |
| 96 | + "constraint_following": 0.88, |
| 97 | + "editing_continuation": 0.788, |
| 98 | + "validity_renderability": 0.7019 |
| 99 | + }, |
| 100 | + "task_type_scores": { |
| 101 | + "controlled_generation": 0.88, |
| 102 | + "error_correction": 0.9745, |
| 103 | + "free_continuation": 0.3235, |
| 104 | + "middle_infilling": 0.8671, |
| 105 | + "next_bar_choice": 1.0, |
| 106 | + "style_variation": 0.934, |
| 107 | + "validity_check": 0.7019 |
| 108 | + }, |
| 109 | + "composite_score": 0.79 |
29 | 110 | } |
30 | 111 | ] |
31 | 112 | } |
0 commit comments