-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_eval_160M_minipile_table.txt
141 lines (141 loc) · 18.4 KB
/
02_eval_160M_minipile_table.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
| Tasks |Version|Filter|n-shot| Metric | | Value | | Stderr |
|------------------------------------------------------------|------:|------|-----:|----------|---|------------:|---|-----------:|
|arc_challenge | 1|none | 0|acc |↑ | 0.2125|± | 0.0120|
| | |none | 0|acc_norm |↑ | 0.2679|± | 0.0129|
|blimp | 2|none | |acc |↑ | 0.5194|± | 0.0018|
| - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.6290|± | 0.0153|
| - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.7590|± | 0.0135|
| - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.5730|± | 0.0156|
| - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.6220|± | 0.0153|
| - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.7790|± | 0.0131|
| - blimp_causative | 1|none | 0|acc |↑ | 0.4080|± | 0.0155|
| - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.4910|± | 0.0158|
| - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.6770|± | 0.0148|
| - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.3380|± | 0.0150|
| - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.5100|± | 0.0158|
| - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.5120|± | 0.0158|
| - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.4870|± | 0.0158|
| - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.5090|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.5060|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.4490|± | 0.0157|
| - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.5120|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.4800|± | 0.0158|
| - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.5190|± | 0.0158|
| - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.5520|± | 0.0157|
| - blimp_drop_argument | 1|none | 0|acc |↑ | 0.6990|± | 0.0145|
| - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.4710|± | 0.0158|
| - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.3220|± | 0.0148|
| - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.7090|± | 0.0144|
| - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.3270|± | 0.0148|
| - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.6870|± | 0.0147|
| - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.4200|± | 0.0156|
| - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.6060|± | 0.0155|
| - blimp_inchoative | 1|none | 0|acc |↑ | 0.4190|± | 0.0156|
| - blimp_intransitive | 1|none | 0|acc |↑ | 0.6020|± | 0.0155|
| - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.7440|± | 0.0138|
| - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.4470|± | 0.0157|
| - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.4930|± | 0.0158|
| - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.5420|± | 0.0158|
| - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.5630|± | 0.0157|
| - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.5160|± | 0.0158|
| - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.6320|± | 0.0153|
| - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.6440|± | 0.0151|
| - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.5790|± | 0.0156|
| - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.5420|± | 0.0158|
| - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.3320|± | 0.0149|
| - blimp_passive_1 | 1|none | 0|acc |↑ | 0.7070|± | 0.0144|
| - blimp_passive_2 | 1|none | 0|acc |↑ | 0.5820|± | 0.0156|
| - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.3080|± | 0.0146|
| - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 0.5540|± | 0.0157|
| - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.4770|± | 0.0158|
| - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.4730|± | 0.0158|
| - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.4620|± | 0.0158|
| - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.5200|± | 0.0158|
| - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.3920|± | 0.0154|
| - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.4400|± | 0.0157|
| - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.5130|± | 0.0158|
| - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 0.5270|± | 0.0158|
| - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.2560|± | 0.0138|
| - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.4310|± | 0.0157|
| - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.5080|± | 0.0158|
| - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.9110|± | 0.0090|
| - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.2720|± | 0.0141|
| - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.7090|± | 0.0144|
| - blimp_transitive | 1|none | 0|acc |↑ | 0.5390|± | 0.0158|
| - blimp_wh_island | 1|none | 0|acc |↑ | 0.8840|± | 0.0101|
| - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.0680|± | 0.0080|
| - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.2390|± | 0.0135|
| - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.4310|± | 0.0157|
| - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.4310|± | 0.0157|
| - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.4740|± | 0.0158|
| - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.5410|± | 0.0158|
| - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.5420|± | 0.0158|
|hellaswag | 1|none | 0|acc |↑ | 0.2582|± | 0.0044|
| | |none | 0|acc_norm |↑ | 0.2616|± | 0.0044|
|lambada_openai | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
| | |none | 0|perplexity|↓ | 3033175.2693|± | 288926.5827|
|lambada_standard | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
| | |none | 0|perplexity|↓ |27067951.3461|± |2710040.1909|
|mmlu | 2|none | |acc |↑ | 0.2465|± | 0.0036|
| - humanities | 2|none | |acc |↑ | 0.2451|± | 0.0063|
| - formal_logic | 1|none | 0|acc |↑ | 0.1984|± | 0.0357|
| - high_school_european_history | 1|none | 0|acc |↑ | 0.2424|± | 0.0335|
| - high_school_us_history | 1|none | 0|acc |↑ | 0.2353|± | 0.0298|
| - high_school_world_history | 1|none | 0|acc |↑ | 0.2616|± | 0.0286|
| - international_law | 1|none | 0|acc |↑ | 0.2479|± | 0.0394|
| - jurisprudence | 1|none | 0|acc |↑ | 0.2963|± | 0.0441|
| - logical_fallacies | 1|none | 0|acc |↑ | 0.2454|± | 0.0338|
| - moral_disputes | 1|none | 0|acc |↑ | 0.2457|± | 0.0232|
| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± | 0.0143|
| - philosophy | 1|none | 0|acc |↑ | 0.2733|± | 0.0253|
| - prehistory | 1|none | 0|acc |↑ | 0.2654|± | 0.0246|
| - professional_law | 1|none | 0|acc |↑ | 0.2392|± | 0.0109|
| - world_religions | 1|none | 0|acc |↑ | 0.2105|± | 0.0313|
| - other | 2|none | |acc |↑ | 0.2687|± | 0.0079|
| - business_ethics | 1|none | 0|acc |↑ | 0.2600|± | 0.0441|
| - clinical_knowledge | 1|none | 0|acc |↑ | 0.2679|± | 0.0273|
| - college_medicine | 1|none | 0|acc |↑ | 0.2081|± | 0.0310|
| - global_facts | 1|none | 0|acc |↑ | 0.3100|± | 0.0465|
| - human_aging | 1|none | 0|acc |↑ | 0.3767|± | 0.0325|
| - management | 1|none | 0|acc |↑ | 0.2524|± | 0.0430|
| - marketing | 1|none | 0|acc |↑ | 0.2564|± | 0.0286|
| - medical_genetics | 1|none | 0|acc |↑ | 0.2600|± | 0.0441|
| - miscellaneous | 1|none | 0|acc |↑ | 0.2874|± | 0.0162|
| - nutrition | 1|none | 0|acc |↑ | 0.2288|± | 0.0241|
| - professional_accounting | 1|none | 0|acc |↑ | 0.2553|± | 0.0260|
| - professional_medicine | 1|none | 0|acc |↑ | 0.2022|± | 0.0244|
| - virology | 1|none | 0|acc |↑ | 0.3193|± | 0.0363|
| - social sciences | 2|none | |acc |↑ | 0.2343|± | 0.0076|
| - econometrics | 1|none | 0|acc |↑ | 0.2807|± | 0.0423|
| - high_school_geography | 1|none | 0|acc |↑ | 0.2172|± | 0.0294|
| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.2073|± | 0.0293|
| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.2205|± | 0.0210|
| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.2311|± | 0.0274|
| - high_school_psychology | 1|none | 0|acc |↑ | 0.2367|± | 0.0182|
| - human_sexuality | 1|none | 0|acc |↑ | 0.2290|± | 0.0369|
| - professional_psychology | 1|none | 0|acc |↑ | 0.2565|± | 0.0177|
| - public_relations | 1|none | 0|acc |↑ | 0.3455|± | 0.0455|
| - security_studies | 1|none | 0|acc |↑ | 0.1714|± | 0.0241|
| - sociology | 1|none | 0|acc |↑ | 0.2388|± | 0.0301|
| - us_foreign_policy | 1|none | 0|acc |↑ | 0.2100|± | 0.0409|
| - stem | 2|none | |acc |↑ | 0.2388|± | 0.0076|
| - abstract_algebra | 1|none | 0|acc |↑ | 0.2600|± | 0.0441|
| - anatomy | 1|none | 0|acc |↑ | 0.2519|± | 0.0375|
| - astronomy | 1|none | 0|acc |↑ | 0.1842|± | 0.0315|
| - college_biology | 1|none | 0|acc |↑ | 0.2222|± | 0.0348|
| - college_chemistry | 1|none | 0|acc |↑ | 0.2100|± | 0.0409|
| - college_computer_science | 1|none | 0|acc |↑ | 0.1500|± | 0.0359|
| - college_mathematics | 1|none | 0|acc |↑ | 0.2300|± | 0.0423|
| - college_physics | 1|none | 0|acc |↑ | 0.1961|± | 0.0395|
| - computer_security | 1|none | 0|acc |↑ | 0.2400|± | 0.0429|
| - conceptual_physics | 1|none | 0|acc |↑ | 0.3234|± | 0.0306|
| - electrical_engineering | 1|none | 0|acc |↑ | 0.2207|± | 0.0346|
| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2566|± | 0.0225|
| - high_school_biology | 1|none | 0|acc |↑ | 0.2548|± | 0.0248|
| - high_school_chemistry | 1|none | 0|acc |↑ | 0.2709|± | 0.0313|
| - high_school_computer_science | 1|none | 0|acc |↑ | 0.2300|± | 0.0423|
| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2630|± | 0.0268|
| - high_school_physics | 1|none | 0|acc |↑ | 0.1987|± | 0.0326|
| - high_school_statistics | 1|none | 0|acc |↑ | 0.1620|± | 0.0251|
| - machine_learning | 1|none | 0|acc |↑ | 0.2857|± | 0.0429|
|winogrande | 1|none | 0|acc |↑ | 0.4933|± | 0.0141|