-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_eval_1.4B_minipile_table.txt
143 lines (143 loc) · 18.4 KB
/
04_eval_1.4B_minipile_table.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
| Tasks |Version|Filter|n-shot| Metric | | Value | | Stderr |
|------------------------------------------------------------|------:|------|-----:|----------|---|-----------:|---|----------:|
|arc_challenge | 1|none | 0|acc |↑ | 0.1903|± | 0.0115|
| | |none | 0|acc_norm |↑ | 0.2201|± | 0.0121|
|arc_easy | 1|none | 0|acc |↑ | 0.2715|± | 0.0091|
| | |none | 0|acc_norm |↑ | 0.2685|± | 0.0091|
|blimp | 2|none | |acc |↑ | 0.5483|± | 0.0017|
| - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.6120|± | 0.0154|
| - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.1920|± | 0.0125|
| - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.3580|± | 0.0152|
| - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.6260|± | 0.0153|
| - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.8010|± | 0.0126|
| - blimp_causative | 1|none | 0|acc |↑ | 0.5030|± | 0.0158|
| - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.4510|± | 0.0157|
| - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.1270|± | 0.0105|
| - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.5320|± | 0.0158|
| - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.5080|± | 0.0158|
| - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.5070|± | 0.0158|
| - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.4800|± | 0.0158|
| - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.5330|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.4790|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.4380|± | 0.0157|
| - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.5380|± | 0.0158|
| - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.4950|± | 0.0158|
| - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.3090|± | 0.0146|
| - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.2890|± | 0.0143|
| - blimp_drop_argument | 1|none | 0|acc |↑ | 0.6960|± | 0.0146|
| - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.1200|± | 0.0103|
| - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.3470|± | 0.0151|
| - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.7980|± | 0.0127|
| - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.8510|± | 0.0113|
| - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.4380|± | 0.0157|
| - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.5020|± | 0.0158|
| - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.6990|± | 0.0145|
| - blimp_inchoative | 1|none | 0|acc |↑ | 0.3940|± | 0.0155|
| - blimp_intransitive | 1|none | 0|acc |↑ | 0.5870|± | 0.0156|
| - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.7740|± | 0.0132|
| - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.7510|± | 0.0137|
| - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.5540|± | 0.0157|
| - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.4920|± | 0.0158|
| - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.7910|± | 0.0129|
| - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.2040|± | 0.0127|
| - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.0060|± | 0.0024|
| - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.5800|± | 0.0156|
| - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.5470|± | 0.0157|
| - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.2230|± | 0.0132|
| - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.3190|± | 0.0147|
| - blimp_passive_1 | 1|none | 0|acc |↑ | 0.7340|± | 0.0140|
| - blimp_passive_2 | 1|none | 0|acc |↑ | 0.6170|± | 0.0154|
| - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.6420|± | 0.0152|
| - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|± | 0.0000|
| - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.5150|± | 0.0158|
| - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.9990|± | 0.0010|
| - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.5890|± | 0.0156|
| - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.5080|± | 0.0158|
| - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.4220|± | 0.0156|
| - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.6720|± | 0.0149|
| - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.5690|± | 0.0157|
| - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 1.0000|± | 0.0000|
| - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.6790|± | 0.0148|
| - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.6260|± | 0.0153|
| - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.5400|± | 0.0158|
| - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.2340|± | 0.0134|
| - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.3020|± | 0.0145|
| - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.7140|± | 0.0143|
| - blimp_transitive | 1|none | 0|acc |↑ | 0.6530|± | 0.0151|
| - blimp_wh_island | 1|none | 0|acc |↑ | 0.6970|± | 0.0145|
| - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.6210|± | 0.0153|
| - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.9780|± | 0.0046|
| - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.9730|± | 0.0051|
| - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 1.0000|± | 0.0000|
| - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 1.0000|± | 0.0000|
| - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
| - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
|hellaswag | 1|none | 0|acc |↑ | 0.2579|± | 0.0044|
| | |none | 0|acc_norm |↑ | 0.2558|± | 0.0044|
|lambada_openai | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
| | |none | 0|perplexity|↓ |1564928.5258|± |118691.4565|
|lambada_standard | 1|none | 0|acc |↑ | 0.0000|± | 0.0000|
| | |none | 0|perplexity|↓ |8848600.9409|± |745031.8900|
|mmlu | 2|none | |acc |↑ | 0.2295|± | 0.0035|
| - humanities | 2|none | |acc |↑ | 0.2421|± | 0.0062|
| - formal_logic | 1|none | 0|acc |↑ | 0.2857|± | 0.0404|
| - high_school_european_history | 1|none | 0|acc |↑ | 0.2182|± | 0.0323|
| - high_school_us_history | 1|none | 0|acc |↑ | 0.2500|± | 0.0304|
| - high_school_world_history | 1|none | 0|acc |↑ | 0.2700|± | 0.0289|
| - international_law | 1|none | 0|acc |↑ | 0.2397|± | 0.0390|
| - jurisprudence | 1|none | 0|acc |↑ | 0.2593|± | 0.0424|
| - logical_fallacies | 1|none | 0|acc |↑ | 0.2209|± | 0.0326|
| - moral_disputes | 1|none | 0|acc |↑ | 0.2486|± | 0.0233|
| - moral_scenarios | 1|none | 0|acc |↑ | 0.2380|± | 0.0142|
| - philosophy | 1|none | 0|acc |↑ | 0.1865|± | 0.0221|
| - prehistory | 1|none | 0|acc |↑ | 0.2160|± | 0.0229|
| - professional_law | 1|none | 0|acc |↑ | 0.2458|± | 0.0110|
| - world_religions | 1|none | 0|acc |↑ | 0.3216|± | 0.0358|
| - other | 2|none | |acc |↑ | 0.2398|± | 0.0076|
| - business_ethics | 1|none | 0|acc |↑ | 0.3000|± | 0.0461|
| - clinical_knowledge | 1|none | 0|acc |↑ | 0.2151|± | 0.0253|
| - college_medicine | 1|none | 0|acc |↑ | 0.2081|± | 0.0310|
| - global_facts | 1|none | 0|acc |↑ | 0.1800|± | 0.0386|
| - human_aging | 1|none | 0|acc |↑ | 0.3139|± | 0.0311|
| - management | 1|none | 0|acc |↑ | 0.1748|± | 0.0376|
| - marketing | 1|none | 0|acc |↑ | 0.2906|± | 0.0297|
| - medical_genetics | 1|none | 0|acc |↑ | 0.3000|± | 0.0461|
| - miscellaneous | 1|none | 0|acc |↑ | 0.2375|± | 0.0152|
| - nutrition | 1|none | 0|acc |↑ | 0.2255|± | 0.0239|
| - professional_accounting | 1|none | 0|acc |↑ | 0.2340|± | 0.0253|
| - professional_medicine | 1|none | 0|acc |↑ | 0.1838|± | 0.0235|
| - virology | 1|none | 0|acc |↑ | 0.2831|± | 0.0351|
| - social sciences | 2|none | |acc |↑ | 0.2171|± | 0.0074|
| - econometrics | 1|none | 0|acc |↑ | 0.2368|± | 0.0400|
| - high_school_geography | 1|none | 0|acc |↑ | 0.1768|± | 0.0272|
| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.1969|± | 0.0287|
| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.2026|± | 0.0204|
| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.2101|± | 0.0265|
| - high_school_psychology | 1|none | 0|acc |↑ | 0.1927|± | 0.0169|
| - human_sexuality | 1|none | 0|acc |↑ | 0.2595|± | 0.0384|
| - professional_psychology | 1|none | 0|acc |↑ | 0.2500|± | 0.0175|
| - public_relations | 1|none | 0|acc |↑ | 0.2182|± | 0.0396|
| - security_studies | 1|none | 0|acc |↑ | 0.1878|± | 0.0250|
| - sociology | 1|none | 0|acc |↑ | 0.2438|± | 0.0304|
| - us_foreign_policy | 1|none | 0|acc |↑ | 0.2800|± | 0.0451|
| - stem | 2|none | |acc |↑ | 0.2125|± | 0.0073|
| - abstract_algebra | 1|none | 0|acc |↑ | 0.2200|± | 0.0416|
| - anatomy | 1|none | 0|acc |↑ | 0.1852|± | 0.0336|
| - astronomy | 1|none | 0|acc |↑ | 0.1776|± | 0.0311|
| - college_biology | 1|none | 0|acc |↑ | 0.2569|± | 0.0365|
| - college_chemistry | 1|none | 0|acc |↑ | 0.2000|± | 0.0402|
| - college_computer_science | 1|none | 0|acc |↑ | 0.2600|± | 0.0441|
| - college_mathematics | 1|none | 0|acc |↑ | 0.2100|± | 0.0409|
| - college_physics | 1|none | 0|acc |↑ | 0.2157|± | 0.0409|
| - computer_security | 1|none | 0|acc |↑ | 0.2800|± | 0.0451|
| - conceptual_physics | 1|none | 0|acc |↑ | 0.2638|± | 0.0288|
| - electrical_engineering | 1|none | 0|acc |↑ | 0.2414|± | 0.0357|
| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2090|± | 0.0209|
| - high_school_biology | 1|none | 0|acc |↑ | 0.1774|± | 0.0217|
| - high_school_chemistry | 1|none | 0|acc |↑ | 0.1527|± | 0.0253|
| - high_school_computer_science | 1|none | 0|acc |↑ | 0.2500|± | 0.0435|
| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2111|± | 0.0249|
| - high_school_physics | 1|none | 0|acc |↑ | 0.1987|± | 0.0326|
| - high_school_statistics | 1|none | 0|acc |↑ | 0.1528|± | 0.0245|
| - machine_learning | 1|none | 0|acc |↑ | 0.3125|± | 0.0440|
|winogrande | 1|none | 0|acc |↑ | 0.5185|± | 0.0140|