-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_eval_160M_pile-dedup_table.txt
141 lines (141 loc) · 16.9 KB
/
02_eval_160M_pile-dedup_table.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|------------------------------------------------------------|------:|------|-----:|----------|---|-------:|---|-----:|
|arc_challenge | 1|none | 0|acc |↑ | 0.1997|± |0.0117|
| | |none | 0|acc_norm |↑ | 0.2398|± |0.0125|
|blimp | 2|none | |acc |↑ | 0.7294|± |0.0015|
| - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.8150|± |0.0123|
| - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.9640|± |0.0059|
| - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.9810|± |0.0043|
| - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.7870|± |0.0130|
| - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.7310|± |0.0140|
| - blimp_causative | 1|none | 0|acc |↑ | 0.7060|± |0.0144|
| - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.5360|± |0.0158|
| - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.3500|± |0.0151|
| - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.7390|± |0.0139|
| - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.9670|± |0.0057|
| - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.9480|± |0.0070|
| - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.8330|± |0.0118|
| - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.8510|± |0.0113|
| - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.8850|± |0.0101|
| - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.8050|± |0.0125|
| - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.8390|± |0.0116|
| - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.9170|± |0.0087|
| - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.7400|± |0.0139|
| - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.5460|± |0.0158|
| - blimp_drop_argument | 1|none | 0|acc |↑ | 0.6880|± |0.0147|
| - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.6770|± |0.0148|
| - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.9250|± |0.0083|
| - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.8740|± |0.0105|
| - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.9700|± |0.0054|
| - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.1860|± |0.0123|
| - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.8090|± |0.0124|
| - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.7800|± |0.0131|
| - blimp_inchoative | 1|none | 0|acc |↑ | 0.5650|± |0.0157|
| - blimp_intransitive | 1|none | 0|acc |↑ | 0.6870|± |0.0147|
| - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.7060|± |0.0144|
| - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.7900|± |0.0129|
| - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.8690|± |0.0107|
| - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.7530|± |0.0136|
| - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.3540|± |0.0151|
| - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.2840|± |0.0143|
| - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.2440|± |0.0136|
| - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.6470|± |0.0151|
| - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.6300|± |0.0153|
| - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 1.0000|± |0.0000|
| - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.8510|± |0.0113|
| - blimp_passive_1 | 1|none | 0|acc |↑ | 0.8830|± |0.0102|
| - blimp_passive_2 | 1|none | 0|acc |↑ | 0.8660|± |0.0108|
| - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.5160|± |0.0158|
| - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|± |0.0000|
| - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.9450|± |0.0072|
| - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.9910|± |0.0030|
| - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.8200|± |0.0122|
| - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.5840|± |0.0156|
| - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.1610|± |0.0116|
| - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.8770|± |0.0104|
| - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.8610|± |0.0109|
| - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 0.9990|± |0.0010|
| - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.6850|± |0.0147|
| - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.3300|± |0.0149|
| - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.6370|± |0.0152|
| - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.7620|± |0.0135|
| - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.5340|± |0.0158|
| - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.8340|± |0.0118|
| - blimp_transitive | 1|none | 0|acc |↑ | 0.8020|± |0.0126|
| - blimp_wh_island | 1|none | 0|acc |↑ | 0.6700|± |0.0149|
| - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.6840|± |0.0147|
| - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.8820|± |0.0102|
| - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.8260|± |0.0120|
| - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.9730|± |0.0051|
| - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.9700|± |0.0054|
| - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.4010|± |0.0155|
| - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.1500|± |0.0113|
|hellaswag | 1|none | 0|acc |↑ | 0.2903|± |0.0045|
| | |none | 0|acc_norm |↑ | 0.3135|± |0.0046|
|lambada_openai | 1|none | 0|acc |↑ | 0.3691|± |0.0067|
| | |none | 0|perplexity|↓ | 31.2589|± |1.1594|
|lambada_standard | 1|none | 0|acc |↑ | 0,|± |0.0059|
| | |none | 0|perplexity|↓ |172.7619|± |7.7265|
|mmlu | 2|none | |acc |↑ | 0.2299|± |0.0035|
| - humanities | 2|none | |acc |↑ | 0.2417|± |0.0062|
| - formal_logic | 1|none | 0|acc |↑ | 0.2778|± |0.0401|
| - high_school_european_history | 1|none | 0|acc |↑ | 0.2242|± |0.0326|
| - high_school_us_history | 1|none | 0|acc |↑ | 0.2500|± |0.0304|
| - high_school_world_history | 1|none | 0|acc |↑ | 0.2700|± |0.0289|
| - international_law | 1|none | 0|acc |↑ | 0.2397|± |0.0390|
| - jurisprudence | 1|none | 0|acc |↑ | 0.2593|± |0.0424|
| - logical_fallacies | 1|none | 0|acc |↑ | 0.2209|± |0.0326|
| - moral_disputes | 1|none | 0|acc |↑ | 0.2486|± |0.0233|
| - moral_scenarios | 1|none | 0|acc |↑ | 0.2380|± |0.0142|
| - philosophy | 1|none | 0|acc |↑ | 0.1865|± |0.0221|
| - prehistory | 1|none | 0|acc |↑ | 0.2130|± |0.0228|
| - professional_law | 1|none | 0|acc |↑ | 0.2458|± |0.0110|
| - world_religions | 1|none | 0|acc |↑ | 0.3158|± |0.0357|
| - other | 2|none | |acc |↑ | 0.2401|± |0.0076|
| - business_ethics | 1|none | 0|acc |↑ | 0.3000|± |0.0461|
| - clinical_knowledge | 1|none | 0|acc |↑ | 0.2151|± |0.0253|
| - college_medicine | 1|none | 0|acc |↑ | 0.2081|± |0.0310|
| - global_facts | 1|none | 0|acc |↑ | 0.1800|± |0.0386|
| - human_aging | 1|none | 0|acc |↑ | 0.3094|± |0.0310|
| - management | 1|none | 0|acc |↑ | 0.1748|± |0.0376|
| - marketing | 1|none | 0|acc |↑ | 0.2906|± |0.0297|
| - medical_genetics | 1|none | 0|acc |↑ | 0.3000|± |0.0461|
| - miscellaneous | 1|none | 0|acc |↑ | 0.2375|± |0.0152|
| - nutrition | 1|none | 0|acc |↑ | 0.2190|± |0.0237|
| - professional_accounting | 1|none | 0|acc |↑ | 0.2340|± |0.0253|
| - professional_medicine | 1|none | 0|acc |↑ | 0.1985|± |0.0242|
| - virology | 1|none | 0|acc |↑ | 0.2831|± |0.0351|
| - social sciences | 2|none | |acc |↑ | 0.2187|± |0.0074|
| - econometrics | 1|none | 0|acc |↑ | 0.2368|± |0.0400|
| - high_school_geography | 1|none | 0|acc |↑ | 0.1869|± |0.0278|
| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.1969|± |0.0287|
| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.2051|± |0.0205|
| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.2101|± |0.0265|
| - high_school_psychology | 1|none | 0|acc |↑ | 0.1927|± |0.0169|
| - human_sexuality | 1|none | 0|acc |↑ | 0.2595|± |0.0384|
| - professional_psychology | 1|none | 0|acc |↑ | 0.2516|± |0.0176|
| - public_relations | 1|none | 0|acc |↑ | 0.2182|± |0.0396|
| - security_studies | 1|none | 0|acc |↑ | 0.1878|± |0.0250|
| - sociology | 1|none | 0|acc |↑ | 0.2488|± |0.0306|
| - us_foreign_policy | 1|none | 0|acc |↑ | 0.2800|± |0.0451|
| - stem | 2|none | |acc |↑ | 0.2131|± |0.0073|
| - abstract_algebra | 1|none | 0|acc |↑ | 0.2200|± |0.0416|
| - anatomy | 1|none | 0|acc |↑ | 0.1926|± |0.0341|
| - astronomy | 1|none | 0|acc |↑ | 0.1776|± |0.0311|
| - college_biology | 1|none | 0|acc |↑ | 0.2639|± |0.0369|
| - college_chemistry | 1|none | 0|acc |↑ | 0.1900|± |0.0394|
| - college_computer_science | 1|none | 0|acc |↑ | 0.2600|± |0.0441|
| - college_mathematics | 1|none | 0|acc |↑ | 0.2100|± |0.0409|
| - college_physics | 1|none | 0|acc |↑ | 0.2157|± |0.0409|
| - computer_security | 1|none | 0|acc |↑ | 0.2800|± |0.0451|
| - conceptual_physics | 1|none | 0|acc |↑ | 0.2596|± |0.0287|
| - electrical_engineering | 1|none | 0|acc |↑ | 0.2345|± |0.0353|
| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2090|± |0.0209|
| - high_school_biology | 1|none | 0|acc |↑ | 0.1742|± |0.0216|
| - high_school_chemistry | 1|none | 0|acc |↑ | 0.1724|± |0.0266|
| - high_school_computer_science | 1|none | 0|acc |↑ | 0.2600|± |0.0441|
| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2074|± |0.0247|
| - high_school_physics | 1|none | 0|acc |↑ | 0.1987|± |0.0326|
| - high_school_statistics | 1|none | 0|acc |↑ | 0.1528|± |0.0245|
| - machine_learning | 1|none | 0|acc |↑ | 0.3125|± |0.0440|
|winogrande | 1|none | 0|acc |↑ | 0.4957|± |0.0141|