-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_eval_1.4B_pile-dedup_table.txt
138 lines (138 loc) · 16.4 KB
/
04_eval_1.4B_pile-dedup_table.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|------------------------------------------------------------|------:|------|-----:|----------|---|------:|---|-----:|
|blimp | 2|none | |acc |↑ | 0.8154|± |0.0013|
| - blimp_adjunct_island | 1|none | 0|acc |↑ | 0.9150|± |0.0088|
| - blimp_anaphor_gender_agreement | 1|none | 0|acc |↑ | 0.9880|± |0.0034|
| - blimp_anaphor_number_agreement | 1|none | 0|acc |↑ | 0.9950|± |0.0022|
| - blimp_animate_subject_passive | 1|none | 0|acc |↑ | 0.7900|± |0.0129|
| - blimp_animate_subject_trans | 1|none | 0|acc |↑ | 0.9190|± |0.0086|
| - blimp_causative | 1|none | 0|acc |↑ | 0.7490|± |0.0137|
| - blimp_complex_NP_island | 1|none | 0|acc |↑ | 0.5710|± |0.0157|
| - blimp_coordinate_structure_constraint_complex_left_branch| 1|none | 0|acc |↑ | 0.7650|± |0.0134|
| - blimp_coordinate_structure_constraint_object_extraction | 1|none | 0|acc |↑ | 0.8570|± |0.0111|
| - blimp_determiner_noun_agreement_1 | 1|none | 0|acc |↑ | 0.9920|± |0.0028|
| - blimp_determiner_noun_agreement_2 | 1|none | 0|acc |↑ | 0.9800|± |0.0044|
| - blimp_determiner_noun_agreement_irregular_1 | 1|none | 0|acc |↑ | 0.9400|± |0.0075|
| - blimp_determiner_noun_agreement_irregular_2 | 1|none | 0|acc |↑ | 0.9540|± |0.0066|
| - blimp_determiner_noun_agreement_with_adj_2 | 1|none | 0|acc |↑ | 0.9550|± |0.0066|
| - blimp_determiner_noun_agreement_with_adj_irregular_1 | 1|none | 0|acc |↑ | 0.9160|± |0.0088|
| - blimp_determiner_noun_agreement_with_adj_irregular_2 | 1|none | 0|acc |↑ | 0.9280|± |0.0082|
| - blimp_determiner_noun_agreement_with_adjective_1 | 1|none | 0|acc |↑ | 0.9700|± |0.0054|
| - blimp_distractor_agreement_relational_noun | 1|none | 0|acc |↑ | 0.8710|± |0.0106|
| - blimp_distractor_agreement_relative_clause | 1|none | 0|acc |↑ | 0.7700|± |0.0133|
| - blimp_drop_argument | 1|none | 0|acc |↑ | 0.7040|± |0.0144|
| - blimp_ellipsis_n_bar_1 | 1|none | 0|acc |↑ | 0.8790|± |0.0103|
| - blimp_ellipsis_n_bar_2 | 1|none | 0|acc |↑ | 0.8880|± |0.0100|
| - blimp_existential_there_object_raising | 1|none | 0|acc |↑ | 0.8790|± |0.0103|
| - blimp_existential_there_quantifiers_1 | 1|none | 0|acc |↑ | 0.9780|± |0.0046|
| - blimp_existential_there_quantifiers_2 | 1|none | 0|acc |↑ | 0.4030|± |0.0155|
| - blimp_existential_there_subject_raising | 1|none | 0|acc |↑ | 0.9010|± |0.0094|
| - blimp_expletive_it_object_raising | 1|none | 0|acc |↑ | 0.8100|± |0.0124|
| - blimp_inchoative | 1|none | 0|acc |↑ | 0.6600|± |0.0150|
| - blimp_intransitive | 1|none | 0|acc |↑ | 0.7850|± |0.0130|
| - blimp_irregular_past_participle_adjectives | 1|none | 0|acc |↑ | 0.9570|± |0.0064|
| - blimp_irregular_past_participle_verbs | 1|none | 0|acc |↑ | 0.9470|± |0.0071|
| - blimp_irregular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.9130|± |0.0089|
| - blimp_irregular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.9050|± |0.0093|
| - blimp_left_branch_island_echo_question | 1|none | 0|acc |↑ | 0.5670|± |0.0157|
| - blimp_left_branch_island_simple_question | 1|none | 0|acc |↑ | 0.8430|± |0.0115|
| - blimp_matrix_question_npi_licensor_present | 1|none | 0|acc |↑ | 0.4580|± |0.0158|
| - blimp_npi_present_1 | 1|none | 0|acc |↑ | 0.6120|± |0.0154|
| - blimp_npi_present_2 | 1|none | 0|acc |↑ | 0.5400|± |0.0158|
| - blimp_only_npi_licensor_present | 1|none | 0|acc |↑ | 0.9090|± |0.0091|
| - blimp_only_npi_scope | 1|none | 0|acc |↑ | 0.5840|± |0.0156|
| - blimp_passive_1 | 1|none | 0|acc |↑ | 0.9010|± |0.0094|
| - blimp_passive_2 | 1|none | 0|acc |↑ | 0.9060|± |0.0092|
| - blimp_principle_A_c_command | 1|none | 0|acc |↑ | 0.7270|± |0.0141|
| - blimp_principle_A_case_1 | 1|none | 0|acc |↑ | 1.0000|± |0.0000|
| - blimp_principle_A_case_2 | 1|none | 0|acc |↑ | 0.9360|± |0.0077|
| - blimp_principle_A_domain_1 | 1|none | 0|acc |↑ | 0.9930|± |0.0026|
| - blimp_principle_A_domain_2 | 1|none | 0|acc |↑ | 0.8950|± |0.0097|
| - blimp_principle_A_domain_3 | 1|none | 0|acc |↑ | 0.7620|± |0.0135|
| - blimp_principle_A_reconstruction | 1|none | 0|acc |↑ | 0.3200|± |0.0148|
| - blimp_regular_plural_subject_verb_agreement_1 | 1|none | 0|acc |↑ | 0.9470|± |0.0071|
| - blimp_regular_plural_subject_verb_agreement_2 | 1|none | 0|acc |↑ | 0.9160|± |0.0088|
| - blimp_sentential_negation_npi_licensor_present | 1|none | 0|acc |↑ | 0.9930|± |0.0026|
| - blimp_sentential_negation_npi_scope | 1|none | 0|acc |↑ | 0.6580|± |0.0150|
| - blimp_sentential_subject_island | 1|none | 0|acc |↑ | 0.3050|± |0.0146|
| - blimp_superlative_quantifiers_1 | 1|none | 0|acc |↑ | 0.9130|± |0.0089|
| - blimp_superlative_quantifiers_2 | 1|none | 0|acc |↑ | 0.9240|± |0.0084|
| - blimp_tough_vs_raising_1 | 1|none | 0|acc |↑ | 0.6280|± |0.0153|
| - blimp_tough_vs_raising_2 | 1|none | 0|acc |↑ | 0.8850|± |0.0101|
| - blimp_transitive | 1|none | 0|acc |↑ | 0.8760|± |0.0104|
| - blimp_wh_island | 1|none | 0|acc |↑ | 0.7590|± |0.0135|
| - blimp_wh_questions_object_gap | 1|none | 0|acc |↑ | 0.8670|± |0.0107|
| - blimp_wh_questions_subject_gap | 1|none | 0|acc |↑ | 0.9270|± |0.0082|
| - blimp_wh_questions_subject_gap_long_distance | 1|none | 0|acc |↑ | 0.8710|± |0.0106|
| - blimp_wh_vs_that_no_gap | 1|none | 0|acc |↑ | 0.9810|± |0.0043|
| - blimp_wh_vs_that_no_gap_long_distance | 1|none | 0|acc |↑ | 0.9610|± |0.0061|
| - blimp_wh_vs_that_with_gap | 1|none | 0|acc |↑ | 0.4930|± |0.0158|
| - blimp_wh_vs_that_with_gap_long_distance | 1|none | 0|acc |↑ | 0.3420|± |0.0150|
|hellaswag | 1|none | 0|acc |↑ | 0.4177|± |0.0049|
| | |none | 0|acc_norm |↑ | 0.5433|± |0.0050|
|lambada_openai | 1|none | 0|acc |↑ | 0.6202|± |0.0068|
| | |none | 0|perplexity|↓ | 6.1041|± |0.1531|
|lambada_standard | 1|none | 0|acc |↑ | 0.4898|± |0.0070|
| | |none | 0|perplexity|↓ |11.2448|± |0.3305|
|mmlu | 2|none | |acc |↑ | 0.2388|± |0.0036|
| - humanities | 2|none | |acc |↑ | 0.2429|± |0.0062|
| - formal_logic | 1|none | 0|acc |↑ | 0.1825|± |0.0346|
| - high_school_european_history | 1|none | 0|acc |↑ | 0.1939|± |0.0309|
| - high_school_us_history | 1|none | 0|acc |↑ | 0.2598|± |0.0308|
| - high_school_world_history | 1|none | 0|acc |↑ | 0.2321|± |0.0275|
| - international_law | 1|none | 0|acc |↑ | 0.3058|± |0.0421|
| - jurisprudence | 1|none | 0|acc |↑ | 0.2778|± |0.0433|
| - logical_fallacies | 1|none | 0|acc |↑ | 0.2147|± |0.0323|
| - moral_disputes | 1|none | 0|acc |↑ | 0.2688|± |0.0239|
| - moral_scenarios | 1|none | 0|acc |↑ | 0.2659|± |0.0148|
| - philosophy | 1|none | 0|acc |↑ | 0.1865|± |0.0221|
| - prehistory | 1|none | 0|acc |↑ | 0.2068|± |0.0225|
| - professional_law | 1|none | 0|acc |↑ | 0.2425|± |0.0109|
| - world_religions | 1|none | 0|acc |↑ | 0.2924|± |0.0349|
| - other | 2|none | |acc |↑ | 0.2440|± |0.0077|
| - business_ethics | 1|none | 0|acc |↑ | 0.2800|± |0.0451|
| - clinical_knowledge | 1|none | 0|acc |↑ | 0.2151|± |0.0253|
| - college_medicine | 1|none | 0|acc |↑ | 0.2139|± |0.0313|
| - global_facts | 1|none | 0|acc |↑ | 0.2100|± |0.0409|
| - human_aging | 1|none | 0|acc |↑ | 0.3139|± |0.0311|
| - management | 1|none | 0|acc |↑ | 0.1942|± |0.0392|
| - marketing | 1|none | 0|acc |↑ | 0.3034|± |0.0301|
| - medical_genetics | 1|none | 0|acc |↑ | 0.2300|± |0.0423|
| - miscellaneous | 1|none | 0|acc |↑ | 0.2554|± |0.0156|
| - nutrition | 1|none | 0|acc |↑ | 0.2255|± |0.0239|
| - professional_accounting | 1|none | 0|acc |↑ | 0.2305|± |0.0251|
| - professional_medicine | 1|none | 0|acc |↑ | 0.1985|± |0.0242|
| - virology | 1|none | 0|acc |↑ | 0.2590|± |0.0341|
| - social sciences | 2|none | |acc |↑ | 0.2291|± |0.0076|
| - econometrics | 1|none | 0|acc |↑ | 0.2719|± |0.0419|
| - high_school_geography | 1|none | 0|acc |↑ | 0.2071|± |0.0289|
| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.2228|± |0.0300|
| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.2051|± |0.0205|
| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.2437|± |0.0279|
| - high_school_psychology | 1|none | 0|acc |↑ | 0.2275|± |0.0180|
| - human_sexuality | 1|none | 0|acc |↑ | 0.2366|± |0.0373|
| - professional_psychology | 1|none | 0|acc |↑ | 0.2549|± |0.0176|
| - public_relations | 1|none | 0|acc |↑ | 0.2091|± |0.0390|
| - security_studies | 1|none | 0|acc |↑ | 0.1837|± |0.0248|
| - sociology | 1|none | 0|acc |↑ | 0.2239|± |0.0295|
| - us_foreign_policy | 1|none | 0|acc |↑ | 0.2800|± |0.0451|
| - stem | 2|none | |acc |↑ | 0.2369|± |0.0076|
| - abstract_algebra | 1|none | 0|acc |↑ | 0.2400|± |0.0429|
| - anatomy | 1|none | 0|acc |↑ | 0.2222|± |0.0359|
| - astronomy | 1|none | 0|acc |↑ | 0.1842|± |0.0315|
| - college_biology | 1|none | 0|acc |↑ | 0.2917|± |0.0380|
| - college_chemistry | 1|none | 0|acc |↑ | 0.2200|± |0.0416|
| - college_computer_science | 1|none | 0|acc |↑ | 0.2500|± |0.0435|
| - college_mathematics | 1|none | 0|acc |↑ | 0.2300|± |0.0423|
| - college_physics | 1|none | 0|acc |↑ | 0.2353|± |0.0422|
| - computer_security | 1|none | 0|acc |↑ | 0.3000|± |0.0461|
| - conceptual_physics | 1|none | 0|acc |↑ | 0.2766|± |0.0292|
| - electrical_engineering | 1|none | 0|acc |↑ | 0.2345|± |0.0353|
| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2407|± |0.0220|
| - high_school_biology | 1|none | 0|acc |↑ | 0.2323|± |0.0240|
| - high_school_chemistry | 1|none | 0|acc |↑ | 0.2167|± |0.0290|
| - high_school_computer_science | 1|none | 0|acc |↑ | 0.2500|± |0.0435|
| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2148|± |0.0250|
| - high_school_physics | 1|none | 0|acc |↑ | 0.2185|± |0.0337|
| - high_school_statistics | 1|none | 0|acc |↑ | 0.1759|± |0.0260|
| - machine_learning | 1|none | 0|acc |↑ | 0.3482|± |0.0452|