forked from wellcometrust/grants_tagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdvc.lock
337 lines (337 loc) · 12.2 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
schema: '2.0'
stages:
preprocess_wellcome_science:
cmd: grants_tagger preprocess wellcome-science data/raw/science_tags_full_version.xlsx
data/processed/science_grants_tagged_title_synopsis.jsonl models/label_binarizer.pkl
deps:
- path: data/raw/science_tags_full_version.xlsx
md5: 74da2bf7a507e52b8b677ddce19156a9
size: 2638299
- path: grants_tagger/preprocess_wellcome.py
md5: 738ccb78c7ee261c7e934cd4196e9b46
size: 6654
params:
params.yaml:
preprocess_wellcome_science.meta_cols: Grant_ID,Title
preprocess_wellcome_science.text_cols: Title,Synopsis
outs:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
train:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/tfidf-svm-2020.05.2.pkl --approach tfidf-svm
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 162e8650a0a7e970420f48eb5c253f82
size: 3970
params:
params.yaml:
train.class_weight: balanced
train.min_df: 5
train.ngram_range:
- 1
- 2
outs:
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm-2020.05.2.pkl
md5: eceaf3846999e47380ca670c096b810a
size: 17768856
train_tfidf_svm:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/tfidf-svm.pkl --approach tfidf-svm --train-info
results/tfidf_svm_train_info.json
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 7ad0632959accc481d8d5300f3c9fd84
size: 5996
params:
params.yaml:
train.tfidf-svm.svm__estimator.class_weight: balanced
train.tfidf-svm.tfidf.min_df: 5
train.tfidf-svm.tfidf.ngram_range:
- 1
- 2
outs:
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
- path: results/tfidf_svm_train_info.json
md5: 6f98a7bec4325ece260581465a1d9847
size: 62
evaluate:
cmd: grants_tagger evaluate model tfidf-svm models/tfidf-svm-2020.05.2.pkl data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl
deps:
- path: grants_tagger/evaluate_model.py
md5: cd583e8d10e0c889647834cab217ce2f
size: 1872
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm-2020.05.2.pkl
md5: f24b224be6a867f96400b3df2ad26ac9
size: 17768856
outs:
- path: results.json
md5: 1d0d4fb63ae1d1b911373cc558147737
size: 89
evaluate_tfidf_svm:
cmd: grants_tagger evaluate model tfidf-svm models/tfidf-svm.pkl data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl --results-path results/tfidf_svm.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
outs:
- path: results/tfidf_svm.json
md5: b2f118de649f2da71cb40b5f8694fcf1
size: 120
train_scibert:
cmd: grants_tagger train data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl models/scibert --approach scibert --train-info results/scibert_train_info.json
deps:
- path: data/processed/science_grants_tagged_title_synopsis.jsonl
md5: 2fb37d57daeece50a0190e16e229647a
size: 2809039
- path: grants_tagger/train.py
md5: 7ad0632959accc481d8d5300f3c9fd84
size: 5996
params:
params.yaml:
train.scibert.epochs: 10
train.scibert.learning_rate: 2e-05
train.scibert.validation_split: 0.1
outs:
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
evaluate_scibert:
cmd: grants_tagger evaluate model scibert models/scibert data/processed/science_grants_tagged_title_synopsis.jsonl
models/label_binarizer.pkl --results-path results/scibert.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
outs:
- path: results/scibert.json
md5: 936e3229c02ea1a562eeb90298d1ecca
size: 120
preprocess_bioasq_mesh:
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021.jsonl
models/xlinear/label_binarizer.pkl --test-split 0.01 --test-output-path data/processed/test_mesh2021.jsonl
deps:
- path: data/raw/allMeSH_2021.json
md5: e827a6b8062d1312664dcf075c12d89f
size: 27547042745
- path: grants_tagger/preprocess_mesh.py
md5: 5c458714cc114c85dfc1c569f297b1c9
size: 5124
outs:
- path: data/processed/test_mesh2021.jsonl
md5: 2d6f73d29ec6f98a3cae6df2c618f077
size: 257607574
- path: data/processed/train_mesh2021.jsonl
md5: 3fb0ddb59e11c40c0c34fd104b1815fb
size: 25488153802
- path: models/xlinear/label_binarizer.pkl
md5: 67d759ed4142feab2e575dc9bd3d5f54
size: 827793
evaluate_science_ensemble:
cmd: grants_tagger evaluate model science-ensemble models/tfidf-svm.pkl,models/scibert
data/processed/science_grants_tagged_title_synopsis.jsonl models/label_binarizer.pkl
--results-path results/science_ensemble.json
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/label_binarizer.pkl
md5: 220319a004a60bd6475e2fb617c2c1f3
size: 1067
- path: models/scibert
md5: 5cf7e7f8e11a1e00d1c214d637618b85.dir
size: 440020006
nfiles: 2
- path: models/tfidf-svm.pkl
md5: 38f35b3381d116adad18dc5e2a7dab03
size: 17768857
outs:
- path: results/science_ensemble.json
md5: a9fb045af166aa9901e7cd89678fed25
size: 120
train_mesh_cnn:
cmd: grants_tagger train data/processed/disease_mesh.jsonl models/disease_mesh_label_binarizer-2021.06.0.pkl
models/disease_mesh_cnn-2021.06.0 --approach mesh-cnn --sparse-labels
deps:
- path: data/processed/disease_mesh.jsonl
md5: f4463f861869e7516caef78ff75600ac
size: 12179582610
- path: grants_tagger/models.py
md5: 234124a679b6f761241c4ff7ba7b2fd7
size: 25324
- path: grants_tagger/train.py
md5: 78683e5785fe5f403b4d1f58211b5dc8
size: 3971
params:
params.yaml:
train.mesh-cnn.cnn.attention: true
train.mesh-cnn.cnn.batch_size: 256
train.mesh-cnn.cnn.dense_size: 10000
train.mesh-cnn.cnn.dropout: 0.1
train.mesh-cnn.cnn.hidden_size: 400
train.mesh-cnn.cnn.l2: 7e-07
train.mesh-cnn.cnn.learning_rate: 0.0001
train.mesh-cnn.cnn.learning_rate_decay: 0.8
train.mesh-cnn.cnn.multilabel: true
train.mesh-cnn.cnn.nb_epochs: 10
train.mesh-cnn.vec.sequence_length: 400
train.mesh-cnn.vec.tokenizer_library: transformers
train.mesh-cnn.vec.vocab_size: 30000
outs:
- path: models/disease_mesh_cnn-2021.06.0
md5: 08ec776180c1d45ff0f35024833f843d.dir
size: 808278043
nfiles: 5
- path: models/disease_mesh_label_binarizer-2021.06.0.pkl
md5: 85be50a39457aa83f1dd4fbb2b1f26b6
size: 147371
filter_mesh_tags:
cmd: python grants_tagger/filter_mesh_tags.py data/raw/desc2021.xml data/processed/mesh_disease_tags.csv
deps:
- path: data/raw/desc2021.xml
md5: 8663a7dd8e1895dd22525d42b80cd2df
size: 300410104
outs:
- path: data/processed/mesh_disease_tags.csv
md5: 4311b12fb4f381ffab1d76f55069683d
size: 260080
train_mesh_xlinear:
cmd: grants_tagger train data/processed/train_mesh2021.jsonl models/xlinear/label_binarizer.pkl
models/xlinear/model --approach mesh-xlinear --sparse-labels --train-info results/mesh_xlinear_train_info.json
deps:
- path: data/processed/test_mesh2021.jsonl
md5: 2d6f73d29ec6f98a3cae6df2c618f077
size: 257607574
- path: grants_tagger/models/mesh_xlinear.py
md5: a8c4abd11e2d6204c7dcfc0252521c6a
size: 5705
- path: grants_tagger/train.py
md5: 7ad0632959accc481d8d5300f3c9fd84
size: 5996
params:
params.yaml:
train.mesh-xlinear.beam_size: 30
train.mesh-xlinear.cluster_chain: true
train.mesh-xlinear.lowercase: true
train.mesh-xlinear.max_df: 1.0
train.mesh-xlinear.max_features: 400000
train.mesh-xlinear.min_df: 5
train.mesh-xlinear.min_weight_value: 0.1
train.mesh-xlinear.negative_sampling_scheme: tfn
train.mesh-xlinear.ngram_range:
- 1
- 1
train.mesh-xlinear.only_topk: 200
train.mesh-xlinear.stop_words: english
train.mesh-xlinear.threshold: 0.2
train.mesh-xlinear.vectorizer_library: pecos
outs:
- path: models/xlinear/model
md5: 4939257c1b5185f3eb53d20fafcf11f4.dir
size: 6753545796
nfiles: 38
evaluate_mesh_xlinear_on_grants:
cmd: grants_tagger evaluate grants mesh-xlinear models/xlinear/model data/raw/disease_tags_validation_grants.xlsx
models/xlinear/label_binarizer.pkl --results-path results/mesh_xlinear_on_grants.json
--mesh-tags-path data/processed/mesh_disease_tags.csv
deps:
- path: data/processed/mesh_disease_tags.csv
md5: 4311b12fb4f381ffab1d76f55069683d
size: 260080
- path: data/raw/disease_tags_validation_grants.xlsx
md5: 71554cf90758773fb996351000384d4f
size: 615751
- path: grants_tagger/evaluate_mesh_on_grants.py
md5: 3ed963696ba23e55310f84579be6c3ec
size: 4143
- path: models/xlinear/label_binarizer.pkl
md5: 67d759ed4142feab2e575dc9bd3d5f54
size: 827793
- path: models/xlinear/model
md5: 4939257c1b5185f3eb53d20fafcf11f4.dir
size: 6753545796
nfiles: 38
outs:
- path: results/mesh_xlinear_on_grants.json
md5: 4fb584aaed6106d4cf9ade0ce2f3c1a7
size: 26
evaluate_mesh_xlinear:
cmd: grants_tagger evaluate model mesh-xlinear models/xlinear/model data/processed/test_mesh2021.jsonl
models/xlinear/label_binarizer.pkl --results-path results/mesh_xlinear.json
--full-report-path results/mesh_xlinear_full_report.json --no-split-data
deps:
- path: grants_tagger/evaluate_model.py
md5: d184ff6cda5a492977586d85d9b19328
size: 5774
- path: models/xlinear/label_binarizer.pkl
md5: 67d759ed4142feab2e575dc9bd3d5f54
size: 827793
- path: models/xlinear/model
md5: 4939257c1b5185f3eb53d20fafcf11f4.dir
size: 6753545796
nfiles: 38
outs:
- path: results/mesh_xlinear.json
md5: b6f13676eb8af3b55a70b28477dea250
size: 120
generate_validation_data_xlinear:
cmd: python scripts/generate_validation_data_xlinear.py
deps:
- path: data/interim/mesh_pipeline_result.csv
md5: 10639bbe244b986919efc8f7866b98b4
size: 138862818
- path: data/raw/grants.csv
md5: 9732c21dd1954cce8baaf3746f301ead
size: 152523849
- path: scripts/generate_validation_data_xlinear.py
md5: e3eb14350c3ea5f7692cce5a48fed5fe
size: 2734
outs:
- path: data/processed/merged_mesh_predictions_mesh_xlinear_for_validation.xlsx
md5: f174b02bf804885f769816cc27540b89
size: 40795153
get_grants:
cmd: python scripts/get_grants.py
deps:
- path: scripts/get_grants.py
md5: 50c0cf255eb0252fd4f3412920430d19
size: 1010
outs:
- path: data/raw/grants.csv
md5: 9732c21dd1954cce8baaf3746f301ead
size: 152523849