1
1
from .pipeline import Pipeline
2
2
from .tpipeline import TPipeline
3
3
from .pipeline import supported_langs , langwithner , remove_with_path
4
+ from .utils .base_utils import download , trankit2conllu
5
+ from .utils .tbinfo import supported_embeddings , supported_langs , saved_model_version
6
+ import os
7
+ from shutil import copyfile
4
8
5
- __version__ = "1.0.1 "
9
+ __version__ = "1.1.0 "
6
10
7
11
8
- def verify_customized_pipeline (category , save_dir ):
12
+ def download_missing_files (category , save_dir , embedding_name , language ):
13
+ assert language in supported_langs , '{} is not a pretrained language. Current pretrained languages: {}' .format (language , supported_langs )
14
+ assert embedding_name in supported_embeddings , '{} has not been supported. Current supported embeddings: {}' .format (embedding_name , supported_embeddings )
15
+
9
16
import os
10
17
assert category in {'customized' , 'customized-ner' , 'customized-mwt' ,
11
- 'customized-mwt-ner' }, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
18
+ 'customized-mwt-ner' }, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
19
+ if category == 'customized' :
20
+ file_list = [
21
+ ('{}.tokenizer.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category ))),
22
+ ('{}.tagger.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category ))),
23
+ ('{}.vocabs.json' , os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category ))),
24
+ ('{}_lemmatizer.pt' , os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category )))
25
+ ]
26
+ elif category == 'customized-ner' :
27
+ file_list = [
28
+ ('{}.tokenizer.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category ))),
29
+ ('{}.tagger.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category ))),
30
+ ('{}.vocabs.json' , os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category ))),
31
+ ('{}_lemmatizer.pt' , os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category ))),
32
+ ('{}.ner.mdl' , os .path .join (save_dir , embedding_name , category , '{}.ner.mdl' .format (category ))),
33
+ ('{}.ner-vocab.json' , os .path .join (save_dir , embedding_name , category , '{}.ner-vocab.json' .format (category )))
34
+ ]
35
+ elif category == 'customized-mwt' :
36
+ file_list = [
37
+ ('{}.tokenizer.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category ))),
38
+ ('{}_mwt_expander.pt' , os .path .join (save_dir , embedding_name , category , '{}_mwt_expander.pt' .format (category ))),
39
+ ('{}.tagger.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category ))),
40
+ ('{}.vocabs.json' , os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category ))),
41
+ ('{}_lemmatizer.pt' , os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category )))
42
+ ]
43
+ elif category == 'customized-mwt-ner' :
44
+ file_list = [
45
+ ('{}.tokenizer.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category ))),
46
+ ('{}_mwt_expander.pt' , os .path .join (save_dir , embedding_name , category , '{}_mwt_expander.pt' .format (category ))),
47
+ ('{}.tagger.mdl' , os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category ))),
48
+ ('{}.vocabs.json' , os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category ))),
49
+ ('{}_lemmatizer.pt' , os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category ))),
50
+ ('{}.ner.mdl' , os .path .join (save_dir , embedding_name , category , '{}.ner.mdl' .format (category ))),
51
+ ('{}.ner-vocab.json' , os .path .join (save_dir , embedding_name , category , '{}.ner-vocab.json' .format (category )))
52
+ ]
53
+ else :
54
+ assert 'Unknown customized lang!'
55
+ missing_filenamess = []
56
+ for filename , filepath in file_list :
57
+ if not os .path .exists (filepath ):
58
+ print ('Missing {}' .format (filepath ))
59
+ missing_filenamess .append (filename )
60
+
61
+ download (
62
+ cache_dir = save_dir ,
63
+ language = language ,
64
+ saved_model_version = saved_model_version , # manually set this to avoid duplicated storage
65
+ embedding_name = embedding_name
66
+ )
67
+ # borrow pretrained files
68
+ src_dir = os .path .join (save_dir , embedding_name , language )
69
+ tgt_dir = os .path .join (save_dir , embedding_name , category )
70
+ for fname in missing_filenamess :
71
+ copyfile (os .path .join (src_dir , fname .format (language )), os .path .join (tgt_dir , fname .format (category )))
72
+ print ('Copying {} to {}' .format (
73
+ os .path .join (src_dir , fname .format (language )),
74
+ os .path .join (tgt_dir , fname .format (category ))
75
+ ))
76
+ remove_with_path (src_dir )
77
+
78
+
79
+ def verify_customized_pipeline (category , save_dir , embedding_name ):
80
+ assert embedding_name in supported_embeddings , '{} has not been supported. Current supported embeddings: {}' .format (
81
+ embedding_name , supported_embeddings )
82
+ assert category in {'customized' , 'customized-ner' , 'customized-mwt' ,
83
+ 'customized-mwt-ner' }, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
12
84
if category == 'customized' :
13
85
file_list = [
14
- os .path .join (save_dir , category , '{}.tokenizer.mdl' .format (category )),
15
- os .path .join (save_dir , category , '{}.tagger.mdl' .format (category )),
16
- os .path .join (save_dir , category , '{}.vocabs.json' .format (category )),
17
- os .path .join (save_dir , category , '{}_lemmatizer.pt' .format (category ))
86
+ os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category )),
87
+ os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category )),
88
+ os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category )),
89
+ os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category ))
18
90
]
19
91
elif category == 'customized-ner' :
20
92
file_list = [
21
- os .path .join (save_dir , category , '{}.tokenizer.mdl' .format (category )),
22
- os .path .join (save_dir , category , '{}.tagger.mdl' .format (category )),
23
- os .path .join (save_dir , category , '{}.vocabs.json' .format (category )),
24
- os .path .join (save_dir , category , '{}_lemmatizer.pt' .format (category )),
25
- os .path .join (save_dir , category , '{}.ner.mdl' .format (category )),
26
- os .path .join (save_dir , category , '{}.ner-vocab.json' .format (category ))
93
+ os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category )),
94
+ os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category )),
95
+ os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category )),
96
+ os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category )),
97
+ os .path .join (save_dir , embedding_name , category , '{}.ner.mdl' .format (category )),
98
+ os .path .join (save_dir , embedding_name , category , '{}.ner-vocab.json' .format (category ))
27
99
]
28
100
elif category == 'customized-mwt' :
29
101
file_list = [
30
- os .path .join (save_dir , category , '{}.tokenizer.mdl' .format (category )),
31
- os .path .join (save_dir , category , '{}_mwt_expander.pt' .format (category )),
32
- os .path .join (save_dir , category , '{}.tagger.mdl' .format (category )),
33
- os .path .join (save_dir , category , '{}.vocabs.json' .format (category )),
34
- os .path .join (save_dir , category , '{}_lemmatizer.pt' .format (category ))
102
+ os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category )),
103
+ os .path .join (save_dir , embedding_name , category , '{}_mwt_expander.pt' .format (category )),
104
+ os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category )),
105
+ os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category )),
106
+ os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category ))
35
107
]
36
108
elif category == 'customized-mwt-ner' :
37
109
file_list = [
38
- os .path .join (save_dir , category , '{}.tokenizer.mdl' .format (category )),
39
- os .path .join (save_dir , category , '{}_mwt_expander.pt' .format (category )),
40
- os .path .join (save_dir , category , '{}.tagger.mdl' .format (category )),
41
- os .path .join (save_dir , category , '{}.vocabs.json' .format (category )),
42
- os .path .join (save_dir , category , '{}_lemmatizer.pt' .format (category )),
43
- os .path .join (save_dir , category , '{}.ner.mdl' .format (category )),
44
- os .path .join (save_dir , category , '{}.ner-vocab.json' .format (category ))
110
+ os .path .join (save_dir , embedding_name , category , '{}.tokenizer.mdl' .format (category )),
111
+ os .path .join (save_dir , embedding_name , category , '{}_mwt_expander.pt' .format (category )),
112
+ os .path .join (save_dir , embedding_name , category , '{}.tagger.mdl' .format (category )),
113
+ os .path .join (save_dir , embedding_name , category , '{}.vocabs.json' .format (category )),
114
+ os .path .join (save_dir , embedding_name , category , '{}_lemmatizer.pt' .format (category )),
115
+ os .path .join (save_dir , embedding_name , category , '{}.ner.mdl' .format (category )),
116
+ os .path .join (save_dir , embedding_name , category , '{}.ner-vocab.json' .format (category ))
45
117
]
46
118
else :
47
119
assert 'Unknown customized lang!'
@@ -52,13 +124,11 @@ def verify_customized_pipeline(category, save_dir):
52
124
verified = False
53
125
print ('Missing {}' .format (filepath ))
54
126
if verified :
55
- with open (os .path .join (save_dir , category , '{}.downloaded' .format (category )), 'w' ) as f :
127
+ with open (os .path .join (save_dir , embedding_name , category , '{}.downloaded' .format (category )), 'w' ) as f :
56
128
f .write ('' )
57
- remove_with_path (os .path .join (save_dir , category , 'train.txt.character' ))
58
- remove_with_path (os .path .join (save_dir , category , 'logs' ))
59
- remove_with_path (os .path .join (save_dir , category , 'preds' ))
60
- remove_with_path (os .path .join (save_dir , category , 'xlm-roberta-large' ))
61
- remove_with_path (os .path .join (save_dir , category , 'xlm-roberta-base' ))
129
+ remove_with_path (os .path .join (save_dir , embedding_name , category , 'train.txt.character' ))
130
+ remove_with_path (os .path .join (save_dir , embedding_name , category , 'logs' ))
131
+ remove_with_path (os .path .join (save_dir , embedding_name , category , 'preds' ))
62
132
print (
63
133
"Customized pipeline is ready to use!\n It can be initialized as follows:\n -----------------------------------\n from trankit import Pipeline\n p = Pipeline(lang='{}', cache_dir='{}')" .format (
64
134
category , save_dir ))
0 commit comments