Skip to content

Commit e9f1802

Browse files
author
Guanheng Zhang
committed
Merge branch 'master' into xlmr_mlm
2 parents 213f0ef + 9053d95 commit e9f1802

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+985
-897
lines changed

.circleci/unittest/linux/scripts/environment.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ dependencies:
1717
- sphinx
1818
- sphinx-rtd-theme
1919
- tqdm
20-
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
21-
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
20+
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
21+
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0

.circleci/unittest/linux/scripts/install.sh

+3
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,6 @@ conda install -y -c "pytorch-${UPLOAD_CHANNEL}" pytorch cpuonly
1616
printf "* Installing torchtext\n"
1717
git submodule update --init --recursive
1818
python setup.py develop
19+
20+
printf "* Installing parameterized\n"
21+
pip install parameterized

.circleci/unittest/linux/scripts/setup_env.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,6 @@ fi
4545

4646
# 4. Download
4747
printf "* Downloading SpaCy English models\n"
48-
python -m spacy download en
48+
python -m spacy download en_core_web_sm
49+
printf "* Downloading SpaCy German models\n"
50+
python -m spacy download de_core_news_sm

.circleci/unittest/windows/scripts/environment.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@ dependencies:
1919
- tqdm
2020
- certifi
2121
- future
22-
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
23-
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
22+
- https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0
23+
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0

.circleci/unittest/windows/scripts/install.sh

+3
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ conda install -y -c "pytorch-${UPLOAD_CHANNEL}" pytorch cpuonly
2121
printf "* Installing torchtext\n"
2222
git submodule update --init --recursive
2323
"$root_dir/packaging/vc_env_helper.bat" python setup.py develop
24+
25+
printf "* Installing parameterized\n"
26+
pip install parameterized

.circleci/unittest/windows/scripts/setup_env.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune
3939

4040
# 4. Download
4141
printf "* Downloading SpaCy English models\n"
42-
python -m spacy download en
42+
python -m spacy download en_core_web_sm
43+
printf "* Downloading SpaCy German models\n"
44+
python -m spacy download de_core_news_sm

benchmark/benchmark_basic_english_normalize.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def _run_benchmark_lookup(train, tokenizer):
1515

1616
existing_basic_english_tokenizer = get_tokenizer("basic_english")
1717
experimental_basic_english_normalize = basic_english_normalize()
18-
experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize.to_ivalue())
18+
experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize)
1919

2020
# existing eager lookup
2121
train, _ = AG_NEWS()

benchmark/benchmark_experimental_vectors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _run_benchmark_lookup(tokens, vector):
4242

4343
# experimental FastText jit lookup
4444
print("FastText Experimental - Jit Mode")
45-
jit_fast_text_experimental = torch.jit.script(fast_text_experimental.to_ivalue())
45+
jit_fast_text_experimental = torch.jit.script(fast_text_experimental)
4646
_run_benchmark_lookup(tokens, jit_fast_text_experimental)
4747

4848

benchmark/benchmark_experimental_vocab.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
6767
print("Loading from raw text file with basic_english_normalize tokenizer")
6868
for _ in range(num_iters):
6969
tokenizer = basic_english_normalize()
70-
jited_tokenizer = torch.jit.script(tokenizer.to_ivalue())
70+
jited_tokenizer = torch.jit.script(tokenizer)
7171
build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
7272
print("Construction time:", time.monotonic() - t0)
7373
else:
@@ -140,7 +140,7 @@ def token_iterator(file_path):
140140
t0 = time.monotonic()
141141
v_experimental = VocabExperimental(ordered_dict)
142142
print("Construction time:", time.monotonic() - t0)
143-
jit_v_experimental = torch.jit.script(v_experimental.to_ivalue())
143+
jit_v_experimental = torch.jit.script(v_experimental)
144144

145145
# existing Vocab eager lookup
146146
print("Vocab - Eager Mode")
@@ -154,7 +154,7 @@ def token_iterator(file_path):
154154
_run_benchmark_lookup([tokens], v_experimental)
155155
_run_benchmark_lookup(tokens_lists, v_experimental)
156156

157-
jit_v_experimental = torch.jit.script(v_experimental.to_ivalue())
157+
jit_v_experimental = torch.jit.script(v_experimental)
158158
# experimental Vocab jit lookup
159159
print("Vocab Experimental - Jit Mode")
160160
_run_benchmark_lookup(tokens, jit_v_experimental)

benchmark/benchmark_pytext_vocab.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def benchmark_experimental_vocab():
150150
t0 = time.monotonic()
151151
experimental_script_vocab = ExperimentalScriptVocabulary(ordered_dict, unk_token="<unk>")
152152
print("Construction time:", time.monotonic() - t0)
153-
jit_experimental_script_vocab = torch.jit.script(experimental_script_vocab.to_ivalue())
153+
jit_experimental_script_vocab = torch.jit.script(experimental_script_vocab)
154154

155155
# pytext Vocab eager lookup
156156
print("Pytext Vocabulary - Eager Mode")

docs/source/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ popular datasets for natural language.
4444
experimental_transforms
4545
experimental_vectors
4646
experimental_vocab
47+
models_utils
4748
examples <examples>
4849

4950
.. automodule:: torchtext

docs/source/models_utils.rst

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
.. role:: hidden
2+
:class: hidden-section
3+
4+
torchtext.experimental.models.utils
5+
===================================
6+
7+
.. automodule:: torchtext.experimental.models.utils
8+
.. currentmodule:: torchtext.experimental.models.utils
9+
10+
:hidden:`count_model_param`
11+
~~~~~~~~~~~~~~~~~~~~~~~~~~~
12+
13+
.. autofunction:: count_model_param

examples/BERT/model.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def __init__(self, ntoken, ninp, dropout=0.5):
4343
self.norm = LayerNorm(ninp)
4444
self.dropout = Dropout(dropout)
4545

46-
def forward(self, src, token_type_input):
46+
def forward(self, seq_inputs):
47+
src, token_type_input = seq_inputs
4748
src = self.embed(src) + self.pos_embed(src) \
4849
+ self.tok_type_embed(src, token_type_input)
4950
return self.dropout(self.norm(src))
@@ -114,16 +115,16 @@ def forward(self, src, src_mask=None, src_key_padding_mask=None):
114115
class BertModel(nn.Module):
115116
"""Contain a transformer encoder."""
116117

117-
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
118+
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, embed_layer, dropout=0.5):
118119
super(BertModel, self).__init__()
119120
self.model_type = 'Transformer'
120-
self.bert_embed = BertEmbedding(ntoken, ninp)
121+
self.bert_embed = embed_layer
121122
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
122123
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
123124
self.ninp = ninp
124125

125-
def forward(self, src, token_type_input):
126-
src = self.bert_embed(src, token_type_input)
126+
def forward(self, seq_inputs):
127+
src = self.bert_embed(seq_inputs)
127128
output = self.transformer_encoder(src)
128129
return output
129130

@@ -150,15 +151,16 @@ class MLMTask(nn.Module):
150151

151152
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
152153
super(MLMTask, self).__init__()
153-
self.bert_model = BertModel(ntoken, ninp, nhead, nhid, nlayers, dropout=0.5)
154+
embed_layer = BertEmbedding(ntoken, ninp)
155+
self.bert_model = BertModel(ntoken, ninp, nhead, nhid, nlayers, embed_layer, dropout=0.5)
154156
self.mlm_span = Linear(ninp, ninp)
155157
self.activation = F.gelu
156158
self.norm_layer = LayerNorm(ninp, eps=1e-12)
157159
self.mlm_head = Linear(ninp, ntoken)
158160

159161
def forward(self, src, token_type_input=None):
160162
src = src.transpose(0, 1) # Wrap up by nn.DataParallel
161-
output = self.bert_model(src, token_type_input)
163+
output = self.bert_model((src, token_type_input))
162164
output = self.mlm_span(output)
163165
output = self.activation(output)
164166
output = self.norm_layer(output)
@@ -199,7 +201,7 @@ def __init__(self, bert_model):
199201

200202
def forward(self, src, token_type_input):
201203
src = src.transpose(0, 1) # Wrap up by nn.DataParallel
202-
output = self.bert_model(src, token_type_input)
204+
output = self.bert_model((src, token_type_input))
203205
# Send the first <'cls'> seq to a classifier
204206
output = self.activation(self.linear_layer(output[0]))
205207
output = self.ns_span(output)
@@ -216,7 +218,7 @@ def __init__(self, bert_model):
216218
self.qa_span = Linear(bert_model.ninp, 2)
217219

218220
def forward(self, src, token_type_input):
219-
output = self.bert_model(src, token_type_input)
221+
output = self.bert_model((src, token_type_input))
220222
# transpose output (S, N, E) to (N, S, E)
221223
output = output.transpose(0, 1)
222224
output = self.activation(output)

examples/BERT/ns_task.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import torch.nn as nn
66
from torch.nn.parallel import DistributedDataParallel as DDP
77
from torch.utils.data import DataLoader
8-
from model import NextSentenceTask, BertModel
8+
from model import NextSentenceTask, BertModel, BertEmbedding
99
from utils import run_demo, run_ddp, wrap_up
1010

1111

@@ -149,7 +149,8 @@ def run_main(args, rank=None):
149149
if args.checkpoint != 'None':
150150
model = torch.load(args.checkpoint)
151151
else:
152-
pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout)
152+
embed_layer = BertEmbedding(len(vocab), args.emsize)
153+
pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout)
153154
pretrained_bert.load_state_dict(torch.load(args.bert_model))
154155
model = NextSentenceTask(pretrained_bert)
155156

examples/BERT/qa_task.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from model import QuestionAnswerTask
1010
from metrics import compute_qa_exact, compute_qa_f1
1111
from utils import print_loss_log
12-
from model import BertModel
12+
from model import BertModel, BertEmbedding
1313

1414

1515
def process_raw_data(data):
@@ -174,7 +174,8 @@ def train():
174174
train_dataset = process_raw_data(train_dataset)
175175
dev_dataset = process_raw_data(dev_dataset)
176176
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
177-
pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout)
177+
embed_layer = BertEmbedding(len(vocab), args.emsize)
178+
pretrained_bert = BertModel(len(vocab), args.emsize, args.nhead, args.nhid, args.nlayers, embed_layer, args.dropout)
178179
pretrained_bert.load_state_dict(torch.load(args.bert_model))
179180
model = QuestionAnswerTask(pretrained_bert).to(device)
180181
criterion = nn.CrossEntropyLoss()

examples/data_pipeline/pipelines.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,10 @@ def build_sp_pipeline(spm_file):
3232
vocab = PretrainedSPVocab(load_sp_model(spm_file))
3333

3434
# Insert token in vocab to match a pretrained vocab
35-
vocab.insert_token('<pad>', 1)
3635
pipeline = TextSequentialTransforms(tokenizer, vocab)
37-
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
36+
jit_pipeline = torch.jit.script(pipeline)
3837
print('jit sentencepiece pipeline success!')
39-
return pipeline, pipeline.to_ivalue(), jit_pipeline
38+
return pipeline, pipeline, jit_pipeline
4039

4140

4241
def build_legacy_torchtext_vocab_pipeline(vocab_file):
@@ -59,9 +58,9 @@ def build_experimental_torchtext_pipeline(hf_vocab_file):
5958
with open(hf_vocab_file, 'r') as f:
6059
vocab = load_vocab_from_file(f)
6160
pipeline = TextSequentialTransforms(tokenizer, vocab)
62-
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
61+
jit_pipeline = torch.jit.script(pipeline)
6362
print('jit experimental torchtext pipeline success!')
64-
return pipeline, pipeline.to_ivalue(), jit_pipeline
63+
return pipeline, pipeline, jit_pipeline
6564

6665

6766
def build_legacy_batch_torchtext_vocab_pipeline(vocab_file):
@@ -104,9 +103,9 @@ def build_legacy_pytext_script_vocab_pipeline(vocab_file):
104103
vocab_list.insert(0, "<unk>")
105104
pipeline = TextSequentialTransforms(tokenizer,
106105
PyTextScriptVocabTransform(ScriptVocabulary(vocab_list)))
107-
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
106+
jit_pipeline = torch.jit.script(pipeline)
108107
print('jit legacy PyText pipeline success!')
109-
return pipeline, pipeline.to_ivalue(), jit_pipeline
108+
return pipeline, pipeline, jit_pipeline
110109

111110

112111
def build_experimental_pytext_script_pipeline(vocab_file):
@@ -125,9 +124,9 @@ def build_experimental_pytext_script_pipeline(vocab_file):
125124
# Insert token in vocab to match a pretrained vocab
126125
pipeline = TextSequentialTransforms(tokenizer,
127126
PyTextScriptVocabTransform(script_vocab(ordered_dict)))
128-
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
127+
jit_pipeline = torch.jit.script(pipeline)
129128
print('jit legacy PyText pipeline success!')
130-
return pipeline, pipeline.to_ivalue(), jit_pipeline
129+
return pipeline, pipeline, jit_pipeline
131130

132131

133132
def build_legacy_fasttext_vector_pipeline():
@@ -143,10 +142,10 @@ def build_experimental_fasttext_vector_pipeline():
143142
vector = FastTextExperimental()
144143

145144
pipeline = TextSequentialTransforms(tokenizer, vector)
146-
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
145+
jit_pipeline = torch.jit.script(pipeline)
147146

148147
print('jit legacy fasttext pipeline success!')
149-
return pipeline, pipeline.to_ivalue(), jit_pipeline
148+
return pipeline, pipeline, jit_pipeline
150149

151150

152151
def run_benchmark_lookup(text_classification_dataset, pipeline):

examples/data_pipeline/transforms.py

-14
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,6 @@ def forward(self, tokens: List[str]) -> List[int]:
2424
def insert_token(self, token: str, index: int) -> None:
2525
self.vocab.insert_token(token, index)
2626

27-
def to_ivalue(self):
28-
if hasattr(self.vocab, 'to_ivalue'):
29-
sp_model = self.sp_model
30-
new_module = PretrainedSPVocab(sp_model)
31-
new_module.vocab = self.vocab.to_ivalue()
32-
return new_module
33-
return self
34-
3527

3628
class PyTextVocabTransform(nn.Module):
3729
r"""PyTextVocabTransform transform
@@ -57,12 +49,6 @@ def __init__(self, vocab):
5749
def forward(self, tokens: List[str]) -> List[int]:
5850
return self.vocab.lookup_indices_1d(tokens)
5951

60-
def to_ivalue(self):
61-
if hasattr(self.vocab, 'to_ivalue'):
62-
vocab = self.vocab.to_ivalue()
63-
return PyTextScriptVocabTransform(vocab)
64-
return self
65-
6652

6753
class ToLongTensor(nn.Module):
6854
r"""Convert a list of integers to long tensor

0 commit comments

Comments
 (0)