-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
122 lines (81 loc) · 3.32 KB
/
model.py
File metadata and controls
122 lines (81 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import random
import spacy
import torch
import torch.nn as nn
from torchtext.data import BucketIterator, Field
from torchtext.datasets import Multi30k
device = "gpu" if torch.cuda.is_available() else "cpu"
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokernizer_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokernizer_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(
tokenize=tokernizer_ger, lower=True, init_token="<sos>", eos_token="<eos>"
)
english = Field(
tokenize=tokernizer_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, val_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10_000, min_freq=2)
english.build_vocab(train_data, max_size=10_000, min_freq=2)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def forward(self, x):
# x.shape: (seq_len, N)
embedding = self.dropout(self.embedding(x))
# embedding.shape: (seq_len, N, embedding_size)
_, (hidden, cell) = self.rnn(embedding)
return hidden, cell
class Decoder(nn.Module):
def __init__(
self, input_size, embedding_size, hidden_size, output_size, num_layers, p
):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden, cell):
# x.shape: (N)
x = x.unsqueeze(0)
# x.shape: (1, N)
embedding = self.dropout(self.embedding(x))
# embedding.shape: (1, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
# outputs.shape: (1, N, hidden_size)
predictions = self.fc(outputs)
# predictions.shape: (1, N, vocab_len)
predictions = predictions.squeeze()
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
hidden, cell = self.encoder(source)
# Get start token
x = target[0]
for t in range(1, target_len):
output, hidden, cell = self.decoder(x, hidden, cell)
outputs[t] = output
# output.shape: (N, english_vocab_size)
best_guess = output.argmax(1)
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs