This repository was archived by the owner on Dec 27, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment_wv.py
More file actions
194 lines (173 loc) · 6.76 KB
/
sentiment_wv.py
File metadata and controls
194 lines (173 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/bin/python2
import os
import codecs
import sys
import numpy as np
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation,\
stem_text
from gensim.models.word2vec import Word2Vec
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers.recurrent import LSTM
from keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
import time
def export(type_data='train'):
print "Extracting data..."
if type_data.lower() == 'train':
filename = 'training.1600000.processed.noemoticon.csv'
elif type_data.lower() == 'test':
filename = 'testdata.manual.2009.06.14.csv'
data_file = codecs.open('Sentiment140/' + filename, encoding='ISO-8859-1')
data = []
for tweet in data_file.read().split('\n')[:-1]:
data.append([string for string in tweet.split('"') if string not in [
'', ',']])
data_file.close()
labels = [(float(tweet[0]) / 4) for tweet in data]
tweets = [tweet[-1] for tweet in data]
print "Preprocessing data..."
for i, tweet in enumerate(tweets):
new_tweet = ' '.join([word for word in tweet.split(' ') if len(word)\
> 0 and word[0] not in ['@', '#'] and 'http' not\
in word]).strip()
pro_tweet = preprocess_string(new_tweet)
if len(pro_tweet) < 2:
tweets[i] = strip_punctuation(stem_text(new_tweet.lower())).\
strip().split()
else:
tweets[i] = pro_tweet
sys.stdout.write("\r%d tweet(s) pre-processed out of %d\r" % (
i + 1, len(tweets)))
sys.stdout.flush()
print "Cleaning data..."
backup_tweets = np.array(tweets)
backup_labels = np.array(labels)
tweets = []
labels = []
for i, tweet in enumerate(backup_tweets):
if len(tweet) >= 2:
tweets.append(tweet)
labels.append(backup_labels[i])
del backup_tweets
del backup_labels
# Shuffle the dataset
data = zip(tweets, labels)
np.random.shuffle(data)
tweets, labels = zip(*data)
return (tweets, labels)
def create_word2vec(tweets):
wv_model = Word2Vec(size=32, alpha=0.1, window=2, min_count=0, workers=8,
min_alpha=0.01)
print "Created Word2Vec model\nBuilding vocabulary..."
wv_model.build_vocab(tweets)
print "Training..."
wv_model.train(tweets, total_examples=wv_model.corpus_count, epochs=10)
print "Trained"
wv_model.save('model_word2vec')
print "Model saved"
return wv_model
def get_word2vec(tweets=None):
if 'model_word2vec' in os.listdir('.'):
response = raw_input('Word2Vec model found. Do you want to load it?'\
' (Y/n): ')
if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']:
if not tweets:
tweets, labels = export()
del labels
return create_word2vec(tweets)
else:
print "Loading model..."
wv_model = Word2Vec.load('./model_word2vec')
print "Loaded model"
return wv_model
else:
if not tweets:
tweets, labels = export()
del labels
return create_word2vec(tweets)
def init_with_wv(tweets=None, labels=None, wv_model=None, type_data='train'):
if not tweets and not labels:
tweets, labels = export(type_data)
elif tweets and labels:
pass
else:
print "One of tweets or labels given, but not the other"
return
if not wv_model and type_data == 'train':
wv_model = get_word2vec(tweets)
elif not wv_model:
wv_model = get_word2vec()
print "Replacing words with word vectors..."
if type_data == 'train':
max_tweet_len = max([len(tweet) for tweet in tweets])
else:
max_tweet_len = 40 #Empirically obtained :P
tweets_wv = []
vocab = wv_model.wv.vocab.keys()
# TODO: Replace following loop with tensorflow embedding lookup
for tweet_num, tweet in enumerate(tweets):
current_tweet = []
for word in tweet:
if word in vocab:
current_tweet.append(wv_model.wv[word])
if len(current_tweet) < max_tweet_len:
current_tweet_len = len(current_tweet)
for i in range(max_tweet_len - current_tweet_len):
current_tweet.append(np.zeros(100))
tweets_wv.append(current_tweet)
sys.stdout.write("\r%d tweet(s) replaced out of %d\r" % (
tweet_num + 1, len(tweets)))
sys.stdout.flush()
print "\nReplaced words with word vectors"
del tweets
tweets_wv = np.array(tweets_wv)
labels = np.array(labels)
return (tweets_wv, labels)
def create_nn(max_tweet_len=None):
if max_tweet_len == None:
print "Error: Please specify max tweet length"
nn_model = Sequential()
nn_model.add(LSTM(128, input_shape=(max_tweet_len, 32)))
nn_model.add(Dense(1, activation='sigmoid'))
nn_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=[
'accuracy'])
print "Created neural network model"
return nn_model
def get_nn(max_tweet_len=None):
if 'model_nn.h5' in os.listdir('.'):
response = raw_input('Neural network model found. Do you want to load'\
'it? (Y/n): ')
if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']:
return create_nn(max_tweet_len)
else:
print "Loading model..."
nn_model = load_model('model_nn.h5')
print "Loaded model"
return nn_model
else:
return create_nn(vocab_len)
def train_nn(tweets=None, labels=None, nn_model=None):
if not tweets and not labels:
tweets, labels = init_with_wv()
elif tweets and labels:
pass
else:
print "One of tweets or labels given, but not the other"
return
if not nn_model:
max_tweet_len = max([len(tweet) for tweet in tweets])
nn_model = get_nn(max_tweet_len)
# Callbacks (extra features)
tb_callback = TensorBoard(log_dir='./Tensorboard/' + str(time.time()))
early_stop = EarlyStopping(monitor='loss', min_delta=0.1, patience=10)
lr_reducer = ReduceLROnPlateau(monitor='loss', factor=0.5, min_lr=0.00001,
patience=3, epsilon=0.2)
nn_model.fit(tweets, labels, epochs=10, batch_size=32, callbacks=
[tb_callback, early_stop, lr_reducer], validation_split=0.2)
nn_model.save('model_nn.h5')
print "Saved model"
del tweets
del labels
tweets_test, labels_test, _ = init_with_vocab(type_data='test')
print nn_model.evaluate(tweets_test, labels_test, batch_size=32)
train_nn()