-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
55 lines (41 loc) · 2.42 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import preprocessing, word2vec, pipeline, helpers, cross_validation, plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from matplotlib import pyplot as plt
print('Loading the dataset...')
tweets, size_pos, size_neg = preprocessing.load_tweets("./data/train_pos.txt", "./data/train_neg.txt")
pred = preprocessing.predictions(size_pos, size_neg)
print('Tokenizing...')
tokenized_tweets = [preprocessing.tokenize(tweet) for tweet in tweets]
#x_train, x_test, y_train, y_test = train_test_split(tokenized_tweets, pred, test_size = 0.2)
#x_test, ids_test = preprocessing.load_test_tweets('./test_data.txt')
print('Training the W2V model...')
model = word2vec.create_train_w2v(tokenized_tweets, 250, 30)
print('Converting tweets to vectors...')
x_train_vec = word2vec.tweets2vec(tokenized_tweets, 250, model)
#x_test_vec = word2vec.tweets2vec(x_test, 250, model)
print('Training classifier...')
clf = LogisticRegression(solver='lbfgs')
"""clf.fit(x_train_vec, y_train)
print('Accuracy: ', clf.score(x_test_vec, y_test))
clf = pipeline.pipeline_model()"""
cross_val_scores, stds = cross_validation.n_kfold_cross_validation(x_train_vec, pred, clf, n=5, k=5)
clf1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(64, 4))
cross_val_scores1, stds1 = cross_validation.n_kfold_cross_validation(x_train_vec, pred, clf1, n=5, k=5)
clf2 = LinearSVC()
cross_val_scores2, stds2 = cross_validation.n_kfold_cross_validation(x_train_vec, pred, clf2, n=5, k=5)
plt.errorbar(x=list(range(len(cross_val_scores))), y=cross_val_scores, yerr=stds, marker='o', ecolor='orange', label='LogisticRegression')
plt.errorbar(x=list(range(len(cross_val_scores1))), y=cross_val_scores1, yerr=stds1, linestyle='--', marker='o', ecolor='orange', label='MLPClassifier')
plt.errorbar(x=list(range(len(cross_val_scores2))), y=cross_val_scores2, yerr=stds2, linestyle=':', marker='o', ecolor='orange', label='LinearSVC')
plt.xlabel('K-fold indice')
plt.ylabel('Average accuracy')
plt.title('Mean accuracies of the k-fold cross-validations')
plt.legend()
plt.show()
#plot.plot_cross_validation(cross_val_scores, stds)
cross_validation.display_info(cross_val_scores)
cross_validation.display_info(cross_val_scores1)
cross_validation.display_info(cross_val_scores2)
#helpers.create_csv_submission(ids_test, clf.predict(x_test), "submission.csv")