-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
120 lines (101 loc) · 3.33 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices("GPU")
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pooling import MaskGlobalMaxPooling1D
from pooling import MaskGlobalAveragePooling1D
from dataset import SimpleTokenizer, find_best_maxlen
from dataset import load_THUCNews_title_label
from dataset import load_weibo_senti_100k
from dataset import load_simplifyweibo_4_moods
from dataset import load_hotel_comment
X, y, classes = load_hotel_comment()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7384672)
num_classes = len(classes)
tokenizer = SimpleTokenizer()
tokenizer.fit(X_train)
X_train = tokenizer.transform(X_train)
maxlen = 48
maxlen = find_best_maxlen(X_train)
X_train = sequence.pad_sequences(
X_train,
maxlen=maxlen,
dtype="int32",
padding="post",
truncating="post",
value=0.0
)
y_train = tf.keras.utils.to_categorical(y_train)
num_words = len(tokenizer)
embedding_dims = 128
inputs = Input(shape=(maxlen,))
mask = Lambda(lambda x: tf.not_equal(x, 0))(inputs)
x = Embedding(num_words, embedding_dims,
embeddings_initializer="normal",
input_length=maxlen,
mask_zero=True)(inputs)
x = Dropout(0.2)(x)
x = Conv1D(filters=128,
kernel_size=3,
padding="same",
activation="relu",
strides=1)(x)
x, w = MaskGlobalMaxPooling1D()(x, mask=mask)
x = Dense(128)(x)
x = Dropout(0.2)(x)
x = Activation("relu")(x)
outputs = Dense(num_classes, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(loss="categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"])
model.summary()
model_w_outputs = Model(inputs, w)
batch_size = 32
epochs = 2
callbacks = []
model.fit(X_train, y_train,
batch_size=batch_size,
epochs=epochs,
callbacks=callbacks,
validation_split=0.2
)
id_to_classes = {j:i for i,j in classes.items()}
from textcolor import print_color_text
def visualization():
for sample, label in zip(X_test, y_test):
sample_len = len(sample)
if sample_len > maxlen:
sample_len = maxlen
x = np.array(tokenizer.transform([sample]))
x = sequence.pad_sequences(
x,
maxlen=maxlen,
dtype="int32",
padding="post",
truncating="post",
value=0
)
y_pred = model.predict(x)[0]
y_pred_id = np.argmax(y_pred)
# 预测错误的样本跳过
if y_pred_id != label:
continue
# 预测权重
weights = model_w_outputs.predict(x)[0]
# print(sample, "=>", id_to_classes[y_pred_id])
# print(weights.flatten() * len(sample))
weights = weights.flatten()[:sample_len]
print_color_text(sample, weights)
print(" =>", id_to_classes[y_pred_id])
input() # 按回车预测下一个样本
visualization()