Skip to content

Commit a5eeacb

Browse files
committed
added implementation for ada boost, decision tree and neural networks as well as updated rbad support
1 parent 842160a commit a5eeacb

File tree

1 file changed

+286
-18
lines changed

1 file changed

+286
-18
lines changed

dmonscikit/dmonscilearnclassification.py

Lines changed: 286 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from util import str2Bool
2020
import glob
2121
from util import ut2hum
22+
import itertools
2223

2324

2425
class SciClassification:
@@ -64,11 +65,61 @@ def detect(self, method, model, data):
6465
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
6566
dpredict = 0
6667
elif isinstance(smodel, AdaBoostClassifier):
67-
print "TODO" #TODO
68+
print "Detected AdaBoost model"
69+
print "base_estimator -> %s" % smodel.base_estimator
70+
print "n_estimators -> %s" % smodel.n_estimators
71+
print "Learning_rate -> %s" % smodel.learning_rate
72+
print "Algorithm -> %s" % smodel.algorithm
73+
print "Random State -> %s" % smodel.random_state
74+
try:
75+
dpredict = smodel.predict(self.df)
76+
print "AdaBoost Prediction Array -> %s" % str(dpredict)
77+
except Exception as inst:
78+
logger.error('[%s] : [ERROR] Error while fitting AdaBoost model to event with %s and %s',
79+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
80+
sys.exit(1)
6881
elif isinstance(smodel, DecisionTreeClassifier):
69-
print "TODO" #TODO
82+
print "Detected Decision Tree model"
83+
print "Criterion -> %s" % smodel.criterion
84+
print "Spliter -> %s" % smodel.splitter
85+
print "Max_Depth -> %s" % smodel.max_depth
86+
print "Min_sample_split -> %s " % smodel.min_samples_split
87+
print "Min_sample_leaf -> %s " % smodel.min_samples_leaf
88+
print "Min_weight_fraction_leaf -> %s " % smodel.min_weight_fraction_leaf
89+
print "Max_Features -> %s" % smodel.max_features
90+
print "Random_state -> %s " % smodel.random_state
91+
print "Max_leaf_nodes -> %s " % smodel.max_leaf_nodes
92+
print "Min_impurity_split -> %s " % smodel.min_impurity_split
93+
print "Class_weight -> %s " % smodel.class_weight
94+
try:
95+
dpredict = smodel.predict(self.df)
96+
print "Decision Tree Prediction Array -> %s" % str(dpredict)
97+
except Exception as inst:
98+
logger.error('[%s] : [ERROR] Error while fitting Decision Tree model to event with %s and %s',
99+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
100+
inst.args)
101+
sys.exit(1)
102+
70103
elif isinstance(smodel, MLPClassifier):
71-
print "TODO" #TODO
104+
print "Detected Neural Network model"
105+
print "Hidden Layer size -> %s" % str(smodel.hidden_layer_sizes)
106+
print "Activation -> %s" % smodel.activation
107+
print "Solver -> %s" % smodel.solver
108+
print "Alpha -> %s" % smodel.alpha
109+
print "Batch Size -> %s" % smodel.batch_size
110+
print "Learning rate -> %s" % smodel.learning_rate
111+
print "Max Iterations -> %s" % smodel.max_iter
112+
print "Shuffle -> %s" % smodel.shuffle
113+
print "Momentum -> %s" % smodel.momentum
114+
print "Epsilon -> %s" % smodel.epsilon
115+
try:
116+
dpredict = smodel.predict(self.df)
117+
print "MLP Prediction Array -> %s" % str(dpredict)
118+
except Exception as inst:
119+
logger.error('[%s] : [ERROR] Error while fitting MLP model to event with %s and %s',
120+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
121+
inst.args)
122+
sys.exit(1)
72123
else:
73124
logger.error('[%s] : [ERROR] Unsuported model loaded: %s!',
74125
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(smodel))
@@ -99,36 +150,253 @@ def detect(self, method, model, data):
99150
str(anomaliesDict))
100151
return anomaliesDict
101152

102-
def score(self):
103-
return True
153+
def score(self, model, X, y):
154+
return model.score(X, y)
104155

105-
def compare(self):
106-
return True
156+
def compare(self, modelList, X, y):
157+
scores = []
158+
for model in modelList:
159+
scores.append(model.score(X,y))
160+
logger.info('[%s] : [INFO] Best performing model score is -> %s',
161+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), max(scores))
162+
# for a, b in itertools.combinations(modelList, 2):
163+
# a.score(X, y)
164+
# b.score(X, y)
165+
return modelList.index(max(scores))
107166

108-
def crossvalid(self):
109-
return True
167+
def crossvalid(self, model, X, y, kfold):
168+
return model_selection.cross_val_score(model, X, y, cv=kfold)
110169

111170
def naiveBayes(self):
112171
return True
113172

114173
def adaBoost(self, settings, data=None, dropna=True):
174+
if "n_estimators" not in settings:
175+
print "Received settings for Ada Boost are %s invalid!" % str(settings)
176+
logger.error('[%s] : [ERROR] Received settings for Decision Tree %s are invalid',
177+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings))
178+
sys.exit(1)
179+
dtallowedSettings = ["n_estimators", "learning_rate"]
180+
for k, v in settings.iteritems():
181+
if k in dtallowedSettings:
182+
logger.info('[%s] : [INFO] Ada Boost %s set to %s',
183+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
184+
print "Ada Boost %s set to %s" % (k, v)
185+
186+
if not isinstance(self.export, str):
187+
mname = 'default'
188+
else:
189+
mname = self.export
115190
df = self.__loadData(data, dropna)
116191
features = df.columns[:-1]
117192
X = df[features]
118193
y = df.iloc[:, -1].values
119194
seed = 7
120-
num_trees = 500
195+
# num_trees = 500
121196
kfold = model_selection.KFold(n_splits=10, random_state=seed)
122197
print kfold
123-
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
124-
results = model_selection.cross_val_score(model, X, y, cv=kfold)
125-
model.fit(X, y)
126-
print results.mean()
127-
print model.score(X, y)
128-
return True
198+
ad = AdaBoostClassifier(n_estimators=settings['n_estimators'], learning_rate=settings['learning_rate'],
199+
random_state=seed)
200+
if self.validratio:
201+
trainSize = 1.0 - self.validratio
202+
print "Decision Tree training to validation ratio set to: %s" % str(self.validratio)
203+
logger.info('[%s] : [INFO] Ada Boost training to validation ratio set to: %s',
204+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(self.validratio))
205+
d_train, d_test, f_train, f_test = self.__dataSplit(X, y, testSize=self.validratio, trainSize=trainSize)
206+
ad.fit(d_train, f_train)
207+
predict = ad.predict(d_train)
208+
print "Prediction for Ada Boost Training:"
209+
print predict
129210

130-
def neuralNet(self):
131-
return True
211+
print "Actual labels of training set:"
212+
print f_train
213+
214+
predProb = ad.predict_proba(d_train)
215+
print "Prediction probabilities for Ada Boost Training:"
216+
print predProb
217+
218+
score = ad.score(d_train, f_train)
219+
print "Ada Boost Training Score: %s" % str(score)
220+
logger.info('[%s] : [INFO] Ada Boost training score: %s',
221+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score))
222+
223+
feature_imp = list(zip(d_train, ad.feature_importances_))
224+
print "Feature importance Ada Boost Training: "
225+
print list(zip(d_train, ad.feature_importances_))
226+
logger.info('[%s] : [INFO] Ada Boost feature importance: %s',
227+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(feature_imp))
228+
229+
pred_valid = ad.predict(d_test)
230+
print "Ada Boost Validation set prediction: "
231+
print pred_valid
232+
print "Actual values of validation set: "
233+
print d_test
234+
score_valid = ad.score(d_test, f_test)
235+
print "Ada Boost validation set score: %s" % str(score_valid)
236+
logger.info('[%s] : [INFO] Ada Boost validation score: %s',
237+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score_valid))
238+
else:
239+
ad.fit(X, y)
240+
predict = ad.predict(X)
241+
print "Prediction for Ada Boost Training:"
242+
print predict
243+
244+
print "Actual labels of training set:"
245+
print y
246+
247+
predProb = ad.predict_proba(X)
248+
print "Prediction probabilities for Ada Boost Training:"
249+
print predProb
250+
251+
score = ad.score(X, y)
252+
print "Ada Boost Training Score: %s" % str(score)
253+
254+
fimp = list(zip(X, ad.feature_importances_))
255+
print "Feature importance Ada Boost Training: "
256+
print fimp
257+
dfimp = dict(fimp)
258+
dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance'])
259+
sdfimp = dfimp.sort('Importance', ascending=False)
260+
dfimpCsv = 'Feature_Importance_%s.csv' % mname
261+
sdfimp.to_csv(os.path.join(self.modelDir, dfimpCsv))
262+
if self.validation is None:
263+
logger.info('[%s] : [INFO] Validation is set to None',
264+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
265+
# return True
266+
else:
267+
vfile = os.path.join(self.dataDir, self.validation)
268+
logger.info('[%s] : [INFO] Validation data file is set to %s',
269+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile))
270+
if not os.path.isfile(vfile):
271+
print "Validation file %s not found" % vfile
272+
logger.error('[%s] : [ERROR] Validation file %s not found',
273+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile))
274+
else:
275+
df_valid = pd.read_csv(vfile)
276+
if dropna:
277+
df_valid = df_valid.dropna()
278+
features_valid = df_valid.columns[:-1]
279+
X_valid = df_valid[features_valid]
280+
y_valid = df_valid.iloc[:, -1].values
281+
pred_valid = ad.predict(X_valid)
282+
print "Ada Boost Validation set prediction: "
283+
print pred_valid
284+
print "Actual values of validation set: "
285+
print y_valid
286+
score_valid = ad.score(X_valid, y_valid)
287+
print "Ada Boost set score: %s" % str(score_valid)
288+
# return True
289+
self.__serializemodel(ad, 'DecisionTree', mname)
290+
return ad
291+
292+
def neuralNet(self, settings, data=None, dropna=True):
293+
if "activation" not in settings:
294+
print "Received settings for Neural Networks are %s invalid!" % str(settings)
295+
logger.error('[%s] : [ERROR] Received settings for Neural Networks %s are invalid',
296+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings))
297+
sys.exit(1)
298+
299+
rfallowedSettings = ["max_iter", "activation", "solver", "batch_size", "learning_rate",
300+
"momentum", "alpha"]
301+
302+
for k, v in settings.iteritems():
303+
if k in rfallowedSettings:
304+
logger.info('[%s] : [INFO] Neural Network %s set to %s',
305+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
306+
print "Neural Network %s set to %s" % (k, v)
307+
308+
if not isinstance(self.export, str):
309+
mname = 'default'
310+
else:
311+
mname = self.export
312+
313+
df = self.__loadData(data, dropna)
314+
features = df.columns[:-1]
315+
X = df[features]
316+
y = df.iloc[:, -1].values
317+
318+
mlp = MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=settings['max_iter'],
319+
activation=settings['activation'],
320+
solver=settings['solver'], batch_size=settings['batch_size'],
321+
learning_rate=settings['learning_rate'], momentum=settings['momentum'],
322+
alpha=settings['alpha'])
323+
324+
if self.validratio:
325+
trainSize = 1.0 - self.validratio
326+
print "Neural Network training to validation ratio set to: %s" % str(self.validratio)
327+
logger.info('[%s] : [INFO] Neural Netowork training to validation ratio set to: %s',
328+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(self.validratio))
329+
d_train, d_test, f_train, f_test = self.__dataSplit(X, y, testSize=self.validratio, trainSize=trainSize)
330+
mlp.fit(d_train, f_train)
331+
predict = mlp.predict(d_train)
332+
print "Prediction for Neural Network Training:"
333+
print predict
334+
335+
print "Actual labels of training set:"
336+
print f_train
337+
338+
predProb = mlp.predict_proba(d_train)
339+
print "Prediction probabilities for Neural Network Training:"
340+
print predProb
341+
342+
score = mlp.score(d_train, f_train)
343+
print "Neural Network Training Score: %s" % str(score)
344+
logger.info('[%s] : [INFO] Neural Network training score: %s',
345+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score))
346+
pred_valid = mlp.predict(d_test)
347+
print "Neural Network Validation set prediction: "
348+
print pred_valid
349+
print "Actual values of validation set: "
350+
print d_test
351+
score_valid = mlp.score(d_test, f_test)
352+
print "Neural Network validation set score: %s" % str(score_valid)
353+
logger.info('[%s] : [INFO] Random forest validation score: %s',
354+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score_valid))
355+
else:
356+
mlp.fit(X, y)
357+
predict = mlp.predict(X)
358+
print "Prediction for Neural Network Training:"
359+
print predict
360+
361+
print "Actual labels of training set:"
362+
print y
363+
364+
predProb = mlp.predict_proba(X)
365+
print "Prediction probabilities for Neural Network Training:"
366+
print predProb
367+
368+
score = mlp.score(X, y)
369+
print "Random Forest Training Score: %s" % str(score)
370+
371+
if self.validation is None:
372+
logger.info('[%s] : [INFO] Validation is set to None',
373+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
374+
# return True
375+
else:
376+
vfile = os.path.join(self.dataDir, settings['validation'])
377+
logger.info('[%s] : [INFO] Validation data file is set to %s',
378+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile))
379+
if not os.path.isfile(vfile):
380+
print "Validation file %s not found" % vfile
381+
logger.error('[%s] : [ERROR] Validation file %s not found',
382+
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile))
383+
else:
384+
df_valid = pd.read_csv(vfile)
385+
if dropna:
386+
df_valid = df_valid.dropna()
387+
features_valid = df_valid.columns[:-1]
388+
X_valid = df_valid[features_valid]
389+
y_valid = df_valid.iloc[:, -1].values
390+
pred_valid = mlp.predict(X_valid)
391+
print "Neural Network Validation set prediction: "
392+
print pred_valid
393+
print "Actual values of validation set: "
394+
print y_valid
395+
score_valid = mlp.score(X_valid, y_valid)
396+
print "Neural Network validation set score: %s" % str(score_valid)
397+
# return True
398+
self.__serializemodel(mlp, 'RandomForest', mname)
399+
return mlp
132400

133401
def decisionTree(self, settings, data=None, dropna=True):
134402
if "splitter" not in settings:

0 commit comments

Comments
 (0)