|
19 | 19 | from util import str2Bool |
20 | 20 | import glob |
21 | 21 | from util import ut2hum |
| 22 | +import itertools |
22 | 23 |
|
23 | 24 |
|
24 | 25 | class SciClassification: |
@@ -64,11 +65,61 @@ def detect(self, method, model, data): |
64 | 65 | datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) |
65 | 66 | dpredict = 0 |
66 | 67 | elif isinstance(smodel, AdaBoostClassifier): |
67 | | - print "TODO" #TODO |
| 68 | + print "Detected AdaBoost model" |
| 69 | + print "base_estimator -> %s" % smodel.base_estimator |
| 70 | + print "n_estimators -> %s" % smodel.n_estimators |
| 71 | + print "Learning_rate -> %s" % smodel.learning_rate |
| 72 | + print "Algorithm -> %s" % smodel.algorithm |
| 73 | + print "Random State -> %s" % smodel.random_state |
| 74 | + try: |
| 75 | + dpredict = smodel.predict(self.df) |
| 76 | + print "AdaBoost Prediction Array -> %s" % str(dpredict) |
| 77 | + except Exception as inst: |
| 78 | + logger.error('[%s] : [ERROR] Error while fitting AdaBoost model to event with %s and %s', |
| 79 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) |
| 80 | + sys.exit(1) |
68 | 81 | elif isinstance(smodel, DecisionTreeClassifier): |
69 | | - print "TODO" #TODO |
| 82 | + print "Detected Decision Tree model" |
| 83 | + print "Criterion -> %s" % smodel.criterion |
| 84 | + print "Spliter -> %s" % smodel.splitter |
| 85 | + print "Max_Depth -> %s" % smodel.max_depth |
| 86 | + print "Min_sample_split -> %s " % smodel.min_samples_split |
| 87 | + print "Min_sample_leaf -> %s " % smodel.min_samples_leaf |
| 88 | + print "Min_weight_fraction_leaf -> %s " % smodel.min_weight_fraction_leaf |
| 89 | + print "Max_Features -> %s" % smodel.max_features |
| 90 | + print "Random_state -> %s " % smodel.random_state |
| 91 | + print "Max_leaf_nodes -> %s " % smodel.max_leaf_nodes |
| 92 | + print "Min_impurity_split -> %s " % smodel.min_impurity_split |
| 93 | + print "Class_weight -> %s " % smodel.class_weight |
| 94 | + try: |
| 95 | + dpredict = smodel.predict(self.df) |
| 96 | + print "Decision Tree Prediction Array -> %s" % str(dpredict) |
| 97 | + except Exception as inst: |
| 98 | + logger.error('[%s] : [ERROR] Error while fitting Decision Tree model to event with %s and %s', |
| 99 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), |
| 100 | + inst.args) |
| 101 | + sys.exit(1) |
| 102 | + |
70 | 103 | elif isinstance(smodel, MLPClassifier): |
71 | | - print "TODO" #TODO |
| 104 | + print "Detected Neural Network model" |
| 105 | + print "Hidden Layer size -> %s" % str(smodel.hidden_layer_sizes) |
| 106 | + print "Activation -> %s" % smodel.activation |
| 107 | + print "Solver -> %s" % smodel.solver |
| 108 | + print "Alpha -> %s" % smodel.alpha |
| 109 | + print "Batch Size -> %s" % smodel.batch_size |
| 110 | + print "Learning rate -> %s" % smodel.learning_rate |
| 111 | + print "Max Iterations -> %s" % smodel.max_iter |
| 112 | + print "Shuffle -> %s" % smodel.shuffle |
| 113 | + print "Momentum -> %s" % smodel.momentum |
| 114 | + print "Epsilon -> %s" % smodel.epsilon |
| 115 | + try: |
| 116 | + dpredict = smodel.predict(self.df) |
| 117 | + print "MLP Prediction Array -> %s" % str(dpredict) |
| 118 | + except Exception as inst: |
| 119 | + logger.error('[%s] : [ERROR] Error while fitting MLP model to event with %s and %s', |
| 120 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), |
| 121 | + inst.args) |
| 122 | + sys.exit(1) |
72 | 123 | else: |
73 | 124 | logger.error('[%s] : [ERROR] Unsuported model loaded: %s!', |
74 | 125 | datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(smodel)) |
@@ -99,36 +150,253 @@ def detect(self, method, model, data): |
99 | 150 | str(anomaliesDict)) |
100 | 151 | return anomaliesDict |
101 | 152 |
|
102 | | - def score(self): |
103 | | - return True |
| 153 | + def score(self, model, X, y): |
| 154 | + return model.score(X, y) |
104 | 155 |
|
105 | | - def compare(self): |
106 | | - return True |
| 156 | + def compare(self, modelList, X, y): |
| 157 | + scores = [] |
| 158 | + for model in modelList: |
| 159 | + scores.append(model.score(X,y)) |
| 160 | + logger.info('[%s] : [INFO] Best performing model score is -> %s', |
| 161 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), max(scores)) |
| 162 | + # for a, b in itertools.combinations(modelList, 2): |
| 163 | + # a.score(X, y) |
| 164 | + # b.score(X, y) |
| 165 | + return modelList.index(max(scores)) |
107 | 166 |
|
108 | | - def crossvalid(self): |
109 | | - return True |
| 167 | + def crossvalid(self, model, X, y, kfold): |
| 168 | + return model_selection.cross_val_score(model, X, y, cv=kfold) |
110 | 169 |
|
111 | 170 | def naiveBayes(self): |
112 | 171 | return True |
113 | 172 |
|
114 | 173 | def adaBoost(self, settings, data=None, dropna=True): |
| 174 | + if "n_estimators" not in settings: |
| 175 | + print "Received settings for Ada Boost are %s invalid!" % str(settings) |
| 176 | + logger.error('[%s] : [ERROR] Received settings for Decision Tree %s are invalid', |
| 177 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings)) |
| 178 | + sys.exit(1) |
| 179 | + dtallowedSettings = ["n_estimators", "learning_rate"] |
| 180 | + for k, v in settings.iteritems(): |
| 181 | + if k in dtallowedSettings: |
| 182 | + logger.info('[%s] : [INFO] Ada Boost %s set to %s', |
| 183 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) |
| 184 | + print "Ada Boost %s set to %s" % (k, v) |
| 185 | + |
| 186 | + if not isinstance(self.export, str): |
| 187 | + mname = 'default' |
| 188 | + else: |
| 189 | + mname = self.export |
115 | 190 | df = self.__loadData(data, dropna) |
116 | 191 | features = df.columns[:-1] |
117 | 192 | X = df[features] |
118 | 193 | y = df.iloc[:, -1].values |
119 | 194 | seed = 7 |
120 | | - num_trees = 500 |
| 195 | + # num_trees = 500 |
121 | 196 | kfold = model_selection.KFold(n_splits=10, random_state=seed) |
122 | 197 | print kfold |
123 | | - model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) |
124 | | - results = model_selection.cross_val_score(model, X, y, cv=kfold) |
125 | | - model.fit(X, y) |
126 | | - print results.mean() |
127 | | - print model.score(X, y) |
128 | | - return True |
| 198 | + ad = AdaBoostClassifier(n_estimators=settings['n_estimators'], learning_rate=settings['learning_rate'], |
| 199 | + random_state=seed) |
| 200 | + if self.validratio: |
| 201 | + trainSize = 1.0 - self.validratio |
| 202 | + print "Decision Tree training to validation ratio set to: %s" % str(self.validratio) |
| 203 | + logger.info('[%s] : [INFO] Ada Boost training to validation ratio set to: %s', |
| 204 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(self.validratio)) |
| 205 | + d_train, d_test, f_train, f_test = self.__dataSplit(X, y, testSize=self.validratio, trainSize=trainSize) |
| 206 | + ad.fit(d_train, f_train) |
| 207 | + predict = ad.predict(d_train) |
| 208 | + print "Prediction for Ada Boost Training:" |
| 209 | + print predict |
129 | 210 |
|
130 | | - def neuralNet(self): |
131 | | - return True |
| 211 | + print "Actual labels of training set:" |
| 212 | + print f_train |
| 213 | + |
| 214 | + predProb = ad.predict_proba(d_train) |
| 215 | + print "Prediction probabilities for Ada Boost Training:" |
| 216 | + print predProb |
| 217 | + |
| 218 | + score = ad.score(d_train, f_train) |
| 219 | + print "Ada Boost Training Score: %s" % str(score) |
| 220 | + logger.info('[%s] : [INFO] Ada Boost training score: %s', |
| 221 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score)) |
| 222 | + |
| 223 | + feature_imp = list(zip(d_train, ad.feature_importances_)) |
| 224 | + print "Feature importance Ada Boost Training: " |
| 225 | + print list(zip(d_train, ad.feature_importances_)) |
| 226 | + logger.info('[%s] : [INFO] Ada Boost feature importance: %s', |
| 227 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(feature_imp)) |
| 228 | + |
| 229 | + pred_valid = ad.predict(d_test) |
| 230 | + print "Ada Boost Validation set prediction: " |
| 231 | + print pred_valid |
| 232 | + print "Actual values of validation set: " |
| 233 | + print d_test |
| 234 | + score_valid = ad.score(d_test, f_test) |
| 235 | + print "Ada Boost validation set score: %s" % str(score_valid) |
| 236 | + logger.info('[%s] : [INFO] Ada Boost validation score: %s', |
| 237 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score_valid)) |
| 238 | + else: |
| 239 | + ad.fit(X, y) |
| 240 | + predict = ad.predict(X) |
| 241 | + print "Prediction for Ada Boost Training:" |
| 242 | + print predict |
| 243 | + |
| 244 | + print "Actual labels of training set:" |
| 245 | + print y |
| 246 | + |
| 247 | + predProb = ad.predict_proba(X) |
| 248 | + print "Prediction probabilities for Ada Boost Training:" |
| 249 | + print predProb |
| 250 | + |
| 251 | + score = ad.score(X, y) |
| 252 | + print "Ada Boost Training Score: %s" % str(score) |
| 253 | + |
| 254 | + fimp = list(zip(X, ad.feature_importances_)) |
| 255 | + print "Feature importance Ada Boost Training: " |
| 256 | + print fimp |
| 257 | + dfimp = dict(fimp) |
| 258 | + dfimp = pd.DataFrame(dfimp.items(), columns=['Metric', 'Importance']) |
| 259 | + sdfimp = dfimp.sort('Importance', ascending=False) |
| 260 | + dfimpCsv = 'Feature_Importance_%s.csv' % mname |
| 261 | + sdfimp.to_csv(os.path.join(self.modelDir, dfimpCsv)) |
| 262 | + if self.validation is None: |
| 263 | + logger.info('[%s] : [INFO] Validation is set to None', |
| 264 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) |
| 265 | + # return True |
| 266 | + else: |
| 267 | + vfile = os.path.join(self.dataDir, self.validation) |
| 268 | + logger.info('[%s] : [INFO] Validation data file is set to %s', |
| 269 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile)) |
| 270 | + if not os.path.isfile(vfile): |
| 271 | + print "Validation file %s not found" % vfile |
| 272 | + logger.error('[%s] : [ERROR] Validation file %s not found', |
| 273 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile)) |
| 274 | + else: |
| 275 | + df_valid = pd.read_csv(vfile) |
| 276 | + if dropna: |
| 277 | + df_valid = df_valid.dropna() |
| 278 | + features_valid = df_valid.columns[:-1] |
| 279 | + X_valid = df_valid[features_valid] |
| 280 | + y_valid = df_valid.iloc[:, -1].values |
| 281 | + pred_valid = ad.predict(X_valid) |
| 282 | + print "Ada Boost Validation set prediction: " |
| 283 | + print pred_valid |
| 284 | + print "Actual values of validation set: " |
| 285 | + print y_valid |
| 286 | + score_valid = ad.score(X_valid, y_valid) |
| 287 | + print "Ada Boost set score: %s" % str(score_valid) |
| 288 | + # return True |
| 289 | + self.__serializemodel(ad, 'DecisionTree', mname) |
| 290 | + return ad |
| 291 | + |
| 292 | + def neuralNet(self, settings, data=None, dropna=True): |
| 293 | + if "activation" not in settings: |
| 294 | + print "Received settings for Neural Networks are %s invalid!" % str(settings) |
| 295 | + logger.error('[%s] : [ERROR] Received settings for Neural Networks %s are invalid', |
| 296 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(settings)) |
| 297 | + sys.exit(1) |
| 298 | + |
| 299 | + rfallowedSettings = ["max_iter", "activation", "solver", "batch_size", "learning_rate", |
| 300 | + "momentum", "alpha"] |
| 301 | + |
| 302 | + for k, v in settings.iteritems(): |
| 303 | + if k in rfallowedSettings: |
| 304 | + logger.info('[%s] : [INFO] Neural Network %s set to %s', |
| 305 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) |
| 306 | + print "Neural Network %s set to %s" % (k, v) |
| 307 | + |
| 308 | + if not isinstance(self.export, str): |
| 309 | + mname = 'default' |
| 310 | + else: |
| 311 | + mname = self.export |
| 312 | + |
| 313 | + df = self.__loadData(data, dropna) |
| 314 | + features = df.columns[:-1] |
| 315 | + X = df[features] |
| 316 | + y = df.iloc[:, -1].values |
| 317 | + |
| 318 | + mlp = MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=settings['max_iter'], |
| 319 | + activation=settings['activation'], |
| 320 | + solver=settings['solver'], batch_size=settings['batch_size'], |
| 321 | + learning_rate=settings['learning_rate'], momentum=settings['momentum'], |
| 322 | + alpha=settings['alpha']) |
| 323 | + |
| 324 | + if self.validratio: |
| 325 | + trainSize = 1.0 - self.validratio |
| 326 | + print "Neural Network training to validation ratio set to: %s" % str(self.validratio) |
| 327 | + logger.info('[%s] : [INFO] Neural Netowork training to validation ratio set to: %s', |
| 328 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(self.validratio)) |
| 329 | + d_train, d_test, f_train, f_test = self.__dataSplit(X, y, testSize=self.validratio, trainSize=trainSize) |
| 330 | + mlp.fit(d_train, f_train) |
| 331 | + predict = mlp.predict(d_train) |
| 332 | + print "Prediction for Neural Network Training:" |
| 333 | + print predict |
| 334 | + |
| 335 | + print "Actual labels of training set:" |
| 336 | + print f_train |
| 337 | + |
| 338 | + predProb = mlp.predict_proba(d_train) |
| 339 | + print "Prediction probabilities for Neural Network Training:" |
| 340 | + print predProb |
| 341 | + |
| 342 | + score = mlp.score(d_train, f_train) |
| 343 | + print "Neural Network Training Score: %s" % str(score) |
| 344 | + logger.info('[%s] : [INFO] Neural Network training score: %s', |
| 345 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score)) |
| 346 | + pred_valid = mlp.predict(d_test) |
| 347 | + print "Neural Network Validation set prediction: " |
| 348 | + print pred_valid |
| 349 | + print "Actual values of validation set: " |
| 350 | + print d_test |
| 351 | + score_valid = mlp.score(d_test, f_test) |
| 352 | + print "Neural Network validation set score: %s" % str(score_valid) |
| 353 | + logger.info('[%s] : [INFO] Random forest validation score: %s', |
| 354 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(score_valid)) |
| 355 | + else: |
| 356 | + mlp.fit(X, y) |
| 357 | + predict = mlp.predict(X) |
| 358 | + print "Prediction for Neural Network Training:" |
| 359 | + print predict |
| 360 | + |
| 361 | + print "Actual labels of training set:" |
| 362 | + print y |
| 363 | + |
| 364 | + predProb = mlp.predict_proba(X) |
| 365 | + print "Prediction probabilities for Neural Network Training:" |
| 366 | + print predProb |
| 367 | + |
| 368 | + score = mlp.score(X, y) |
| 369 | + print "Random Forest Training Score: %s" % str(score) |
| 370 | + |
| 371 | + if self.validation is None: |
| 372 | + logger.info('[%s] : [INFO] Validation is set to None', |
| 373 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) |
| 374 | + # return True |
| 375 | + else: |
| 376 | + vfile = os.path.join(self.dataDir, settings['validation']) |
| 377 | + logger.info('[%s] : [INFO] Validation data file is set to %s', |
| 378 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile)) |
| 379 | + if not os.path.isfile(vfile): |
| 380 | + print "Validation file %s not found" % vfile |
| 381 | + logger.error('[%s] : [ERROR] Validation file %s not found', |
| 382 | + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(vfile)) |
| 383 | + else: |
| 384 | + df_valid = pd.read_csv(vfile) |
| 385 | + if dropna: |
| 386 | + df_valid = df_valid.dropna() |
| 387 | + features_valid = df_valid.columns[:-1] |
| 388 | + X_valid = df_valid[features_valid] |
| 389 | + y_valid = df_valid.iloc[:, -1].values |
| 390 | + pred_valid = mlp.predict(X_valid) |
| 391 | + print "Neural Network Validation set prediction: " |
| 392 | + print pred_valid |
| 393 | + print "Actual values of validation set: " |
| 394 | + print y_valid |
| 395 | + score_valid = mlp.score(X_valid, y_valid) |
| 396 | + print "Neural Network validation set score: %s" % str(score_valid) |
| 397 | + # return True |
| 398 | + self.__serializemodel(mlp, 'RandomForest', mname) |
| 399 | + return mlp |
132 | 400 |
|
133 | 401 | def decisionTree(self, settings, data=None, dropna=True): |
134 | 402 | if "splitter" not in settings: |
|
0 commit comments