ensemble-binary-classification/main.py at master · arezaz/ensemble-binary-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
from os.path import join as pjoin
import pandas as pd
import pickle
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split

from utils import gen_data, iterModel, MetaClassifier, Sampling


EPOCHS = 3 # arbitrary number of times that the flow will work through the entire dataset.

for i in tqdm(range(0,EPOCHS)):

    # ------------------------------ I) Load & Process Data ----------------------------------- #

    #    A) Train Set: generate train dataset fro challenge data
    PATH_TRAIN = pjoin("Data", "dataset-challenge.xlsx")
    data_dict = gen_data(PATH_TRAIN, 'train')
    X, y = [data_dict['X'], data_dict['y']]

    #    B) Test Set: generate test dataset from the challenge data (optional)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    #train_dict = {'X_train': X_train, 'y_train':y_train}
    #test_dict = {'X_test': X_test, 'y_test':y_test}

    #    C) Evaluation Set: generate evaluation dataset
    PATH_EVAL = pjoin("Data", "dataset-evaluation.xlsx")
    eval_dict = gen_data(PATH_EVAL, 'eval')
    X_eval = eval_dict['X']

    train_dict = {'X_train': X, 'y_train':y}
    test_dict = {}

    # ------------------------------ II) Sampling Imbalanced Data ----------------------------- #

    method = 'SMOTEENN' # samlpling method
    X_res, y_res = Sampling(train_dict['X_train'], train_dict['y_train'], method=method)
    train_dict = {'X_train': X_res, 'y_train':y_res}

    # --------------------------------- III) Model Training ---------------------------------- #

    # ---- iterate classification algorithms: XGBoost, LightGBM
    max_evals = 15 # max iters for tunings hyperparameters
    XGBoost_iters = iterModel(name='XGBoost', max_evals=max_evals, train_dict=train_dict, test_dict=test_dict)
    LightGBM_iters = iterModel(name='LightGBM', max_evals=max_evals, train_dict=train_dict, test_dict=test_dict)

    # ---- Build an soft-voting ensemble meta-classification model
    Results = MetaClassifier([XGBoost_iters, LightGBM_iters], train_dict, test_dict)

    # ---------------------------------- IV) Make Prediction ---------------------------------- #

    # ---- make predictions on evaluation set
    y_pred = Results['BestModel'].predict(X_eval.drop(columns='scenario'))
    y_proba = Results['BestModel'].predict_proba(X_eval.drop(columns='scenario'))
    X_eval['prediction_score'] = y_pred
    X_eval['prediction_score_proba'] = y_proba[:,1]

    prediction_df = X_eval[['scenario', 'prediction_score','prediction_score_proba']]
    prediction_df.columns = ['dataset_id', 'prediction_score','prediction_score_proba']
    # ------------------------------------ V) Save Outputs ----------------------------------- #

    timestr = method+time.strftime("-%Y%m%d-%H%M%S") # create timestamp for saving epoch results

    PATH_SAVE = pjoin("Sandbox", "Output-"+timestr)
    if not os.path.exists(PATH_SAVE):
        os.makedirs(PATH_SAVE)

    # ---- A) save evaluation set predictions
    # ------------ 1 - evaluation set prediction with submission format
    filename = 'prediction-'+timestr+'.csv'
    prediction_df[['dataset_id', 'prediction_score']].to_csv(pjoin(PATH_SAVE ,filename), index=False)
    # ------------ 2 -evaluation set prediction also outputing predicted probabilities
    filename = 'prediction_proba-'+timestr+'.csv'
    prediction_df[['dataset_id', 'prediction_score','prediction_score_proba']].to_csv(pjoin(PATH_SAVE ,filename), index=False)

    #  ---- B) train set performance metrics summary
    filename = 'metrics-'+timestr+'.csv'
    Results['Metrics'].to_csv(pjoin(PATH_SAVE ,filename))

    # ---- C) metaclassifier performance on trainset
    filename = 'train_df_pred-'+timestr+'.csv'
    Results['train_df_pred'].to_csv(pjoin(PATH_SAVE ,filename))

    # ---- D) meta-classification model
    filename = 'MetaClassifier-'+timestr+'.pkl'
    with open(pjoin(PATH_SAVE ,filename), 'wb') as file:
        pickle.dump(Results['BestModel'], file)