-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML.py
165 lines (133 loc) · 7.14 KB
/
ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
# ADDED: Import for shuffle utility if needed, though df.sample is often sufficient
from sklearn.utils import shuffle
df = pd.read_csv('employee_attrition_dataset.csv')
# --- PREPARATION ---
df_prepared = df.drop('Employee_ID', axis=1)
# --- Label Encoding ---
columns_to_encode = ['Attrition', 'Overtime']
label_encoder = LabelEncoder()
print("--- Label Encoding ---")
for col in columns_to_encode:
original_values = df_prepared[col].unique()
df_prepared[col] = label_encoder.fit_transform(df_prepared[col])
encoded_values = df_prepared[col].unique()
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f" Encoded '{col}': Original unique values {original_values} -> Encoded unique values {encoded_values}. Mapping: {mapping}")
# Attrition: No=0, Yes=1
# Overtime: No=0, Yes=1
# --- One-Hot Encoding ---
print("\n--- One-Hot Encoding ---")
columns_to_encode_onehot = ['Marital_Status', 'Department']
columns_exist_for_onehot = [col for col in columns_to_encode_onehot if col in df_prepared.columns]
df_prepared = pd.get_dummies(df_prepared, columns=columns_exist_for_onehot, drop_first=False)
print(" One-Hot Encoding applied.")
print("Columns after One-Hot Encoding:", df_prepared.columns.tolist())
# --- START: Undersampling Implementation ---
print("\n--- Performing Random Undersampling ---")
random_seed = 42 # Use the same random seed for reproducibility
# Separate majority and minority classes (assuming Attrition 'Yes' is 1 and 'No' is 0)
df_majority = df_prepared[df_prepared.Attrition == 0]
df_minority = df_prepared[df_prepared.Attrition == 1]
print(f"Original dataset shape: {df_prepared.shape}")
print(f"Majority class (Attrition=0) count: {len(df_majority)}")
print(f"Minority class (Attrition=1) count: {len(df_minority)}")
# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=len(df_minority), random_state=random_seed)
# Combine minority class with undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])
# Shuffle the resulting DataFrame
df_undersampled = df_undersampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)
print(f"\nUndersampled dataset shape: {df_undersampled.shape}")
print("Undersampled dataset 'Attrition' distribution:\n", df_undersampled.Attrition.value_counts())
# --- END: Undersampling Implementation ---
# --- Feature Selection (using the undersampled data) ---
print("\n--- Feature Selection ---")
# Define features to drop (same logic as before, but applied to df_undersampled)
low_mi_features_to_drop = [
'Work_Life_Balance',
'Work_Environment_Satisfaction',
'Absenteeism',
'Gender',
'Average_Hours_Worked_Per_Week',
'Job_Role'
]
# We need Attrition for y, so we define features_to_drop for X separately
features_to_drop_for_X = low_mi_features_to_drop # Job_Role was already dropped by one-hot if it existed
# Ensure the columns actually exist in df_undersampled before trying to drop
features_to_drop_for_X = [col for col in features_to_drop_for_X if col in df_undersampled.columns]
print(f"Dropping features to create X: {features_to_drop_for_X}")
# --- Splitting the UNDERSAMPLED data ---
print("\n--- Splitting Undersampled Data ---")
# Use df_undersampled instead of df_prepared
y = df_undersampled['Attrition'] # Label from undersampled data
# Drop 'Attrition' AND the low_mi_features to create X from undersampled data
X = df_undersampled.drop(columns=['Attrition'] + features_to_drop_for_X) # Features from undersampled data
print("Final features used for X:", X.columns.tolist()) # Verify the final features
# Split into test and training
test_set_size = 0.20
# random_seed is already defined above
X_train, X_test, y_train, y_test = train_test_split(
X, # From undersampled data
y, # From undersampled data
test_size=test_set_size,
random_state=random_seed,
stratify=y # Stratify on the now balanced y
)
print("\n--- Verification of Train/Test Split (Undersampled Data) ---")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("\nAttrition distribution in original undersampled y:\n", y.value_counts(normalize=True))
print("\nAttrition distribution in y_train:\n", y_train.value_counts(normalize=True))
print("\nAttrition distribution in y_test:\n", y_test.value_counts(normalize=True))
# --- KNN Model (using undersampled data) ---
if X_train is not None and y_train is not None:
print("\n--- K-Nearest Neighbors (KNN) Classifier (on Undersampled Data) ---")
# Scaling might be more important now, consider uncommenting if results are poor
print("\n--- Scaling Features ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled using StandardScaler.")
k_value = 6
print(f"Initializing KNN classifier with k={k_value}")
knn_model = KNeighborsClassifier(n_neighbors=k_value)
print("Fitting the KNN model to the undersampled training data...")
# Use unscaled data for now
knn_model.fit(X_train, y_train)
print("Predictions made on the test set using KNN.")
y_pred_knn = knn_model.predict(X_test) # Renamed variable
print("\n--- Evaluating KNN Model (on Undersampled Data) ---")
accuracy_knn = accuracy_score(y_test, y_pred_knn) # Renamed variable
print(f"KNN Model Accuracy: {accuracy_knn:.4f}")
# Added zero_division=0 to handle potential warnings if a class isn't predicted in the test split
print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn, zero_division=0))
print("\nKNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
# --- Naive Bayes Model (using undersampled data) ---
if X_train is not None and y_train is not None:
print("\n--- Gaussian Naive Bayes Classifier (on Undersampled Data) ---")
print("Initializing Gaussian Naive Bayes classifier...")
nb_model = GaussianNB()
print("Fitting the Naive Bayes model to the undersampled training data...")
nb_model.fit(X_train, y_train)
print("Predictions made on the test set using Naive Bayes.")
y_pred_nb = nb_model.predict(X_test)
print("\n--- Evaluating Naive Bayes Model (on Undersampled Data) ---")
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Model Accuracy: {accuracy_nb:.4f}")
# Added zero_division=0 to handle potential warnings
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb, zero_division=0))
print("\nNaive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
# --- END: Model Code ---