-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_learning.py
407 lines (316 loc) · 17.7 KB
/
deep_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# --- Import TensorFlow/Keras for Deep Learning ---
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping # Optional: for early stopping
# --- Load Data ---
df = pd.read_csv('employee_attrition_dataset.csv')
print("Dataset loaded.")
# --- PREPARATION ---
df_prepared = df.drop('Employee_ID', axis=1)
# --- Label Encoding ---
columns_to_encode = ['Attrition', 'Gender']
label_encoder_attrition = LabelEncoder()
label_encoder_gender = LabelEncoder()
print("--- Label Encoding ---")
# Encode Attrition
original_values = df_prepared['Attrition'].unique()
df_prepared['Attrition'] = label_encoder_attrition.fit_transform(df_prepared['Attrition'])
encoded_values = df_prepared['Attrition'].unique()
mapping_attrition = dict(zip(label_encoder_attrition.classes_, label_encoder_attrition.transform(label_encoder_attrition.classes_)))
print(f" Attrition mapping: {mapping_attrition}")
# Encode Gender
original_values = df_prepared['Gender'].unique()
df_prepared['Gender'] = label_encoder_gender.fit_transform(df_prepared['Gender'])
encoded_values = df_prepared['Gender'].unique()
mapping_gender = dict(zip(label_encoder_gender.classes_, label_encoder_gender.transform(label_encoder_gender.classes_)))
print(f" Gender mapping: {mapping_gender}")
# Rename the encoded column to Gender_Male IF Male is encoded as 1
if 'Male' in mapping_gender and mapping_gender['Male'] == 1:
df_prepared.rename(columns={'Gender': 'Gender_Male'}, inplace=True)
print(" Renamed encoded 'Gender' column to 'Gender_Male'.")
elif 'Female' in mapping_gender and mapping_gender['Female'] == 1:
# If Female is 1, we need to flip the bits to get Gender_Male (where Male=1)
print(" Adjusting 'Gender' encoding to create 'Gender_Male' (Male=1).")
df_prepared['Gender_Male'] = 1 - df_prepared['Gender']
df_prepared.drop('Gender', axis=1, inplace=True)
elif 'Gender_Male' not in df_prepared.columns:
# Handle cases where gender might already be numerical or have different labels
print("Warning: Could not automatically create 'Gender_Male'. Check Gender encoding.")
# As a fallback, let's assume the existing encoded 'Gender' might represent Male=1 if only two values exist
if len(df_prepared['Gender'].unique()) == 2 and 1 in df_prepared['Gender'].unique():
print(" Assuming existing 'Gender' column represents Male=1. Renaming.")
df_prepared.rename(columns={'Gender': 'Gender_Male'}, inplace=True)
else:
print(" Could not determine Gender_Male mapping. Dropping 'Gender'.")
if 'Gender' in df_prepared.columns:
df_prepared.drop('Gender', axis=1, inplace=True)
# --- One-Hot Encoding ---
print("\n--- One-Hot Encoding ---")
columns_to_encode_onehot = ['Department'] # Job_Role, Marital_Status removed
columns_exist_for_onehot = [col for col in columns_to_encode_onehot if col in df_prepared.columns]
if columns_exist_for_onehot:
df_prepared = pd.get_dummies(df_prepared, columns=columns_exist_for_onehot, drop_first=False, prefix='Department') # Added prefix
print(f" One-Hot Encoding applied to: {columns_exist_for_onehot}")
else:
print(" No specified columns found for One-Hot Encoding.")
# ---Undersampling Implementation ---
print("\n--- Performing Undersampling ---")
random_seed = 42
tf.random.set_seed(random_seed) # Set seed for TensorFlow reproducibility
np.random.seed(random_seed) # Ensure numpy operations are also seeded if needed elsewhere
undersampling_ratio = 1.0 #MODIFY THIS VALUE TO CONTROL THE LEVEL (e.g., 1.0, 1.5, 2.0)
# Separate majority and minority classes (Attrition: No=0, Yes=1)
df_majority = df_prepared[df_prepared.Attrition == 0]
df_minority = df_prepared[df_prepared.Attrition == 1]
minority_size = len(df_minority)
majority_size = len(df_majority)
print(f"Original distribution: Majority={majority_size}, Minority={minority_size}")
print(f"Target undersampling ratio (Majority:Minority): {undersampling_ratio}:1")
if majority_size < minority_size:
print("Warning: Initial 'majority' class (Attrition=0) is smaller than 'minority' class (Attrition=1). "
"Undersampling logic might behave unexpectedly or not reduce the majority.")
# In this unusual case, maybe we don't undersample the 'majority' (0)
# Or adjust logic based on specific needs. Here, we'll proceed but it won't reduce class 0.
desired_majority_samples = int(minority_size * undersampling_ratio)
n_samples_majority = min(desired_majority_samples, majority_size)
else:
# Calculate the desired number of majority samples based on the minority size and the ratio
desired_majority_samples = int(minority_size * undersampling_ratio)
# Ensure we don't try to sample more majority samples than actually exist
n_samples_majority = min(desired_majority_samples, majority_size)
print(f"Calculated samples to keep from majority class: {n_samples_majority}")
# Undersample the majority class to the calculated size
df_majority_undersampled = df_majority.sample(n=n_samples_majority, random_state=random_seed)
# Combine the minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])
# Shuffle the resulting DataFrame
df_undersampled = df_undersampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)
print("Final dataset 'Attrition' distribution after undersampling:\n", df_undersampled.Attrition.value_counts())
final_minority_count = len(df_undersampled[df_undersampled.Attrition == 1])
if final_minority_count > 0:
final_ratio = len(df_undersampled[df_undersampled.Attrition == 0]) / final_minority_count
print(f"Final ratio (Majority/Minority): {final_ratio:.2f}:1")
else:
print("Final ratio calculation skipped: No minority samples after processing.")
# --- END: Undersampling Implementation ---
# --- START: Feature Selection (Select Specific Features) ---
print("\n--- Feature Selection (Selecting Specific Features) ---")
# Define the list of features we want to use for modeling.
# Make sure the one-hot encoded columns are included correctly
selected_features = [
# Original features kept
'Age',
'Years_Since_Last_Promotion',
'Work_Life_Balance',
'Performance_Rating',
'Training_Hours_Last_Year',
'Average_Hours_Worked_Per_Week',
'Absenteeism',
'Job_Involvement'
]
# Add Gender_Male if it exists
if 'Gender_Male' in df_prepared.columns:
selected_features.append('Gender_Male')
print(" Including 'Gender_Male' in selected features.")
elif 'Gender' in df_prepared.columns and len(df_prepared['Gender'].unique()) == 2:
# Fallback if renaming failed but 'Gender' is binary
print(" Warning: 'Gender_Male' not found, but binary 'Gender' exists. Assuming it represents Male=1 and including.")
df_prepared.rename(columns={'Gender': 'Gender_Male'}, inplace=True) # Try renaming again just in case
selected_features.append('Gender_Male')
# Add one-hot encoded Department columns if they exist
department_cols = [col for col in df_prepared.columns if col.startswith('Department_')]
if department_cols:
selected_features.extend(department_cols)
print(f" Including one-hot encoded Department columns: {department_cols}")
# Verify that all selected features actually exist in the dataframe before proceeding
final_selected_features = [col for col in selected_features if col in df_prepared.columns]
missing_features = set(selected_features) - set(final_selected_features)
if missing_features:
print(f"Warning: The following selected features were not found in the prepared dataframe and will be excluded: {missing_features}")
selected_features = final_selected_features
print(f"Final features used for modeling: {selected_features}")
# --- Define Target (y) and Features (X) from Undersampled Data ---
print("\n--- Defining Target (y) and Features (X) ---")
target_variable = 'Attrition'
print("Use undersampled data for modeling? (y / n)")
ans = input()
if ans.lower() == 'y':
print("Using undersampled data.")
current_df = df_undersampled
else:
print("Using original (prepared) data.")
current_df = df_prepared
# Ensure all selected features exist in the chosen dataframe (current_df)
selected_features = [col for col in selected_features if col in current_df.columns]
print(f"Features available in the chosen dataset: {selected_features}")
if not selected_features:
raise ValueError("No features selected or available in the chosen dataset. Stopping.")
y = current_df[target_variable] # Target
X = current_df[selected_features] # Features
# --- Train-Test Split ---
print("\n--- Splitting Data into Train/Test Sets ---")
test_set_size = 0.20
# Check if stratification is possible
if len(np.unique(y)) > 1 and np.min(np.bincount(y)) >= 2:
stratify_option = y
print(f" Splitting with stratification based on '{target_variable}'.")
else:
stratify_option = None
print(f" Warning: Cannot stratify split. Target variable '{target_variable}' might have only one class or too few samples in one class.")
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=test_set_size,
random_state=random_seed,
stratify=stratify_option
)
print(f" Train set shape: X={X_train.shape}, y={y_train.shape}")
print(f" Test set shape: X={X_test.shape}, y={y_test.shape}")
print(f" Test set '{target_variable}' distribution:\n{y_test.value_counts(normalize=True)}")
# --- START: Preprocessing Before Models (Missing Values, Scaling) ---
#Handle Missing Values
print("\n--- Handling Missing Values ---")
# Check if X_train/X_test are empty
if X_train.empty or X_test.empty:
raise ValueError("X_train or X_test is empty after feature selection/split.")
#Check if numeric
numeric_cols_train = X_train.select_dtypes(include=np.number).columns
if len(numeric_cols_train) != X_train.shape[1]:
non_numeric_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()
print(f"Warning: Non-numeric columns detected in X_train after feature selection: {non_numeric_cols}. Attempting to drop them.")
X_train = X_train.drop(columns=non_numeric_cols)
X_test = X_test.drop(columns=non_numeric_cols)
numeric_cols_train = X_train.columns # Update numeric cols list
print(f" Non-numeric columns dropped. Remaining features: {numeric_cols_train.tolist()}")
if X_train.isnull().sum().sum() > 0:
print(f" Found {X_train.isnull().sum().sum()} missing values in X_train. Imputing with median.")
train_medians = X_train[numeric_cols_train].median()
# Fill NaN in training set with training medians
X_train.fillna(train_medians, inplace=True)
# Fill NaN in test set with training medians (to avoid data leakage)
X_test.fillna(train_medians, inplace=True)
print(" Missing values imputed using training set medians.")
else:
print(" No missing values found in X_train.")
# 2. Scaling Features
print("\n--- Scaling Features ---")
print("Apply scaling? (y / n) (Recommended for KNN, NB, DL)")
ans = input()
if ans.lower() == 'y':
print(" Applying StandardScaler.")
scaler = StandardScaler()
# Fit scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train)
# Transform both training and test data
X_test_scaled = scaler.transform(X_test)
# Convert back to DataFrame for compatibility if needed, though numpy arrays are fine for models
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
else:
print(" Skipping feature scaling.")
X_train_scaled = X_train.copy() # Use copy to avoid potential SettingWithCopyWarning later
X_test_scaled = X_test.copy()
# --- KNN Model ---
if not X_train_scaled.empty and not y_train.empty:
print(f"\n--- K-Nearest Neighbors (KNN) Classifier (Predicting '{target_variable}') ---")
k_value = 6
knn_model = KNeighborsClassifier(n_neighbors=k_value)
knn_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_knn = knn_model.predict(X_test_scaled) # Predict on scaled data
print("\n--- Evaluating KNN Model ---")
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Model Accuracy: {accuracy_knn:.4f}")
print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn, zero_division=0))
print("\nKNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
# --- Naive Bayes Model ---
if not X_train_scaled.empty and not y_train.empty:
print(f"\n--- Gaussian Naive Bayes Classifier (Predicting '{target_variable}') ---")
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_nb = nb_model.predict(X_test_scaled) # Predict on scaled data
print("\n--- Evaluating Naive Bayes Model ---")
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Model Accuracy: {accuracy_nb:.4f}")
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb, zero_division=0))
print("\nNaive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
# --- Decision Tree Model ---
# Decision Trees are less sensitive to scaling, but using scaled data is fine
if not X_train_scaled.empty and not y_train.empty:
print(f"\n--- Decision Tree Classifier (Predicting '{target_variable}') ---")
# Using class_weight='balanced' can help with imbalanced datasets if undersampling wasn't used or wasn't perfect
dt_model = DecisionTreeClassifier(random_state=random_seed, class_weight='balanced')
dt_model.fit(X_train_scaled, y_train) # Train on scaled data
y_pred_dt = dt_model.predict(X_test_scaled) # Predict on scaled data
print("\n--- Evaluating Decision Tree Model ---")
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Model Accuracy: {accuracy_dt:.4f}")
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt, zero_division=0))
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
# --- Random Forest Model ---
# Random Forests are also less sensitive to scaling
if not X_train_scaled.empty and not y_train.empty:
print(f"\n--- Random Forest Classifier (Predicting '{target_variable}') ---")
# Using class_weight='balanced' can help
rf_model = RandomForestClassifier(random_state=random_seed, class_weight='balanced', n_estimators=100) # Added n_estimators
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print("\n--- Evaluating Random Forest Model ---")
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf, zero_division=0))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
# --- Deep Learning Model (MLP) ---
if not X_train_scaled.empty and not y_train.empty:
print(f"\n--- Deep Learning (MLP) Classifier (Predicting '{target_variable}') ---")
# Define the model architecture
n_features = X_train_scaled.shape[1]
dl_model = Sequential([
Dense(64, activation='relu', input_shape=(n_features,)), # Input layer + first hidden layer
Dropout(0.3), # Dropout for regularization
Dense(32, activation='relu'), # Second hidden layer
Dropout(0.2), # Dropout
Dense(1, activation='sigmoid') # Output layer for binary classification
])
# Compile the model
dl_model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
print("\n--- Training Deep Learning Model ---")
# Optional: Early stopping to prevent overfitting and save time
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# Train the model
# Ensure y_train and y_test are numpy arrays for TensorFlow
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)
history = dl_model.fit(X_train_scaled, y_train_np,
epochs=100, # Number of training cycles
batch_size=32, # Number of samples per gradient update
validation_data=(X_test_scaled, y_test_np),
callbacks=[early_stopping], # Add early stopping
verbose=0) # Set verbose=1 or 2 to see training progress per epoch
print("Deep Learning Model training complete.")
print(f"Training stopped after {len(history.history['loss'])} epochs.")
print("\n--- Evaluating Deep Learning Model ---")
# Evaluate the model on the test set
loss, accuracy_dl = dl_model.evaluate(X_test_scaled, y_test_np, verbose=0)
print(f"Deep Learning Model Accuracy: {accuracy_dl:.4f}")
print(f"Deep Learning Model Loss: {loss:.4f}")
# Make predictions (outputs probabilities)
y_pred_prob_dl = dl_model.predict(X_test_scaled)
# Convert probabilities to binary predictions (0 or 1) using a 0.5 threshold
y_pred_dl = (y_pred_prob_dl > 0.5).astype("int32").flatten() # Use flatten() to make it 1D array
print("\nDeep Learning Classification Report:\n", classification_report(y_test, y_pred_dl, zero_division=0))
print("\nDeep Learning Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dl))
# --- END: Model Code ---
print("\n--- Script Finished ---")