diff --git a/AttritionForecast_Analysis_and_Prediction_S.ipynb b/AttritionForecast_Analysis_and_Prediction_S.ipynb new file mode 100644 index 0000000..114f06c --- /dev/null +++ b/AttritionForecast_Analysis_and_Prediction_S.ipynb @@ -0,0 +1,2566 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "uJK0CMj1paWT" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n" + ] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv('/content/WA_Fn-UseC_-HR-Employee-Attrition.csv')\n" + ], + "metadata": { + "id": "Hx96ztmAqA_G" + }, + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 582 + }, + "id": "hs2Fk056qEhB", + "outputId": "0d2295a5-87fa-478b-9d68-3a46da2c10f6" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age Attrition BusinessTravel DailyRate Department \\\n", + "0 41 Yes Travel_Rarely 1102 Sales \n", + "1 49 No Travel_Frequently 279 Research & Development \n", + "2 37 Yes Travel_Rarely 1373 Research & Development \n", + "3 33 No Travel_Frequently 1392 Research & Development \n", + "4 27 No Travel_Rarely 591 Research & Development \n", + "... ... ... ... ... ... \n", + "1465 36 No Travel_Frequently 884 Research & Development \n", + "1466 39 No Travel_Rarely 613 Research & Development \n", + "1467 27 No Travel_Rarely 155 Research & Development \n", + "1468 49 No Travel_Frequently 1023 Sales \n", + "1469 34 No Travel_Rarely 628 Research & Development \n", + "\n", + " DistanceFromHome Education EducationField EmployeeCount \\\n", + "0 1 2 Life Sciences 1 \n", + "1 8 1 Life Sciences 1 \n", + "2 2 2 Other 1 \n", + "3 3 4 Life Sciences 1 \n", + "4 2 1 Medical 1 \n", + "... ... ... ... ... \n", + "1465 23 2 Medical 1 \n", + "1466 6 1 Medical 1 \n", + "1467 4 3 Life Sciences 1 \n", + "1468 2 3 Medical 1 \n", + "1469 8 3 Medical 1 \n", + "\n", + " EmployeeNumber ... RelationshipSatisfaction StandardHours \\\n", + "0 1 ... 1 80 \n", + "1 2 ... 4 80 \n", + "2 4 ... 2 80 \n", + "3 5 ... 3 80 \n", + "4 7 ... 4 80 \n", + "... ... ... ... ... \n", + "1465 2061 ... 3 80 \n", + "1466 2062 ... 1 80 \n", + "1467 2064 ... 2 80 \n", + "1468 2065 ... 4 80 \n", + "1469 2068 ... 1 80 \n", + "\n", + " StockOptionLevel TotalWorkingYears TrainingTimesLastYear \\\n", + "0 0 8 0 \n", + "1 1 10 3 \n", + "2 0 7 3 \n", + "3 0 8 3 \n", + "4 1 6 3 \n", + "... ... ... ... \n", + "1465 1 17 3 \n", + "1466 1 9 5 \n", + "1467 1 6 0 \n", + "1468 0 17 3 \n", + "1469 0 6 3 \n", + "\n", + " WorkLifeBalance YearsAtCompany YearsInCurrentRole \\\n", + "0 1 6 4 \n", + "1 3 10 7 \n", + "2 3 0 0 \n", + "3 3 8 7 \n", + "4 3 2 2 \n", + "... ... ... ... \n", + "1465 3 5 2 \n", + "1466 3 7 7 \n", + "1467 3 6 2 \n", + "1468 2 9 6 \n", + "1469 4 4 3 \n", + "\n", + " YearsSinceLastPromotion YearsWithCurrManager \n", + "0 0 5 \n", + "1 1 7 \n", + "2 0 0 \n", + "3 3 0 \n", + "4 2 2 \n", + "... ... ... \n", + "1465 0 3 \n", + "1466 1 7 \n", + "1467 0 3 \n", + "1468 0 8 \n", + "1469 1 2 \n", + "\n", + "[1470 rows x 35 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAttritionBusinessTravelDailyRateDepartmentDistanceFromHomeEducationEducationFieldEmployeeCountEmployeeNumber...RelationshipSatisfactionStandardHoursStockOptionLevelTotalWorkingYearsTrainingTimesLastYearWorkLifeBalanceYearsAtCompanyYearsInCurrentRoleYearsSinceLastPromotionYearsWithCurrManager
041YesTravel_Rarely1102Sales12Life Sciences11...18008016405
149NoTravel_Frequently279Research & Development81Life Sciences12...4801103310717
237YesTravel_Rarely1373Research & Development22Other14...28007330000
333NoTravel_Frequently1392Research & Development34Life Sciences15...38008338730
427NoTravel_Rarely591Research & Development21Medical17...48016332222
..................................................................
146536NoTravel_Frequently884Research & Development232Medical12061...380117335203
146639NoTravel_Rarely613Research & Development61Medical12062...18019537717
146727NoTravel_Rarely155Research & Development43Life Sciences12064...28016036203
146849NoTravel_Frequently1023Sales23Medical12065...480017329608
146934NoTravel_Rarely628Research & Development83Medical12068...18006344312
\n", + "

1470 rows × 35 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Display the first few rows\n", + "print(df.head())\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rFS_iManqFvJ", + "outputId": "4c9601db-19ef-4c72-ba3f-534c2b821ca2" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Age Attrition BusinessTravel DailyRate Department \\\n", + "0 41 Yes Travel_Rarely 1102 Sales \n", + "1 49 No Travel_Frequently 279 Research & Development \n", + "2 37 Yes Travel_Rarely 1373 Research & Development \n", + "3 33 No Travel_Frequently 1392 Research & Development \n", + "4 27 No Travel_Rarely 591 Research & Development \n", + "\n", + " DistanceFromHome Education EducationField EmployeeCount EmployeeNumber \\\n", + "0 1 2 Life Sciences 1 1 \n", + "1 8 1 Life Sciences 1 2 \n", + "2 2 2 Other 1 4 \n", + "3 3 4 Life Sciences 1 5 \n", + "4 2 1 Medical 1 7 \n", + "\n", + " ... RelationshipSatisfaction StandardHours StockOptionLevel \\\n", + "0 ... 1 80 0 \n", + "1 ... 4 80 1 \n", + "2 ... 2 80 0 \n", + "3 ... 3 80 0 \n", + "4 ... 4 80 1 \n", + "\n", + " TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany \\\n", + "0 8 0 1 6 \n", + "1 10 3 3 10 \n", + "2 7 3 3 0 \n", + "3 8 3 3 8 \n", + "4 6 3 3 2 \n", + "\n", + " YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager \n", + "0 4 0 5 \n", + "1 7 1 7 \n", + "2 0 0 0 \n", + "3 7 3 0 \n", + "4 2 2 2 \n", + "\n", + "[5 rows x 35 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Get a concise summary of the DataFrame\n", + "print(df.info())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v7HM4SqVqYmO", + "outputId": "10bd1f21-30b6-4460-e5ba-832ac5554a7c" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 1470 entries, 0 to 1469\n", + "Data columns (total 35 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Age 1470 non-null int64 \n", + " 1 Attrition 1470 non-null object\n", + " 2 BusinessTravel 1470 non-null object\n", + " 3 DailyRate 1470 non-null int64 \n", + " 4 Department 1470 non-null object\n", + " 5 DistanceFromHome 1470 non-null int64 \n", + " 6 Education 1470 non-null int64 \n", + " 7 EducationField 1470 non-null object\n", + " 8 EmployeeCount 1470 non-null int64 \n", + " 9 EmployeeNumber 1470 non-null int64 \n", + " 10 EnvironmentSatisfaction 1470 non-null int64 \n", + " 11 Gender 1470 non-null object\n", + " 12 HourlyRate 1470 non-null int64 \n", + " 13 JobInvolvement 1470 non-null int64 \n", + " 14 JobLevel 1470 non-null int64 \n", + " 15 JobRole 1470 non-null object\n", + " 16 JobSatisfaction 1470 non-null int64 \n", + " 17 MaritalStatus 1470 non-null object\n", + " 18 MonthlyIncome 1470 non-null int64 \n", + " 19 MonthlyRate 1470 non-null int64 \n", + " 20 NumCompaniesWorked 1470 non-null int64 \n", + " 21 Over18 1470 non-null object\n", + " 22 OverTime 1470 non-null object\n", + " 23 PercentSalaryHike 1470 non-null int64 \n", + " 24 PerformanceRating 1470 non-null int64 \n", + " 25 RelationshipSatisfaction 1470 non-null int64 \n", + " 26 StandardHours 1470 non-null int64 \n", + " 27 StockOptionLevel 1470 non-null int64 \n", + " 28 TotalWorkingYears 1470 non-null int64 \n", + " 29 TrainingTimesLastYear 1470 non-null int64 \n", + " 30 WorkLifeBalance 1470 non-null int64 \n", + " 31 YearsAtCompany 1470 non-null int64 \n", + " 32 YearsInCurrentRole 1470 non-null int64 \n", + " 33 YearsSinceLastPromotion 1470 non-null int64 \n", + " 34 YearsWithCurrManager 1470 non-null int64 \n", + "dtypes: int64(26), object(9)\n", + "memory usage: 402.1+ KB\n", + "None\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Get statistical summary of numeric columns\n", + "print(df.describe())\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Tboi56PUqlsQ", + "outputId": "46700d18-89ad-4652-d6fc-b59f162626ee" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Age DailyRate DistanceFromHome Education EmployeeCount \\\n", + "count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 \n", + "mean 36.923810 802.485714 9.192517 2.912925 1.0 \n", + "std 9.135373 403.509100 8.106864 1.024165 0.0 \n", + "min 18.000000 102.000000 1.000000 1.000000 1.0 \n", + "25% 30.000000 465.000000 2.000000 2.000000 1.0 \n", + "50% 36.000000 802.000000 7.000000 3.000000 1.0 \n", + "75% 43.000000 1157.000000 14.000000 4.000000 1.0 \n", + "max 60.000000 1499.000000 29.000000 5.000000 1.0 \n", + "\n", + " EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement \\\n", + "count 1470.000000 1470.000000 1470.000000 1470.000000 \n", + "mean 1024.865306 2.721769 65.891156 2.729932 \n", + "std 602.024335 1.093082 20.329428 0.711561 \n", + "min 1.000000 1.000000 30.000000 1.000000 \n", + "25% 491.250000 2.000000 48.000000 2.000000 \n", + "50% 1020.500000 3.000000 66.000000 3.000000 \n", + "75% 1555.750000 4.000000 83.750000 3.000000 \n", + "max 2068.000000 4.000000 100.000000 4.000000 \n", + "\n", + " JobLevel ... RelationshipSatisfaction StandardHours \\\n", + "count 1470.000000 ... 1470.000000 1470.0 \n", + "mean 2.063946 ... 2.712245 80.0 \n", + "std 1.106940 ... 1.081209 0.0 \n", + "min 1.000000 ... 1.000000 80.0 \n", + "25% 1.000000 ... 2.000000 80.0 \n", + "50% 2.000000 ... 3.000000 80.0 \n", + "75% 3.000000 ... 4.000000 80.0 \n", + "max 5.000000 ... 4.000000 80.0 \n", + "\n", + " StockOptionLevel TotalWorkingYears TrainingTimesLastYear \\\n", + "count 1470.000000 1470.000000 1470.000000 \n", + "mean 0.793878 11.279592 2.799320 \n", + "std 0.852077 7.780782 1.289271 \n", + "min 0.000000 0.000000 0.000000 \n", + "25% 0.000000 6.000000 2.000000 \n", + "50% 1.000000 10.000000 3.000000 \n", + "75% 1.000000 15.000000 3.000000 \n", + "max 3.000000 40.000000 6.000000 \n", + "\n", + " WorkLifeBalance YearsAtCompany YearsInCurrentRole \\\n", + "count 1470.000000 1470.000000 1470.000000 \n", + "mean 2.761224 7.008163 4.229252 \n", + "std 0.706476 6.126525 3.623137 \n", + "min 1.000000 0.000000 0.000000 \n", + "25% 2.000000 3.000000 2.000000 \n", + "50% 3.000000 5.000000 3.000000 \n", + "75% 3.000000 9.000000 7.000000 \n", + "max 4.000000 40.000000 18.000000 \n", + "\n", + " YearsSinceLastPromotion YearsWithCurrManager \n", + "count 1470.000000 1470.000000 \n", + "mean 2.187755 4.123129 \n", + "std 3.222430 3.568136 \n", + "min 0.000000 0.000000 \n", + "25% 0.000000 2.000000 \n", + "50% 1.000000 3.000000 \n", + "75% 3.000000 7.000000 \n", + "max 15.000000 17.000000 \n", + "\n", + "[8 rows x 26 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Check for missing values\n", + "print(df.isnull().sum())\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y_5UzgeCqo94", + "outputId": "771d9205-4a5d-499c-d0d0-6fd7c41d9518" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Age 0\n", + "Attrition 0\n", + "BusinessTravel 0\n", + "DailyRate 0\n", + "Department 0\n", + "DistanceFromHome 0\n", + "Education 0\n", + "EducationField 0\n", + "EmployeeCount 0\n", + "EmployeeNumber 0\n", + "EnvironmentSatisfaction 0\n", + "Gender 0\n", + "HourlyRate 0\n", + "JobInvolvement 0\n", + "JobLevel 0\n", + "JobRole 0\n", + "JobSatisfaction 0\n", + "MaritalStatus 0\n", + "MonthlyIncome 0\n", + "MonthlyRate 0\n", + "NumCompaniesWorked 0\n", + "Over18 0\n", + "OverTime 0\n", + "PercentSalaryHike 0\n", + "PerformanceRating 0\n", + "RelationshipSatisfaction 0\n", + "StandardHours 0\n", + "StockOptionLevel 0\n", + "TotalWorkingYears 0\n", + "TrainingTimesLastYear 0\n", + "WorkLifeBalance 0\n", + "YearsAtCompany 0\n", + "YearsInCurrentRole 0\n", + "YearsSinceLastPromotion 0\n", + "YearsWithCurrManager 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Check for duplicate rows\n", + "print(df.duplicated().sum())\n", + "\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RkIeqiEzqv_4", + "outputId": "39de62c9-4b31-4cff-eb91-4b93629a0309" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Get the list of categorical columns\n", + "categorical_columns = df.select_dtypes(include=['object']).columns\n", + "print(categorical_columns)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oIQIYAVBq9LC", + "outputId": "2759fa27-ddd1-45e6-c7a8-abbbba710598" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',\n", + " 'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],\n", + " dtype='object')\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# One-Hot Encoding for categorical variables\n", + "df_encoded = pd.get_dummies(df, columns=categorical_columns)\n", + "\n", + "le = LabelEncoder()\n", + "for column in categorical_columns:\n", + " df[column] = le.fit_transform(df[column])\n" + ], + "metadata": { + "id": "QWmv7na9rE5H" + }, + "execution_count": 59, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 443 + }, + "id": "ZqTVp1sBrgFn", + "outputId": "1bd132c5-5ef1-4fd6-94ea-676c8dcd529e" + }, + "execution_count": 60, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age Attrition BusinessTravel DailyRate Department DistanceFromHome \\\n", + "0 41 1 2 1102 2 1 \n", + "1 49 0 1 279 1 8 \n", + "2 37 1 2 1373 1 2 \n", + "3 33 0 1 1392 1 3 \n", + "4 27 0 2 591 1 2 \n", + "... ... ... ... ... ... ... \n", + "1465 36 0 1 884 1 23 \n", + "1466 39 0 2 613 1 6 \n", + "1467 27 0 2 155 1 4 \n", + "1468 49 0 1 1023 2 2 \n", + "1469 34 0 2 628 1 8 \n", + "\n", + " Education EducationField EmployeeCount EmployeeNumber ... \\\n", + "0 2 1 1 1 ... \n", + "1 1 1 1 2 ... \n", + "2 2 4 1 4 ... \n", + "3 4 1 1 5 ... \n", + "4 1 3 1 7 ... \n", + "... ... ... ... ... ... \n", + "1465 2 3 1 2061 ... \n", + "1466 1 3 1 2062 ... \n", + "1467 3 1 1 2064 ... \n", + "1468 3 3 1 2065 ... \n", + "1469 3 3 1 2068 ... \n", + "\n", + " RelationshipSatisfaction StandardHours StockOptionLevel \\\n", + "0 1 80 0 \n", + "1 4 80 1 \n", + "2 2 80 0 \n", + "3 3 80 0 \n", + "4 4 80 1 \n", + "... ... ... ... \n", + "1465 3 80 1 \n", + "1466 1 80 1 \n", + "1467 2 80 1 \n", + "1468 4 80 0 \n", + "1469 1 80 0 \n", + "\n", + " TotalWorkingYears TrainingTimesLastYear WorkLifeBalance \\\n", + "0 8 0 1 \n", + "1 10 3 3 \n", + "2 7 3 3 \n", + "3 8 3 3 \n", + "4 6 3 3 \n", + "... ... ... ... \n", + "1465 17 3 3 \n", + "1466 9 5 3 \n", + "1467 6 0 3 \n", + "1468 17 3 2 \n", + "1469 6 3 4 \n", + "\n", + " YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion \\\n", + "0 6 4 0 \n", + "1 10 7 1 \n", + "2 0 0 0 \n", + "3 8 7 3 \n", + "4 2 2 2 \n", + "... ... ... ... \n", + "1465 5 2 0 \n", + "1466 7 7 1 \n", + "1467 6 2 0 \n", + "1468 9 6 0 \n", + "1469 4 3 1 \n", + "\n", + " YearsWithCurrManager \n", + "0 5 \n", + "1 7 \n", + "2 0 \n", + "3 0 \n", + "4 2 \n", + "... ... \n", + "1465 3 \n", + "1466 7 \n", + "1467 3 \n", + "1468 8 \n", + "1469 2 \n", + "\n", + "[1470 rows x 35 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeAttritionBusinessTravelDailyRateDepartmentDistanceFromHomeEducationEducationFieldEmployeeCountEmployeeNumber...RelationshipSatisfactionStandardHoursStockOptionLevelTotalWorkingYearsTrainingTimesLastYearWorkLifeBalanceYearsAtCompanyYearsInCurrentRoleYearsSinceLastPromotionYearsWithCurrManager
041121102212111...18008016405
14901279181112...4801103310717
237121373122414...28007330000
333011392134115...38008338730
42702591121317...48016332222
..................................................................
146536018841232312061...380117335203
14663902613161312062...18019537717
14672702155143112064...28016036203
146849011023223312065...480017329608
14693402628183312068...18006344312
\n", + "

1470 rows × 35 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 60 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Separate features (X) and target variable (y)\n", + "X = df.drop(columns=[target])\n", + "y = df[target]\n" + ], + "metadata": { + "id": "dLSaqkZ0rvJy" + }, + "execution_count": 61, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 443 + }, + "id": "o6T5dvKAsbzp", + "outputId": "54523b5a-2605-497a-b182-795eceaadb00" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age BusinessTravel DailyRate Department DistanceFromHome Education \\\n", + "0 41 2 1102 2 1 2 \n", + "1 49 1 279 1 8 1 \n", + "2 37 2 1373 1 2 2 \n", + "3 33 1 1392 1 3 4 \n", + "4 27 2 591 1 2 1 \n", + "... ... ... ... ... ... ... \n", + "1465 36 1 884 1 23 2 \n", + "1466 39 2 613 1 6 1 \n", + "1467 27 2 155 1 4 3 \n", + "1468 49 1 1023 2 2 3 \n", + "1469 34 2 628 1 8 3 \n", + "\n", + " EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction \\\n", + "0 1 1 1 2 \n", + "1 1 1 2 3 \n", + "2 4 1 4 4 \n", + "3 1 1 5 4 \n", + "4 3 1 7 1 \n", + "... ... ... ... ... \n", + "1465 3 1 2061 3 \n", + "1466 3 1 2062 4 \n", + "1467 1 1 2064 2 \n", + "1468 3 1 2065 4 \n", + "1469 3 1 2068 2 \n", + "\n", + " ... RelationshipSatisfaction StandardHours StockOptionLevel \\\n", + "0 ... 1 80 0 \n", + "1 ... 4 80 1 \n", + "2 ... 2 80 0 \n", + "3 ... 3 80 0 \n", + "4 ... 4 80 1 \n", + "... ... ... ... ... \n", + "1465 ... 3 80 1 \n", + "1466 ... 1 80 1 \n", + "1467 ... 2 80 1 \n", + "1468 ... 4 80 0 \n", + "1469 ... 1 80 0 \n", + "\n", + " TotalWorkingYears TrainingTimesLastYear WorkLifeBalance \\\n", + "0 8 0 1 \n", + "1 10 3 3 \n", + "2 7 3 3 \n", + "3 8 3 3 \n", + "4 6 3 3 \n", + "... ... ... ... \n", + "1465 17 3 3 \n", + "1466 9 5 3 \n", + "1467 6 0 3 \n", + "1468 17 3 2 \n", + "1469 6 3 4 \n", + "\n", + " YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion \\\n", + "0 6 4 0 \n", + "1 10 7 1 \n", + "2 0 0 0 \n", + "3 8 7 3 \n", + "4 2 2 2 \n", + "... ... ... ... \n", + "1465 5 2 0 \n", + "1466 7 7 1 \n", + "1467 6 2 0 \n", + "1468 9 6 0 \n", + "1469 4 3 1 \n", + "\n", + " YearsWithCurrManager \n", + "0 5 \n", + "1 7 \n", + "2 0 \n", + "3 0 \n", + "4 2 \n", + "... ... \n", + "1465 3 \n", + "1466 7 \n", + "1467 3 \n", + "1468 8 \n", + "1469 2 \n", + "\n", + "[1470 rows x 34 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeBusinessTravelDailyRateDepartmentDistanceFromHomeEducationEducationFieldEmployeeCountEmployeeNumberEnvironmentSatisfaction...RelationshipSatisfactionStandardHoursStockOptionLevelTotalWorkingYearsTrainingTimesLastYearWorkLifeBalanceYearsAtCompanyYearsInCurrentRoleYearsSinceLastPromotionYearsWithCurrManager
041211022121112...18008016405
14912791811123...4801103310717
237213731224144...28007330000
333113921341154...38008338730
42725911213171...48016332222
..................................................................
146536188412323120613...380117335203
14663926131613120624...18019537717
14672721551431120642...28016036203
146849110232233120654...480017329608
14693426281833120682...18006344312
\n", + "

1470 rows × 34 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "X" + } + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Split the data into training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ], + "metadata": { + "id": "uX2VwSc5tHFQ" + }, + "execution_count": 63, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", + "\n", + "# Logistic Regression\n", + "log_reg = LogisticRegression(max_iter=1000)\n", + "log_reg.fit(X_train, y_train)\n", + "y_pred_log_reg = log_reg.predict(X_test)\n", + "print(\"Logistic Regression Confusion Matrix:\")\n", + "print(confusion_matrix(y_test, y_pred_log_reg))\n", + "print(\"Logistic Regression Accuracy:\")\n", + "print(accuracy_score(y_test, y_pred_log_reg))\n", + "print(\"Logistic Regression Classification Report:\")\n", + "print(classification_report(y_test, y_pred_log_reg))\n", + "\n", + "# Random Forest Classifier\n", + "rf_clf = RandomForestClassifier()\n", + "rf_clf.fit(X_train, y_train)\n", + "y_pred_rf_clf = rf_clf.predict(X_test)\n", + "print(\"Random Forest Confusion Matrix:\")\n", + "print(confusion_matrix(y_test, y_pred_rf_clf))\n", + "print(\"Random Forest Accuracy:\")\n", + "print(accuracy_score(y_test, y_pred_rf_clf))\n", + "print(\"Random Forest Classification Report:\")\n", + "print(classification_report(y_test, y_pred_rf_clf))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "atK0_s3Otbv3", + "outputId": "a855434b-357e-477c-c1b4-0c2696522135" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Logistic Regression Confusion Matrix:\n", + "[[249 6]\n", + " [ 36 3]]\n", + "Logistic Regression Accuracy:\n", + "0.8571428571428571\n", + "Logistic Regression Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.87 0.98 0.92 255\n", + " 1 0.33 0.08 0.12 39\n", + "\n", + " accuracy 0.86 294\n", + " macro avg 0.60 0.53 0.52 294\n", + "weighted avg 0.80 0.86 0.82 294\n", + "\n", + "Random Forest Confusion Matrix:\n", + "[[253 2]\n", + " [ 35 4]]\n", + "Random Forest Accuracy:\n", + "0.8741496598639455\n", + "Random Forest Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.88 0.99 0.93 255\n", + " 1 0.67 0.10 0.18 39\n", + "\n", + " accuracy 0.87 294\n", + " macro avg 0.77 0.55 0.55 294\n", + "weighted avg 0.85 0.87 0.83 294\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "fiRgIW3Kt4sC" + }, + "execution_count": 64, + "outputs": [] + } + ] +} \ No newline at end of file