|
9 | 9 | "\n", |
10 | 10 | "This notebook demonstrates key concepts and tools for training ensemble models.\n", |
11 | 11 | "\n", |
| 12 | + "1. Baseline models\n", |
| 13 | + " - Logistic regression\n", |
| 14 | + " - Decision tree\n", |
| 15 | + "2. Parallel ensembles\n", |
| 16 | + " - Voting ensemble\n", |
| 17 | + " - Bagging ensemble\n", |
| 18 | + " - Random forest\n", |
| 19 | + "3. Serial (sequential) ensembles\n", |
| 20 | + " - AdaBoost\n", |
| 21 | + " - Gradient boosting\n", |
| 22 | + " - Stacking ensemble\n", |
| 23 | + "4. Model comparison\n", |
| 24 | + " - Score comparison\n", |
| 25 | + " - Confusion matrix comparison\n", |
| 26 | + "5. Model metric optimization\n", |
| 27 | + " - ROC_AUC optimized thresholds\n", |
| 28 | + " - F1 optimized thresholds\n", |
| 29 | + "\n", |
12 | 30 | "## Notebook set up\n", |
13 | 31 | "\n", |
14 | 32 | "### Imports" |
|
21 | 39 | "metadata": {}, |
22 | 40 | "outputs": [], |
23 | 41 | "source": [ |
24 | | - "import pandas as pd\n", |
25 | | - "# import numpy as np\n", |
26 | 42 | "import matplotlib.pyplot as plt\n", |
| 43 | + "import pandas as pd\n", |
27 | 44 | "\n", |
28 | 45 | "from sklearn.datasets import make_classification\n", |
29 | | - "from sklearn.model_selection import train_test_split, cross_validate, TunedThresholdClassifierCV\n", |
30 | 46 | "from sklearn.linear_model import LogisticRegression\n", |
31 | | - "from sklearn.tree import DecisionTreeClassifier\n", |
| 47 | + "from sklearn.metrics import ConfusionMatrixDisplay\n", |
| 48 | + "from sklearn.model_selection import train_test_split, cross_validate, TunedThresholdClassifierCV\n", |
32 | 49 | "from sklearn.svm import SVC\n", |
| 50 | + "from sklearn.tree import DecisionTreeClassifier\n", |
| 51 | + "\n", |
| 52 | + "# Ensemble models\n", |
33 | 53 | "from sklearn.ensemble import (\n", |
34 | | - " VotingClassifier,\n", |
35 | | - " BaggingClassifier,\n", |
36 | | - " RandomForestClassifier,\n", |
37 | 54 | " AdaBoostClassifier,\n", |
| 55 | + " BaggingClassifier,\n", |
38 | 56 | " GradientBoostingClassifier,\n", |
39 | | - " StackingClassifier\n", |
40 | | - ")\n", |
41 | | - "from sklearn.metrics import (\n", |
42 | | - " # roc_auc_score,\n", |
43 | | - " # f1_score,\n", |
44 | | - " # confusion_matrix,\n", |
45 | | - " ConfusionMatrixDisplay\n", |
| 57 | + " RandomForestClassifier,\n", |
| 58 | + " StackingClassifier,\n", |
| 59 | + " VotingClassifier,\n", |
46 | 60 | ")" |
47 | 61 | ] |
48 | 62 | }, |
|
602 | 616 | "id": "78cb3b9b", |
603 | 617 | "metadata": {}, |
604 | 618 | "source": [ |
605 | | - "### 3.2. Gradient Boosting\n", |
| 619 | + "### 3.2. Gradient boosting\n", |
606 | 620 | "\n", |
607 | 621 | "Builds models sequentially where each new model is trained to predict the residual errors of the previous ensemble, using gradient descent to minimize a loss function.\n", |
608 | 622 | "\n", |
|
1016 | 1030 | "id": "5e23b03a", |
1017 | 1031 | "metadata": {}, |
1018 | 1032 | "source": [ |
1019 | | - "## 5. Model Metric Optimization" |
| 1033 | + "## 5. Model metric optimization" |
1020 | 1034 | ] |
1021 | 1035 | }, |
1022 | 1036 | { |
|
1034 | 1048 | "id": "fa9ee276", |
1035 | 1049 | "metadata": {}, |
1036 | 1050 | "source": [ |
1037 | | - "### 5.1. ROC_AUC Optimized Thresholds" |
| 1051 | + "### 5.1. ROC_AUC optimized thresholds" |
1038 | 1052 | ] |
1039 | 1053 | }, |
1040 | 1054 | { |
|
1196 | 1210 | "id": "783e5fef", |
1197 | 1211 | "metadata": {}, |
1198 | 1212 | "source": [ |
1199 | | - "### 5.2. F1 Optimized Thresholds" |
| 1213 | + "### 5.2. F1 optimized thresholds" |
1200 | 1214 | ] |
1201 | 1215 | }, |
1202 | 1216 | { |
|
0 commit comments