-
Notifications
You must be signed in to change notification settings - Fork 8k
/
Copy pathpipeline.py
63 lines (61 loc) · 2.1 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# for encoding categorical variables
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.imputation import (
AddMissingIndicator,
CategoricalImputer,
MeanMedianImputer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from classification_model.config.core import config
from classification_model.processing import features as pp
titanic_pipe = Pipeline(
[
# ===== IMPUTATION =====
# impute categorical variables with string missing
(
"categorical_imputation",
CategoricalImputer(
imputation_method="missing",
variables=config.model_config.categorical_vars,
),
),
# add missing indicator to numerical variables
(
"missing_indicator",
AddMissingIndicator(variables=config.model_config.numerical_vars),
),
# impute numerical variables with the median
(
"median_imputation",
MeanMedianImputer(
imputation_method="median", variables=config.model_config.numerical_vars
),
),
# Extract letter from cabin
(
"extract_letter",
pp.ExtractLetterTransformer(variables=config.model_config.cabin),
),
# == CATEGORICAL ENCODING ======
# remove categories present in less than 5% of the observations (0.05)
# group them in one category called 'Rare'
(
"rare_label_encoder",
RareLabelEncoder(
tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
),
),
# encode categorical variables using one hot encoding into k-1 variables
(
"categorical_encoder",
OneHotEncoder(
drop_last=True, variables=config.model_config.categorical_vars
),
),
# scale
("scaler", StandardScaler()),
("Logit", LogisticRegression(C=0.0005, random_state=0)),
]
)