-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
107 lines (88 loc) · 3.39 KB
/
train.py
File metadata and controls
107 lines (88 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import argparse
import pandas as pd
import time
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from dotenv import load_dotenv
# Load environment variables from .env file
# MLFlow reads MLFLOW_TRACKING_URI automatically from environment
load_dotenv()
if __name__ == "__main__":
### MLFLOW Experiment setup
experiment_name = "ibm_attrition_detector"
mlflow.set_experiment(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)
client = mlflow.tracking.MlflowClient()
run = client.create_run(experiment.experiment_id)
print("training model...")
# Time execution
start_time = time.time()
# Call mlflow autolog
mlflow.sklearn.autolog(log_models=False) # We won't log models right away
# Parse arguments given in shell script
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators")
parser.add_argument("--min_samples_split")
args = parser.parse_args()
# Import dataset
df = pd.read_excel(
"https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/ibm_hr_attrition.xlsx",
index_col=0,
)
# X, y split
target_col_name = "Attrition"
X = df.loc[:, df.columns != target_col_name]
y = df.loc[:, target_col_name].apply(lambda x: 0 if x == "No" else 1)
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Preprocessing
categorical_features = X_train.select_dtypes(
"object"
).columns # Select all the columns containing strings
categorical_transformer = OneHotEncoder(
drop="first", handle_unknown="error", sparse_output=False
)
numerical_feature_mask = ~X_train.columns.isin(
X_train.select_dtypes("object").columns
) # Select all the columns containing anything else than strings
numerical_features = X_train.columns[numerical_feature_mask]
numerical_transformer = StandardScaler()
preprocessor = ColumnTransformer(
transformers=[
("categorical_transformer", categorical_transformer, categorical_features),
("numerical_transformer", numerical_transformer, numerical_features),
]
)
# Pipeline
n_estimators = int(args.n_estimators)
min_samples_split = int(args.min_samples_split)
model = Pipeline(
steps=[
("Preprocessing", preprocessor),
(
"Regressor",
RandomForestClassifier(
n_estimators=n_estimators, min_samples_split=min_samples_split
),
),
],
verbose=True,
)
# Log experiment to MLFlow
with mlflow.start_run(run_id=run.info.run_id) as run:
model.fit(X_train, y_train)
predictions = model.predict(X_train)
# Log model separately to have more flexibility on setup
mlflow.sklearn.log_model(
sk_model=model,
artifact_path="ibm_attrition_detector",
registered_model_name="ibm_attrition_detector_RF",
signature=infer_signature(X_train, predictions),
)
print("...Done!")
print(f"---Total training time: {time.time()-start_time}")