forked from DataTalksClub/mlops-zoomcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathduration-prediction.py
More file actions
122 lines (84 loc) · 3.38 KB
/
duration-prediction.py
File metadata and controls
122 lines (84 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# coding: utf-8
import pickle
from pathlib import Path
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)
def read_dataframe(year, month):
url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{year}-{month:02d}.parquet'
df = pd.read_parquet(url)
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
df = df[(df.duration >= 1) & (df.duration <= 60)]
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)
df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
return df
def create_X(df, dv=None):
categorical = ['PU_DO']
numerical = ['trip_distance']
dicts = df[categorical + numerical].to_dict(orient='records')
if dv is None:
dv = DictVectorizer(sparse=True)
X = dv.fit_transform(dicts)
else:
X = dv.transform(dicts)
return X, dv
def train_model(X_train, y_train, X_val, y_val, dv):
with mlflow.start_run() as run:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)
best_params = {
'learning_rate': 0.09585355369315604,
'max_depth': 30,
'min_child_weight': 1.060597050922164,
'objective': 'reg:linear',
'reg_alpha': 0.018060244040060163,
'reg_lambda': 0.011658731377413597,
'seed': 42
}
mlflow.log_params(best_params)
booster = xgb.train(
params=best_params,
dtrain=train,
num_boost_round=30,
evals=[(valid, 'validation')],
early_stopping_rounds=50
)
y_pred = booster.predict(valid)
rmse = root_mean_squared_error(y_val, y_pred)
mlflow.log_metric("rmse", rmse)
with open("models/preprocessor.b", "wb") as f_out:
pickle.dump(dv, f_out)
mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
return run.info.run_id
def run(year, month):
df_train = read_dataframe(year=year, month=month)
next_year = year if month < 12 else year + 1
next_month = month + 1 if month < 12 else 1
df_val = read_dataframe(year=next_year, month=next_month)
X_train, dv = create_X(df_train)
X_val, _ = create_X(df_val, dv)
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values
run_id = train_model(X_train, y_train, X_val, y_val, dv)
print(f"MLflow run_id: {run_id}")
return run_id
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Train a model to predict taxi trip duration.')
parser.add_argument('--year', type=int, required=True, help='Year of the data to train on')
parser.add_argument('--month', type=int, required=True, help='Month of the data to train on')
args = parser.parse_args()
run_id = run(year=args.year, month=args.month)
with open("run_id.txt", "w") as f:
f.write(run_id)