Skip to content

Commit 39609ae

Browse files
jotaylobjcmit
andauthored
Proposal: split train.py into train.py and train_aml.py (#219)
This change splits train.py into two files. The new train.py is standalone, and has no references to AzureML. It defines three functions, split_data to split a dataframe into test/train data, and train_model which takes the test/train data and a parameter object and trains the model, and get_model_metrics, which evaluates metrics about the model. The script can be run locally, in which case it loads a dataset from a file. The second file, train_aml.py contains reasonably general AzureML logic. It reads data from a dataset, then calls the split_data function from train.py. It loads input parameters from a config file and logs them, then calls train_model from train.py. It then uploads the model and logs any metrics returned by get_model_metrics. The hope with these changes is to demonstrate a simple interface for integrating an existing ML script with MLOpsPython, as well as providing an example for how the core ML functionality can be invoked in multiple ways for development purposes. Co-authored-by: Bryan J Smith <[email protected]>
1 parent 5966d3d commit 39609ae

File tree

6 files changed

+799
-155
lines changed

6 files changed

+799
-155
lines changed

.pipelines/diabetes_regression-variables-template.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ variables:
77
value: diabetes_regression
88
# The path to the model training script under SOURCES_DIR_TRAIN
99
- name: TRAIN_SCRIPT_PATH
10-
value: training/train.py
10+
value: training/train_aml.py
1111
# The path to the model evaluation script under SOURCES_DIR_TRAIN
1212
- name: EVALUATE_SCRIPT_PATH
1313
value: evaluate/evaluate_model.py
Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,32 @@
11
import numpy as np
2-
from azureml.core.run import Run
3-
from unittest.mock import Mock
4-
from diabetes_regression.training.train import train_model
2+
from diabetes_regression.training.train import train_model, get_model_metrics
53

64

75
def test_train_model():
86
X_train = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1)
97
y_train = np.array([10, 9, 8, 8, 6, 5])
8+
data = {"train": {"X": X_train, "y": y_train}}
9+
10+
reg_model = train_model(data, {"alpha": 1.2})
11+
12+
preds = reg_model.predict([[1], [2]])
13+
np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303])
14+
15+
16+
def test_get_model_metrics():
17+
18+
class MockModel:
19+
20+
@staticmethod
21+
def predict(data):
22+
return ([8.12121212, 7.21212121])
23+
1024
X_test = np.array([3, 4]).reshape(-1, 1)
1125
y_test = np.array([8, 7])
12-
data = {"train": {"X": X_train, "y": y_train},
13-
"test": {"X": X_test, "y": y_test}}
26+
data = {"test": {"X": X_test, "y": y_test}}
1427

15-
run = Mock(Run)
16-
reg = train_model(run, data, alpha=1.2)
28+
metrics = get_model_metrics(MockModel(), data)
1729

18-
_, call2 = run.log.call_args_list
19-
nameValue, descriptionDict = call2
20-
name, value = nameValue
21-
description = descriptionDict['description']
22-
assert (name == 'mse')
23-
np.testing.assert_almost_equal(value, 0.029843893480257067)
24-
assert (description == 'Mean squared error metric')
25-
26-
preds = reg.predict([[1], [2]])
27-
np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303])
30+
assert 'mse' in metrics
31+
mse = metrics['mse']
32+
np.testing.assert_almost_equal(mse, 0.029843893480257067)

diabetes_regression/training/train.py

Lines changed: 37 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -23,161 +23,61 @@
2323
ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
2424
POSSIBILITY OF SUCH DAMAGE.
2525
"""
26-
from azureml.core.run import Run
26+
2727
import os
28-
import argparse
28+
import pandas as pd
2929
from sklearn.linear_model import Ridge
3030
from sklearn.metrics import mean_squared_error
3131
from sklearn.model_selection import train_test_split
32-
import joblib
33-
import json
34-
from azureml.core import Dataset, Datastore, Workspace
35-
36-
37-
def register_dataset(
38-
aml_workspace: Workspace,
39-
dataset_name: str,
40-
datastore_name: str,
41-
file_path: str
42-
) -> Dataset:
43-
datastore = Datastore.get(aml_workspace, datastore_name)
44-
dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path))
45-
dataset = dataset.register(workspace=aml_workspace,
46-
name=dataset_name,
47-
create_new_version=True)
48-
49-
return dataset
50-
51-
52-
def train_model(run, data, alpha):
53-
run.log("alpha", alpha)
54-
run.parent.log("alpha", alpha)
55-
reg = Ridge(alpha=alpha)
56-
reg.fit(data["train"]["X"], data["train"]["y"])
57-
preds = reg.predict(data["test"]["X"])
58-
run.log("mse", mean_squared_error(
59-
preds, data["test"]["y"]), description="Mean squared error metric")
60-
run.parent.log("mse", mean_squared_error(
61-
preds, data["test"]["y"]), description="Mean squared error metric")
62-
return reg
63-
6432

65-
def main():
66-
print("Running train.py")
6733

68-
parser = argparse.ArgumentParser("train")
69-
70-
parser.add_argument(
71-
"--model_name",
72-
type=str,
73-
help="Name of the Model",
74-
default="sklearn_regression_model.pkl",
75-
)
76-
77-
parser.add_argument(
78-
"--step_output",
79-
type=str,
80-
help=("output for passing data to next step")
81-
)
82-
83-
parser.add_argument(
84-
"--dataset_version",
85-
type=str,
86-
help=("dataset version")
87-
)
88-
89-
parser.add_argument(
90-
"--data_file_path",
91-
type=str,
92-
help=("data file path, if specified,\
93-
a new version of the dataset will be registered")
94-
)
95-
96-
parser.add_argument(
97-
"--caller_run_id",
98-
type=str,
99-
help=("caller run id, for example ADF pipeline run id")
100-
)
101-
102-
parser.add_argument(
103-
"--dataset_name",
104-
type=str,
105-
help=("Dataset name. Dataset must be passed by name\
106-
to always get the desired dataset version\
107-
rather than the one used while the pipeline creation")
108-
)
109-
110-
args = parser.parse_args()
111-
112-
print("Argument [model_name]: %s" % args.model_name)
113-
print("Argument [step_output]: %s" % args.step_output)
114-
print("Argument [dataset_version]: %s" % args.dataset_version)
115-
print("Argument [data_file_path]: %s" % args.data_file_path)
116-
print("Argument [caller_run_id]: %s" % args.caller_run_id)
117-
print("Argument [dataset_name]: %s" % args.dataset_name)
118-
119-
model_name = args.model_name
120-
step_output_path = args.step_output
121-
dataset_version = args.dataset_version
122-
data_file_path = args.data_file_path
123-
dataset_name = args.dataset_name
124-
125-
print("Getting training parameters")
126-
127-
with open("config.json") as f:
128-
pars = json.load(f)
129-
try:
130-
alpha = pars["training"]["alpha"]
131-
except KeyError:
132-
alpha = 0.5
133-
134-
print("Parameter alpha: %s" % alpha)
135-
136-
run = Run.get_context()
137-
138-
# Get the dataset
139-
if (dataset_name):
140-
if (data_file_path == 'none'):
141-
dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501
142-
else:
143-
dataset = register_dataset(run.experiment.workspace,
144-
dataset_name,
145-
os.environ.get("DATASTORE_NAME"),
146-
data_file_path)
147-
else:
148-
e = ("No dataset provided")
149-
print(e)
150-
raise Exception(e)
151-
152-
# Link dataset to the step run so it is trackable in the UI
153-
run.input_datasets['training_data'] = dataset
154-
run.parent.tag("dataset_id", value=dataset.id)
155-
156-
df = dataset.to_pandas_dataframe()
34+
# Split the dataframe into test and train data
35+
def split_data(df):
15736
X = df.drop('Y', axis=1).values
15837
y = df['Y'].values
15938

16039
X_train, X_test, y_train, y_test = train_test_split(
16140
X, y, test_size=0.2, random_state=0)
16241
data = {"train": {"X": X_train, "y": y_train},
16342
"test": {"X": X_test, "y": y_test}}
43+
return data
44+
45+
46+
# Train the model, return the model
47+
def train_model(data, ridge_args):
48+
reg_model = Ridge(**ridge_args)
49+
reg_model.fit(data["train"]["X"], data["train"]["y"])
50+
return reg_model
51+
52+
53+
# Evaluate the metrics for the model
54+
def get_model_metrics(model, data):
55+
preds = model.predict(data["test"]["X"])
56+
mse = mean_squared_error(preds, data["test"]["y"])
57+
metrics = {"mse": mse}
58+
return metrics
59+
60+
61+
def main():
62+
print("Running train.py")
16463

165-
reg = train_model(run, data, alpha)
64+
# Define training parameters
65+
ridge_args = {"alpha": 0.5}
16666

167-
# Pass model file to next step
168-
os.makedirs(step_output_path, exist_ok=True)
169-
model_output_path = os.path.join(step_output_path, model_name)
170-
joblib.dump(value=reg, filename=model_output_path)
67+
# Load the training data as dataframe
68+
data_dir = "data"
69+
data_file = os.path.join(data_dir, 'diabetes.csv')
70+
train_df = pd.read_csv(data_file)
17171

172-
# Also upload model file to run outputs for history
173-
os.makedirs('outputs', exist_ok=True)
174-
output_path = os.path.join('outputs', model_name)
175-
joblib.dump(value=reg, filename=output_path)
72+
data = split_data(train_df)
17673

177-
run.tag("run_type", value="train")
178-
print(f"tags now present for run: {run.tags}")
74+
# Train the model
75+
model = train_model(data, ridge_args)
17976

180-
run.complete()
77+
# Log the metrics for the model
78+
metrics = get_model_metrics(model, data)
79+
for (k, v) in metrics.items():
80+
print(f"{k}: {v}")
18181

18282

18383
if __name__ == '__main__':

0 commit comments

Comments
 (0)