|
23 | 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
|
24 | 24 | POSSIBILITY OF SUCH DAMAGE.
|
25 | 25 | """
|
26 |
| -from azureml.core.run import Run |
| 26 | + |
27 | 27 | import os
|
28 |
| -import argparse |
| 28 | +import pandas as pd |
29 | 29 | from sklearn.linear_model import Ridge
|
30 | 30 | from sklearn.metrics import mean_squared_error
|
31 | 31 | from sklearn.model_selection import train_test_split
|
32 |
| -import joblib |
33 |
| -import json |
34 |
| -from azureml.core import Dataset, Datastore, Workspace |
35 |
| - |
36 |
| - |
37 |
| -def register_dataset( |
38 |
| - aml_workspace: Workspace, |
39 |
| - dataset_name: str, |
40 |
| - datastore_name: str, |
41 |
| - file_path: str |
42 |
| -) -> Dataset: |
43 |
| - datastore = Datastore.get(aml_workspace, datastore_name) |
44 |
| - dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) |
45 |
| - dataset = dataset.register(workspace=aml_workspace, |
46 |
| - name=dataset_name, |
47 |
| - create_new_version=True) |
48 |
| - |
49 |
| - return dataset |
50 |
| - |
51 |
| - |
52 |
| -def train_model(run, data, alpha): |
53 |
| - run.log("alpha", alpha) |
54 |
| - run.parent.log("alpha", alpha) |
55 |
| - reg = Ridge(alpha=alpha) |
56 |
| - reg.fit(data["train"]["X"], data["train"]["y"]) |
57 |
| - preds = reg.predict(data["test"]["X"]) |
58 |
| - run.log("mse", mean_squared_error( |
59 |
| - preds, data["test"]["y"]), description="Mean squared error metric") |
60 |
| - run.parent.log("mse", mean_squared_error( |
61 |
| - preds, data["test"]["y"]), description="Mean squared error metric") |
62 |
| - return reg |
63 |
| - |
64 | 32 |
|
65 |
| -def main(): |
66 |
| - print("Running train.py") |
67 | 33 |
|
68 |
| - parser = argparse.ArgumentParser("train") |
69 |
| - |
70 |
| - parser.add_argument( |
71 |
| - "--model_name", |
72 |
| - type=str, |
73 |
| - help="Name of the Model", |
74 |
| - default="sklearn_regression_model.pkl", |
75 |
| - ) |
76 |
| - |
77 |
| - parser.add_argument( |
78 |
| - "--step_output", |
79 |
| - type=str, |
80 |
| - help=("output for passing data to next step") |
81 |
| - ) |
82 |
| - |
83 |
| - parser.add_argument( |
84 |
| - "--dataset_version", |
85 |
| - type=str, |
86 |
| - help=("dataset version") |
87 |
| - ) |
88 |
| - |
89 |
| - parser.add_argument( |
90 |
| - "--data_file_path", |
91 |
| - type=str, |
92 |
| - help=("data file path, if specified,\ |
93 |
| - a new version of the dataset will be registered") |
94 |
| - ) |
95 |
| - |
96 |
| - parser.add_argument( |
97 |
| - "--caller_run_id", |
98 |
| - type=str, |
99 |
| - help=("caller run id, for example ADF pipeline run id") |
100 |
| - ) |
101 |
| - |
102 |
| - parser.add_argument( |
103 |
| - "--dataset_name", |
104 |
| - type=str, |
105 |
| - help=("Dataset name. Dataset must be passed by name\ |
106 |
| - to always get the desired dataset version\ |
107 |
| - rather than the one used while the pipeline creation") |
108 |
| - ) |
109 |
| - |
110 |
| - args = parser.parse_args() |
111 |
| - |
112 |
| - print("Argument [model_name]: %s" % args.model_name) |
113 |
| - print("Argument [step_output]: %s" % args.step_output) |
114 |
| - print("Argument [dataset_version]: %s" % args.dataset_version) |
115 |
| - print("Argument [data_file_path]: %s" % args.data_file_path) |
116 |
| - print("Argument [caller_run_id]: %s" % args.caller_run_id) |
117 |
| - print("Argument [dataset_name]: %s" % args.dataset_name) |
118 |
| - |
119 |
| - model_name = args.model_name |
120 |
| - step_output_path = args.step_output |
121 |
| - dataset_version = args.dataset_version |
122 |
| - data_file_path = args.data_file_path |
123 |
| - dataset_name = args.dataset_name |
124 |
| - |
125 |
| - print("Getting training parameters") |
126 |
| - |
127 |
| - with open("config.json") as f: |
128 |
| - pars = json.load(f) |
129 |
| - try: |
130 |
| - alpha = pars["training"]["alpha"] |
131 |
| - except KeyError: |
132 |
| - alpha = 0.5 |
133 |
| - |
134 |
| - print("Parameter alpha: %s" % alpha) |
135 |
| - |
136 |
| - run = Run.get_context() |
137 |
| - |
138 |
| - # Get the dataset |
139 |
| - if (dataset_name): |
140 |
| - if (data_file_path == 'none'): |
141 |
| - dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 |
142 |
| - else: |
143 |
| - dataset = register_dataset(run.experiment.workspace, |
144 |
| - dataset_name, |
145 |
| - os.environ.get("DATASTORE_NAME"), |
146 |
| - data_file_path) |
147 |
| - else: |
148 |
| - e = ("No dataset provided") |
149 |
| - print(e) |
150 |
| - raise Exception(e) |
151 |
| - |
152 |
| - # Link dataset to the step run so it is trackable in the UI |
153 |
| - run.input_datasets['training_data'] = dataset |
154 |
| - run.parent.tag("dataset_id", value=dataset.id) |
155 |
| - |
156 |
| - df = dataset.to_pandas_dataframe() |
| 34 | +# Split the dataframe into test and train data |
| 35 | +def split_data(df): |
157 | 36 | X = df.drop('Y', axis=1).values
|
158 | 37 | y = df['Y'].values
|
159 | 38 |
|
160 | 39 | X_train, X_test, y_train, y_test = train_test_split(
|
161 | 40 | X, y, test_size=0.2, random_state=0)
|
162 | 41 | data = {"train": {"X": X_train, "y": y_train},
|
163 | 42 | "test": {"X": X_test, "y": y_test}}
|
| 43 | + return data |
| 44 | + |
| 45 | + |
| 46 | +# Train the model, return the model |
| 47 | +def train_model(data, ridge_args): |
| 48 | + reg_model = Ridge(**ridge_args) |
| 49 | + reg_model.fit(data["train"]["X"], data["train"]["y"]) |
| 50 | + return reg_model |
| 51 | + |
| 52 | + |
| 53 | +# Evaluate the metrics for the model |
| 54 | +def get_model_metrics(model, data): |
| 55 | + preds = model.predict(data["test"]["X"]) |
| 56 | + mse = mean_squared_error(preds, data["test"]["y"]) |
| 57 | + metrics = {"mse": mse} |
| 58 | + return metrics |
| 59 | + |
| 60 | + |
| 61 | +def main(): |
| 62 | + print("Running train.py") |
164 | 63 |
|
165 |
| - reg = train_model(run, data, alpha) |
| 64 | + # Define training parameters |
| 65 | + ridge_args = {"alpha": 0.5} |
166 | 66 |
|
167 |
| - # Pass model file to next step |
168 |
| - os.makedirs(step_output_path, exist_ok=True) |
169 |
| - model_output_path = os.path.join(step_output_path, model_name) |
170 |
| - joblib.dump(value=reg, filename=model_output_path) |
| 67 | + # Load the training data as dataframe |
| 68 | + data_dir = "data" |
| 69 | + data_file = os.path.join(data_dir, 'diabetes.csv') |
| 70 | + train_df = pd.read_csv(data_file) |
171 | 71 |
|
172 |
| - # Also upload model file to run outputs for history |
173 |
| - os.makedirs('outputs', exist_ok=True) |
174 |
| - output_path = os.path.join('outputs', model_name) |
175 |
| - joblib.dump(value=reg, filename=output_path) |
| 72 | + data = split_data(train_df) |
176 | 73 |
|
177 |
| - run.tag("run_type", value="train") |
178 |
| - print(f"tags now present for run: {run.tags}") |
| 74 | + # Train the model |
| 75 | + model = train_model(data, ridge_args) |
179 | 76 |
|
180 |
| - run.complete() |
| 77 | + # Log the metrics for the model |
| 78 | + metrics = get_model_metrics(model, data) |
| 79 | + for (k, v) in metrics.items(): |
| 80 | + print(f"{k}: {v}") |
181 | 81 |
|
182 | 82 |
|
183 | 83 | if __name__ == '__main__':
|
|
0 commit comments