-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
90 lines (69 loc) · 2.49 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""Evaluation script for measuring mean squared error."""
import argparse
import json
import logging
import pathlib
import pickle
import tarfile
import boto3
import numpy as np
import pandas as pd
import xgboost
from sklearn.metrics import hamming_loss
base_dir = "/opt/ml/processing"
# base_dir = "temp"
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-test", type=str, required=True)
args = parser.parse_args()
input_test = args.input_test
bucket_test = input_test.split("/")[2]
prefix_test = "/".join(input_test.split("/")[3:])
print(input_test)
logger.debug("Downloading data.")
s3 = boto3.resource("s3")
s3client = boto3.client("s3")
response_columns = s3client.list_objects_v2(
Bucket=bucket_test,
Prefix=prefix_test,
)
key_test = response_columns['Contents'][0]['Key']
pathlib.Path(f"{base_dir}/test").mkdir(parents=True, exist_ok=True)
logger.info("Downloading data from bucket: %s, key: %s", bucket_test, key_test)
file_test = f"{base_dir}/test/test.csv"
s3.Bucket(bucket_test).download_file(key_test, file_test)
logger.debug("Reading test data.")
df = pd.read_csv(file_test)
y_test = df.iloc[:, 0].to_numpy()
df.drop(df.columns[0], axis=1, inplace=True)
X_test = xgboost.DMatrix(df.values)
logger.debug("Loading model.")
model_path = f"{base_dir}/model/model.tar.gz"
with tarfile.open(model_path) as tar:
tar.extractall(path=".")
# model = pickle.load(open("xgboost-model", "rb"))
model = xgboost.Booster()
model.load_model('xgboost-model')
logger.info("Performing predictions against test data.")
predictions = model.predict(X_test)
num_class = len(np.unique(y_test))
num_samples = y_test.shape[0]
predictions = predictions.reshape(num_samples, num_class)
pred_label = np.argmax(predictions, axis=1)
hloss = hamming_loss(y_test, pred_label)
report_dict = {
"regression_metrics": {
"hamming_loss": {
"value": hloss,
},
},
}
output_dir = f"{base_dir}/evaluation"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
logger.info("Writing out evaluation report with hamming loss (%f)", hloss)
evaluation_path = f"{output_dir}/evaluation.json"
with open(evaluation_path, "w") as f:
f.write(json.dumps(report_dict))