-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModelTesting.py
107 lines (79 loc) · 4.13 KB
/
ModelTesting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# MelbourneHousingPrediction
# Python v3.6 (not compatible with 2.7)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from datetime import datetime
# Load up the data if needed: https://www.kaggle.com/anthonypino/melbourne-housing-market?
# Set directory path to the location of this csv (local or ~/Downloads if you downloaded from kaggle)
df = pd.read_csv('Melbourne_housing_FULL.csv')
# Drop the statistically insignificant columns, columns whose data are better evaluated through other columns,
# or columns with significant missing values
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']
# If any row is missing any data, drop it
# It is important that this is done after the previous step of removing insignificant columns
# If converting to Python2: df.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='ignore')
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
# Create features_df which will serve as our independent variables array
# Evaluate the following columns as numerical data through one-hot encoding with Pandas
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])
# We're deleting this column from features_df because it will serve as our dependent variable, y
del features_df['Price']
# Create a list of the columns for checking potential variables post one-hot, print if you need
# cols = features_df.columns.tolist()
# prop = 'property_to_value'
# print("property_to_value = [")
# for item in cols:
# print("\t0, " + "#" +item)
# print("]")
# Create X and y
X = features_df.as_matrix()
y = df['Price'].as_matrix()
# Split the dataset (70/30)
startTimeS = datetime.now()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state = 0)
print ("\nSplit Time Taken:", datetime.now() - startTimeS)
# Select algorithm (gradient boosting) and configure hyperparamaters
startTime = datetime.now()
model = ensemble.GradientBoostingRegressor(
n_estimators=500, #how many decision trees (set very high, higher generally improves accuracy, but increases processing time)
learning_rate=0.1, #rate at which additional decision trees generated influence the overall prediction (set low)
max_depth=6, #maximum number of layers per decision tree (set relatively low)
#setting to none expands until all leaves are pure
min_samples_split=4, #number of samples required to force another binary split
min_samples_leaf=6, #number of samples required in each leaf before a new branch can be implemented
max_features=0.6, #total number of features presented to 'model' when determining best split
loss='huber' #huber protects against outlier and anomalies
#other options: least squares: ls, least absolute deviations: lad, quantile regression: quantile
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoosting
)
print ("Model Creation Time Taken:", datetime.now() - startTime)
# Implement fit function on training data generated by train_test_split
startTime1 = datetime.now()
model.fit(X_train, y_train)
print ("Model Fit Time Taken:", datetime.now() - startTime1)
# Export training model as pickle file
startTime2 = datetime.now()
joblib.dump(model, 'house_trained_model.pkl')
print ("Export Time Taken:", datetime.now() - startTime2)
print ("\n" + "-" * 50 + "\n")
# Time to evaluate the results and see if our algorithm and hyperparamaters are sufficient
startTime3 = datetime.now()
mse = mean_absolute_error(y_train, model.predict(X_train))
print ("Training - Mean Abs Error: %.2f" % mse)
print ("Training Time taken:", datetime.now() - startTime3)
startTime3 = datetime.now()
mse1 = mean_absolute_error(y_test, model.predict(X_test))
print ("\nTesting - Mean Abs Error: %.2f" % mse1)
print ("Testing Time taken:", datetime.now() - startTime3)
print ("\n")