ModelTesting.py

# MelbourneHousingPrediction
# Python v3.6 (not compatible with 2.7)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from datetime import datetime


# Load up the data if needed: https://www.kaggle.com/anthonypino/melbourne-housing-market?
# Set directory path to the location of this csv (local or ~/Downloads if you downloaded from kaggle)
df = pd.read_csv('Melbourne_housing_FULL.csv')


# Drop the statistically insignificant columns, columns whose data are better evaluated through other columns,
# or columns with significant missing values
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']


# If any row is missing any data, drop it
# It is important that this is done after the previous step of removing insignificant columns
# If converting to Python2: df.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='ignore')
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)


# Create features_df which will serve as our independent variables array
# Evaluate the following columns as numerical data through one-hot encoding with Pandas
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])


# We're deleting this column from features_df because it will serve as our dependent variable, y
del features_df['Price']


# Create a list of the columns for checking potential variables post one-hot, print if you need
# cols = features_df.columns.tolist()
# prop = 'property_to_value'
# print("property_to_value = [")
# for item in cols:
#     print("\t0, " + "#" +item)
# print("]")

# Create X and y
X = features_df.as_matrix()
y = df['Price'].as_matrix()


# Split the dataset (70/30)
startTimeS = datetime.now()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state = 0)
print ("\nSplit Time Taken:", datetime.now() - startTimeS)


# Select algorithm (gradient boosting) and configure hyperparamaters
startTime = datetime.now()
model = ensemble.GradientBoostingRegressor(
    
    n_estimators=500,    #how many decision trees (set very high, higher generally improves accuracy, but increases processing time)
    learning_rate=0.1,   #rate at which additional decision trees generated influence the overall prediction (set low)
    
    max_depth=6,         #maximum number of layers per decision tree (set relatively low) 
                         #setting to none expands until all leaves are pure
    
    min_samples_split=4, #number of samples required to force another binary split
    min_samples_leaf=6,  #number of samples required in each leaf before a new branch can be implemented
    max_features=0.6,    #total number of features presented to 'model' when determining best split
    
    loss='huber'         #huber protects against outlier and anomalies 
                         #other options: least squares: ls, least absolute deviations: lad, quantile regression: quantile
                         #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoosting
)
print ("Model Creation Time Taken:", datetime.now() - startTime)


# Implement fit function on training data generated by train_test_split
startTime1 = datetime.now()
model.fit(X_train, y_train)
print ("Model Fit Time Taken:", datetime.now() - startTime1)


# Export training model as pickle file
startTime2 = datetime.now()
joblib.dump(model, 'house_trained_model.pkl')
print ("Export Time Taken:", datetime.now() - startTime2)
print ("\n" + "-" * 50 + "\n")

# Time to evaluate the results and see if our algorithm and hyperparamaters are sufficient
startTime3 = datetime.now()
mse = mean_absolute_error(y_train, model.predict(X_train))
print ("Training - Mean Abs Error: %.2f" % mse)
print ("Training Time taken:", datetime.now() - startTime3)

startTime3 = datetime.now()
mse1 = mean_absolute_error(y_test, model.predict(X_test))
print ("\nTesting - Mean Abs Error: %.2f" % mse1)
print ("Testing Time taken:", datetime.now() - startTime3)
print ("\n")