-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChapter2.py
390 lines (328 loc) · 17.9 KB
/
Chapter2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hashlib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from pandas.plotting import scatter_matrix
# !!! make sure to explore all of these modules and their functions !!!
# Fetch and downlad data:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url, housing_path):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
# Create a csv file containing the housing information fetched from the housing.tgz file
def load_housing_data(housing_path):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
fetch_housing_data(HOUSING_URL, HOUSING_PATH)
housing = load_housing_data(HOUSING_PATH)
# print(housing.head(10)) # View the first 10 rows of the housing data
# print(housing.info())
# print(housing["ocean_proximity"].value_counts())
# print(housing.describe())
#
# housing.hist(bins=50, figsize=(9,9))
# plt.show()
np.random.seed(42)
def split_train_test(data, test_ratio):
"""This function splits the data set from the .csv file into a trining set and a
test set (which should never
be looked at!) according to a training-to-test set ratio"""
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print('Training set size: ', len(train_set))
print('Test set size: ', len(test_set))
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1]<256*test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
return data.loc[~in_test_set], data.loc[in_test_set]
housing_with_id = housing.reset_index() # add an index column to the data
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
housing_with_id["id"] = housing["longitude"]*1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'id')
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
housing["income_cat"].value_counts() / len(housing)
def income_cat_proportions(data):
return data["income_cat"].value_counts() / len(data)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
# Visualising data to gain some insights: Create a copy of the training data
housing = strat_train_set.copy()
# Create of scatter plot of the longitude and latitude information
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
# plt.show()
# Find correlations by computing the standard correlation coefficient (Pearson's r)
corr_matrix = housing.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))
# plot graphs of the correlation between select attributes using the scatter_matrix function from pandas
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age", "households"]
scatter_matrix(housing[attributes], figsize = (12,8))
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
# plt.show()
# Create attribute combinations to reveal additional insights
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))
# Data preparation...
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
# To compensate for any missing features for each element, we have three options:
# 1) Get rid of the corresponding districts
# 2) Discard the whole attribute
# 3) Set the missing value to a 'filler' value (i.e. zero or the median for that feature
housing.dropna(subset=["total_bedrooms"]) # Option 1
housing.drop("total_bedrooms", axis=1) # Option 2
median = housing["total_bedrooms"].median() # Option 3
housing["total_bedrooms"].fillna(median, inplace=True) # Saves the median value when option 3 is used
# Scikitlean class Imputer can be used to fill missing values
imputer = Imputer(strategy="median")
# NB Imputer can only be used on numberical values, so a copy of the data without the text attribute ocean_proximity must be created
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num) # now fit the imputer instance to the training data
X = imputer.transform(housing_num) # Transform training set by replacing missing values with the median values calculated NB training set is now a NumPy array!
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values)) # to convert from NumPy into a Pandas array
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
# housing_tr.head()
# Encode ocean_proximity labels to numerical values using LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded)
print(encoder.classes_)
# Encode ocean_proximity labels to numerical values using OneHot
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) # Stored as a SciPy sparse matrix
print(housing_cat_1hot)
housing_cat_1hot.toarray() # converts to a NumPy array
# Both the transformations (text->int->onehot vector) LabelEncoder and OneHot can be applied in one shot with the LabelBinarizer:
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat) # Returns a dense NumPy array
# This class is used as a transformer that add the combined attributes created above: rooms_per_household, population_per_household, bedrooms_per_room
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
"""This class is used as a transformer that add the combined attributes: rooms_per_household, population_per_household, bedrooms_per_room"""
def __init__(self, add_bedrooms_per_room = True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
population_per_household = X[:, population_ix]/X[:,household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
# pipeline for transformations of numerical attruburtes of data
num_pipeline = Pipeline([
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
# Create a transformer class to handle Pandas DataFrames (since there is nothin in Scikitlean to handle this)
class DataFrameSelector(BaseEstimator, TransformerMixin):
"""a transformer class to handle Pandas DataFrames (since there is nothin in Scikitlean to handle this)
DataFrameSelector will trans data by selecting the desired attributes, dropping the rest, and then converting the resulting DataFrame into a NumPy array."""
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
# Another pipeline for the categorical, rather than only the numerical, attributes. This pipeline would select only the categorical attributes using DataFrameSelector and LabelBinarizer
num_attribs = list(housing_num) # Collect the numerical attributes
cat_attribs = ["ocean_proximity"] # Collect the categorical attributes
# First create the pipeline for only numerical data:
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
# Now, a second pipeline for the categorical data:
# cat_pipeline = Pipeline([
# ('selector', DataFrameSelector(cat_attribs))
# ('label_binarizer', LabelBinarizer()),
# ]) #Wrong
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
]) #Correct
# Use FeatureUnion to join both of these pipelines into a single pipeline
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
# Now we have the finsihed, fully pre-processed data..
housing_prepared = full_pipeline.fit_transform(housing)
# print(housing_prepared)
# print(housing_prepared.shape)
# -------------------------------------------------------------------
# Select and train the learning model
# -------------------------------------------------------------------
# Use the LinearRegression model from sklearn, trained on the prepared
# housing data from the pipeline and the respective housing labels:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_prepared_data = full_pipeline.transform(some_data)
# print("Predictions: ", lin_reg.predict(some_prepared_data))
# print("Labels: ", list(some_labels))
predic = lin_reg.predict(some_prepared_data)
labs = list(some_labels)
print("Predictions: ", " ", "Labels: ")
for elem in predic:
print(elem)
# Measure the models RMSE to improve predicion accuracy
lin_reg_housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, lin_reg_housing_predictions) # compute the mean squared error
lin_rmse = np.sqrt(lin_mse) # compute the square root of the mse to get the RMSE
print("Linear Regression root mean squared error: ", lin_rmse)
# DecisionTreeRegressor regressor model:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
tree_housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, tree_housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("Decision tree regressor root mean squared error: ", tree_rmse)
# Time to evaluate the model using Cross-validation...
# The following section of code uses the Scikit-Learn cross validation features
# It performs K-fold cross validation: splits the training set into 10 distinct subsets (folds)
# ...then trains and evaluates the decision tree model 10 times, picking a different fold for each evaluation
# result is an array containing the 10 evaliation scores:
# NB the Scikit-Learn cross validation methods expect a utility finction rather than a cost function!!
def display_scores(scores):
"""This function is used to display the scores from the Scikit-Learn K-fold cross validation process."""
print("Scores: ")
for i in scores:
print(i)
print("Mean: ", scores.mean())
print("Standard deviation: ", scores.std())
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
print("Linear Regression model: ")
display_scores(lin_rmse_scores)
tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
print("Decision Tree model: ")
display_scores(tree_rmse_scores)
# RandomForestRegressor model:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, forest_housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print("Random Forest model root mean squared error: ", forest_rmse)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
print("Random forest model: ")
display_scores(forest_rmse_scores)
# -------------------------------------------------------------------
# Fine-tune the learning model
# -------------------------------------------------------------------
# fiddling with the hyperparameters of the model manually is very tedious work...
# We use the Scikit-Learn GridSearchCV to search hyperparameters
# GridSearchCV is told which hyperparameters we want to experiment with and what values to try
# It then evaluates possible hyperparameter combinations using Cross-validation
# this section of code runs the GridSearchCV for the RandomForestRegressor model
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
print("Best hyperparameters: ", grid_search.best_params_)
# Obtain the best estimator directly:
print(grid_search.best_estimator_)
# Result:
# RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
# max_features=6, max_leaf_nodes=None, min_impurity_split=1e-07,
# min_samples_leaf=1, min_samples_split=2,
# min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
# oob_score=False, random_state=None, verbose=0, warm_start=False)
# Obtain the evaluation scores for the CV methods:
cv_res = grid_search.cv_results_
for mean_score, params in zip(cv_res["mean_test_score"], cv_res["params"]):
print(np.sqrt(-mean_score), params)
# Open the hood: observe the relative importance of each featue/attribute to help improve the accuracy of predictions
# Extract the weights of each of the features
feature_importance = grid_search.best_estimator_.feature_importances_
# Create a list of all of the features/attributes
extra_attribs = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted_features = sorted(zip(attributes, feature_importance), reverse=True)
# Print the features/attributes alongside their weights
# print("Attribute", " ", "Feature importance")
# for attrib in sorted_features:
# print(attrib)
# -------------------------------------------------------------------
# Evaluate the system on the test-set
# -------------------------------------------------------------------
# Establish the final model!
final_model = grid_search.best_estimator_
x_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
x_test_prepared = full_pipeline.transform(x_test)
final_predictions = final_model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
percent_improvement = ((forest_rmse - final_rmse)/forest_rmse)*100
print("-------------------------------------------------------------------")
print("Root mean squared error of final model: ", final_rmse)
print("Percentage improement in RMSE: ", percent_improvement, "%")
print("-------------------------------------------------------------------")