house_pricing_prediction/transform_full.py at master · strachan/house_pricing_prediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
from scipy import stats
import pandas as pd


# Get the original dataframe and transform the features
# according to the rules defined
def transform_dataset(house_prices):

	y = np.log(house_prices['SalePrice'])
	x = house_prices.drop(columns=['SalePrice'])

	categorical_columns = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig',
							'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
							'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
							'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
							'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
							'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive',
							'SaleType', 'SaleCondition', 'Utilities']

	x[categorical_columns] = x[categorical_columns].astype(str)

	continuous_columns = x.drop(columns=categorical_columns).columns

	for column in continuous_columns:
		k, p = stats.normaltest(x[column])
		skewness = stats.skew(x[column])
		if p < 0.05 and skewness > .70:
			x[column] = np.log(x[column] + 1)
		x[column + 'Square'] = x[column]**2
		x[column + 'Cubic'] = x[column]**3

	x = pd.get_dummies(x, drop_first=True)

	x['LotGrBsmtArea'] = x['GrLivArea'] * x['LotArea'] * x['TotalBsmtSF']
	x['Q406'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2006))
	x['Q407'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2007))
	x['Q408'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2008))
	x['Q409'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2009))
	x['Q410'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2010))

	return x, y


# Get the original dataframe and transform the features
# according to the rules defined
def transform_dataset_test(house_prices):

	x = house_prices

	categorical_columns = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig',
							'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
							'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
							'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
							'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
							'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive',
							'SaleType', 'SaleCondition', 'Utilities']

	x[categorical_columns] = x[categorical_columns].astype(str)

	continuous_columns = x.drop(columns=categorical_columns).columns

	for column in continuous_columns:
		k, p = stats.normaltest(x[column])
		skewness = stats.skew(x[column])
		if p < 0.05 and skewness > .70:
			x[column] = np.log(x[column] + 1)
		x[column + 'Square'] = x[column]**2
		x[column + 'Cubic'] = x[column]**3

	x['LotGrBsmtArea'] = x['GrLivArea'] * x['LotArea'] * x['TotalBsmtSF']
	x['Q406'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2006))
	x['Q407'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2007))
	x['Q408'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2008))
	x['Q409'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2009))
	x['Q410'] = ((x['MoSold'] == 10) | (x['MoSold'] == 11) | (x['MoSold'] == 12) & (x['YrSold'] == 2010))

	x = pd.get_dummies(x, drop_first=True)

	return x