-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathcalc_categories.py
More file actions
111 lines (103 loc) · 4.95 KB
/
calc_categories.py
File metadata and controls
111 lines (103 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
__author__ = 'Bryan Gregory'
__email__ = 'bryan.gregory1@gmail.com'
__date__ = '08-18-2013'
'''
Extract categories and calculate averages, then dump it to a csv for import into the main analysis
'''
import numpy as np
import pandas as pd
from datetime import datetime
def calc_categories(dfTrn_All):
listCats = []
dfCats = dfTrn_All.ix[:,['bus_categories']]
j=0
#make a complete list of all categories in the training set by extracting them from the nested lists
for row in dfTrn_All.ix[:,['bus_categories']].values:
for list in row:
for i in list:
listCats.append(i)
j+=1
#Take the top 421 categories (cutoff at 15 reviews minimum)
dfTopCats = pd.DataFrame(pd.Series(listCats).value_counts()[:421])
dfTopCats.columns = ['cat_count']
dfTopCats['cat_tot_stars'] = 0
#Calc rev star average for each category:
##Iterate through every record in the training data set and if the category matches a top category, then add the review stars to that category's total stars
j=0
for row in dfTrn_All.ix[:,['bus_categories']].values:
for list in row:
for i in list:
if i in topCats.index.tolist():
dfTopCats['cat_tot_stars'][i] += dfTrn_All['rev_stars'][j]
j+=1
##divide each category's total stars by its total rev count to derive the avg
dfTopCats['cat_avg_stars'] = dfTopCats['cat_tot_stars'] / dfTopCats['cat_count'].astype(np.float16)
del dfTopCats['cat_tot_stars']
#dump to csv
file_path = "Data/top_categories.csv"
dfTopCats.to_csv(file_path)
return
#--Different method using business avg's to derive category averages (therefore using population mean, thus it should be better than using sample mean above)--#
def calc_categories_using_bus_avg(dfAllBus):
listCats = []
#Remove businesses with < 3 reviews adn businesses with no business star ratings
dfAllBus = dfAllBus[dfAllBus['bus_review_count'] > 3]
dfAllBus = dfAllBus[dfAllBus['bus_stars'] > 0]
dfAllBus = dfAllBus.reset_index(drop=True)
#slice off the categories
dfCats = dfAllBus.ix[:,['bus_categories']]
#make a complete list of all categories by extracting them from the nested lists
j=0
for row in dfAllBus.ix[:,['bus_categories']].values:
for list in row:
for i in list:
listCats.append(i)
j+=1
dfTopCats = pd.DataFrame(pd.Series(listCats).unique())
dfTopCats.columns = ['category']
dfTopCats['cat_tot_stars'] = 0.0
dfTopCats['cat_tot_count'] = 0.0
dfTopCats = dfTopCats.set_index('category')
#Calc rev star average for each category:
##Iterate through every business in the data set and add the review stars to that category's total stars
j=0
for row in dfAllBus.ix[:,['bus_categories']].values:
for list in row:
for i in list:
dfTopCats['cat_tot_stars'][i] += dfAllBus['bus_stars'][j]
dfTopCats['cat_tot_count'][i] += 1
j+=1
##divide each category's total stars by its total rev count to derive the avg
dfTopCats['cat_avg_stars'] = dfTopCats['cat_tot_stars'] / dfTopCats['cat_tot_count'].astype(np.float16)
del dfTopCats['cat_tot_stars']
#parse off any categories with less than 3 businesses
dfTopCats = dfTopCats[dfTopCats['cat_tot_count'] > 2]
#dump to csv
file_path = "Data/top_categories_bus_avg.csv"
dfTopCats.to_csv(file_path)
return
#--Latest method using grouping all categories for a business together to create the feature (therefore [Restaurants, Mexican] is one category, not 2 like in the above methods)--#
def calc_group_categories(dfAllBus):
#Remove businesses with < 5 reviews and businesses with no business star ratings AND businesses with no bus categories
dfAllBus = dfAllBus[dfAllBus['bus_review_count'] > 4]
dfAllBus = dfAllBus[dfAllBus['bus_stars'] > 0]
#dfAllBus['bus_categories'] = [x if len(x) > 0 else 'MISSING' for x in dfAllBus.bus_categories]
dfAllBus = dfAllBus.reset_index(drop=True)
#slice off the categories
dfTopCats = pd.DataFrame(dfAllBus['bus_categories'].value_counts())
dfTopCats.columns = ['cat_counts']
dfTopCats['total_stars'] = 0.00
#Calc rev star average for each category:
##Iterate through every business in the data set and add the review stars to that category's total stars
j=0
for row in dfAllBus.ix[:,['bus_categories']].values:
for cat in row:
if len(cat) > 0:
dfTopCats['total_stars'][cat] += dfAllBus['bus_stars'][j]
j+=1
##divide each category's total stars by its total rev count to derive the avg
dfTopCats['cat_avg_stars'] = dfTopCats['total_stars'] / dfTopCats['cat_counts'].astype(np.float16)
del dfTopCats['total_stars']
#dump to csv
file_path = "Data/top_categories_grouped.csv"
dfTopCats.to_csv(file_path)