-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathBaselineRecommendations.py
166 lines (125 loc) · 6.47 KB
/
BaselineRecommendations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import math
import itertools
import time
from MatrixOperations import convert_coo_to_csc_and_csr
from scipy import sparse
class BaselineRecommendations:
def __init__(self, dataset):
# Load the sparse matrix from a file
self.training_filepath = 'matrices/{}_training.npz'.format(dataset)
self.testing_filepath = 'matrices/{}_test.npz'.format(dataset)
self.training_matrix_coo = self.load_sparse_matrix(self.training_filepath)
self.test_matrix_coo = self.load_sparse_matrix(self.testing_filepath)
self.training_matrix_csr = None
self.test_matrix_csr = None
self.training_matrix_csc = None
self.test_matrix_csc = None
self.baseline_rating = {}
self.movie_centered = {}
self.user_centered = {}
self.global_mean = 0.0
def load_sparse_matrix(self, file_name):
return sparse.load_npz(file_name)
def calculate_baseline_RMSE(self):
summed_error = 0
# Loop through each entry in the test dataset
for movie, user, true_rating in itertools.izip(self.test_matrix_coo.row, self.test_matrix_coo.col,
self.test_matrix_coo.data):
# Get the baseline rating for this movie in the test set
movie_baseline = self.movie_centered[movie]
# Get the baseline rating for this user in the test set
user_baseline = self.user_centered[user]
estimated_rating = movie_baseline + user_baseline + self.global_mean
self.baseline_rating[(movie, user)] = estimated_rating
# Calculate the error between the predicted rating and the true rating
summed_error = summed_error + self.calculate_error_test(estimated_rating, true_rating)
# Calculate the number of entries in the test set
test_dataset_size = self.test_matrix_coo.nnz
# Compute the RMSE on the test set
rmse = math.sqrt(float(summed_error) / test_dataset_size)
return rmse
def calculate_error_test(self, estimated_rating, true_rating):
error = math.pow(true_rating - estimated_rating, 2)
return error
def calculate_global_baseline_rating(self):
summed_movie_rating = 0
for i, j, v in itertools.izip(self.training_matrix_coo.row, self.training_matrix_coo.col,
self.training_matrix_coo.data):
summed_movie_rating = summed_movie_rating + v
number_of_ratings = self.training_matrix_coo.nnz
self.global_mean = float(summed_movie_rating) / number_of_ratings
def calculate_relative_mean_movie_rating(self):
# Calculate the mean of each movie
movie_sums = self.training_matrix_csr.sum(axis=1)
# Calculate the number of ratings for each movie
movie_rating_counts = self.training_matrix_csr.getnnz(axis=1)
# Loop through each movie
number_of_movies = self.training_matrix_csr.shape[0]
for index in xrange(1, number_of_movies):
# Check to see if the movie has not been rated
if movie_sums[index] != 0:
movie_average = float(movie_sums[index]) / movie_rating_counts[index]
self.movie_centered[index] = movie_average - self.global_mean
else:
self.movie_centered[index] = 0
def calculate_mean_user_rating(self):
# Calculate the mean of each user
user_sums = self.training_matrix_csc.sum(axis=0)
# Reshape the matrix to array form for proper indexing
user_sums = user_sums.reshape((user_sums.size, 1))
# Calculate the number of ratings for each user
user_rating_counts = self.training_matrix_csc.getnnz(axis=0)
# Loop through each user
number_of_users = self.training_matrix_csc.shape[1]
for index in xrange(1, number_of_users):
# Check to see if the user has not rated
if user_sums[index] != 0:
user_average = float(user_sums[index]) / user_rating_counts[index]
self.user_centered[index] = user_average - self.global_mean
else:
self.user_centered[index] = 0
def calculate_baseline_error(self):
start = time.time()
self.calculate_global_baseline_rating()
end = time.time()
print "Time to calculate global movie mean: " + str((end - start))
start = time.time()
self.calculate_relative_mean_movie_rating()
end = time.time()
print "Time to calculate mean movie ratings: " + str((end - start))
start = time.time()
self.calculate_mean_user_rating()
end = time.time()
print "Time to calculate mean user ratings: " + str((end - start))
start = time.time()
rmse = self.calculate_baseline_RMSE()
end = time.time()
print "Time to calculate RMSE: " + str((end - start))
return rmse
def run_baseline(self):
self.training_matrix_csc, self.training_matrix_csr = convert_coo_to_csc_and_csr(self.training_matrix_coo)
self.test_matrix_csc, self.test_matrix_csr = convert_coo_to_csc_and_csr(self.test_matrix_coo)
print "Finished converting to csc and csr"
rmse = self.calculate_baseline_error()
print "RMSE Baseline: " + str(rmse)
if __name__ == '__main__':
start_time = time.time()
print "Running Baseline Estimate on Random Dataset"
dataset = 'random'
random_training_filepath = 'matrices/{}_training.npz'.format(dataset)
random_testing_filepath = 'matrices/{}_test.npz'.format(dataset)
random_test = sparse.load_npz(random_testing_filepath)
random_training = sparse.load_npz(random_training_filepath)
random_baseline = BaselineRecommendations(random_training,random_test)
random_baseline.run_baseline()
print "Baseline Estimate on Random Dataset done in {} seconds".format(time.time() - start_time)
start_time = time.time()
print "Running Baseline Estimate on Arbitrary Dataset"
dataset = 'arbitrary'
arbitrary_training_filepath = 'matrices/{}_training.npz'.format(dataset)
arbitrary_testing_filepath = 'matrices/{}_test.npz'.format(dataset)
arbitrary_test = sparse.load_npz(arbitrary_testing_filepath)
arbitrary_training = sparse.load_npz(arbitrary_training_filepath)
arbitrary_baseline = BaselineRecommendations(arbitrary_training, arbitrary_test)
arbitrary_baseline.run_baseline()
print "Baseline Estimate on Random Dataset done in {} seconds".format(time.time() - start_time)