-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecommend-products.py
More file actions
159 lines (114 loc) · 3.81 KB
/
recommend-products.py
File metadata and controls
159 lines (114 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pandas.plotting
from surprise import SVD, Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import cross_validate
import csv
import time
import sys
# -- cont time execution
initial = time.time()
# -- open csv for analyse
df= pd.read_csv(sys.argv[1],encoding='iso-8859-1')
df.head()
df.shape
df.head()
# -- filter data for analyse
df.drop(df[df.resource_type>1].index ,inplace=True)
df.head()
# -- categorization of data
def categoriza(s):
if s <= 14:
return 1
elif s >= 15 and s<=18:
return 2
elif s >=19 and s<=21:
return 3
elif s >= 15 and s<=18:
return 4
elif s >=22 and s<=26:
return 5
elif s >=31 and s<=35:
return 6
elif s >=36 and s<=40:
return 7
elif s >=41 and s<=50:
return 8
elif s >=50 and s<=60:
return 9
elif s >=61 and s<=80:
return 10
elif s >=81:
return 11
# -- defining id for categorized data
df['age_range'] = df['age'].apply(categoriza)
df.head()
# -- reading category dictionary
df_profile = pd.read_csv('profile.csv')
df_profile.head()
# -- merge dictionary with data filter
df = pd.merge(df_profile,df)
df.head()
# -- drop unnecessary columns
df.drop(['browser'], axis=1, inplace=True)
df.drop(['date_time'], axis=1, inplace=True)
df.drop(['device_type'], axis=1, inplace=True)
df.drop(['os'], axis=1, inplace=True)
df.drop(['country'], axis=1, inplace=True)
df.drop(['utm_campaign'], axis=1, inplace=True)
df.drop(['utm_medium'], axis=1, inplace=True)
df.drop(['utm_source'], axis=1, inplace=True)
df.drop(['utm_term'], axis=1, inplace=True)
df.drop(['ip_addr'], axis=1, inplace=True)
df.drop(['region'], axis=1, inplace=True)
df.drop(['age_range'], axis=1, inplace=True)
df.drop(['gender'], axis=1, inplace=True)
df.drop(['age'], axis=1, inplace=True)
df.drop(['resource_type'], axis=1, inplace=True)
# -- let's set 1 for purchased
df.head()
df.id_profile.unique()
df['rating'] = 1
df.head()
list_store=df.store_id.unique()
# -- output csv
with open(sys.argv[2], 'w') as csvfile:
writer = csv.writer(csvfile)
# -- beginning of machine learning
for i in range(len(list_store)):
print(list_store[i])
# -- creating unique store dataframe
df_result=df[df['store_id']==list_store[i] ]
# -- 1 bought and 0 not purchased
reader = Reader(rating_scale=(0,1))
# -- load dataframe filtered
data = Dataset.load_from_df(df_result[['id_profile','resource_id','rating']], reader=reader)
# -- These are algorithm that are directly derived from a basic nearest neighbors approach.
algo_kNN = KNNBasic(sim_options = {'name':'cosine', 'user_based': False})
cross_validate(algo_kNN, data, measures=['RMSE','MAE'], cv = 5, verbose = True)
algo_SVD = SVD()
# -- matrix factorization algorithm
cross_validate(algo_SVD, data, measures=['RMSE','MAE'], cv = 5, verbose = True)
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()
algo = SVD()
algo.fit(trainset)
prediction = algo.test(testset)
prediction[:3]
# -- maximum number of products
n = 100
# -- Here create a pseudolist
top_n = defaultdict(list)
for uid, iid, r_ui, est, _ in prediction:
top_n[uid].append((iid,est))
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse = True)
top_n[uid] = user_ratings[:n]
# -- print data
for uid, user_ratings in top_n.items():
writer.writerow([list_store[i], uid,[iid for (iid, _) in user_ratings]])
#print(uid, [iid for (iid, _) in user_ratings])
endtime = time.time()
print ("execution time: ", endtime-initial)