ecomplus-ml-products/recommend-products.py at master · ecomclub/ecomplus-ml-products · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pandas.plotting
from surprise import SVD, Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import cross_validate
import csv
import time
import sys

# -- cont time execution

initial = time.time()

# -- open csv for analyse

df= pd.read_csv(sys.argv[1],encoding='iso-8859-1')
df.head()
df.shape
df.head()

# -- filter data for analyse

df.drop(df[df.resource_type>1].index ,inplace=True)

df.head()

# -- categorization of data

def categoriza(s):
    if s <= 14:
        return 1
    elif s >= 15 and s<=18:
        return 2
    elif s >=19 and s<=21:
        return 3
    elif s >= 15 and s<=18:
        return 4
    elif s >=22 and s<=26:
        return 5
    elif s >=31 and s<=35:
        return 6
    elif s >=36 and s<=40:
        return 7
    elif s >=41 and s<=50:
        return 8
    elif s >=50 and s<=60:
        return 9
    elif s >=61 and s<=80:
        return 10
    elif s >=81:
        return 11

# -- defining id for categorized data

df['age_range'] = df['age'].apply(categoriza)
df.head()

# -- reading category dictionary

df_profile = pd.read_csv('profile.csv')
df_profile.head()

# -- merge dictionary with data filter

df = pd.merge(df_profile,df)
df.head()

# -- drop unnecessary columns

df.drop(['browser'], axis=1, inplace=True)
df.drop(['date_time'], axis=1, inplace=True)
df.drop(['device_type'], axis=1, inplace=True)
df.drop(['os'], axis=1, inplace=True)
df.drop(['country'], axis=1, inplace=True)
df.drop(['utm_campaign'], axis=1, inplace=True)
df.drop(['utm_medium'], axis=1, inplace=True)
df.drop(['utm_source'], axis=1, inplace=True)
df.drop(['utm_term'], axis=1, inplace=True)
df.drop(['ip_addr'], axis=1, inplace=True)
df.drop(['region'], axis=1, inplace=True)
df.drop(['age_range'], axis=1, inplace=True)
df.drop(['gender'], axis=1, inplace=True)
df.drop(['age'], axis=1, inplace=True)
df.drop(['resource_type'], axis=1, inplace=True)

# -- let's set 1 for purchased

df.head()

df.id_profile.unique()

df['rating'] = 1

df.head()

list_store=df.store_id.unique()

# -- output csv

with open(sys.argv[2], 'w') as csvfile:
    writer = csv.writer(csvfile)

# -- beginning of machine learning

    for i in range(len(list_store)):
        print(list_store[i])

# -- creating unique store dataframe

        df_result=df[df['store_id']==list_store[i] ]

# -- 1 bought and 0 not purchased

        reader = Reader(rating_scale=(0,1))

# -- load dataframe filtered

        data = Dataset.load_from_df(df_result[['id_profile','resource_id','rating']], reader=reader)

# -- These are algorithm that are directly derived from a basic nearest neighbors approach.

        algo_kNN  = KNNBasic(sim_options = {'name':'cosine', 'user_based': False})
        cross_validate(algo_kNN, data, measures=['RMSE','MAE'], cv = 5, verbose = True)
        algo_SVD  = SVD()

# -- matrix factorization algorithm

        cross_validate(algo_SVD, data, measures=['RMSE','MAE'], cv = 5, verbose = True)
        trainset = data.build_full_trainset()
        testset = trainset.build_anti_testset()
        algo = SVD()
        algo.fit(trainset)

        prediction = algo.test(testset)

        prediction[:3]

# -- maximum number of products
        n = 100

# -- Here create a pseudolist

        top_n = defaultdict(list)
        for uid, iid, r_ui, est, _ in prediction:
            top_n[uid].append((iid,est))
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse = True)
            top_n[uid] = user_ratings[:n]

# -- print data

        for uid, user_ratings in top_n.items():
            writer.writerow([list_store[i], uid,[iid for (iid, _) in user_ratings]])
            #print(uid, [iid for (iid, _) in user_ratings])

endtime = time.time()
print ("execution time: ", endtime-initial)