-
Notifications
You must be signed in to change notification settings - Fork 350
/
Copy pathprocess_amazon.py
106 lines (93 loc) · 3.55 KB
/
process_amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import random
import pandas as pd
print('Start reading data...')
title = ['UserID', 'BookID', 'Time']
print('Reading train data...')
train = pd.read_table(
'AmazonBooksData/book_train.txt',
sep=',',
header=None,
names=title,
engine='python',
encoding='ISO-8859-1')
print('Reading test data...')
test = pd.read_table(
'AmazonBooksData/book_test.txt',
sep=',',
header=None,
names=title,
engine='python',
encoding='ISO-8859-1')
print('Start processing train data...')
train_set = []
for userID, hist in train.groupby('UserID'):
pos_list = hist['BookID'].tolist()
# generate negative samples randomly
def gen_neg():
neg = pos_list[0]
while neg in pos_list:
# 1~367982 is the range of book id
neg = random.randint(1, 367982)
return neg
neg_list_1 = [gen_neg() for i in range(len(pos_list))]
neg_list_2 = [gen_neg() for i in range(len(pos_list))]
neg_list_3 = [gen_neg() for i in range(len(pos_list))]
neg_list_4 = [gen_neg() for i in range(len(pos_list))]
for i in range(1, len(pos_list)):
# set the max sequence length to 50
hist = pos_list[:i][-50:]
hist_str = '|'.join(map(str, hist))
if i != len(pos_list):
# for each positive sample, random generate 4 negative samples
train_set.append((userID, hist_str, pos_list[i], 1))
train_set.append((userID, hist_str, neg_list_1[i], 0))
train_set.append((userID, hist_str, neg_list_2[i], 0))
train_set.append((userID, hist_str, neg_list_3[i], 0))
train_set.append((userID, hist_str, neg_list_4[i], 0))
random.shuffle(train_set)
print('Start processing test data...')
test_set = []
for userID, hist in test.groupby('UserID'):
pos_list = hist['BookID'].tolist()
# generate negative samples randomly
def gen_neg():
neg = pos_list[0]
while neg in pos_list:
# 1~367982 is the range of book id
neg = random.randint(1, 367982)
return neg
neg_list_1 = [gen_neg() for i in range(len(pos_list))]
neg_list_2 = [gen_neg() for i in range(len(pos_list))]
neg_list_3 = [gen_neg() for i in range(len(pos_list))]
neg_list_4 = [gen_neg() for i in range(len(pos_list))]
for i in range(1, len(pos_list)):
# set the max sequence length to 50
hist = pos_list[:i][-50:]
hist_str = '|'.join(map(str, hist))
if i != len(pos_list):
# for each positive sample, random generate 4 negative samples
test_set.append((userID, hist_str, pos_list[i], 1))
test_set.append((userID, hist_str, neg_list_1[i], 0))
test_set.append((userID, hist_str, neg_list_2[i], 0))
test_set.append((userID, hist_str, neg_list_3[i], 0))
test_set.append((userID, hist_str, neg_list_4[i], 0))
random.shuffle(test_set)
train_set_df = pd.DataFrame(train_set)
test_set_df = pd.DataFrame(test_set)
print('Start writing amazon_train_data...')
train_set_df.to_csv(
r'amazon_train_data', index=False, sep='\t', mode='a', header=False)
print('Start writing amazon_test_data...')
test_set_df.to_csv(
r'amazon_test_data', index=False, sep='\t', mode='a', header=False)
print('Negative Sampling')
train_book = train[['BookID']].drop_duplicates()
test_book = test[['BookID']].drop_duplicates()
negative_book = pd.concat([train_book, test_book]).drop_duplicates()
df_ones = pd.DataFrame(
1, index=negative_book.index, columns=negative_book.columns)
negative_book_data = pd.concat([negative_book, df_ones, negative_book], axis=1)
new_header = ['id:int64', 'weight:float', 'feature:string']
negative_book_data.to_csv(
r'negative_book_data', index=False, sep='\t', mode='a', header=new_header)
print('Done.')