-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLocalSensitivityHashing.py
More file actions
99 lines (82 loc) · 3.8 KB
/
LocalSensitivityHashing.py
File metadata and controls
99 lines (82 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from turtle import pos
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest
from utils import clean_tweets
def preprocess(text):
text = re.sub(r'[^\w\s]','',text)
shingles = text.lower()
shingles = shingles.split()
return shingles
db = pd.read_csv('tweets.csv')
db['cleaned_tweets'] = db['text'].apply(lambda x: clean_tweets(x))
def get_forest(df, perms):
start_time = time.time()
minhash = []
#preprocess our tweet in shingles with the same format
for text in df['cleaned_tweets']:
shingles = preprocess(text)
#print(shingles)
#MinHash the sting on all our shingles in the string
m = MinHash(num_perm=perms)
for s in shingles:
m.update(s.encode('utf8'))
#store the minhash in a string
minhash.append(m)
#build a forest of all the MinHashed strings
forest = MinHashLSHForest(num_perm=perms)
for i,m in enumerate(minhash):
forest.add(i,m)
#index our forest to make it searchable
forest.index()
print('It took %s seconds to build forest.' %(time.time()-start_time))
return forest
def predict(text, df, permutations, num_results, forest):
start_time = time.time()
#preprocess our text in shingles
shingles = preprocess(text)
#set the same number of permutations for your MinHash as used in building the forest
m = MinHash(num_perm=permutations)
#create the MinHash on text using all of our shingles
for s in shingles:
m.update(s.encode('utf8'))
#query the forest with our MinHash and return the number of needed reccomenations
idx_array = np.array(forest.query(m, num_results))
if len(idx_array) == 0:
return None # if your query is empty, return none
#create a list with the tweets and its sentiment
#result = list(zip(df.iloc[idx_array]['sentiment'],df.iloc[idx_array]['cleaned_tweets']))
result = df.iloc[idx_array]['sentiment']
#convert the list in a df for better showing results, containing the sentiment and tweet
#res = pd.DataFrame(result, columns=['Sentiment','Tweet'])
#res = pd.DataFrame(result, columns=['Sentiment'])
# print('It took %s seconds to query forest.' %(time.time()-start_time))
return result
def print_results(keyword):
permutations = 128
forest = get_forest(db, permutations)
#We set the number of recommendations that we want
num_recommendations = 10
#We set the keyword to find those recommandations as keyword
#keyword = 'Today was a good day!'
#print(predict(keyword, db, permutations, num_recommendations, forest))
result = len(predict(keyword, db, permutations, num_recommendations, forest))
res_pos = (predict(keyword, db, permutations, num_recommendations, forest))
res_neg = (predict(keyword, db, permutations, num_recommendations, forest))
res_neu = (predict(keyword, db, permutations, num_recommendations, forest))
res_pos=res_pos.to_frame()
res_neg=res_neg.to_frame()
res_neu=res_neu.to_frame()
res_pos_total = res_pos[res_pos['sentiment'] == 'positive'].count()
res_neg_total = res_neg[res_neg['sentiment'] == 'negative'].count()
res_neu_total = res_neu[res_neu['sentiment'] == 'neutral'].count()
pos_sentiment = res_pos_total/result
neg_sentiment = res_neg_total/result
neu_sentiment = res_neu_total/result
return [pos_sentiment, neg_sentiment, neu_sentiment]
#print('The tweet', keyword,'is positive with probability:', res_pos_total/result)
#print('The tweet', keyword,'is negative with probability:', res_neg_total/result)
#print('The tweet', keyword,'is neutral with probability:', res_neu_total/result)
#print("Res pos total:", res_pos_total, "\n resneg:", res_neg_total, "\n res neu:", res_neu_total)