-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdataset.py
More file actions
81 lines (68 loc) · 2.07 KB
/
dataset.py
File metadata and controls
81 lines (68 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from multiprocessing import *
import praw
from praw.handlers import MultiprocessHandler
from mongo import *
from pymongo import *
import datetime
import sys
import requests
def getSubredditUsers(subreddit):
"""
Get the commentors in a subreddit.
"""
client = MongoClient()
reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation Engine", handler=MultiprocessHandler())
subreddit = reddit.get_subreddit(subreddit)
comments = subreddit.get_comments(limit=250)
currentUsers = allUsers(client)
if currentUsers:
found = [user['username'] for user in currentUsers]
else:
found = []
users = []
for comment in comments:
if comment.author.name not in found:
users.append({'user':comment.author.name})
return tempBulkInsert(users, client)
def getComments(username):
"""
Return the subreddits a user has commented in.
"""
try:
unique_subs = []
client = MongoClient()
reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation Engine", handler=MultiprocessHandler())
user = reddit.get_redditor(username)
subs = []
for comment in user.get_comments(limit=250):
if comment.subreddit.display_name not in subs:
subs.append(comment.subreddit.display_name)
insertSub(comment.subreddit.display_name, client)
return insertUser(username, subs, client)
except requests.exceptions.HTTPError as e:
print e
pass
#def updateSubs():
def getSubreddits():
#reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation", handler=MultiprocessHandler())
return ['all']
#Eventually, get all subreddits with over 10,000 users and go from there.
#return subreddits(MongoClient())
def cron(user):
client = MongoClient()
if abs(datetime.datetime.utcnow() - user['updated']).days >= 1:
return getComments(username)
def main():
try:
pool = Pool(processes=(cpu_count()*6))
subs = getSubreddits()
pool.map(getSubredditUsers, subs)
users = [user['user'] for user in tempUserList(MongoClient())]
pool.map(getComments, users)
pool.close()
except KeyboardInterrupt:
pool.terminate()
sys.exit()
#TEST. This will fail so hard.
if __name__ == "__main__":
main()