forked from ronilp/Finding-Influencers-in-Social-Networks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetPageData2.py
87 lines (72 loc) · 2.01 KB
/
getPageData2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from utilities import url, access_token, getAppendString
from database import getPageCollection,getPageDataCollection
from Queue import Queue
import traceback
import requests
from requests.exceptions import ConnectionError
import threading
import time
import sys
import re
class fetchingPageData(threading.Thread):
#Pythonthonic way, avoid using global variable
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
self.collection = getPageDataCollection()
def getdata(self,data,string):
if string in data:
return data[string]
return None
def run(self):
while True:
try:
fbid = self.queue.get()
trurl = url + '/v2.3/'
rurl = trurl + fbid
response = requests.get(rurl, params={'access_token': access_token, 'fields': 'about,category,bio,name,posts.limit(10){message}'})
data = response.json()
if 'posts' not in data:
data['posts'] = {}
data['posts']['data'] = []
document = {
'_id' : fbid,
'name' : self.getdata(data,'name'),
'about' : self.getdata(data,'about'),
'category' : self.getdata(data,'category'),
'bio' : self.getdata(data,'bio'),
'posts' : [ self.getdata(msg,'message') for msg in data['posts']['data']]
}
string = getAppendString(document)
a = self.collection.insert(string)
except:
self.queue.put(fbid)
time.sleep(2)
print fbid
self.queue.task_done()
def getPageData():
pageCollection = getPageCollection()
pageDataCollection = getPageDataCollection()
queue = Queue()
index = 1
for page_id in pageCollection.find():
if not pageDataCollection.find_one(page_id):
queue.put(page_id['_id'])
index += 1
print index
for i in range(200):
t = fetchingPageData(queue)
t.setDaemon(True)
t.start()
queue.join()
for doc in pageDataCollection.find():
f = open('data/' + doc['_id'] + '.txt','w')
f.write(doc['data'])
f.close()
start = time.time()
if __name__ == '__main__':
print "Working...."
getPageData()
end = time.time()
#print end - start
print 'Done'