-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0_crawlReddit.py
More file actions
63 lines (51 loc) · 1.59 KB
/
Copy path0_crawlReddit.py
File metadata and controls
63 lines (51 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import praw
import os
import json
import time
number = 100
sleep_time = 60
id="sHtLL5NaCEJNuw"
secret="_7lRXMZxnSKM1s-afRxPoC2JOnqOZA"
agent = "NER Knowledge Graph Project 0.1"
r = praw.Reddit(client_id=id, client_secret=secret, user_agent=agent)
subreddit = "news"
newsReddit = r.subreddit(subreddit).new(limit=None) # params={"after" : "t3_m2i5wv"}
# con_search = r.subreddit(subreddit).search('timestamp=1616588297..1615438502', subreddit=subreddit, syntax="cloudsearch")
cur_sub = 0
Content = []
for submission in newsReddit:
print(cur_sub)
id = submission.id
title = submission.title
title_no_quotes = title.replace("'", "")
ups = submission.ups
upvote_ratio = submission.upvote_ratio
date = submission.created_utc
author = submission.author.name
domain = submission.domain
ex_url = submission.url
re_url = "http://reddit.com" + submission.permalink
# pprint.pprint(vars(submission))
newsEntry = {
"id": id,
"title": title_no_quotes,
"ups": str(ups),
"upvote_ratio": str(upvote_ratio),
"date": str(date),
"author": author,
"domain": domain,
"ex_url": ex_url,
"re_url": re_url
}
print(newsEntry)
News_String = json.dumps(newsEntry)
f = open("data/raw/reddit/" + str(id) + ".json", "w")
f.write(News_String)
f.close()
cur_sub = cur_sub +1
Content.append(newsEntry)
timestr = time.strftime("%Y_%m_%d-%H_%M")
FinalString = json.dumps(Content)
f = open("data_json_new/" + timestr + "_" + str(cur_sub) + ".json", "w")
f.write(FinalString)
f.close()