-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
57 lines (45 loc) · 2.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import definitions as d
import grab_imgur_links_2 as gil2
import reading_comprehension_data as rcd
# Step 0: Create a list of all links on the subreddit:
# Step 1.1: Compare against existing list to see if associated data has already been scraped
# Do this by comparing pages that have already been downloaded.
# Step 1.2: Iterate through each non-archived page - grabbing the whole page's information.
#d.grab_data_json("HFY", "submission", after_time=0, f_name="output_submission2", sleep_time=2, query_count=100) # Downloads subreddit into 50mb JSON files.
# d.grab_data_json("HFY", "comment", after_time=0, f_name="output_comment3", sleep_time=1, query_count=100)
# Grabbing sister-communities:
"""
humansarespaceorcs
humansarespacebards
humansarespaceferrets
RedditSerials
TurningtoWords
Hunter_or_Huntress
NatureofPredators
NatureOfPredatorsNSFW (?)
Sexyspacebabes
GATEhouse
bubblewriters
"""
# d.grab_data_json("humansarespaceorcs", "submission", after_time=0, f_name="HASO_Submissions01", sleep_time=2, query_count=100)
# d.grab_data_json("humansarespacebards", "submission", after_time=0, f_name="HASB_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("humansarespaceferrets", "submission", after_time=0, f_name="HASF_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("Hunter_or_Huntress", "submission", after_time=0, f_name="HoH_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("NatureofPredators", "submission", after_time=0, f_name="NoP_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("NatureOfPredatorsNSFW", "submission", after_time=0, f_name="NoPNSFW_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("Sexyspacebabes", "submission", after_time=0, f_name="SSB_Submissions01", sleep_time=2, query_count=100)
#d.grab_data_json("GATEhouse", "submission", after_time=0, f_name="GATEhouse_Submissions01", sleep_time=2, query_count=100)
# ... after initial dataset is created, get the msot recent time and then only grab stuff that is newer... unless wanting to increment versions of older posts.
# To grab new post data:
# 1a. Scan through all .json files in directory that are submission/comment files
# 1b. Find the newest .json file
# 1c. Find the most recently grabbed post/comment
# 1d. Scan from the most recent time to current.
# 1e. As posts can be made at the same second - with one being grabbed but not the other, you have to start at that exact same time
# meaning you may get duplicated (or you *wlll* 100% get at least 1 duplicate).
# 1f. For every new post, check to see if you already have it in the database.
# 1g. If you do, check to see if the content (or likes/metadata) has been updated.
# 1h. if it has been, make a new revision-history for that doc...(?)
# 1i. Else, just create a new document like normal...
gil2.main_save_imgur_images(f_name="HFY_images", json_title='submission2')
# rcd.create_corpus(skip_compilation=True)