-
Notifications
You must be signed in to change notification settings - Fork 81
/
Copy pathextract_urls.py
83 lines (67 loc) · 2.78 KB
/
extract_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os, sys, json, argparse
from url_utils import *
parser = argparse.ArgumentParser()
parser.add_argument('--psdir', type=str, default='pushshift_dumps')
parser.add_argument('--outdir', type=str, default='url_dumps')
parser.add_argument('--year_start', type=int, default=2018)
parser.add_argument('--year_end', type=int, default=2018)
parser.add_argument('--single_file', type=str, default=None)
parser.add_argument('--min_karma', type=int, default=3)
args = parser.parse_args()
# for processing many pushshift dumps at once
filenames = []
if args.single_file is None:
years = range(args.year_start, args.year_end+1)
years = [str(year) for year in years]
for fn in os.listdir(args.psdir):
for year in years:
if year in fn:
filenames.append(fn)
filenames = sorted(filenames)
print('Processing the following files:',
filenames, '\n')
else:
# args.single_file overrides the year range
filenames.append(args.single_file)
# make output directory if needed
if not os.path.exists(args.outdir):
os.makedirs(args.outdir)
# extract all good links and save
good_links = []
for fn in filenames:
hit_count = 0
total_count = 0
error_count = 0
path = os.path.join(args.psdir, fn)
decompress = get_decompresser(fn)
print('Processing', fn + '...')
with decompress(path, "r") as psfile:
out_path = os.path.join(args.outdir, fn+'.goodlinks.txt')
with open(out_path, 'w') as outfile:
for line in psfile:
# some sparse unicode codec errors
try:
j = json.loads(line)
except:
total_count += 1
error_count += 1
continue
# only take the good links
if (not is_bad_url(j['url'])) and \
(j['score'] > args.min_karma-1) and \
(not j['over_18']):
# save the good url
outfile.write(j['url'] + '\n')
# simple logging
hit_count += 1
if hit_count % 10000==0:
percent_saved = (hit_count / float(total_count)) * 100
print('-- Links saved ({}), Links processed ({}), Percent saved ({}%), Errors ({})'
.format(hit_count, total_count, int(percent_saved), error_count))
# flush the output every now and then
outfile.flush()
total_count += 1
percent_saved = (hit_count / float(total_count)) * 100
print('-- Links saved ({}), Links processed ({}), Percent saved ({}%), Errors ({})'
.format(hit_count, total_count, int(percent_saved), error_count), '\n')
print('Done!')