-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrab_imgur_links_2.py
378 lines (314 loc) · 17.3 KB
/
grab_imgur_links_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import definitions as d
import process_submissions as p
import os
import urllib
from urllib.error import HTTPError
import requests
import re
import time
import hashlib
def process_imgur_links(imgur_image_links, text_links, url_id_list, subreddit_list, post_title_list,
author_list, t_created_list, t_accessed_list, nsfw_list,
img_folder_name="Images3"):
# TODO instead of numbers, use the imgur-link-hash and an md5 of the file as the image-name
# TODO find which file-hashes are the "this content doesn't exist anymore" and don't save them.
# This would allow new images to be added as more links are scraped over time.
# tries to make an image folder to put all the images into
make_folder(img_folder_name)
# goes through links in reverse order:
print(f"Total Links found: {len(imgur_image_links)}")
# Deletes duplicates in a list; only leaving the oldest (furthest down in the list)
# e.g. [1, 2, 2, 3, 1, 2] -> [3, 1, 2].
# TODO make a dedicated function for this later.
# TODO this has the HUGE issue of removing older versions of a link - could shuffle list.
# A bad solution to that is to go through the list in-order but shift the deleted index by the number of deletions
deleted_count = 0
for i in range(len(imgur_image_links)):
if imgur_image_links.count(imgur_image_links[i - deleted_count]) > 1:
imgur_image_links.pop(i - deleted_count)
text_links.pop(i - deleted_count)
url_id_list.pop(i - deleted_count)
subreddit_list.pop(i - deleted_count)
post_title_list.pop(i - deleted_count)
author_list.pop(i - deleted_count)
t_created_list.pop(i - deleted_count)
nsfw_list.pop(i - deleted_count)
deleted_count += 1
print(f"Total Links found after deleting duplicates: {len(imgur_image_links)}")
# return imgur_image_links # TODO this is temporary to compare which links each function finds
# TODO This doesn't actually do what I want; I want for numbers to go 1 -> + with index going from 999 -> - -> 1.
for i in range(len(imgur_image_links)):
# print(imgur_image_links)
imgur_link_ext = "." + imgur_image_links[i].split(".")[-1]
print(f"{i}/{len(imgur_image_links)}", imgur_image_links[i], imgur_link_ext)
# if it's a single image or direct link, download
if imgur_link_ext in [".jpg", ".jpeg", ".png", ".gif", ".mp4"]:
imgur_hash = imgur_image_links[i].split(".")[-2].split("/")[-1]
save_img_and_create_metadata(img_folder_name, imgur_hash, imgur_image_links[i], text_links[i], imgur_image_links[i], url_id_list[i],
subreddit_list[i], post_title_list[i], author_list[i], t_created_list[i],
t_accessed_list[i], nsfw_list[i])
# the link is in the form of a gallery:
else:
# get image links from blog-gallery, embed-gallery, and general-gallery:
blog_gallery_image_links = grab_imgur_blog_links(imgur_image_links[i])
embed_gallery_image_links = grab_imgur_embed_links(imgur_image_links[i])
general_gallery_image_links = grab_imgur_general_links(imgur_image_links[i])
#print("v1", blog_gallery_image_links)
#print("v2", embed_gallery_image_links)
#print("v3", general_gallery_image_links)
# Combines all the link-lists into a single list and removes duplicates (and some h-variants)
combined_img_links = combine_lists_and_remove_dupes([blog_gallery_image_links, embed_gallery_image_links,
general_gallery_image_links])
print(blog_gallery_image_links, embed_gallery_image_links, general_gallery_image_links)
print(combined_img_links)
# print("A", combined_img_links)
# if there is more than one file, make a folder - else just place at the lowest level
gallery_hash = imgur_image_links[i].split("/")[-1]
if len(combined_img_links) > 1: # more than one file in folder
# make a folder for the images
gallery_folder_path = f"{img_folder_name}/{gallery_hash}"
make_folder(gallery_folder_path)
for j in range(len(combined_img_links)):
# grab and save image
img_data, img_data_ext = direct_imgur_image_link_download(combined_img_links[j])
imgur_hash = combined_img_links[j].split(".")[-2].split("/")[-1]
file_hash = data_file_hash(img_data)
with open(f'{gallery_folder_path}/{imgur_hash}-{file_hash}{img_data_ext}', 'wb') as handler:
handler.write(img_data)
time.sleep(3)
# create a single metadata file for all the images
create_img_metadata_file(img_folder_name, gallery_hash, "0", combined_img_links[j], text_links[i], imgur_image_links[i], url_id_list[i], subreddit_list[i],
post_title_list[i], author_list[i], t_created_list[i],
t_accessed_list[i], nsfw_list[i])
elif len(combined_img_links) == 1:
save_img_and_create_metadata(img_folder_name, gallery_hash, combined_img_links[0], text_links[i], imgur_image_links[i], url_id_list[i],
subreddit_list[i], post_title_list[i], author_list[i], t_created_list[i],
t_accessed_list[i], nsfw_list[i])
time.sleep(10)
def save_img_and_create_metadata(path, imgur_hash, imgur_image_link, text_link, reddit_link, url_id_list, subreddit_list, post_title_list,
author_list, t_created_list, t_accessed_list, nsfw_list):
img_data, img_data_ext = direct_imgur_image_link_download(imgur_image_link)
file_hash = data_file_hash(img_data)
# create the image-file and save image:
with open(f'{path}/{imgur_hash}-{file_hash}{img_data_ext}', 'wb') as handler:
handler.write(img_data)
# create the meta-data file and save metadata:
create_img_metadata_file(path, imgur_hash, file_hash, imgur_image_link, text_link, reddit_link, url_id_list, subreddit_list,
post_title_list, author_list, t_created_list,
t_accessed_list, nsfw_list)
def preprocess_data_for_imgur_links(preview, links, text, url_ids, subreddits, titles, authors, created, accessed, nsfw):
# goal here is to go through all links and check if it's an imgur link
processed_links, processed_text_links = [], []
processed_url_id_list, processed_subreddit_list, processed_post_title_list, processed_author_list = [], [], [], []
processed_t_created_list, processed_t_accessed_list, processed_nsfw_list = [], [], []
for i in range(len(links)):
#if len(preview[i]) > 0:
# print(f"Preview Image:{preview[i]}")
# TODO not implemented scraping these yet; they're embedded into reddit.
for j in range(len(links[i])):
# make sure link is imgur link
imgur_link_matchs = re.findall('(https?://(www\.)?(i\.|m\.)?(stack\.)?imgur\.com//?(a/|gallery/)?(r/[a-zA-Z0-9]+/)?[a-zA-Z0-9]+(\.(jpg|jpeg|png|gif|mp4))?)',
links[i][j])
if len(imgur_link_matchs) > 0:
#print("A", imgur_link_matchs)
#print("B", imgur_link_matchs[0][0])
processed_links.append(imgur_link_matchs[0][0])
processed_text_links.append(text[i][j])
processed_url_id_list.append(url_ids[i])
processed_subreddit_list.append(subreddits[i])
processed_post_title_list.append(titles[i])
processed_author_list.append(authors[i])
processed_t_created_list.append(created[i])
processed_t_accessed_list.append(accessed[i])
processed_nsfw_list.append(nsfw[i])
return processed_links, processed_text_links, processed_url_id_list, processed_subreddit_list, \
processed_post_title_list, processed_author_list, processed_t_created_list, processed_t_accessed_list, \
processed_nsfw_list
def combine_lists_and_remove_dupes(list_array):
combined_list = []
for i in range(len(list_array)):
for j in range(len(list_array[i])):
if list_array[i][j] not in combined_list:
combined_list.append(list_array[i][j])
# NOTE this also removes h-variants *IF* (and only if) there is a non-h-variant in the list
# This will NOT remove h-variants that don't have a non-h-variant in the list.
# that has to be done in the main function - or at least somewhere where files are being read.
# Getting the hashes of all the imgur images
imgur_img_hash_list = []
for i in range(len(combined_list)):
if combined_list[i].count('.') >= 2:
imgur_img_hash_list.append(combined_list[i].split(".")[-2])
else:
imgur_img_hash_list.append(combined_list[i].split("/")[-1])
#print("Hash List", imgur_img_hash_list)
# if a trailing-h-hash with the h removed is already in the list, remove
for j in range(len(combined_list) - 1, -1, -1):
if imgur_img_hash_list[j][-1] == 'h':
# print(hash_list[j], combined_link_list[j], hash_list[j][0:-1])
if imgur_img_hash_list[j][0:-1] in imgur_img_hash_list:
# print(f"removing {hash_list[j]}")
imgur_img_hash_list.pop(j)
combined_list.pop(j)
#print("Reduced Combined List", combined_list)
return combined_list
def grab_imgur_blog_links(url):
url = url + "/layout/blog"
html_data = str(html_grab_withfiveohfour_error_wrap(url))
search_string = 'content="(https://i\.imgur\.com/[a-zA-Z0-9]+\.jpg|jpeg|png|gif|mp4|jpg\?1|png\?1)"'
raw_image_links = re.findall(search_string, html_data)
return raw_image_links
def grab_imgur_embed_links(url):
url = url + "/embed?pub=tru"
html_data = str(html_grab_withfiveohfour_error_wrap(url))
hash_search_string = '"hash":"([a-zA-Z0-9]+)"'
exts_search_string = '"ext":"\.(jpg|jpeg|png|gif|mp4|jpg\?1|png\?1)"'
hash_list = re.findall(hash_search_string, html_data)
exts_list = re.findall(exts_search_string, html_data)
raw_image_links = []
for i in range(len(hash_list)):
# print("test string", f"https://i.imgur.com/{hash_list[i]}.{exts_list[i]}")
raw_image_links.append(f"https://i.imgur.com/{hash_list[i]}.{exts_list[i]}")
return raw_image_links
def grab_imgur_general_links(url):
html_data = str(html_grab_withfiveohfour_error_wrap(url))
search_string = 'content="(https://i\.imgur\.com/[a-zA-Z0-9]+\.jpg|jpeg|png|gif|mp4|jpg\?1|png\?1)"'
raw_image_links = re.findall(search_string, html_data)
return raw_image_links
def html_grab_withfiveohfour_error_wrap(url):
# Named after the most frequent HTML error (504), but also works with 503.
error = 504
while error == 504 or error == 503:
try:
response = urllib.request.urlopen(url)
html_data = response.read()
response.close()
return html_data
except HTTPError as err:
if err.code == 404:
return ""
print(f"404 Error! {err}")
elif err.code == 504 or err.code == 503:
error = error.code
else:
raise
def create_img_metadata_file(path, imgur_hash, file_hash, imgur_image_link, text_link, reddit_link, url_ids, subreddit, title, author,
t_created, t_access, nsfw_state, encoding="utf-8"):
with open(f"{path}/{imgur_hash}-{file_hash}.txt", 'w', encoding=encoding) as f:
f.write(f"{url_ids}\n")
f.write(f"{text_link}\n")
f.write(f"{reddit_link}\n")
f.write(f"{imgur_image_link}\n")
f.write(f"{imgur_hash}\n")
f.write(f"{file_hash}\n")
f.write(f"{subreddit}\n")
f.write(f"{title}\n")
f.write(f"{author}\n")
f.write(f"{t_created}\n")
f.write(f"{t_access}\n")
f.write(f"{nsfw_state}\n")
def direct_imgur_image_link_download(link):
"""
TODO try check for h-files and try non-h;
# Don't know what processing was done before and want to make this robust.
# confirm that the link has a known extension
#
:param link: Imgur link to the image-download
:param ext: Extension of the image download.
:return:
"""
# Gets the image-type extension e.g., .png, .jpg, .jpeg, .gif, .gifv, .mp4
link_ext = "." + link.split(".")[-1]
# Don't know why I'm getting a ".jpg?1" ext.
# This may be obsolete if I've found the root cause elsewhere.
# if extension is .jpg?1, replace with .jpg.
if link_ext == ".jpg?1":
link_ext = ".jpg"
link = link.replace(".jpg?1", ".jpg")
if link_ext == ".png?1":
link_ext = ".png"
link = link.replace(".png?1", ".png")
# confirming that the extension is a valid type
if link_ext not in (".jpg", ".jpeg", ".png", ".gif", ".mp4"):
raise Exception(f"Direct Imgur link has wrong extension!\n Link found: {link}")
# Time to open the file
# Error 504 is gateway timeout error; Basically the server didn't recieve data it requested.
# Basically if this error is hit, wait a bit and retry; It's not an error in the code.
error = 504
if link.split(".")[-2][-1] == 'h': # if trailing
# make non-h-link
h_pos = link.rfind('h')
nonhlink = link[0:h_pos] + link[h_pos + 1:]
# print("TESTING", link, nonhlink)
while error == 504:
try:
response = requests.get(nonhlink, timeout=20)
img_data = response.content
response.close()
error = 0
except HTTPError as err:
if err.code == 504:
error = err.code
print(f"HTTPError 504 on non-h: Retrying... ")
# TODO Handle 404 errors here???? Just going to raise 404 errors for now. Will likely change later.
elif error == 404:
print("non-h 404 error!")
print(f"NOT BREAKING; Trying h-img")
pass
# TODO Make an exception here for if my internet dies... Error 421?
else:
print(f"Error raised on non-h: {err}")
print(f"NOT BREAKING; Trying h-img")
error = err.code
pass
else:
while error == 504:
try:
# end bad code time.
response = requests.get(link, timeout=20)
img_data = response.content
response.close()
error = 0
except HTTPError.code as error:
if error == 504:
print(f"HTTPError 504: Retrying... ")
# TODO Handle 404 errors here???? Just going to raise 404 errors for now. Will likely change later.
elif error == 404:
print("404 error!")
raise
# TODO Make an exception here for if my internet dies... Error 421?
else:
raise
return img_data, link_ext
def make_folder(folder_name):
"""
# Tries to create a folder if it doesn't already exist.
# If it exists, it does nothing.
:param folder_name: The name of the folder being created
:return: NONE
"""
try:
os.makedirs(folder_name)
except WindowsError as ex:
if ex.winerror == 183: # If the folder already exists
print(f"Tried creating folder; folder {folder_name} Already exists; continuing.")
else:
raise
def main_save_imgur_images(f_name="Images4", json_title='submission2'):
post_ids, subreddits, titles, author, t_created, _, t_accessed, _, _, likes, _, nsfw, previews, \
_, _, _, _, _, links, link_texts = p.process_submission_json_data(json_title)
imgur_image_links, text_links, url_id_list, subreddit_list, title_list, author_list, t_created_list, t_accessed_list, \
nsfw_list = preprocess_data_for_imgur_links(previews, links, link_texts, post_ids, subreddits, titles, author,
t_created, t_accessed, nsfw)
process_imgur_links(imgur_image_links, text_links, url_id_list, subreddit_list, title_list,
author_list, t_created_list, t_accessed_list, nsfw_list,
img_folder_name=f_name)
def data_file_hash(data):
if type(data) is bytes:
hash_obj = hashlib.md5(data)
hash_str = hash_obj.hexdigest()
else:
data = str.encode(str(data))
hash_obj = hashlib.md5(data)
hash_str = hash_obj.hexdigest()
return str(hash_str)
# main_save_imgur_images(f_name="HFY_images", json_title='submission2')