-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimage_downloader_modified.py
108 lines (83 loc) · 3.44 KB
/
image_downloader_modified.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# source of original code:
# https://github.com/WuLC/GoogleImagesDownloader
# as described in the first method of the blog
# http://wulc.me/2017/09/23/Google%20%E5%9B%BE%E7%89%87%E7%88%AC%E8%99%AB/
# code has been modified and understood
# controlled image download quantity
# removed supplementary keywords (simplified input)
# randomized image links sample
# removed logger
# simplified exception handling
# string formatting for readability
# adjusted comments
# etc
# to better understand re.findall('src="(.*?)"', page_content)
# https://docs.python.org/3/library/re.html
# the reason we use user_agent is because we cannot make the request to google without it, otherwise it will be forbidden/denied
# therefore we pass in the user agent in the header when manking the request
# it should be noted that regular expressions are suboptimal for parsing html files
import os
import re
import urllib.request
import urllib.error
from urllib.parse import quote
from user_agent import generate_user_agent
import random
def download_page(url):
'download raw content of the page (html)'
try:
headers = {}
headers['User-Agent'] = generate_user_agent()
headers['Referer'] = 'https://www.google.com'
req = urllib.request.Request(url, headers = headers) #must pass in user agent, or else the request will be denied
resp = urllib.request.urlopen(req)
return str(resp.read())
except Exception as e:
print(f'Error while downloading page {url}')
return None
def parse_page(url,image_quantity):
'parge the page and get a random sample (of specified size) of image links'
page_content = download_page(url)
if page_content:
link_list = re.findall('src="(.*?)"', page_content)
if len(link_list) == 0:
return set()
else:
return random.sample(set(link_list),image_quantity)
else:
return set()
def download_images(main_keyword, image_quantity):
'download a specified amount of images with the main keyword into the download directory'
# create a directory for a main keyword
img_dir = './images/' + main_keyword + '/'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
url = 'https://www.google.com/search?q=' + quote(main_keyword) + '&source=lnms&tbm=isch'
image_links = set()
image_links = image_links.union(parse_page(url,image_quantity))
#print ("Start downloading...")
count = 0
for link in image_links:
try:
req = urllib.request.Request(link, headers = {"User-Agent": generate_user_agent()})
response = urllib.request.urlopen(req)
data = response.read()
file_path = img_dir + f'{count+1}.jpg'
with open(file_path,'wb') as wf:
wf.write(data)
#print(f'{main_keyword}/{count}.jpg Download Complete')
count += 1
except:
print('Error while downloading image')
#print(f"Finished downloading {count} image(s)")
def testprogram():
try:
image_quantity = int(input("How many images do you want to download? (limit: 50)\n"))
if image_quantity > 50:
print("The limit is 50 images at a time\n")
return main()
except:
print("Please enter a valid base-10 integer\n")
return main()
main_keyword = input("Enter the search keyword for the images (e.g. grass green)\n")
download_images(main_keyword, image_quantity)