Skip to content

Commit cadb842

Browse files
committed
fixing erros and improving
1 parent 91552a9 commit cadb842

File tree

5 files changed

+13
-7
lines changed

5 files changed

+13
-7
lines changed

ImgFinder.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ def __init__(self, page_url):
99
self.page_url = page_url
1010
self.base_url = urlres.netloc
1111
self.folder = functions.get_folder_name(urlres.netloc)
12-
self.path = urlres.path
12+
self.path = urlres.path.replace("/", "_")
13+
self.scheme = urlres.scheme
1314
self.src = set()
1415
HTMLParser.__init__(self)
1516

@@ -22,7 +23,7 @@ def handle_starttag(self, tag, attrs):
2223
if tag == 'img':
2324
for (attr, value) in attrs:
2425
if attr == 'src':
25-
fullUrl = urllib.parse.urljoin(self.base_url, value)
26+
fullUrl = urllib.parse.urljoin(self.scheme + "://" + self.base_url, value)
2627
self.src.add(fullUrl)
2728
else:
2829
continue
@@ -46,7 +47,7 @@ def save_to_file(self) -> object:
4647
Save waiting downloadable image to queue. So next time when program run
4748
:rtype: object
4849
"""
49-
file_name = self.folder_path() + self.path + '.txt'
50+
file_name = self.folder_path() + "/" + self.path + '.txt'
5051
with open(file_name, 'w') as f:
5152
for line in sorted(self.src):
5253
f.write(line + '\n')

__pycache__/ImgFinder.cpython-36.pyc

70 Bytes
Binary file not shown.

__pycache__/functions.cpython-36.pyc

199 Bytes
Binary file not shown.

functions.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def create_project_folder(page_url: object) -> object:
3232
os.makedirs("storage/" + base_url)
3333

3434

35+
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
3536
def html_string(page_url: object) -> object:
3637
"""
3738
Fetch html from url and return as Html String
@@ -40,7 +41,9 @@ def html_string(page_url: object) -> object:
4041
"""
4142
html_string = ''
4243
try:
43-
response = urllib.request.urlopen(page_url)
44+
request = urllib.request.Request(page_url, headers={
45+
"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
46+
response = urllib.request.urlopen(request)
4447
if 'text/html' in response.getheader('Content-Type'):
4548
html_bytes = response.read()
4649
html_string = html_bytes.decode("utf-8")
@@ -59,5 +62,7 @@ def get_folder_name(base_url) -> object:
5962
parts = base_url.split(".")
6063
if len(parts) == 3:
6164
return parts[1]
65+
elif len(parts) == 2:
66+
return parts[0]
6267
else:
63-
return parts.join("-")
68+
return "-".join(parts)

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
import Download
66

77
if __name__ == '__main__':
8-
PAGE_URL = 'http://www.fdfashionbd.com/gallarey'
8+
PAGE_URL = 'https://gopostie.com/how-it-works'
99
# Create the project folder into storage folder
1010
create_project_folder(PAGE_URL)
1111
# Find images source and save it to project folder
1212
finder = ImgFinder.ImgFinder(PAGE_URL)
1313
finder.feed(html_string(PAGE_URL))
1414
file_name = finder.save_to_file()
15-
# start downloading images
15+
#start downloading images
1616
down = Download.Download(file_name, finder.folder_path())
1717
down.start()

0 commit comments

Comments
 (0)