forked from mithileshtipkari/Web-Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebCrawler.py
105 lines (94 loc) · 4.55 KB
/
webCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Author's note -
# This script belongs to/developed by Mithilesh Tipkari (https://github.com/mithileshtipkari)
# It is free to use and contribute and share but author name should be mentioned while sharing the script.
__author__ = 'Mithilesh Tipkari'
__version__ = '1.0'
import os
import argparse
import traceback
import time
import random
from urllib import request
from bs4 import BeautifulSoup
import shutil
to_be_visited = []
visited_links = {}
downloaded_links = []
def fetch_n_crawl_urls(url, path, max_level, file_types):
if not os.path.exists(path):
os.makedirs(path)
if url.startswith("http"):
site_name = url.split('//')[1].replace('www.', '')
else:
site_name = url.split('.')[0]
# if site_name.startswith('www.'):
# site_name = site_name.replace('www.', '')
print('---------------------------------------------------------------------')
print(site_name)
print('---------------------------------------------------------------------')
dump_path = path+'/'+site_name
if not os.path.exists(dump_path):
os.makedirs(dump_path)
visited_links[url] = False
crawler(url, url, dump_path, file_types, 1, max_level)
def crawler(main_url, local_url, path, file_types, current_level, max_level):
"""
Crawler method is used to crawl through given URL
up to given level and download specified files
:param main_url: base URL of website
:param local_url: URL to be crawled
:param path: dumping location for downloaded files
:param file_types: types of files to be downloaded
:param current_level: current level of web page visit
:param max_level: level of depth to visit links
"""
print('crawler called')
file_type_list = file_types.split(',')
try:
if not visited_links[local_url]:
print('not visited -' + local_url)
if current_level > max_level:
print('don"t cross your level-', current_level, ', returning')
return
visited_links[local_url] = True
time.sleep(random.randint(1, 5)) # sleep call so as not to overload the server
print('visiting--', local_url)
print('level--', current_level)
html = request.urlopen(local_url)
if html.status == 200:
soup = BeautifulSoup(html, "lxml")
print('getting links')
links = soup.find_all('a', recursive=True)
fetched_links = 0
for l in links:
n_url = l.get('href')
if n_url is not None:
for file_ext in file_type_list:
if n_url.endswith(file_ext):
print('lets download--', n_url)
if main_url.startswith('https') or main_url.startswith('http'):
if n_url not in downloaded_links:
time.sleep(random.randint(1, 4)) # waiting before accessing URL
res = request.urlopen(main_url + '/' + n_url)
downloaded_links.append(n_url)
visited_links[n_url] = True
print(n_url.split('/')[-1], 'status-', res.status, 'downloading level-', current_level)
with open(path + '/' + n_url.split('/')[-1], 'wb') as f:
shutil.copyfileobj(res, f)
fetched_links += 1
crawler(main_url, local_url, path, current_level + 1, max_level)
print("Total files downloaded - ", fetched_links)
except Exception as e:
print('Exception in Crawler-', traceback.format_exc())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('website_url', help='URL from which you want to download files')
parser.add_argument('path', help='Path to which you want to dump downloaded files')
parser.add_argument('--level', help='Level of recursively visiting pages from a link, default = 3', type=int, default=3)
parser.add_argument('--file_types', help='Specify comma seperated filetypes such as pdf,txt to be downloaded, default is pdf', default='pdf')
args = parser.parse_args()
print('url-', args.website_url)
print('path-', args.path)
print('level-', args.level)
print('filetypes-', args.file_types)
fetch_n_crawl_urls(args.website_url, args.path, args.level, args.file_types)