-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
79 lines (61 loc) · 2.56 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import sys
import urllib.request
from urllib.parse import *
import bs4 as bs
class Spider():
def __init__(self, url):
self.parsedURL = urllib.parse.urlparse(url)
# Variables For Crawl
self.pages = set()
self.valid_pages = set()
self.query_pages = 0
self.crawl_init(self.parsedURL.path.strip('/'))
pass
def crawl_init(self, initPage):
self.crawl_page(initPage)
self.write_to_text_file()
print('Crawl Finished!')
# Crawling Given Page
def crawl_page(self, path):
sys.stdout.write('Crawled: {0}\r'.format(len(self.pages)))
sys.stdout.flush()
# add to set to mark crawled
self.pages.add(path)
url = self.parsedURL.scheme + '://' + self.parsedURL.netloc + '/' + path
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print('Error Code: ', e.code)
return
except urllib.error.URLError as e:
print('Error: ', e.reason)
return
else:
self.valid_pages.add(path)
sauce = response.read()
soup = bs.BeautifulSoup(sauce, 'lxml')
# loop thru all links on the page
for link in soup.find_all('a'):
href = link.get('href')
parsedHref = urllib.parse.urlparse(href)
# if url is not an external url
if (parsedHref.netloc == self.parsedURL.netloc or parsedHref.netloc == '') and not isinstance(parsedHref.path, bytes) and parsedHref.scheme == '':
stripHref = parsedHref.path.strip('/')
# add query params if any
if parsedHref.query:
scanned = stripHref + '/?' + parsedHref.query
else:
scanned = stripHref
# scan if not in pages set
if scanned not in self.pages:
if parsedHref.query:
self.query_pages = int(self.query_pages) + 1
self.crawl_page(scanned)
def write_to_text_file(self):
with open('site.txt', 'w') as out_file:
out_file.write(str(self.parsedURL.netloc) + ' Crawl Export\n')
out_file.write('Total Pages: ' + str(len(self.valid_pages)) + '\n')
out_file.write('Query Pages: ' + str(self.query_pages) + '\n\n')
sorted_pages = sorted(self.pages)
for page in sorted_pages:
out_file.write(str(self.parsedURL.netloc + '/' + page) + '\n')