-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider.py
More file actions
68 lines (54 loc) · 1.87 KB
/
spider.py
File metadata and controls
68 lines (54 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#################################
# CodesInTheShell.........
#
#################################
import requests
from bs4 import BeautifulSoup
#Printing a simple banner
print("""
#############################################
# This program is a part of walusiki project.
# Feel free to use and modify the code.
#
# by: CITS
#############################################
""")
# url varies depending on your target
url = 'http://192.168.142.128/mutillidae/'
#adding header for the request to became a bit anonymous
head = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}
links_list = []
crawled_links = []
print("Program running...")
print("Crawling " + url + " Domain")
#Browse the target website
r = requests.get(url, headers=head)
#Parse the response
soup = BeautifulSoup(r.content,"html.parser")
#Find all links on the response page
links = soup.find_all("a")
#Filtering the links on the target domain only
for link in links:
if './' in link.get("href") and len(links_list) < 100:
links_list.append(link.get("href"))
crawled_links.append(link.get("href"))
#Crawling the links for the first response for other links and add them to the crawled links
for l in links_list:
new_link = url + l[2:]
new_r = requests.get(new_link)
new_soup = BeautifulSoup(new_r.content, "html.parser")
links_per_page = new_soup.find_all("a")
for li in links_per_page:
try: #Some produce errors
if './' in li.get("href") and li not in crawled_links:
crawled_links.append(li.get("href"))
except TypeError:
pass
#Removing duplicated links
sorted_crawled_links = set(crawled_links)
print("\n")
print("There are " + str(len(sorted_crawled_links)) + " links on " + url + " domain.")
print("Here are the pages: \n")
#Printing
for c in sorted_crawled_links:
print(c)