walusiki/spider.py at master · CodesInTheShell/walusiki · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#################################
#        CodesInTheShell.........
#
#################################

import requests
from bs4 import BeautifulSoup

#Printing a simple banner
print("""
#############################################
# This program is a part of walusiki project.
# Feel free to use and modify the code.
#
#                 by: CITS
#############################################
""")

# url varies depending on your target
url = 'http://192.168.142.128/mutillidae/'
#adding header for the request to became a bit anonymous
head = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}

links_list = []
crawled_links = []

print("Program running...")
print("Crawling " + url + " Domain")

#Browse the target website
r = requests.get(url, headers=head)

#Parse the response
soup = BeautifulSoup(r.content,"html.parser")

#Find all links on the response page
links = soup.find_all("a")

#Filtering the links on the target domain only
for link in links:
    if './' in link.get("href") and len(links_list) < 100:
        links_list.append(link.get("href"))
        crawled_links.append(link.get("href"))

#Crawling the links for the first response for other links and add them to the crawled links
for l in links_list:
    new_link = url + l[2:]
    new_r = requests.get(new_link)
    new_soup = BeautifulSoup(new_r.content, "html.parser")
    links_per_page = new_soup.find_all("a")

    for li in links_per_page:
        try: #Some produce errors
            if './' in li.get("href") and li not in crawled_links:
                crawled_links.append(li.get("href"))
        except TypeError:
            pass

#Removing duplicated links
sorted_crawled_links = set(crawled_links)
print("\n")
print("There are " + str(len(sorted_crawled_links)) + " links on " + url + " domain.")
print("Here are the pages: \n")

#Printing
for c in sorted_crawled_links:
    print(c)