-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathjustdail.py
158 lines (127 loc) · 4.02 KB
/
justdail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--url', help='Enter the url you want to scrape')
parser.add_argument('--file', help='Specify the file name', default="export.csv")
args = parser.parse_args()
from bs4 import BeautifulSoup
import urllib
import urllib.request
import requests
import csv
import os
import time
def innerHTML(element):
return element.decode_contents(formatter="html")
def get_name(body):
return body.find('span', {'class':'jcn'}).a.attrs['title']
def which_digit(html):
mappingDict={'icon-ji':9,
'icon-dc':'+',
'icon-fe':'(',
'icon-hg':')',
'icon-ba':'-',
'icon-lk':8,
'icon-nm':7,
'icon-po':6,
'icon-rq':5,
'icon-ts':4,
'icon-vu':3,
'icon-wx':2,
'icon-yz':1,
'icon-acb':0,
}
return mappingDict.get(html,'')
def get_phone_number(body):
i=0
phoneNo = "No Number!"
try:
for item in body.find('p',{'class':'contact-info'}):
i+=1
if(i==2):
phoneNo=''
try:
for element in item.find_all(class_=True):
classes = []
classes.extend(element["class"])
phoneNo+=str((which_digit(classes[1])))
except:
pass
except:
pass
body = body['data-href']
soup = BeautifulSoup(body, 'html.parser')
for a in soup.find_all('a', {"id":"whatsapptriggeer"} ):
# print (a)
phoneNo = str(a['href'][-10:])
return phoneNo
def get_rating(body):
rating = 0.0
text = body.find('span', {'class':'star_m'})
if text is not None:
for item in text:
rating += float(item['class'][0][1:])/10
return rating
def get_rating_count(body):
text = body.find('span', {'class':'rt_count'}).string
# Get only digits
rating_count =''.join(i for i in text if i.isdigit())
return rating_count
def get_address(body):
return body.find('span', {'class':'mrehover'}).text.strip()
def get_location(body):
text = body.find('a', {'class':'rsmap'})
if text == None:
return
text_list = text['onclick'].split(",")
latitutde = text_list[3].strip().replace("'", "")
longitude = text_list[4].strip().replace("'", "")
return latitutde + ", " + longitude
page_number = 1
service_count = 1
fields = ['Name', 'Phone', 'Rating', 'Rating Count', 'Address', 'Location']
out_file = open(args.file,'w')
csvwriter = csv.DictWriter(out_file, delimiter=',', fieldnames=fields)
# Write fields first
#csvwriter.writerow(dict((fn,fn) for fn in fields))
while True:
time.sleep(2)
# Check if reached end of result
if page_number > 50:
break
url="%s/page-%s" % (args.url, page_number)
print(url)
req = urllib.request.Request(url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"})
page = urllib.request.urlopen( req )
# page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read(), "html.parser")
services = soup.find_all('li', {'class': 'cntanr'})
# Iterate through the 10 results in the page
for service_html in services:
# Parse HTML to fetch data
dict_service = {}
name = get_name(service_html)
print(name);
phone = get_phone_number(service_html)
rating = get_rating(service_html)
count = get_rating_count(service_html)
address = get_address(service_html)
location = get_location(service_html)
if name != None:
dict_service['Name'] = name
if phone != None:
print('getting phone number')
dict_service['Phone'] = '\'' + phone
if rating != None:
dict_service['Rating'] = rating
if count != None:
dict_service['Rating Count'] = count
if address != None:
dict_service['Address'] = address
if location != None:
dict_service['Address'] = location
# Write row to CSV
csvwriter.writerow(dict_service)
print("#" + str(service_count) + " " , dict_service)
service_count += 1
page_number += 1
out_file.close()