Skip to content

Commit 1e20ac0

Browse files
authored
Merge pull request #22 from Zubdata/pr-21
Add Fields, Fix Bugs, Update Output and Adjust Documentation
2 parents f294fd0 + d48d2e9 commit 1e20ac0

File tree

8 files changed

+139
-53
lines changed

8 files changed

+139
-53
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,7 @@ Thumbs.db
106106
.pdm.toml
107107
.pdm.lock
108108
.pdm.cache/
109+
110+
111+
# output folder
112+
output/

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## [3.1.0] - 2024-04-26
6+
### Added
7+
- Added additional data fields
8+
9+
### Fixed
10+
- Fixed extraction of `Total Reviews` and `Website` data fields
11+
12+
### Changed
13+
- Updated logic of parsing data fields
14+
- Changed output data file name
15+
516
## [3.0.0] - 2024-04-26
617

718
### Fixed

Google map scraper/scraper/communicator.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
class Communicator:
44

55
__frontend_object = None
6+
__backend_object = None
67

78
@classmethod
89
def show_message(cls, message):
@@ -32,4 +33,12 @@ def end_processing(cls):
3233

3334
@classmethod
3435
def get_output_format(cls):
35-
return cls.__frontend_object.outputFormatValue
36+
return cls.__frontend_object.outputFormatValue
37+
38+
@classmethod
39+
def set_backend_object(cls, backend_object):
40+
cls.__backend_object = backend_object
41+
42+
@classmethod
43+
def get_search_query(cls):
44+
return cls.__backend_object.searchquery

Google map scraper/scraper/datasaver.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import os
1010
from .error_codes import ERROR_CODES
1111

12-
1312
class DataSaver:
1413
def __init__(self) -> None:
1514
self.outputFormat = Communicator.get_output_format()
@@ -27,21 +26,25 @@ def save(self, datalist):
2726
dataFrame = pd.DataFrame(datalist)
2827
totalRecords = dataFrame.shape[0]
2928

30-
filename = "/gms output"
29+
searchQuery = Communicator.get_search_query()
30+
filename = f"{searchQuery} - GMS output"
3131

3232
if self.outputFormat == "excel":
3333
extension = ".xlsx"
3434
elif self.outputFormat == "csv":
3535
extension = ".csv"
3636
elif self.outputFormat == "json":
3737
extension = ".json"
38-
38+
39+
# Create the output directory if it does not exist
40+
if not os.path.exists(OUTPUT_PATH):
41+
os.makedirs(OUTPUT_PATH)
3942
joinedPath = OUTPUT_PATH + filename + extension
4043

4144
if os.path.exists(joinedPath):
4245
index = 1
4346
while True:
44-
filename = f"/gms output{index}"
47+
filename = f"{searchQuery} - GMS output ({index})"
4548

4649
joinedPath = OUTPUT_PATH + filename + extension
4750

Google map scraper/scraper/parser.py

Lines changed: 73 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
from bs4 import BeautifulSoup
2+
from selenium import webdriver
23
from .error_codes import ERROR_CODES
34
from .communicator import Communicator
45
from .datasaver import DataSaver
56
from .base import Base
67
from .common import Common
78

8-
99
class Parser(Base):
1010

1111
def __init__(self, driver) -> None:
1212
self.driver = driver
1313
self.finalData = []
1414
self.comparing_tool_tips = {
15-
"location": """Copy address""",
16-
"phone": """Copy phone number""",
17-
"website": """Open website""",
15+
"location": "Copy address",
16+
"phone": "Copy phone number",
17+
"website": "Open website",
18+
"booking": "Open booking link",
1819
}
1920

2021
def init_data_saver(self):
@@ -31,75 +32,105 @@ def parse(self):
3132
infoSheet = self.driver.execute_script(
3233
"""return document.querySelector("[role='main']")""")
3334
try:
34-
"""If that information sheet is founded in try block, this block will run and find the contact details"""
35-
rating, totalReviews, address, websiteUrl, phone = (
36-
None,
37-
None,
38-
None,
39-
None,
40-
None,
41-
) # by default they will be none
35+
# Initialize data points
36+
rating, totalReviews, address, websiteUrl, phone, hours, category, gmapsUrl, bookingLink, businessStatus = (
37+
None, None, None, None, None, None, None, None, None, None
38+
)
4239

4340
html = infoSheet.get_attribute("outerHTML")
44-
soup = BeautifulSoup(
45-
html, "html.parser"
46-
) # soup of information sheet of that place
41+
soup = BeautifulSoup(html, "html.parser")
4742

43+
# Extract rating
4844
try:
4945
rating = soup.find("span", class_="ceNzKf").get("aria-label")
50-
51-
except: # if a business does not has rating
46+
rating = rating.replace("stars", "").strip()
47+
except:
5248
rating = None
5349

50+
# Extract total reviews
5451
try:
5552
totalReviews = list(soup.find("div", class_="F7nice").children)
56-
totalReviews = totalReviews[1].get_text(
57-
strip=True
58-
)
59-
53+
totalReviews = totalReviews[1].get_text(strip=True)
6054
except:
6155
totalReviews = None
6256

63-
name = soup.select_one(selector=".tAiQdd h1.DUwDvf").text.strip()
57+
# Extract name
58+
try:
59+
name = soup.select_one(".tAiQdd h1.DUwDvf").text.strip()
60+
except:
61+
name = None
6462

63+
# Extract address, website, phone, and appointment link
6564
allInfoBars = soup.find_all("button", class_="CsEnBe")
66-
6765
for infoBar in allInfoBars:
6866
data_tooltip = infoBar.get("data-tooltip")
69-
text = infoBar.find('div', class_='rogA2c').text
70-
71-
"""Below three conditons are used to comapre fetched links to compare them with
72-
those links that we have write in start"""
67+
text = infoBar.find('div', class_='rogA2c').text.strip()
7368

7469
if data_tooltip == self.comparing_tool_tips["location"]:
75-
address = text.strip()
76-
77-
elif data_tooltip == self.comparing_tool_tips["website"]:
78-
try:
79-
websiteUrl = infoBar.parent.get("href")
80-
except:
81-
websiteUrl = None
82-
70+
address = text
71+
8372
elif data_tooltip == self.comparing_tool_tips["phone"]:
8473
phone = text.strip()
74+
75+
# Extract website URL
76+
try:
77+
websiteTag = soup.find("a", {"aria-label": lambda x: x and "Website:" in x})
78+
if websiteTag:
79+
websiteUrl = websiteTag.get("href")
80+
except:
81+
websiteUrl = None
8582

86-
else:
87-
pass
83+
# Extract booking link
84+
try:
85+
bookingTag = soup.find("a", {"aria-label": lambda x: x and "Open booking link" in x})
86+
if bookingTag:
87+
bookingLink = bookingTag.get("href")
88+
except:
89+
bookingLink = None
90+
91+
# Extract hours of operation
92+
try:
93+
hours = soup.find("div", class_="t39EBf").get_text(strip=True)
94+
except:
95+
hours = None
96+
97+
# Extract category
98+
try:
99+
category = soup.find("button", class_="DkEaL").text.strip()
100+
except:
101+
category = None
102+
103+
# Extract Google Maps URL
104+
try:
105+
gmapsUrl = self.driver.current_url
106+
except:
107+
gmapsUrl = None
108+
109+
110+
# Extract business status
111+
try:
112+
businessStatus = soup.find("span", class_="ZDu9vd").findChildren("span", recursive=False)[0].get_text(strip=True)
113+
except:
114+
businessStatus = None
88115

89116
data = {
117+
"Category": category,
90118
"Name": name,
91119
"Phone": phone,
92-
"Address": address,
120+
"Google Maps URL": gmapsUrl,
93121
"Website": websiteUrl,
122+
"Business Status": businessStatus,
123+
"Address": address,
94124
"Total Reviews": totalReviews,
125+
"Booking Links": bookingLink,
95126
"Rating": rating,
127+
"Hours": hours,
96128
}
97129

98130
self.finalData.append(data)
99131

100-
except Exception as e: # some resuts have no information , so we dont want them in our pretty cleaned list
101-
Communicator.show_error_message(f"Error occured while parsing a location. Error is: {str(e)}.", ERROR_CODES['ERR_WHILE_PARSING_DETAILS'] )
102-
132+
except Exception as e:
133+
Communicator.show_error_message(f"Error occurred while parsing a location. Error is: {str(e)}", ERROR_CODES['ERR_WHILE_PARSING_DETAILS'])
103134

104135
def main(self, allResultsLinks):
105136
Communicator.show_message("Scrolling is done. Now going to scrape each location")
@@ -113,9 +144,8 @@ def main(self, allResultsLinks):
113144
self.parse()
114145

115146
except Exception as e:
116-
Communicator.show_message(f"Error occured while parsing the locations. Error: {str(e)}")
147+
Communicator.show_message(f"Error occurred while parsing the locations. Error: {str(e)}")
117148

118149
finally:
119150
self.init_data_saver()
120151
self.data_saver.save(datalist=self.finalData)
121-

Google map scraper/scraper/scraper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ def __init__(self, searchquery, outputformat, healdessmode):
3535

3636
self.init_driver()
3737
self.scroller = Scroller(driver=self.driver)
38+
self.init_communicator()
3839

40+
def init_communicator(self):
41+
Communicator.set_backend_object(self)
3942

4043

4144
def init_driver(self):
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""
22
These are settings of the scraper. To see thier details, please visit:
3-
https://zubdata.com/tools/google-maps-scraper/
3+
https://zubdata.com/docs/google-maps-scraper/getting-started/settings/
44
"""
55

66

7-
OUTPUT_PATH = "."
7+
OUTPUT_PATH = "output/"
88

99
DRIVER_EXECUTABLE_PATH = None

README.md

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Zubdata - Google Maps Scraper
22

3-
## Version: 3.0.0
3+
## Version: 3.1.0
44

55
## Note:
66
**Our all scrapers are working, if you find any issue or bug please open an issue with the detail of issue. We will try to resolve it quickly for you.**
@@ -13,9 +13,35 @@ Documentation can be found at this [link](https://zubdata.com/docs/google-maps-s
1313
## Features
1414

1515
- User-friendly graphical interface for easy navigation and interaction. 😊
16-
- Scrapes various data from Google Maps, such as business names, addresses, phone number, website, ratings, and total reviews.
16+
- Scrapes various data from Google Maps, such as:
17+
- **Category**
18+
- **Name**
19+
- **Phone Number**
20+
- **Google Maps URL**
21+
- **Website**
22+
- **Address**
23+
- **Total Reviews**
24+
- **Rating**
25+
- **Business Status**
26+
- **Booking Links**
27+
- **Hours**
1728
- Fast and efficient 🚀
1829

30+
## Sample Data
31+
{
32+
"Category":"Restaurant",
33+
"Name":"Veh\u0101ri Village",
34+
"Phone":"0300 0020103",
35+
"Google Maps URL":"https:\/\/www.google.com\/maps\/place\/Veh%C4%81ri+Village\/data=!4m7!3m6!1s0x393cc006c30226c7:0xb71394954cfc0b22!8m2!3d30.0558272!4d72.3348188!16s%2Fg%2F11cmp6z14g!19sChIJxyYCwwbAPDkRIgv8TJWUE7c?authuser=0&hl=en&rclk=1",
36+
"Website":"https:\/\/www.facebook.com\/VehariVillage\/",
37+
"Business Status":"Open\u22c5 Closes 1\u202fam",
38+
"Address":"The Royal Gardens Society, Khanewal Vehari Rd, Vehari, Punjab",
39+
"Total Reviews":"(347)",
40+
"Booking Links":null,
41+
"Rating":"4.2 stars ",
42+
"Hours":"Friday9\u202fam\u20131\u202famSaturday9\u202fam\u20131\u202famSunday9\u202fam\u201312\u202famMonday9\u202fam\u20131\u202famTuesday9\u202fam\u20131\u202famWednesday9\u202fam\u20131\u202famThursday9\u202fam\u20131\u202famSuggest new hours"
43+
}
44+
1945
## Getting Started
2046

2147
To get started with the Google Maps Scraper, follow these steps:
@@ -35,7 +61,7 @@ To get started with the Google Maps Scraper, follow these steps:
3561
python "Google map scraper\starter.py" start
3662
```
3763

38-
`For further helping docs please visit our` [documentation](https://zubdata.com/docs/google-maps-scraper/getting-started/installation/) `page`
64+
`For further helping docs please visit our` [documentation](https://zubdata.com/docs/google-maps-scraper) `page`
3965

4066
## Contributing
4167

0 commit comments

Comments
 (0)