Merge pull request #22 from Zubdata/pr-21

mzubairtahir · web-flow · commit 1e20ac087011 · 2024-08-30T13:03:16.000+05:00
Add Fields, Fix Bugs, Update Output and Adjust Documentation
diff --git a/.gitignore b/.gitignore
@@ -106,3 +106,7 @@ Thumbs.db
 .pdm.toml
 .pdm.lock
 .pdm.cache/
+
+
+# output folder
+output/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 All notable changes to this project will be documented in this file.
 
+## [3.1.0] - 2024-04-26
+### Added
+- Added additional data fields
+
+### Fixed
+- Fixed extraction of `Total Reviews` and `Website` data fields
+
+### Changed
+- Updated logic of parsing data fields
+- Changed output data file name
+
 ## [3.0.0] - 2024-04-26
 
 ### Fixed
diff --git a/Google map scraper/scraper/communicator.py b/Google map scraper/scraper/communicator.py
@@ -3,6 +3,7 @@
 class Communicator:
 
     __frontend_object = None
+    __backend_object = None
 
     @classmethod
     def show_message(cls, message):
@@ -32,4 +33,12 @@ def end_processing(cls):
 
     @classmethod
     def get_output_format(cls):
-        return cls.__frontend_object.outputFormatValue
+        return cls.__frontend_object.outputFormatValue
+    
+    @classmethod
+    def set_backend_object(cls, backend_object):
+        cls.__backend_object = backend_object
+    
+    @classmethod
+    def get_search_query(cls):
+        return cls.__backend_object.searchquery
diff --git a/Google map scraper/scraper/datasaver.py b/Google map scraper/scraper/datasaver.py
@@ -9,7 +9,6 @@
 import os
 from .error_codes import ERROR_CODES
 
-
 class DataSaver:
     def __init__(self) -> None:
         self.outputFormat = Communicator.get_output_format()
@@ -27,21 +26,25 @@ def save(self, datalist):
             dataFrame = pd.DataFrame(datalist)
             totalRecords = dataFrame.shape[0]
 
-            filename = "/gms output"
+            searchQuery = Communicator.get_search_query()
+            filename = f"{searchQuery} - GMS output"
 
             if self.outputFormat == "excel":
                 extension = ".xlsx"
             elif self.outputFormat == "csv":
                 extension = ".csv"
             elif self.outputFormat == "json":
                 extension = ".json"
-
+                
+             # Create the output directory if it does not exist
+            if not os.path.exists(OUTPUT_PATH):
+                os.makedirs(OUTPUT_PATH)
             joinedPath = OUTPUT_PATH + filename + extension
 
             if os.path.exists(joinedPath):
                 index = 1
                 while True:
-                    filename = f"/gms output{index}"
+                    filename = f"{searchQuery} - GMS output ({index})"
 
                     joinedPath = OUTPUT_PATH + filename + extension
 
diff --git a/Google map scraper/scraper/parser.py b/Google map scraper/scraper/parser.py
@@ -1,20 +1,21 @@
 from bs4 import BeautifulSoup
+from selenium import webdriver
 from .error_codes import ERROR_CODES
 from .communicator import Communicator
 from .datasaver import DataSaver
 from .base import Base
 from .common import Common
 
-
 class Parser(Base):
 
     def __init__(self, driver) -> None:
         self.driver = driver
         self.finalData = []
         self.comparing_tool_tips = {
-        "location": """Copy address""",
-        "phone": """Copy phone number""",
-        "website": """Open website""",
+            "location": "Copy address",
+            "phone": "Copy phone number",
+            "website": "Open website",
+            "booking": "Open booking link",
         }
 
     def init_data_saver(self):
@@ -31,75 +32,105 @@ def parse(self):
         infoSheet = self.driver.execute_script(
             """return document.querySelector("[role='main']")""")
         try:
-            """If that information sheet is founded in try block, this block will run and find the contact details"""
-            rating, totalReviews, address, websiteUrl, phone = (
-                None,
-                None,
-                None,
-                None,
-                None,
-            )  # by default they will be none
+            # Initialize data points
+            rating, totalReviews, address, websiteUrl, phone, hours, category, gmapsUrl, bookingLink, businessStatus = (
+                None, None, None, None, None, None, None, None, None, None
+            )
 
             html = infoSheet.get_attribute("outerHTML")
-            soup = BeautifulSoup(
-                html, "html.parser"
-            )  # soup of information  sheet of that place
+            soup = BeautifulSoup(html, "html.parser")
 
+            # Extract rating
             try:
                 rating = soup.find("span", class_="ceNzKf").get("aria-label")
-
-            except:  # if a business does not has rating
+                rating = rating.replace("stars", "").strip()
+            except:
                 rating = None
 
+            # Extract total reviews
             try:
                 totalReviews = list(soup.find("div", class_="F7nice").children)
-                totalReviews = totalReviews[1].get_text(
-                    strip=True
-                )
-
+                totalReviews = totalReviews[1].get_text(strip=True)
             except:
                 totalReviews = None
 
-            name = soup.select_one(selector=".tAiQdd h1.DUwDvf").text.strip()
+            # Extract name
+            try:
+                name = soup.select_one(".tAiQdd h1.DUwDvf").text.strip()
+            except:
+                name = None
 
+            # Extract address, website, phone, and appointment link
             allInfoBars = soup.find_all("button", class_="CsEnBe")
-
             for infoBar in allInfoBars:
                 data_tooltip = infoBar.get("data-tooltip")
-                text = infoBar.find('div', class_='rogA2c').text
-
-                """Below three conditons are used to comapre fetched links to compare them with
-                those links that we have write in start"""
+                text = infoBar.find('div', class_='rogA2c').text.strip()
 
                 if data_tooltip == self.comparing_tool_tips["location"]:
-                    address = text.strip()
-
-                elif data_tooltip == self.comparing_tool_tips["website"]:
-                    try:
-                        websiteUrl = infoBar.parent.get("href")
-                    except:
-                        websiteUrl = None
-
+                    address = text
+               
                 elif data_tooltip == self.comparing_tool_tips["phone"]:
                     phone = text.strip()
+               
+            # Extract website URL
+            try:
+                websiteTag = soup.find("a", {"aria-label": lambda x: x and "Website:" in x})
+                if websiteTag:
+                    websiteUrl = websiteTag.get("href")
+            except:
+                websiteUrl = None
 
-                else:
-                    pass
+            # Extract booking link
+            try:
+                bookingTag = soup.find("a", {"aria-label": lambda x: x and "Open booking link" in x})
+                if bookingTag:
+                    bookingLink = bookingTag.get("href")
+            except:
+                bookingLink = None
+
+            # Extract hours of operation
+            try:
+                hours = soup.find("div", class_="t39EBf").get_text(strip=True)
+            except:
+                hours = None
+
+            # Extract category
+            try:
+                category = soup.find("button", class_="DkEaL").text.strip()
+            except:
+                category = None
+
+            # Extract Google Maps URL
+            try:
+                gmapsUrl = self.driver.current_url
+            except:
+                gmapsUrl = None
+
+
+            # Extract business status
+            try:
+                businessStatus = soup.find("span", class_="ZDu9vd").findChildren("span", recursive=False)[0].get_text(strip=True)
+            except:
+                businessStatus = None
 
             data = {
+                "Category": category,
                 "Name": name,
                 "Phone": phone,
-                "Address": address,
+                "Google Maps URL": gmapsUrl,
                 "Website": websiteUrl,
+                "Business Status": businessStatus,
+                "Address": address,
                 "Total Reviews": totalReviews,
+                "Booking Links": bookingLink,
                 "Rating": rating,
+                "Hours": hours,
             }
 
             self.finalData.append(data)
 
-        except Exception as e:  # some resuts have no information , so we dont want them in our pretty cleaned list
-            Communicator.show_error_message(f"Error occured while parsing a location. Error is: {str(e)}.", ERROR_CODES['ERR_WHILE_PARSING_DETAILS'] )
-            
+        except Exception as e:
+            Communicator.show_error_message(f"Error occurred while parsing a location. Error is: {str(e)}", ERROR_CODES['ERR_WHILE_PARSING_DETAILS'])
 
     def main(self, allResultsLinks):
         Communicator.show_message("Scrolling is done. Now going to scrape each location")
@@ -113,9 +144,8 @@ def main(self, allResultsLinks):
                 self.parse()
 
         except Exception as e:
-            Communicator.show_message(f"Error occured while parsing the locations. Error: {str(e)}")
+            Communicator.show_message(f"Error occurred while parsing the locations. Error: {str(e)}")
             
         finally:
             self.init_data_saver()
             self.data_saver.save(datalist=self.finalData)
-                
diff --git a/Google map scraper/scraper/scraper.py b/Google map scraper/scraper/scraper.py
@@ -35,7 +35,10 @@ def __init__(self, searchquery, outputformat,  healdessmode):
 
         self.init_driver()
         self.scroller = Scroller(driver=self.driver)
+        self.init_communicator()
 
+    def init_communicator(self):
+        Communicator.set_backend_object(self)
 
 
     def init_driver(self):
diff --git a/Google map scraper/scraper/settings.py b/Google map scraper/scraper/settings.py
@@ -1,9 +1,9 @@
 """
 These are settings of the scraper. To see thier details, please visit:
-https://zubdata.com/tools/google-maps-scraper/
+https://zubdata.com/docs/google-maps-scraper/getting-started/settings/
 """
 
 
-OUTPUT_PATH = "."
+OUTPUT_PATH = "output/"
 
 DRIVER_EXECUTABLE_PATH = None
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Zubdata - Google Maps Scraper
 
-## Version: 3.0.0
+## Version: 3.1.0
 
 ## Note: 
 **Our all scrapers are working, if you find any issue or bug please open an issue with the detail of issue. We will try to resolve it quickly for you.**
@@ -13,9 +13,35 @@ Documentation can be found at this [link](https://zubdata.com/docs/google-maps-s
 ## Features
 
 - User-friendly graphical interface for easy navigation and interaction. 😊
-- Scrapes various data from Google Maps, such as business names, addresses, phone number, website, ratings, and total reviews.
+- Scrapes various data from Google Maps, such as:
+  - **Category**
+  - **Name**
+  - **Phone Number**
+  - **Google Maps URL**
+  - **Website**
+  - **Address**
+  - **Total Reviews**
+  - **Rating**
+  - **Business Status**
+  - **Booking Links**
+  - **Hours**
 - Fast and efficient 🚀
 
+## Sample Data
+    {
+        "Category":"Restaurant",
+        "Name":"Veh\u0101ri Village",
+        "Phone":"0300 0020103",
+        "Google Maps URL":"https:\/\/www.google.com\/maps\/place\/Veh%C4%81ri+Village\/data=!4m7!3m6!1s0x393cc006c30226c7:0xb71394954cfc0b22!8m2!3d30.0558272!4d72.3348188!16s%2Fg%2F11cmp6z14g!19sChIJxyYCwwbAPDkRIgv8TJWUE7c?authuser=0&hl=en&rclk=1",
+        "Website":"https:\/\/www.facebook.com\/VehariVillage\/",
+        "Business Status":"Open\u22c5 Closes 1\u202fam",
+        "Address":"The Royal Gardens Society, Khanewal Vehari Rd, Vehari, Punjab",
+        "Total Reviews":"(347)",
+        "Booking Links":null,
+        "Rating":"4.2 stars ",
+        "Hours":"Friday9\u202fam\u20131\u202famSaturday9\u202fam\u20131\u202famSunday9\u202fam\u201312\u202famMonday9\u202fam\u20131\u202famTuesday9\u202fam\u20131\u202famWednesday9\u202fam\u20131\u202famThursday9\u202fam\u20131\u202famSuggest new hours"
+    }
+
 ## Getting Started
 
 To get started with the Google Maps Scraper, follow these steps:
@@ -35,7 +61,7 @@ To get started with the Google Maps Scraper, follow these steps:
    python "Google map scraper\starter.py" start
    ```
 
-`For further helping docs please visit our` [documentation](https://zubdata.com/docs/google-maps-scraper/getting-started/installation/) `page`
+`For further helping docs please visit our` [documentation](https://zubdata.com/docs/google-maps-scraper) `page`
 
 ## Contributing