Enhance web scraping reliability and performance

Adriano Sanges · Adriano Sanges · commit cdfdc29728c4 · 2025-03-11T00:15:13.000+01:00
- Implement robust request handling with retry strategy and random user agents
- Add session management to improve request reliability
- Introduce random delays between requests to prevent server blocking
- Improve error handling and logging for network requests
- Refactor parse_page() and parse_listing() to use more resilient scraping techniques
diff --git a/real-estate-etl/scraper.py b/real-estate-etl/scraper.py
@@ -1,19 +1,42 @@
 import re
 import logging
 import requests
+import random
+import time
 from bs4 import BeautifulSoup
 from typing import Optional, Dict, List
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
 
+# List of diverse user agents
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Mobile/15E148 Safari/604.1'
+]
 
-def parse_price(price_raw: Optional[str]) -> Optional[int]:
-    if not price_raw:
-        return None
-    price_cleaned = re.sub(r'[^\d]', '', price_raw)
-    return int(price_cleaned) if price_cleaned else None
-
-def parse_page(url: str) -> Dict[str, Optional[any]]:
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
+def create_session() -> requests.Session:
+    """Create a session with retry strategy and random user agent."""
+    session = requests.Session()
+    
+    # Configure retry strategy
+    retry_strategy = Retry(
+        total=3,  # number of retries
+        backoff_factor=1,  # wait 1, 2, 4 seconds between retries
+        status_forcelist=[429, 500, 502, 503, 504],  # status codes to retry on
+    )
+    
+    # Mount the adapter with retry strategy
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    
+    # Set random user agent and other headers
+    session.headers.update({
+        'User-Agent': random.choice(USER_AGENTS),
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
         'Accept-Language': 'en-US,en;q=0.9',
         'Accept-Encoding': 'gzip, deflate, br',
@@ -23,69 +46,93 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
         'Sec-Fetch-Mode': 'navigate',
         'Sec-Fetch-Site': 'none',
         'Sec-Fetch-User': '?1',
-    }
+    })
+    
+    return session
+
+def parse_price(price_raw: Optional[str]) -> Optional[int]:
+    if not price_raw:
+        return None
+    price_cleaned = re.sub(r'[^\d]', '', price_raw)
+    return int(price_cleaned) if price_cleaned else None
+
+def parse_page(url: str, session: requests.Session) -> Dict[str, Optional[any]]:
     logging.debug("Parsing page: %s", url)
-    response = requests.get(url, headers=headers)
-    soup = BeautifulSoup(response.text, 'html.parser')
     
-    listings = soup.select('section.re-layoutContentCenter')
-    for listing in listings:
-        city = listing.select_one('div.re-title__content span.re-blockTitle__location')
-        neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
-        road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
-        price_raw = listing.select_one('div.re-overview__price span')
-        price = parse_price(price_raw.text if price_raw else None)
-        
-        square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
-        square_meters = int(square_meters_match.group(1)) if square_meters_match else None
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
         
-        floor_match = re.search(r'Piano\s(\d+)', soup.text)
-        floor = int(floor_match.group(1)) if floor_match else None
+        listings = soup.select('section.re-layoutContentCenter')
+        for listing in listings:
+            city = listing.select_one('div.re-title__content span.re-blockTitle__location')
+            neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
+            road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
+            price_raw = listing.select_one('div.re-overview__price span')
+            price = parse_price(price_raw.text if price_raw else None)
+            
+            square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
+            square_meters = int(square_meters_match.group(1)) if square_meters_match else None
+            
+            floor_match = re.search(r'Piano\s(\d+)', soup.text)
+            floor = int(floor_match.group(1)) if floor_match else None
 
-        # Find the feature item related to parking/garage
-        garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
+            # Find the feature item related to parking/garage
+            garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
 
-        if garage_feature:
-            # Get the associated description (dd)
-            garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
-            garage_info = garage_description.get_text(strip=True) if garage_description else None
-        else:
-            garage_info = None
-        
-        data = {
-            "url": url,
-            "title": soup.title.string if soup.title else None,
-            "content": [p.text.strip() for p in soup.find_all('p')],
-            "price": price,
-            "city": city.text.strip() if city else None,
-            "neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
-            "road": road.text.strip() if road else None,
-            "square_meters": square_meters,
-            "floor": floor,
-            "garage_info": garage_info,
-        }
-    return data
+            if garage_feature:
+                garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
+                garage_info = garage_description.get_text(strip=True) if garage_description else None
+            else:
+                garage_info = None
+            
+            data = {
+                "url": url,
+                "title": soup.title.string if soup.title else None,
+                "content": [p.text.strip() for p in soup.find_all('p')],
+                "price": price,
+                "city": city.text.strip() if city else None,
+                "neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
+                "road": road.text.strip() if road else None,
+                "square_meters": square_meters,
+                "floor": floor,
+                "garage_info": garage_info,
+            }
+        return data
+    
+    except requests.RequestException as e:
+        logging.error(f"Error parsing page {url}: {str(e)}")
+        return None
 
 def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1',
-        'Sec-Fetch-Dest': 'document',
-        'Sec-Fetch-Mode': 'navigate',
-        'Sec-Fetch-Site': 'none',
-        'Sec-Fetch-User': '?1',
-    }
     logging.debug("Fetching main listing page: %s", url)
-    response = requests.get(url, headers=headers)
-    soup = BeautifulSoup(response.text, 'html.parser')
+    
+    session = create_session()
     data_list = []
-    links = soup.select('a.in-listingCardTitle')
-    for link in links:
-        absolute_url = requests.compat.urljoin(url, link['href'])
-        logging.debug("Following link: %s", absolute_url)
-        data_list.append(parse_page(absolute_url))
+    
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        links = soup.select('a.in-listingCardTitle')
+        
+        for link in links:
+            absolute_url = requests.compat.urljoin(url, link['href'])
+            logging.debug("Following link: %s", absolute_url)
+            
+            # Add a random delay between requests (1-3 seconds)
+            time.sleep(random.uniform(1, 3))
+            
+            # Get a new session with a different user agent for each request
+            page_session = create_session()
+            
+            result = parse_page(absolute_url, page_session)
+            if result:
+                data_list.append(result)
+    
+    except requests.RequestException as e:
+        logging.error(f"Error fetching listing page {url}: {str(e)}")
+    
     return data_list