Skip to content

Commit cdfdc29

Browse files
author
Adriano Sanges
committed
Enhance web scraping reliability and performance
- Implement robust request handling with retry strategy and random user agents - Add session management to improve request reliability - Introduce random delays between requests to prevent server blocking - Improve error handling and logging for network requests - Refactor parse_page() and parse_listing() to use more resilient scraping techniques
1 parent c1bf1c6 commit cdfdc29

File tree

1 file changed

+112
-65
lines changed

1 file changed

+112
-65
lines changed

real-estate-etl/scraper.py

Lines changed: 112 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,42 @@
11
import re
22
import logging
33
import requests
4+
import random
5+
import time
46
from bs4 import BeautifulSoup
57
from typing import Optional, Dict, List
8+
from requests.adapters import HTTPAdapter
9+
from requests.packages.urllib3.util.retry import Retry
610

11+
# List of diverse user agents
12+
USER_AGENTS = [
13+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
14+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
15+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
16+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
17+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36',
18+
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Mobile/15E148 Safari/604.1'
19+
]
720

8-
def parse_price(price_raw: Optional[str]) -> Optional[int]:
9-
if not price_raw:
10-
return None
11-
price_cleaned = re.sub(r'[^\d]', '', price_raw)
12-
return int(price_cleaned) if price_cleaned else None
13-
14-
def parse_page(url: str) -> Dict[str, Optional[any]]:
15-
headers = {
16-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
21+
def create_session() -> requests.Session:
22+
"""Create a session with retry strategy and random user agent."""
23+
session = requests.Session()
24+
25+
# Configure retry strategy
26+
retry_strategy = Retry(
27+
total=3, # number of retries
28+
backoff_factor=1, # wait 1, 2, 4 seconds between retries
29+
status_forcelist=[429, 500, 502, 503, 504], # status codes to retry on
30+
)
31+
32+
# Mount the adapter with retry strategy
33+
adapter = HTTPAdapter(max_retries=retry_strategy)
34+
session.mount("http://", adapter)
35+
session.mount("https://", adapter)
36+
37+
# Set random user agent and other headers
38+
session.headers.update({
39+
'User-Agent': random.choice(USER_AGENTS),
1740
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
1841
'Accept-Language': 'en-US,en;q=0.9',
1942
'Accept-Encoding': 'gzip, deflate, br',
@@ -23,69 +46,93 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
2346
'Sec-Fetch-Mode': 'navigate',
2447
'Sec-Fetch-Site': 'none',
2548
'Sec-Fetch-User': '?1',
26-
}
49+
})
50+
51+
return session
52+
53+
def parse_price(price_raw: Optional[str]) -> Optional[int]:
54+
if not price_raw:
55+
return None
56+
price_cleaned = re.sub(r'[^\d]', '', price_raw)
57+
return int(price_cleaned) if price_cleaned else None
58+
59+
def parse_page(url: str, session: requests.Session) -> Dict[str, Optional[any]]:
2760
logging.debug("Parsing page: %s", url)
28-
response = requests.get(url, headers=headers)
29-
soup = BeautifulSoup(response.text, 'html.parser')
3061

31-
listings = soup.select('section.re-layoutContentCenter')
32-
for listing in listings:
33-
city = listing.select_one('div.re-title__content span.re-blockTitle__location')
34-
neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
35-
road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
36-
price_raw = listing.select_one('div.re-overview__price span')
37-
price = parse_price(price_raw.text if price_raw else None)
38-
39-
square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
40-
square_meters = int(square_meters_match.group(1)) if square_meters_match else None
62+
try:
63+
response = session.get(url)
64+
response.raise_for_status()
65+
soup = BeautifulSoup(response.text, 'html.parser')
4166

42-
floor_match = re.search(r'Piano\s(\d+)', soup.text)
43-
floor = int(floor_match.group(1)) if floor_match else None
67+
listings = soup.select('section.re-layoutContentCenter')
68+
for listing in listings:
69+
city = listing.select_one('div.re-title__content span.re-blockTitle__location')
70+
neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
71+
road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
72+
price_raw = listing.select_one('div.re-overview__price span')
73+
price = parse_price(price_raw.text if price_raw else None)
74+
75+
square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
76+
square_meters = int(square_meters_match.group(1)) if square_meters_match else None
77+
78+
floor_match = re.search(r'Piano\s(\d+)', soup.text)
79+
floor = int(floor_match.group(1)) if floor_match else None
4480

45-
# Find the feature item related to parking/garage
46-
garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
81+
# Find the feature item related to parking/garage
82+
garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
4783

48-
if garage_feature:
49-
# Get the associated description (dd)
50-
garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
51-
garage_info = garage_description.get_text(strip=True) if garage_description else None
52-
else:
53-
garage_info = None
54-
55-
data = {
56-
"url": url,
57-
"title": soup.title.string if soup.title else None,
58-
"content": [p.text.strip() for p in soup.find_all('p')],
59-
"price": price,
60-
"city": city.text.strip() if city else None,
61-
"neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
62-
"road": road.text.strip() if road else None,
63-
"square_meters": square_meters,
64-
"floor": floor,
65-
"garage_info": garage_info,
66-
}
67-
return data
84+
if garage_feature:
85+
garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
86+
garage_info = garage_description.get_text(strip=True) if garage_description else None
87+
else:
88+
garage_info = None
89+
90+
data = {
91+
"url": url,
92+
"title": soup.title.string if soup.title else None,
93+
"content": [p.text.strip() for p in soup.find_all('p')],
94+
"price": price,
95+
"city": city.text.strip() if city else None,
96+
"neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
97+
"road": road.text.strip() if road else None,
98+
"square_meters": square_meters,
99+
"floor": floor,
100+
"garage_info": garage_info,
101+
}
102+
return data
103+
104+
except requests.RequestException as e:
105+
logging.error(f"Error parsing page {url}: {str(e)}")
106+
return None
68107

69108
def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
70-
headers = {
71-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
72-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
73-
'Accept-Language': 'en-US,en;q=0.9',
74-
'Accept-Encoding': 'gzip, deflate, br',
75-
'Connection': 'keep-alive',
76-
'Upgrade-Insecure-Requests': '1',
77-
'Sec-Fetch-Dest': 'document',
78-
'Sec-Fetch-Mode': 'navigate',
79-
'Sec-Fetch-Site': 'none',
80-
'Sec-Fetch-User': '?1',
81-
}
82109
logging.debug("Fetching main listing page: %s", url)
83-
response = requests.get(url, headers=headers)
84-
soup = BeautifulSoup(response.text, 'html.parser')
110+
111+
session = create_session()
85112
data_list = []
86-
links = soup.select('a.in-listingCardTitle')
87-
for link in links:
88-
absolute_url = requests.compat.urljoin(url, link['href'])
89-
logging.debug("Following link: %s", absolute_url)
90-
data_list.append(parse_page(absolute_url))
113+
114+
try:
115+
response = session.get(url)
116+
response.raise_for_status()
117+
soup = BeautifulSoup(response.text, 'html.parser')
118+
119+
links = soup.select('a.in-listingCardTitle')
120+
121+
for link in links:
122+
absolute_url = requests.compat.urljoin(url, link['href'])
123+
logging.debug("Following link: %s", absolute_url)
124+
125+
# Add a random delay between requests (1-3 seconds)
126+
time.sleep(random.uniform(1, 3))
127+
128+
# Get a new session with a different user agent for each request
129+
page_session = create_session()
130+
131+
result = parse_page(absolute_url, page_session)
132+
if result:
133+
data_list.append(result)
134+
135+
except requests.RequestException as e:
136+
logging.error(f"Error fetching listing page {url}: {str(e)}")
137+
91138
return data_list

0 commit comments

Comments
 (0)