11import re
22import logging
33import requests
4+ import random
5+ import time
46from bs4 import BeautifulSoup
57from typing import Optional , Dict , List
8+ from requests .adapters import HTTPAdapter
9+ from requests .packages .urllib3 .util .retry import Retry
610
11+ # List of diverse user agents
12+ USER_AGENTS = [
13+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
14+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15' ,
15+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0' ,
16+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
17+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0 Safari/537.36' ,
18+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Mobile/15E148 Safari/604.1'
19+ ]
720
8- def parse_price (price_raw : Optional [str ]) -> Optional [int ]:
9- if not price_raw :
10- return None
11- price_cleaned = re .sub (r'[^\d]' , '' , price_raw )
12- return int (price_cleaned ) if price_cleaned else None
13-
14- def parse_page (url : str ) -> Dict [str , Optional [any ]]:
15- headers = {
16- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
21+ def create_session () -> requests .Session :
22+ """Create a session with retry strategy and random user agent."""
23+ session = requests .Session ()
24+
25+ # Configure retry strategy
26+ retry_strategy = Retry (
27+ total = 3 , # number of retries
28+ backoff_factor = 1 , # wait 1, 2, 4 seconds between retries
29+ status_forcelist = [429 , 500 , 502 , 503 , 504 ], # status codes to retry on
30+ )
31+
32+ # Mount the adapter with retry strategy
33+ adapter = HTTPAdapter (max_retries = retry_strategy )
34+ session .mount ("http://" , adapter )
35+ session .mount ("https://" , adapter )
36+
37+ # Set random user agent and other headers
38+ session .headers .update ({
39+ 'User-Agent' : random .choice (USER_AGENTS ),
1740 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ,
1841 'Accept-Language' : 'en-US,en;q=0.9' ,
1942 'Accept-Encoding' : 'gzip, deflate, br' ,
@@ -23,69 +46,93 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
2346 'Sec-Fetch-Mode' : 'navigate' ,
2447 'Sec-Fetch-Site' : 'none' ,
2548 'Sec-Fetch-User' : '?1' ,
26- }
49+ })
50+
51+ return session
52+
53+ def parse_price (price_raw : Optional [str ]) -> Optional [int ]:
54+ if not price_raw :
55+ return None
56+ price_cleaned = re .sub (r'[^\d]' , '' , price_raw )
57+ return int (price_cleaned ) if price_cleaned else None
58+
59+ def parse_page (url : str , session : requests .Session ) -> Dict [str , Optional [any ]]:
2760 logging .debug ("Parsing page: %s" , url )
28- response = requests .get (url , headers = headers )
29- soup = BeautifulSoup (response .text , 'html.parser' )
3061
31- listings = soup .select ('section.re-layoutContentCenter' )
32- for listing in listings :
33- city = listing .select_one ('div.re-title__content span.re-blockTitle__location' )
34- neighbourhood = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(2)' )
35- road = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(3)' )
36- price_raw = listing .select_one ('div.re-overview__price span' )
37- price = parse_price (price_raw .text if price_raw else None )
38-
39- square_meters_match = re .search (r'(\d+)\s?m²' , soup .text )
40- square_meters = int (square_meters_match .group (1 )) if square_meters_match else None
62+ try :
63+ response = session .get (url )
64+ response .raise_for_status ()
65+ soup = BeautifulSoup (response .text , 'html.parser' )
4166
42- floor_match = re .search (r'Piano\s(\d+)' , soup .text )
43- floor = int (floor_match .group (1 )) if floor_match else None
67+ listings = soup .select ('section.re-layoutContentCenter' )
68+ for listing in listings :
69+ city = listing .select_one ('div.re-title__content span.re-blockTitle__location' )
70+ neighbourhood = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(2)' )
71+ road = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(3)' )
72+ price_raw = listing .select_one ('div.re-overview__price span' )
73+ price = parse_price (price_raw .text if price_raw else None )
74+
75+ square_meters_match = re .search (r'(\d+)\s?m²' , soup .text )
76+ square_meters = int (square_meters_match .group (1 )) if square_meters_match else None
77+
78+ floor_match = re .search (r'Piano\s(\d+)' , soup .text )
79+ floor = int (floor_match .group (1 )) if floor_match else None
4480
45- # Find the feature item related to parking/garage
46- garage_feature = listing .find ('dt' , class_ = 're-featuresItem__title' , string = "Box, posti auto" )
81+ # Find the feature item related to parking/garage
82+ garage_feature = listing .find ('dt' , class_ = 're-featuresItem__title' , string = "Box, posti auto" )
4783
48- if garage_feature :
49- # Get the associated description (dd)
50- garage_description = garage_feature .find_next ('dd' , class_ = 're-featuresItem__description' )
51- garage_info = garage_description .get_text (strip = True ) if garage_description else None
52- else :
53- garage_info = None
54-
55- data = {
56- "url" : url ,
57- "title" : soup .title .string if soup .title else None ,
58- "content" : [p .text .strip () for p in soup .find_all ('p' )],
59- "price" : price ,
60- "city" : city .text .strip () if city else None ,
61- "neighbourhood" : neighbourhood .text .strip () if neighbourhood else None ,
62- "road" : road .text .strip () if road else None ,
63- "square_meters" : square_meters ,
64- "floor" : floor ,
65- "garage_info" : garage_info ,
66- }
67- return data
84+ if garage_feature :
85+ garage_description = garage_feature .find_next ('dd' , class_ = 're-featuresItem__description' )
86+ garage_info = garage_description .get_text (strip = True ) if garage_description else None
87+ else :
88+ garage_info = None
89+
90+ data = {
91+ "url" : url ,
92+ "title" : soup .title .string if soup .title else None ,
93+ "content" : [p .text .strip () for p in soup .find_all ('p' )],
94+ "price" : price ,
95+ "city" : city .text .strip () if city else None ,
96+ "neighbourhood" : neighbourhood .text .strip () if neighbourhood else None ,
97+ "road" : road .text .strip () if road else None ,
98+ "square_meters" : square_meters ,
99+ "floor" : floor ,
100+ "garage_info" : garage_info ,
101+ }
102+ return data
103+
104+ except requests .RequestException as e :
105+ logging .error (f"Error parsing page { url } : { str (e )} " )
106+ return None
68107
69108def parse_listing (url : str ) -> List [Dict [str , Optional [any ]]]:
70- headers = {
71- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
72- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ,
73- 'Accept-Language' : 'en-US,en;q=0.9' ,
74- 'Accept-Encoding' : 'gzip, deflate, br' ,
75- 'Connection' : 'keep-alive' ,
76- 'Upgrade-Insecure-Requests' : '1' ,
77- 'Sec-Fetch-Dest' : 'document' ,
78- 'Sec-Fetch-Mode' : 'navigate' ,
79- 'Sec-Fetch-Site' : 'none' ,
80- 'Sec-Fetch-User' : '?1' ,
81- }
82109 logging .debug ("Fetching main listing page: %s" , url )
83- response = requests . get ( url , headers = headers )
84- soup = BeautifulSoup ( response . text , 'html.parser' )
110+
111+ session = create_session ( )
85112 data_list = []
86- links = soup .select ('a.in-listingCardTitle' )
87- for link in links :
88- absolute_url = requests .compat .urljoin (url , link ['href' ])
89- logging .debug ("Following link: %s" , absolute_url )
90- data_list .append (parse_page (absolute_url ))
113+
114+ try :
115+ response = session .get (url )
116+ response .raise_for_status ()
117+ soup = BeautifulSoup (response .text , 'html.parser' )
118+
119+ links = soup .select ('a.in-listingCardTitle' )
120+
121+ for link in links :
122+ absolute_url = requests .compat .urljoin (url , link ['href' ])
123+ logging .debug ("Following link: %s" , absolute_url )
124+
125+ # Add a random delay between requests (1-3 seconds)
126+ time .sleep (random .uniform (1 , 3 ))
127+
128+ # Get a new session with a different user agent for each request
129+ page_session = create_session ()
130+
131+ result = parse_page (absolute_url , page_session )
132+ if result :
133+ data_list .append (result )
134+
135+ except requests .RequestException as e :
136+ logging .error (f"Error fetching listing page { url } : { str (e )} " )
137+
91138 return data_list
0 commit comments