55from typing import Optional , Dict , List
66
77
8- def parse_price (price_raw : Optional [ str ]) -> Optional [ int ] :
9- if not price_raw :
8+ def parse_price (price_str ) :
9+ if not price_str :
1010 return None
11- price_cleaned = re .sub (r'[^\d]' , '' , price_raw )
12- return int (price_cleaned ) if price_cleaned else None
11+ price_match = re .search (r'\d+[.,]?\d*' , price_str .replace ('\u20ac ' , '' ).replace ('.' , '' ).replace (',' , '.' ))
12+ return int (price_match .group (0 ).split ('.' )[0 ]) if price_match else None
13+
14+
1315
1416def parse_page (url : str ) -> Dict [str , Optional [any ]]:
1517 headers = {
@@ -26,45 +28,63 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
2628 }
2729 logging .debug ("Parsing page: %s" , url )
2830 response = requests .get (url , headers = headers )
31+
2932 soup = BeautifulSoup (response .text , 'html.parser' )
3033
31- listings = soup .select ('section.re -layoutContentCenter' )
34+ listings = soup .select ('section.ld -layoutContentCenter' )
3235 for listing in listings :
33- city = listing .select_one ('div.re-title__content span.re-blockTitle__location' )
34- neighbourhood = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(2)' )
35- road = listing .select_one ('div.re-title__content span.re-blockTitle__location:nth-of-type(3)' )
36- price_raw = listing .select_one ('div.re-overview__price span' )
37- price = parse_price (price_raw .text if price_raw else None )
36+ # Extract title, city, neighborhood, road
37+ title = soup .find ('meta' , property = 'og:title' )
38+ city , neighbourhood , road = None , None , None
39+ if title :
40+ location_parts = title ["content" ].split ('|' )[0 ].split (', ' )
41+ location_parts = [part .strip () for part in location_parts ]
42+ if len (location_parts ) >= 3 :
43+ road , neighbourhood , city = location_parts [:3 ]
44+ elif len (location_parts ) == 2 :
45+ neighbourhood , city = location_parts [:2 ]
46+ elif len (location_parts ) == 1 :
47+ city = location_parts [0 ]
48+
49+ # Remove unwanted "Appartamento" prefix if present in the road name
50+ if road and road .lower ().startswith ("appartamento" ):
51+ road = road .replace ("Appartamento" , "" ).strip ()
3852
53+ # Extract price from the correct div
54+ price_span = soup .select_one ('div.ld-overview__price span' )
55+ price = parse_price (price_span .text if price_span else None )
56+ # Extract square meters
3957 square_meters_match = re .search (r'(\d+)\s?m²' , soup .text )
4058 square_meters = int (square_meters_match .group (1 )) if square_meters_match else None
4159
60+ # Extract floor
4261 floor_match = re .search (r'Piano\s(\d+)' , soup .text )
4362 floor = int (floor_match .group (1 )) if floor_match else None
44-
45- # Find the feature item related to parking/ garage
46- garage_feature = listing . find ( 'dt' , class_ = 're-featuresItem__title' , string = "Box, posti auto" )
47-
63+
64+ # Extract garage info
65+ garage_info = None
66+ garage_feature = soup . find ( 'dt' , string = re . compile ( r'Box|Posti auto' , re . IGNORECASE ))
4867 if garage_feature :
49- # Get the associated description (dd)
50- garage_description = garage_feature .find_next ('dd' , class_ = 're-featuresItem__description' )
68+ garage_description = garage_feature .find_next ('dd' )
5169 garage_info = garage_description .get_text (strip = True ) if garage_description else None
52- else :
53- garage_info = None
70+
71+ # Extract content description
72+ description_paragraphs = [p .text .strip () for p in soup .find_all ('p' )]
5473
5574 data = {
5675 "url" : url ,
5776 "title" : soup .title .string if soup .title else None ,
58- "content" : [ p . text . strip () for p in soup . find_all ( 'p' )] ,
77+ "content" : description_paragraphs ,
5978 "price" : price ,
60- "city" : city . text . strip () if city else None ,
61- "neighbourhood" : neighbourhood . text . strip () if neighbourhood else None ,
62- "road" : road . text . strip () if road else None ,
79+ "city" : city ,
80+ "neighbourhood" : neighbourhood ,
81+ "road" : road ,
6382 "square_meters" : square_meters ,
6483 "floor" : floor ,
6584 "garage_info" : garage_info ,
6685 }
67- return data
86+ return data
87+
6888
6989def parse_listing (url : str ) -> List [Dict [str , Optional [any ]]]:
7090 headers = {
0 commit comments