Refactor database and scraper logic for improved data handling

Adriano Sanges · Adriano Sanges · commit d59a8ff10450 · 2025-03-12T21:21:34.000+01:00
- Simplify SQL queries in database.py to use NOT IN for filtering new properties
- Enhance price parsing logic in scraper.py to handle various formats and improve robustness
- Update property extraction logic to correctly parse city, neighbourhood, road, and garage information
- Clean up code by removing unnecessary comments and ensuring consistent variable usage
diff --git a/real-estate-etl/database.py b/real-estate-etl/database.py
@@ -4,7 +4,6 @@
 
 
 def create_properties_table(con: duckdb.DuckDBPyConnection) -> None:
-        # Create the table 'properties'
     con.execute("""
         CREATE TABLE IF NOT EXISTS main.properties (
             url VARCHAR PRIMARY KEY,
@@ -23,24 +22,18 @@ def create_properties_table(con: duckdb.DuckDBPyConnection) -> None:
 
 
 def get_new_properties(con: duckdb.DuckDBPyConnection) -> pl.DataFrame:
-        # Use a LEFT JOIN to filter out rows that already exist in 'properties'
     new_rows_df = con.execute("""
         SELECT nd.*
         FROM new_data nd
-        LEFT JOIN main.properties p ON nd.url = p.url
-        WHERE p.url IS NULL
+        WHERE nd.url NOT IN (SELECT url FROM main.properties)
     """).pl()
-
-    print(new_rows_df)
     return new_rows_df
 
 
 def insert_new_properties(con: duckdb.DuckDBPyConnection) -> None:
-        # Insert the new rows into the 'properties' table
     con.execute("""
         INSERT INTO properties (url, title, content, price, city, neighbourhood, road, square_meters, floor, garage_info)
         SELECT nd.url, nd.title, nd.content, nd.price, nd.city, nd.neighbourhood, nd.road, nd.square_meters, nd.floor, nd.garage_info
         FROM new_data nd
-        LEFT JOIN main.properties p ON nd.url = p.url
-        WHERE p.url IS NULL
+        WHERE nd.url NOT IN (SELECT url FROM main.properties)
     """)
diff --git a/real-estate-etl/scraper.py b/real-estate-etl/scraper.py
@@ -5,11 +5,13 @@
 from typing import Optional, Dict, List
 
 
-def parse_price(price_raw: Optional[str]) -> Optional[int]:
-    if not price_raw:
+def parse_price(price_str):
+    if not price_str:
         return None
-    price_cleaned = re.sub(r'[^\d]', '', price_raw)
-    return int(price_cleaned) if price_cleaned else None
+    price_match = re.search(r'\d+[.,]?\d*', price_str.replace('\u20ac', '').replace('.', '').replace(',', '.'))
+    return int(price_match.group(0).split('.')[0]) if price_match else None
+
+
 
 def parse_page(url: str) -> Dict[str, Optional[any]]:
     headers = {
@@ -26,45 +28,63 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
     }
     logging.debug("Parsing page: %s", url)
     response = requests.get(url, headers=headers)
+    
     soup = BeautifulSoup(response.text, 'html.parser')
     
-    listings = soup.select('section.re-layoutContentCenter')
+    listings = soup.select('section.ld-layoutContentCenter')
     for listing in listings:
-        city = listing.select_one('div.re-title__content span.re-blockTitle__location')
-        neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
-        road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
-        price_raw = listing.select_one('div.re-overview__price span')
-        price = parse_price(price_raw.text if price_raw else None)
+        # Extract title, city, neighborhood, road
+        title = soup.find('meta', property='og:title')
+        city, neighbourhood, road = None, None, None
+        if title:
+            location_parts = title["content"].split('|')[0].split(', ')
+            location_parts = [part.strip() for part in location_parts]
+            if len(location_parts) >= 3:
+                road, neighbourhood, city = location_parts[:3]
+            elif len(location_parts) == 2:
+                neighbourhood, city = location_parts[:2]
+            elif len(location_parts) == 1:
+                city = location_parts[0]
+            
+            # Remove unwanted "Appartamento" prefix if present in the road name
+            if road and road.lower().startswith("appartamento"):
+                road = road.replace("Appartamento", "").strip()
         
+        # Extract price from the correct div
+        price_span = soup.select_one('div.ld-overview__price span')
+        price = parse_price(price_span.text if price_span else None)
+        # Extract square meters
         square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
         square_meters = int(square_meters_match.group(1)) if square_meters_match else None
         
+        # Extract floor
         floor_match = re.search(r'Piano\s(\d+)', soup.text)
         floor = int(floor_match.group(1)) if floor_match else None
-
-        # Find the feature item related to parking/garage
-        garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
-
+        
+        # Extract garage info
+        garage_info = None
+        garage_feature = soup.find('dt', string=re.compile(r'Box|Posti auto', re.IGNORECASE))
         if garage_feature:
-            # Get the associated description (dd)
-            garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
+            garage_description = garage_feature.find_next('dd')
             garage_info = garage_description.get_text(strip=True) if garage_description else None
-        else:
-            garage_info = None
+        
+        # Extract content description
+        description_paragraphs = [p.text.strip() for p in soup.find_all('p')]
         
         data = {
             "url": url,
             "title": soup.title.string if soup.title else None,
-            "content": [p.text.strip() for p in soup.find_all('p')],
+            "content": description_paragraphs,
             "price": price,
-            "city": city.text.strip() if city else None,
-            "neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
-            "road": road.text.strip() if road else None,
+            "city": city,
+            "neighbourhood": neighbourhood,
+            "road": road,
             "square_meters": square_meters,
             "floor": floor,
             "garage_info": garage_info,
         }
-    return data
+        return data
+   
 
 def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
     headers = {