Skip to content

Commit d59a8ff

Browse files
author
Adriano Sanges
committed
Refactor database and scraper logic for improved data handling
- Simplify SQL queries in database.py to use NOT IN for filtering new properties - Enhance price parsing logic in scraper.py to handle various formats and improve robustness - Update property extraction logic to correctly parse city, neighbourhood, road, and garage information - Clean up code by removing unnecessary comments and ensuring consistent variable usage
1 parent c1bf1c6 commit d59a8ff

File tree

2 files changed

+45
-32
lines changed

2 files changed

+45
-32
lines changed

real-estate-etl/database.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55

66
def create_properties_table(con: duckdb.DuckDBPyConnection) -> None:
7-
# Create the table 'properties'
87
con.execute("""
98
CREATE TABLE IF NOT EXISTS main.properties (
109
url VARCHAR PRIMARY KEY,
@@ -23,24 +22,18 @@ def create_properties_table(con: duckdb.DuckDBPyConnection) -> None:
2322

2423

2524
def get_new_properties(con: duckdb.DuckDBPyConnection) -> pl.DataFrame:
26-
# Use a LEFT JOIN to filter out rows that already exist in 'properties'
2725
new_rows_df = con.execute("""
2826
SELECT nd.*
2927
FROM new_data nd
30-
LEFT JOIN main.properties p ON nd.url = p.url
31-
WHERE p.url IS NULL
28+
WHERE nd.url NOT IN (SELECT url FROM main.properties)
3229
""").pl()
33-
34-
print(new_rows_df)
3530
return new_rows_df
3631

3732

3833
def insert_new_properties(con: duckdb.DuckDBPyConnection) -> None:
39-
# Insert the new rows into the 'properties' table
4034
con.execute("""
4135
INSERT INTO properties (url, title, content, price, city, neighbourhood, road, square_meters, floor, garage_info)
4236
SELECT nd.url, nd.title, nd.content, nd.price, nd.city, nd.neighbourhood, nd.road, nd.square_meters, nd.floor, nd.garage_info
4337
FROM new_data nd
44-
LEFT JOIN main.properties p ON nd.url = p.url
45-
WHERE p.url IS NULL
38+
WHERE nd.url NOT IN (SELECT url FROM main.properties)
4639
""")

real-estate-etl/scraper.py

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
from typing import Optional, Dict, List
66

77

8-
def parse_price(price_raw: Optional[str]) -> Optional[int]:
9-
if not price_raw:
8+
def parse_price(price_str):
9+
if not price_str:
1010
return None
11-
price_cleaned = re.sub(r'[^\d]', '', price_raw)
12-
return int(price_cleaned) if price_cleaned else None
11+
price_match = re.search(r'\d+[.,]?\d*', price_str.replace('\u20ac', '').replace('.', '').replace(',', '.'))
12+
return int(price_match.group(0).split('.')[0]) if price_match else None
13+
14+
1315

1416
def parse_page(url: str) -> Dict[str, Optional[any]]:
1517
headers = {
@@ -26,45 +28,63 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
2628
}
2729
logging.debug("Parsing page: %s", url)
2830
response = requests.get(url, headers=headers)
31+
2932
soup = BeautifulSoup(response.text, 'html.parser')
3033

31-
listings = soup.select('section.re-layoutContentCenter')
34+
listings = soup.select('section.ld-layoutContentCenter')
3235
for listing in listings:
33-
city = listing.select_one('div.re-title__content span.re-blockTitle__location')
34-
neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
35-
road = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(3)')
36-
price_raw = listing.select_one('div.re-overview__price span')
37-
price = parse_price(price_raw.text if price_raw else None)
36+
# Extract title, city, neighborhood, road
37+
title = soup.find('meta', property='og:title')
38+
city, neighbourhood, road = None, None, None
39+
if title:
40+
location_parts = title["content"].split('|')[0].split(', ')
41+
location_parts = [part.strip() for part in location_parts]
42+
if len(location_parts) >= 3:
43+
road, neighbourhood, city = location_parts[:3]
44+
elif len(location_parts) == 2:
45+
neighbourhood, city = location_parts[:2]
46+
elif len(location_parts) == 1:
47+
city = location_parts[0]
48+
49+
# Remove unwanted "Appartamento" prefix if present in the road name
50+
if road and road.lower().startswith("appartamento"):
51+
road = road.replace("Appartamento", "").strip()
3852

53+
# Extract price from the correct div
54+
price_span = soup.select_one('div.ld-overview__price span')
55+
price = parse_price(price_span.text if price_span else None)
56+
# Extract square meters
3957
square_meters_match = re.search(r'(\d+)\s?m²', soup.text)
4058
square_meters = int(square_meters_match.group(1)) if square_meters_match else None
4159

60+
# Extract floor
4261
floor_match = re.search(r'Piano\s(\d+)', soup.text)
4362
floor = int(floor_match.group(1)) if floor_match else None
44-
45-
# Find the feature item related to parking/garage
46-
garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
47-
63+
64+
# Extract garage info
65+
garage_info = None
66+
garage_feature = soup.find('dt', string=re.compile(r'Box|Posti auto', re.IGNORECASE))
4867
if garage_feature:
49-
# Get the associated description (dd)
50-
garage_description = garage_feature.find_next('dd', class_='re-featuresItem__description')
68+
garage_description = garage_feature.find_next('dd')
5169
garage_info = garage_description.get_text(strip=True) if garage_description else None
52-
else:
53-
garage_info = None
70+
71+
# Extract content description
72+
description_paragraphs = [p.text.strip() for p in soup.find_all('p')]
5473

5574
data = {
5675
"url": url,
5776
"title": soup.title.string if soup.title else None,
58-
"content": [p.text.strip() for p in soup.find_all('p')],
77+
"content": description_paragraphs,
5978
"price": price,
60-
"city": city.text.strip() if city else None,
61-
"neighbourhood": neighbourhood.text.strip() if neighbourhood else None,
62-
"road": road.text.strip() if road else None,
79+
"city": city,
80+
"neighbourhood": neighbourhood,
81+
"road": road,
6382
"square_meters": square_meters,
6483
"floor": floor,
6584
"garage_info": garage_info,
6685
}
67-
return data
86+
return data
87+
6888

6989
def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
7090
headers = {

0 commit comments

Comments
 (0)