Skip to content

Commit 0e64ec0

Browse files
author
Adriano Sanges
committed
Refactor and type-annotate real estate ETL scripts
- Add type hints to database.py, scan_properties.py, scraper.py, and telegram_api.py - Improve error handling for environment variable checks - Restructure scan_properties.py with a main() function - Update method signatures to include type annotations - Enhance code readability and type safety
1 parent bc0f5f4 commit 0e64ec0

File tree

4 files changed

+46
-34
lines changed

4 files changed

+46
-34
lines changed

real-estate-etl/database.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import logging
2+
import polars as pl
3+
import duckdb
24

3-
4-
def clean_properties(con) -> None:
5+
def clean_properties(con: duckdb.DuckDBPyConnection) -> None:
56
logging.debug("Starting property cleaning")
67
create_table_query = """
78
CREATE TABLE IF NOT EXISTS main.cleaned_properties (
@@ -55,6 +56,6 @@ def clean_properties(con) -> None:
5556
con.sql("DELETE FROM main.properties;")
5657

5758

58-
def get_new_properties(con) -> None:
59+
def get_new_properties(con: duckdb.DuckDBPyConnection) -> pl.DataFrame:
5960
df = con.sql("SELECT * FROM main.new_properties;").pl()
6061
return df

real-estate-etl/scan_properties.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,46 @@
1+
from typing import List, Dict, Any
12
from bs4 import BeautifulSoup
2-
import polars
3+
import polars as pl
34
import duckdb
45
from scraper import parse_listing
56
from database import clean_properties, get_new_properties
67
from dotenv import load_dotenv
78
import os
89
from telegram_api import send_message, format_property_message
910

10-
11-
12-
if __name__ == "__main__":
13-
14-
11+
def main() -> None:
12+
"""Main function to scrape properties and send messages."""
1513
load_dotenv()
1614

17-
url = os.getenv("scrape_url")
18-
warehouse_name = os.getenv("warehouse_name")
19-
motherduck_token = os.getenv("motherduck_token")
15+
url: str = os.getenv("scrape_url")
16+
warehouse_name: str = os.getenv("warehouse_name")
17+
motherduck_token: str = os.getenv("motherduck_token")
2018

21-
data = parse_listing(url)
19+
if not url or not warehouse_name or not motherduck_token:
20+
raise ValueError("Environment variables for URL, warehouse name, or token are not set.")
2221

23-
polars_df = polars.DataFrame(data)
22+
data: List[Dict[str, Any]] = parse_listing(url)
2423

25-
con = duckdb.connect(f"md:{warehouse_name}?motherduck_token={motherduck_token}")
24+
polars_df: pl.DataFrame = pl.DataFrame(data)
25+
26+
con: duckdb.DuckDBPyConnection = duckdb.connect(f"md:{warehouse_name}?motherduck_token={motherduck_token}")
2627

2728
con.sql("create table if not exists main.properties as select * from polars_df")
2829

2930
clean_properties(con)
3031

31-
new_properties = get_new_properties(con)
32-
# Iterate over the DataFrame and format each property
33-
messages = [format_property_message(row) for row in new_properties.iter_rows(named=True)]
32+
new_properties: pl.DataFrame = get_new_properties(con)
33+
34+
# Format and send messages
35+
messages: List[str] = [format_property_message(row) for row in new_properties.to_dicts()]
3436

35-
# Send messages in chunks of two
37+
# Send messages in chunks of two
3638
for i in range(0, len(messages), 2):
37-
# Get the current chunk of two messages
38-
message_chunk = messages[i:i+2]
39-
# Join the two messages with a separator
40-
full_message = "\n\n".join(message_chunk)
41-
# Send the combined message
39+
message_chunk: List[str] = messages[i:i+2]
40+
full_message: str = "\n\n".join(message_chunk)
4241
send_message(full_message)
4342

43+
if __name__ == "__main__":
44+
main()
45+
4446

real-estate-etl/scraper.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22
import logging
33
import requests
44
from bs4 import BeautifulSoup
5+
from typing import Optional, Dict, List
56

67

7-
def parse_price(price_raw):
8+
def parse_price(price_raw: Optional[str]) -> Optional[int]:
89
if not price_raw:
910
return None
1011
price_cleaned = re.sub(r'[^\d]', '', price_raw)
1112
return int(price_cleaned) if price_cleaned else None
1213

13-
def parse_page(url):
14+
def parse_page(url: str) -> Dict[str, Optional[any]]:
1415
logging.debug("Parsing page: %s", url)
1516
response = requests.get(url)
1617
soup = BeautifulSoup(response.text, 'html.parser')
@@ -30,7 +31,7 @@ def parse_page(url):
3031
floor_match = re.search(r'Piano\s(\d+)', soup.text)
3132
floor = int(floor_match.group(1)) if floor_match else None
3233

33-
# Find the feature item related to parking/garage
34+
# Find the feature item related to parking/garage
3435
garage_feature = listing.find('dt', class_='re-featuresItem__title', string="Box, posti auto")
3536

3637
if garage_feature:
@@ -55,7 +56,7 @@ def parse_page(url):
5556

5657
return data
5758

58-
def parse_listing(url):
59+
def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
5960
logging.debug("Fetching main listing page: %s", url)
6061
response = requests.get(url)
6162
soup = BeautifulSoup(response.text, 'html.parser')

real-estate-etl/telegram_api.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,25 @@
11
import telegram
22
import os
3-
3+
from typing import Dict, Any
44
from dotenv import load_dotenv
55

66
load_dotenv()
77

8-
telegram_bot_api_key = os.getenv('telegram_bot_api_key')
8+
TELEGRAM_BOT_API_KEY: str = os.getenv("telegram_bot_api_key")
9+
10+
if TELEGRAM_BOT_API_KEY is None:
11+
raise ValueError("TELEGRAM_BOT_API_KEY is not set. Please check your environment variables.")
12+
13+
bot: telegram.Bot = telegram.Bot(TELEGRAM_BOT_API_KEY)
14+
915
chat_id = os.getenv('chat_id')
1016
chat_tag = os.getenv('chat_tag')
1117

12-
bot = telegram.Bot(telegram_bot_api_key)
1318

1419

1520
# Function to format the message
16-
def format_property_message(row):
21+
def format_property_message(row: Dict[str, Any]) -> str:
22+
"""Format a property message for sending via Telegram."""
1723
return (
1824
f"🏠 **{row['title']}**\n"
1925
f"📍 Location: {row['city']}, {row['neighbourhood']}, {row['road']}\n"
@@ -25,7 +31,9 @@ def format_property_message(row):
2531
)
2632

2733

28-
def send_message(message):
29-
34+
def send_message(message: str) -> None:
35+
"""Send a message using the Telegram bot."""
36+
if chat_id is None:
37+
raise ValueError("TELEGRAM_CHAT_ID is not set. Please check your environment variables.")
3038
bot.sendMessage(chat_id=chat_id, text=message, parse_mode=telegram.ParseMode.MARKDOWN)
3139

0 commit comments

Comments
 (0)