Skip to content

Commit 161b7f4

Browse files
author
Adriano Sanges
committed
Refactor database.py to simplify table creation and property insertion logic
- Replace complex table creation and insertion queries with more focused methods - Create a new `create_properties_table` function for table initialization - Modify `get_new_properties` to use LEFT JOIN for filtering new properties - Add `insert_new_properties` method to handle property insertion - Simplify database operations and improve code readability
1 parent 408c9ff commit 161b7f4

File tree

4 files changed

+156
-60
lines changed

4 files changed

+156
-60
lines changed

real-estate-etl/database.py

Lines changed: 33 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,60 +2,39 @@
22
import polars as pl
33
import duckdb
44

5-
def clean_properties(con: duckdb.DuckDBPyConnection) -> None:
6-
logging.debug("Starting property cleaning")
7-
create_table_query = """
8-
CREATE TABLE IF NOT EXISTS main.cleaned_properties (
9-
url TEXT PRIMARY KEY,
10-
title TEXT,
11-
content TEXT,
12-
price INTEGER,
13-
city TEXT,
14-
neighbourhood TEXT,
15-
road TEXT,
16-
square_meters INTEGER,
17-
floor TEXT,
18-
garage_info TEXT
19-
);
20-
CREATE OR REPLACE TABLE main.new_properties (
21-
url TEXT PRIMARY KEY,
22-
title TEXT,
23-
content TEXT,
24-
price INTEGER,
25-
city TEXT,
26-
neighbourhood TEXT,
27-
road TEXT,
28-
square_meters INTEGER,
29-
floor TEXT,
30-
garage_info TEXT
31-
);
32-
"""
33-
con.sql(create_table_query)
34-
insert_query = """
35-
INSERT INTO main.cleaned_properties (url, title, price, city, neighbourhood, road, square_meters, floor, garage_info)
36-
SELECT url, title, price, city, neighbourhood, road, square_meters, floor, garage_info
37-
FROM main.properties
38-
WHERE NOT EXISTS (
39-
SELECT 1
40-
FROM main.cleaned_properties
41-
WHERE main.cleaned_properties.url = main.properties.url
42-
);
43-
"""
44-
insert_query_only_new = """
45-
INSERT INTO main.new_properties (url, title, price, city, neighbourhood, road, square_meters, floor, garage_info)
46-
SELECT url, title, price, city, neighbourhood, road, square_meters, floor, garage_info
47-
FROM main.properties
48-
WHERE NOT EXISTS (
49-
SELECT 1
50-
FROM main.cleaned_properties
51-
WHERE main.cleaned_properties.url = main.properties.url
52-
)
53-
"""
54-
con.sql(insert_query_only_new)
55-
con.sql(insert_query)
56-
con.sql("DELETE FROM main.properties;")
5+
6+
def create_properties_table(con: duckdb.DuckDBPyConnection) -> None:
7+
# Create the table 'properties'
8+
con.execute("""
9+
CREATE TABLE IF NOT EXISTS main.properties (
10+
url VARCHAR PRIMARY KEY,
11+
title VARCHAR,
12+
content VARCHAR[], -- storing list[str] as an array of VARCHAR
13+
price BIGINT,
14+
road VARCHAR,
15+
square_meters BIGINT,
16+
floor BIGINT,
17+
garage_info VARCHAR
18+
)
19+
""")
5720

5821

5922
def get_new_properties(con: duckdb.DuckDBPyConnection) -> pl.DataFrame:
60-
df = con.sql("SELECT * FROM main.new_properties;").pl()
61-
return df
23+
# Use a LEFT JOIN to filter out rows that already exist in 'properties'
24+
new_rows_df = con.execute("""
25+
SELECT nd.*
26+
FROM new_data nd
27+
LEFT JOIN main.properties p ON nd.url = p.url
28+
WHERE p.url IS NULL
29+
""").pl()
30+
31+
32+
def insert_new_properties(con: duckdb.DuckDBPyConnection) -> None:
33+
# Insert the new rows into the 'properties' table
34+
con.execute("""
35+
INSERT INTO properties
36+
SELECT nd.*
37+
FROM new_data nd
38+
LEFT JOIN main.properties p ON nd.url = p.url
39+
WHERE p.url IS NULL
40+
""")

real-estate-etl/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ requires-python = ">=3.11"
77
dependencies = [
88
"beautifulsoup4>=4.12.3",
99
"duckdb>=1.1.3",
10+
"numpy>=2.2.2",
11+
"pandas>=2.2.3",
1012
"polars>=1.21.0",
1113
"pyarrow>=19.0.0",
1214
"python-dotenv>=1.0.1",

real-estate-etl/scan_properties.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import polars as pl
33
import duckdb
44
from scraper import parse_listing
5-
from database import clean_properties, get_new_properties
5+
from database import create_properties_table, get_new_properties, insert_new_properties
66
from dotenv import load_dotenv
77
import os
88
from telegram_api import send_message, format_property_message
@@ -19,18 +19,24 @@
1919
motherduck_token: str = os.getenv("motherduck_token")
2020

2121
data: list[dict] = parse_listing(url)
22-
2322
polars_df: pl.DataFrame = pl.DataFrame(data)
2423

2524
con: duckdb.DuckDBPyConnection = duckdb.connect(f"md:{warehouse_name}?motherduck_token={motherduck_token}")
25+
26+
create_properties_table(con)
2627

27-
con.sql("create table if not exists main.properties as select * from polars_df")
28+
con.register("new_data", polars_df)
29+
30+
new_rows_df: pl.DataFrame = get_new_properties(con)
31+
2832

29-
clean_properties(con)
33+
insert_new_properties(con)
3034

31-
new_properties: pl.DataFrame = get_new_properties(con)
3235
# Iterate over the DataFrame and format each property
33-
messages: list[str] = [format_property_message(row) for row in new_properties.iter_rows(named=True)]
36+
if new_rows_df is not None:
37+
messages: list[str] = [format_property_message(row) for row in new_rows_df.iter_rows(named=True)]
38+
else:
39+
messages = []
3440

3541
# Send messages in chunks of two
3642
for i in range(0, len(messages), 2):

0 commit comments

Comments
 (0)