Skip to content

Commit c1bf1c6

Browse files
author
Adriano Sanges
committed
Add headers to web scraping requests to improve request reliability
- Add comprehensive headers to requests in scraper.py to mimic browser requests - Prevent potential blocking by web servers when scraping real estate listings - Update both parse_page() and parse_listing() functions with consistent headers
1 parent 3e48b5e commit c1bf1c6

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed

real-estate-etl/scan_properties.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
con: duckdb.DuckDBPyConnection = duckdb.connect(f"md:{warehouse_name}?motherduck_token={motherduck_token}")
2525

2626
create_properties_table(con)
27-
2827
con.register("new_data", polars_df)
2928

3029
new_rows_df: pl.DataFrame = get_new_properties(con)

real-estate-etl/scraper.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,23 @@ def parse_price(price_raw: Optional[str]) -> Optional[int]:
1212
return int(price_cleaned) if price_cleaned else None
1313

1414
def parse_page(url: str) -> Dict[str, Optional[any]]:
15+
headers = {
16+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
17+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
18+
'Accept-Language': 'en-US,en;q=0.9',
19+
'Accept-Encoding': 'gzip, deflate, br',
20+
'Connection': 'keep-alive',
21+
'Upgrade-Insecure-Requests': '1',
22+
'Sec-Fetch-Dest': 'document',
23+
'Sec-Fetch-Mode': 'navigate',
24+
'Sec-Fetch-Site': 'none',
25+
'Sec-Fetch-User': '?1',
26+
}
1527
logging.debug("Parsing page: %s", url)
16-
response = requests.get(url)
28+
response = requests.get(url, headers=headers)
1729
soup = BeautifulSoup(response.text, 'html.parser')
1830

1931
listings = soup.select('section.re-layoutContentCenter')
20-
2132
for listing in listings:
2233
city = listing.select_one('div.re-title__content span.re-blockTitle__location')
2334
neighbourhood = listing.select_one('div.re-title__content span.re-blockTitle__location:nth-of-type(2)')
@@ -53,12 +64,23 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
5364
"floor": floor,
5465
"garage_info": garage_info,
5566
}
56-
5767
return data
5868

5969
def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
70+
headers = {
71+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
72+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
73+
'Accept-Language': 'en-US,en;q=0.9',
74+
'Accept-Encoding': 'gzip, deflate, br',
75+
'Connection': 'keep-alive',
76+
'Upgrade-Insecure-Requests': '1',
77+
'Sec-Fetch-Dest': 'document',
78+
'Sec-Fetch-Mode': 'navigate',
79+
'Sec-Fetch-Site': 'none',
80+
'Sec-Fetch-User': '?1',
81+
}
6082
logging.debug("Fetching main listing page: %s", url)
61-
response = requests.get(url)
83+
response = requests.get(url, headers=headers)
6284
soup = BeautifulSoup(response.text, 'html.parser')
6385
data_list = []
6486
links = soup.select('a.in-listingCardTitle')

0 commit comments

Comments
 (0)