Skip to content

Commit 47c38d3

Browse files
author
Adriano Sanges
committed
Refactor scraper.py to improve modularity and readability
- Introduce get_opener() function to handle proxy and SSL setup for web requests - Create get_html_content() function to streamline HTML content retrieval - Update parse_page() to utilize the new functions for cleaner code structure - Remove redundant headers from parse_listing() and ensure consistent use of get_html_content()
1 parent 1d94444 commit 47c38d3

File tree

1 file changed

+15
-33
lines changed

1 file changed

+15
-33
lines changed

real-estate-etl/scraper.py

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,12 @@ def parse_price(price_str):
1414
price_match = re.search(r'\d+[.,]?\d*', price_str.replace('\u20ac', '').replace('.', '').replace(',', '.'))
1515
return int(price_match.group(0).split('.')[0]) if price_match else None
1616

17-
18-
19-
def parse_page(url: str) -> Dict[str, Optional[any]]:
20-
logging.debug("Parsing page: %s", url)
21-
17+
def get_opener():
2218
proxy_url = os.getenv('proxy_url')
23-
24-
# Create a context that doesn't verify certificates
2519
ctx = ssl.create_default_context()
2620
ctx.check_hostname = False
2721
ctx.verify_mode = ssl.CERT_NONE
28-
29-
opener = urllib.request.build_opener(
30-
urllib.request.ProxyHandler({
31-
'http': proxy_url,
32-
'https': proxy_url
33-
}),
34-
urllib.request.HTTPSHandler(context=ctx) # Add the SSL context
35-
)
22+
opener = urllib.request.build_opener(urllib.request.ProxyHandler({'http': proxy_url, 'https': proxy_url}), urllib.request.HTTPSHandler(context=ctx))
3623
opener.addheaders = [
3724
('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'),
3825
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'),
@@ -45,15 +32,23 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
4532
('Sec-Fetch-Site', 'none'),
4633
('Sec-Fetch-User', '?1'),
4734
]
48-
35+
return opener
36+
37+
def get_html_content(url: str) -> str:
38+
opener = get_opener()
4939
response = opener.open(url)
50-
51-
# Check if response is gzipped
5240
if response.info().get('Content-Encoding') == 'gzip':
5341
import gzip
5442
html_content = gzip.decompress(response.read()).decode('utf-8', errors='replace')
5543
else:
5644
html_content = response.read().decode('utf-8', errors='replace')
45+
return html_content
46+
47+
48+
def parse_page(url: str) -> Dict[str, Optional[any]]:
49+
logging.debug("Parsing page: %s", url)
50+
51+
html_content = get_html_content(url)
5752

5853
soup = BeautifulSoup(html_content, 'html.parser')
5954

@@ -113,22 +108,9 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
113108

114109

115110
def parse_listing(url: str) -> List[Dict[str, Optional[any]]]:
116-
headers = {
117-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
118-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
119-
'Accept-Language': 'en-US,en;q=0.9',
120-
'Accept-Encoding': 'gzip, deflate, br',
121-
'Connection': 'keep-alive',
122-
'Upgrade-Insecure-Requests': '1',
123-
'Sec-Fetch-Dest': 'document',
124-
'Sec-Fetch-Mode': 'navigate',
125-
'Sec-Fetch-Site': 'none',
126-
'Sec-Fetch-User': '?1',
127-
}
128111
logging.debug("Fetching main listing page: %s", url)
129-
response = requests.get(url, headers=headers)
130-
print(response.text)
131-
soup = BeautifulSoup(response.text, 'html.parser')
112+
html_content = get_html_content(url)
113+
soup = BeautifulSoup(html_content, 'html.parser')
132114
data_list = []
133115
links = soup.select('a.in-listingCardTitle')
134116
for link in links:

0 commit comments

Comments
 (0)