Skip to content

Commit a40b76a

Browse files
author
Adriano Sanges
committed
Add proxy support and SSL handling in scraper.py
- Introduce proxy configuration in scraper.py to enhance web scraping capabilities - Implement SSL context to bypass certificate verification for secure connections - Update GitHub Actions workflow to include proxy URL as an environment variable - Ensure compatibility with gzipped responses during HTML content retrieval
1 parent d59a8ff commit a40b76a

File tree

2 files changed

+43
-15
lines changed

2 files changed

+43
-15
lines changed

.github/workflows/run-project.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ jobs:
3636
echo "telegram_bot_api_key=${{ secrets.telegram_bot_api_key }}" >> .env
3737
echo "chat_id=${{ secrets.chat_id }}" >> .env
3838
echo "chat_tag=${{ secrets.chat_tag }}" >> .env
39+
echo "proxy_url=${{ secrets.proxy_url }}" >> .env
3940
- name: Run Python Script
4041
env:
4142
warehouse_name: ${{ secrets.warehouse_name }}
@@ -44,6 +45,7 @@ jobs:
4445
telegram_bot_api_key: ${{ secrets.telegram_bot_api_key }}
4546
chat_id: ${{ secrets.chat_id }}
4647
chat_tag: ${{ secrets.chat_tag }}
48+
proxy_url: ${{ secrets.proxy_url }}
4749
run: |
4850
cd real-estate-etl
4951
uv run scan_properties.py

real-estate-etl/scraper.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
import requests
44
from bs4 import BeautifulSoup
55
from typing import Optional, Dict, List
6-
6+
import urllib.request
7+
import ssl
8+
import gzip
9+
import os
710

811
def parse_price(price_str):
912
if not price_str:
@@ -14,22 +17,45 @@ def parse_price(price_str):
1417

1518

1619
def parse_page(url: str) -> Dict[str, Optional[any]]:
17-
headers = {
18-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
19-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
20-
'Accept-Language': 'en-US,en;q=0.9',
21-
'Accept-Encoding': 'gzip, deflate, br',
22-
'Connection': 'keep-alive',
23-
'Upgrade-Insecure-Requests': '1',
24-
'Sec-Fetch-Dest': 'document',
25-
'Sec-Fetch-Mode': 'navigate',
26-
'Sec-Fetch-Site': 'none',
27-
'Sec-Fetch-User': '?1',
28-
}
2920
logging.debug("Parsing page: %s", url)
30-
response = requests.get(url, headers=headers)
21+
22+
proxy_url = os.getenv('proxy_url')
3123

32-
soup = BeautifulSoup(response.text, 'html.parser')
24+
# Create a context that doesn't verify certificates
25+
ctx = ssl.create_default_context()
26+
ctx.check_hostname = False
27+
ctx.verify_mode = ssl.CERT_NONE
28+
29+
opener = urllib.request.build_opener(
30+
urllib.request.ProxyHandler({
31+
'http': proxy_url,
32+
'https': proxy_url
33+
}),
34+
urllib.request.HTTPSHandler(context=ctx) # Add the SSL context
35+
)
36+
opener.addheaders = [
37+
('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'),
38+
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'),
39+
('Accept-Language', 'en-US,en;q=0.9'),
40+
('Accept-Encoding', 'gzip, deflate, br'),
41+
('Connection', 'keep-alive'),
42+
('Upgrade-Insecure-Requests', '1'),
43+
('Sec-Fetch-Dest', 'document'),
44+
('Sec-Fetch-Mode', 'navigate'),
45+
('Sec-Fetch-Site', 'none'),
46+
('Sec-Fetch-User', '?1'),
47+
]
48+
49+
response = opener.open(url)
50+
51+
# Check if response is gzipped
52+
if response.info().get('Content-Encoding') == 'gzip':
53+
import gzip
54+
html_content = gzip.decompress(response.read()).decode('utf-8', errors='replace')
55+
else:
56+
html_content = response.read().decode('utf-8', errors='replace')
57+
58+
soup = BeautifulSoup(html_content, 'html.parser')
3359

3460
listings = soup.select('section.ld-layoutContentCenter')
3561
for listing in listings:

0 commit comments

Comments
 (0)