33import requests
44from bs4 import BeautifulSoup
55from typing import Optional , Dict , List
6-
6+ import urllib .request
7+ import ssl
8+ import gzip
9+ import os
710
811def parse_price (price_str ):
912 if not price_str :
@@ -14,22 +17,45 @@ def parse_price(price_str):
1417
1518
1619def parse_page (url : str ) -> Dict [str , Optional [any ]]:
17- headers = {
18- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
19- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ,
20- 'Accept-Language' : 'en-US,en;q=0.9' ,
21- 'Accept-Encoding' : 'gzip, deflate, br' ,
22- 'Connection' : 'keep-alive' ,
23- 'Upgrade-Insecure-Requests' : '1' ,
24- 'Sec-Fetch-Dest' : 'document' ,
25- 'Sec-Fetch-Mode' : 'navigate' ,
26- 'Sec-Fetch-Site' : 'none' ,
27- 'Sec-Fetch-User' : '?1' ,
28- }
2920 logging .debug ("Parsing page: %s" , url )
30- response = requests .get (url , headers = headers )
21+
22+ proxy_url = os .getenv ('proxy_url' )
3123
32- soup = BeautifulSoup (response .text , 'html.parser' )
24+ # Create a context that doesn't verify certificates
25+ ctx = ssl .create_default_context ()
26+ ctx .check_hostname = False
27+ ctx .verify_mode = ssl .CERT_NONE
28+
29+ opener = urllib .request .build_opener (
30+ urllib .request .ProxyHandler ({
31+ 'http' : proxy_url ,
32+ 'https' : proxy_url
33+ }),
34+ urllib .request .HTTPSHandler (context = ctx ) # Add the SSL context
35+ )
36+ opener .addheaders = [
37+ ('User-Agent' , 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ),
38+ ('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ),
39+ ('Accept-Language' , 'en-US,en;q=0.9' ),
40+ ('Accept-Encoding' , 'gzip, deflate, br' ),
41+ ('Connection' , 'keep-alive' ),
42+ ('Upgrade-Insecure-Requests' , '1' ),
43+ ('Sec-Fetch-Dest' , 'document' ),
44+ ('Sec-Fetch-Mode' , 'navigate' ),
45+ ('Sec-Fetch-Site' , 'none' ),
46+ ('Sec-Fetch-User' , '?1' ),
47+ ]
48+
49+ response = opener .open (url )
50+
51+ # Check if response is gzipped
52+ if response .info ().get ('Content-Encoding' ) == 'gzip' :
53+ import gzip
54+ html_content = gzip .decompress (response .read ()).decode ('utf-8' , errors = 'replace' )
55+ else :
56+ html_content = response .read ().decode ('utf-8' , errors = 'replace' )
57+
58+ soup = BeautifulSoup (html_content , 'html.parser' )
3359
3460 listings = soup .select ('section.ld-layoutContentCenter' )
3561 for listing in listings :
0 commit comments