@@ -14,25 +14,12 @@ def parse_price(price_str):
1414 price_match = re .search (r'\d+[.,]?\d*' , price_str .replace ('\u20ac ' , '' ).replace ('.' , '' ).replace (',' , '.' ))
1515 return int (price_match .group (0 ).split ('.' )[0 ]) if price_match else None
1616
17-
18-
19- def parse_page (url : str ) -> Dict [str , Optional [any ]]:
20- logging .debug ("Parsing page: %s" , url )
21-
17+ def get_opener ():
2218 proxy_url = os .getenv ('proxy_url' )
23-
24- # Create a context that doesn't verify certificates
2519 ctx = ssl .create_default_context ()
2620 ctx .check_hostname = False
2721 ctx .verify_mode = ssl .CERT_NONE
28-
29- opener = urllib .request .build_opener (
30- urllib .request .ProxyHandler ({
31- 'http' : proxy_url ,
32- 'https' : proxy_url
33- }),
34- urllib .request .HTTPSHandler (context = ctx ) # Add the SSL context
35- )
22+ opener = urllib .request .build_opener (urllib .request .ProxyHandler ({'http' : proxy_url , 'https' : proxy_url }), urllib .request .HTTPSHandler (context = ctx ))
3623 opener .addheaders = [
3724 ('User-Agent' , 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ),
3825 ('Accept' , 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ),
@@ -45,15 +32,23 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
4532 ('Sec-Fetch-Site' , 'none' ),
4633 ('Sec-Fetch-User' , '?1' ),
4734 ]
48-
35+ return opener
36+
37+ def get_html_content (url : str ) -> str :
38+ opener = get_opener ()
4939 response = opener .open (url )
50-
51- # Check if response is gzipped
5240 if response .info ().get ('Content-Encoding' ) == 'gzip' :
5341 import gzip
5442 html_content = gzip .decompress (response .read ()).decode ('utf-8' , errors = 'replace' )
5543 else :
5644 html_content = response .read ().decode ('utf-8' , errors = 'replace' )
45+ return html_content
46+
47+
48+ def parse_page (url : str ) -> Dict [str , Optional [any ]]:
49+ logging .debug ("Parsing page: %s" , url )
50+
51+ html_content = get_html_content (url )
5752
5853 soup = BeautifulSoup (html_content , 'html.parser' )
5954
@@ -113,22 +108,9 @@ def parse_page(url: str) -> Dict[str, Optional[any]]:
113108
114109
115110def parse_listing (url : str ) -> List [Dict [str , Optional [any ]]]:
116- headers = {
117- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ,
118- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8' ,
119- 'Accept-Language' : 'en-US,en;q=0.9' ,
120- 'Accept-Encoding' : 'gzip, deflate, br' ,
121- 'Connection' : 'keep-alive' ,
122- 'Upgrade-Insecure-Requests' : '1' ,
123- 'Sec-Fetch-Dest' : 'document' ,
124- 'Sec-Fetch-Mode' : 'navigate' ,
125- 'Sec-Fetch-Site' : 'none' ,
126- 'Sec-Fetch-User' : '?1' ,
127- }
128111 logging .debug ("Fetching main listing page: %s" , url )
129- response = requests .get (url , headers = headers )
130- print (response .text )
131- soup = BeautifulSoup (response .text , 'html.parser' )
112+ html_content = get_html_content (url )
113+ soup = BeautifulSoup (html_content , 'html.parser' )
132114 data_list = []
133115 links = soup .select ('a.in-listingCardTitle' )
134116 for link in links :
0 commit comments