1212import csv
1313import logging
1414import random
15- import re
1615import time
1716import traceback
17+ import re
18+ import requests
1819import json
1920from datetime import datetime , timedelta , timezone
2021from pathlib import Path
2627from apps .events .models import Events
2728from services .openai_service import extract_events_from_caption , generate_embedding
2829from services .storage_service import upload_image_from_url
30+ from zyte_setup import setup_zyte
31+ from logging_config import logger
2932from utils .embedding_utils import find_similar_events
3033
3134USER_AGENTS = [
3639 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" ,
3740]
3841
39- LOG_DIR = Path ("logs" )
40- LOG_DIR .mkdir (exist_ok = True )
41- LOG_FILE = LOG_DIR / "scraping.log"
42-
43- logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
44- logging .getLogger ("requests" ).setLevel (logging .WARNING )
45- logging .basicConfig (
46- level = logging .DEBUG ,
47- format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ,
48- handlers = [
49- logging .StreamHandler (sys .stdout ),
50- logging .FileHandler (LOG_FILE , encoding = "utf-8" ),
51- ],
52- )
53- logger = logging .getLogger (__name__ )
54-
55- MAX_POSTS = 50
42+ MAX_POSTS = int (os .getenv ("MAX_POSTS" , "100" ))
5643MAX_CONSEC_OLD_POSTS = 10
5744CUTOFF_DAYS = 2
5845
@@ -429,10 +416,53 @@ def process_recent_feed(
429416 logger .info (f"Added { events_added } event(s) to Supabase" )
430417
431418
419+ def test_zyte_proxy (country = "CA" ):
420+ """
421+ Patch requests.Session to route through Zyte with geolocation,
422+ test Zyte proxy routing and geolocation
423+ """
424+ zyte_cert_path = setup_zyte ()
425+ zyte_proxy = os .getenv ("ZYTE_PROXY" )
426+ os .environ ['https_proxy' ] = zyte_proxy
427+
428+ old_request = requests .Session .request
429+
430+ def zyte_request (self , method , url , ** kwargs ):
431+ headers = kwargs .get ("headers" , {})
432+ headers ["Zyte-Geolocation" ] = country
433+ kwargs ["headers" ] = headers
434+ kwargs ["verify" ] = zyte_cert_path
435+ kwargs ["proxies" ] = {"http" : zyte_proxy , "https" : zyte_proxy }
436+ kwargs ["timeout" ] = kwargs .get ("timeout" , 60 )
437+ return old_request (self , method , url , ** kwargs )
438+
439+ requests .Session .request = zyte_request
440+
441+ logging .debug (f"Testing Zyte proxy geolocation: { country } " )
442+ try :
443+ resp = requests .get (
444+ "https://ipapi.co/json/" ,
445+ timeout = 15 ,
446+ verify = zyte_cert_path )
447+ resp .raise_for_status ()
448+ data = resp .json ()
449+ logging .debug (f"Connected via Zyte proxy" )
450+ logging .debug (f"Public IP: { data .get ('ip' )} " )
451+ logging .debug (f"Country: { data .get ('country_name' )} ({ data .get ('country' )} )" )
452+ logging .debug (f"City: { data .get ('city' )} " )
453+ except Exception as e :
454+ print (f"Proxy geolocation test failed: { e } " )
455+
456+
432457@handle_instagram_errors
433458def session ():
434459 L = Instaloader (user_agent = random .choice (USER_AGENTS ))
435- session_file = Path (__file__ ).resolve ().parent .parent / ("session-" + USERNAME )
460+ try :
461+ SESSION_CACHE_DIR = Path (os .getenv ("GITHUB_WORKSPACE" , "." )) / ".insta_cache"
462+ SESSION_CACHE_DIR .mkdir (exist_ok = True )
463+ session_file = SESSION_CACHE_DIR / f"session-{ USERNAME } "
464+ except Exception as e :
465+ session_file = Path (__file__ ).resolve ().parent .parent / ("session-" + USERNAME )
436466 try :
437467 if session_file .exists ():
438468 L .load_session_from_file (USERNAME , filename = str (session_file ))
@@ -457,6 +487,7 @@ def session():
457487
458488
459489if __name__ == "__main__" :
490+ test_zyte_proxy ("CA" )
460491 logger .info ("Attemping to load Instagram session..." )
461492 L = session ()
462493 if L :
0 commit comments