4040from utils .date_utils import parse_utc_datetime
4141
4242
43- MAX_POSTS = int (os .getenv ("MAX_POSTS" , "30 " ))
43+ MAX_POSTS = int (os .getenv ("MAX_POSTS" , "15 " ))
4444MAX_CONSEC_OLD_POSTS = 10
4545CUTOFF_DAYS = int (os .getenv ("CUTOFF_DAYS" , "2" ))
4646
@@ -311,38 +311,6 @@ def get_seen_shortcodes():
311311 return set ()
312312
313313
314- def safe_feed_posts (loader , retries = 3 , backoff = 60 ):
315- """
316- Yield posts from loader.get_feed_posts(), retrying on network errors.
317- On error, re-instantiate the session and skip already-yielded posts.
318- """
319- seen_shortcodes = set ()
320- attempts = 0
321- while attempts < retries :
322- try :
323- for post in loader .get_feed_posts ():
324- if hasattr (post , "shortcode" ):
325- if post .shortcode in seen_shortcodes :
326- continue
327- seen_shortcodes .add (post .shortcode )
328- yield post
329- time .sleep (random .uniform (30 , 90 ))
330- break # Finished all posts
331- except (ReadTimeout , ConnectionError , requests .exceptions .SSLError ) as e :
332- attempts += 1
333- logger .warning (f"Network error: { e !s} . Retrying in { backoff } seconds (attempt { attempts } /{ retries } )..." )
334- time .sleep (backoff )
335- try :
336- new_loader = session ()
337- loader .__dict__ .update (new_loader .__dict__ )
338- logger .info ("Session re-instantiated successfully. Continuing feed scrape." )
339- except Exception as session_e :
340- logger .error (f"Failed to re-instantiate session: { session_e } " )
341- break
342- if attempts >= retries :
343- logger .error ("Too many consecutive network errors. Aborting feed scrape." )
344-
345-
346314def process_recent_feed (
347315 loader ,
348316 cutoff = None ,
@@ -365,7 +333,7 @@ def process_recent_feed(
365333 seen_shortcodes = get_seen_shortcodes ()
366334
367335 try :
368- for post in safe_feed_posts ( loader ):
336+ for post in loader . get_feed_posts ( ):
369337 try :
370338 post_time = timezone .make_aware (post .date_utc ) if timezone .is_naive (post .date_utc ) else post .date_utc
371339 if post_time < cutoff :
@@ -496,6 +464,8 @@ def process_recent_feed(
496464 f"Reached { max_consec_old_posts } consecutive old posts, stopping."
497465 )
498466 break
467+
468+ time .sleep (random .uniform (30 , 90 ))
499469
500470 if not termination_reason :
501471 termination_reason = "no_more_posts"
@@ -513,7 +483,7 @@ def process_recent_feed(
513483 logger .info (f"Added { events_added } event(s) to Supabase" )
514484
515485
516- def test_zyte_proxy (country = "CA" ):
486+ def create_proxy_session (country = "CA" ):
517487 """
518488 Patch requests.Session to route through Zyte with geolocation,
519489 test Zyte proxy routing and geolocation
@@ -526,17 +496,11 @@ def test_zyte_proxy(country="CA"):
526496 logger .warning (
527497 "ZYTE_PROXY not set - skipping proxied geolocation test and trying direct request"
528498 )
529- try :
530- resp = requests .get ("https://ipapi.co/json/" , timeout = 15 )
531- resp .raise_for_status ()
532- logger .info ("Direct geolocation test succeeded" )
533- return True
534- except Exception as e :
535- logger .warning (f"Direct geolocation test failed: { e !s} " )
536- return False
537-
538- old_request = requests .Session .request
539-
499+ return requests .Session ()
500+
501+ session = requests .Session ()
502+ old_request = session .request
503+
540504 def zyte_request (self , method , url , ** kwargs ):
541505 headers = kwargs .get ("headers" , {})
542506 headers ["Zyte-Geolocation" ] = country
@@ -547,29 +511,30 @@ def zyte_request(self, method, url, **kwargs):
547511 kwargs ["timeout" ] = kwargs .get ("timeout" , 30 )
548512 return old_request (self , method , url , ** kwargs )
549513
550- requests .Session .request = zyte_request
551-
514+ session .request = zyte_request .__get__ (session , requests .Session )
552515 logger .debug (f"Testing Zyte proxy geolocation: { country } " )
553516 try :
554- resp = requests .get (
555- "https://ipapi.co/json/" ,
556- timeout = 30 ,
557- verify = str (zyte_cert_path ) if zyte_cert_path else True ,
558- proxies = {"http" : zyte_proxy , "https" : zyte_proxy },
559- )
517+ resp = session .get ("https://ipapi.co/json/" )
560518 resp .raise_for_status ()
561519 data = resp .json ()
562520 logger .debug ("Connected via Zyte proxy" )
563521 logger .debug (f"Public IP: { data .get ('ip' )} " )
564522 logger .debug (f"Country: { data .get ('country_name' )} ({ data .get ('country' )} )" )
565523 logger .debug (f"City: { data .get ('city' )} " )
524+ return session
566525 except Exception as e :
567526 logger .warning (f"Proxied geolocation failed: { e !s} " )
568- return True
527+ return None
569528
570529
571530def session ():
531+ proxied_session = create_proxy_session ("CA" )
532+ if not proxied_session :
533+ logger .critical ("Failed to create proxied session, aborting..." )
534+ return None
535+
572536 L = Instaloader (user_agent = random .choice (USER_AGENTS ))
537+ L .context ._session = proxied_session
573538 L .context .request_timeout = 120
574539 L .context .max_connection_attempts = 5
575540 try :
@@ -603,11 +568,20 @@ def session():
603568
604569
605570if __name__ == "__main__" :
606- test_zyte_proxy ("CA" )
607- logger .info ("Attemping to load Instagram session..." )
608- L = session ()
609- if L :
610- logger .info ("Session created successfully!" )
611- process_recent_feed (L )
612- else :
613- logger .critical ("Failed to initialize Instagram session, stopping..." )
571+ lock_file_path = Path (__file__ ).parent / "scrape.lock"
572+ if lock_file_path .exists ():
573+ sys .exit ()
574+ try :
575+ lock_file_path .touch ()
576+ logger .info ("Attemping to load Instagram session..." )
577+ L = session ()
578+ if L :
579+ logger .info ("Session created successfully!" )
580+ process_recent_feed (L )
581+ else :
582+ logger .critical ("Failed to initialize Instagram session, stopping..." )
583+ except Exception as e :
584+ logger .error (f"An uncaught exception occurred: { e } " )
585+ finally :
586+ if lock_file_path .exists ():
587+ lock_file_path .unlink ()
0 commit comments