1- from instaloader import *
1+ from instaloader import Instaloader
22from dotenv import load_dotenv
33import os
44import csv
88import psycopg2
99import logging
1010import traceback
11- from datetime import datetime
1211import sys
1312from fuzzywuzzy import fuzz
1413import time
14+ from pathlib import Path
15+
1516
1617logging .basicConfig (
1718 level = logging .DEBUG ,
2324)
2425logger = logging .getLogger (__name__ )
2526
27+
28+ # Load environment variables from .env file
29+ load_dotenv ()
30+
31+ # Get credentials from environment variables
32+ USERNAME = os .getenv ("USERNAME" )
33+ PASSWORD = os .getenv ("PASSWORD" )
34+ CSRFTOKEN = os .getenv ("CSRFTOKEN" )
35+ SESSIONID = os .getenv ("SESSIONID" )
36+ DS_USER_ID = os .getenv ("DS_USER_ID" )
37+ MID = os .getenv ("MID" )
38+ IG_DID = os .getenv ("IG_DID" )
39+ SUPABASE_DB_URL = os .getenv ("SUPABASE_DB_URL" )
40+
41+
2642def get_post_image_url (post ):
2743 try :
2844 if "image_versions2" in post ._node and post ._node ["image_versions2" ]:
@@ -56,23 +72,11 @@ def wrapper(*args, **kwargs):
5672 return wrapper
5773
5874
59- # Load environment variables from .env file
60- load_dotenv ()
61-
62- # Get credentials from environment variables
63- USERNAME = os .getenv ("USERNAME" )
64- PASSWORD = os .getenv ("PASSWORD" )
65- CSRFTOKEN = os .getenv ("CSRFTOKEN" )
66- SESSIONID = os .getenv ("SESSIONID" )
67- DS_USER_ID = os .getenv ("DS_USER_ID" )
68- MID = os .getenv ("MID" )
69- IG_DID = os .getenv ("IG_DID" )
70-
71-
7275def append_event_to_csv (event_data , club_ig , post_url , status = "success" ):
73- csv_file = "backend/scraping/events_scraped.csv"
74- os .makedirs (os .path .dirname (csv_file ), exist_ok = True )
75- file_exists = os .path .isfile (csv_file )
76+ csv_file = Path (__file__ ).resolve ().parent / "events_scraped.csv"
77+ csv_file .parent .mkdir (parents = True , exist_ok = True )
78+ file_exists = csv_file .exists ()
79+
7680 with open (csv_file , "a" , newline = "" , encoding = "utf-8" ) as csvfile :
7781 fieldnames = [
7882 "club_handle" , "url" , "name" , "date" , "start_time" , "end_time" ,
@@ -104,7 +108,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
104108 event_location = event_data .get ("location" ) #.title()
105109 conn = None
106110 try :
107- conn = psycopg2 .connect (os . getenv ( " SUPABASE_DB_URL" ) )
111+ conn = psycopg2 .connect (SUPABASE_DB_URL )
108112 cur = conn .cursor ()
109113
110114 # Check duplicates
@@ -146,6 +150,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
146150
147151 try :
148152 append_event_to_csv (event_data , club_ig , post_url , status = "success" )
153+ logger .info (f"Appended event to CSV: { event_data .get ('name' )} " )
149154 except Exception as csv_err :
150155 logger .error (f"Database insert succeeded, but failed to append to CSV: { csv_err } " )
151156 logger .error (f"Traceback: { traceback .format_exc ()} " )
@@ -159,6 +164,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
159164
160165 try :
161166 append_event_to_csv (event_data , club_ig , post_url , status = "failed" )
167+ logger .info (f"Appended event to CSV: { event_data .get ('name' )} " )
162168 except Exception as csv_err :
163169 logger .error (f"Database insert failed, and failed to append to CSV: { csv_err } " )
164170 logger .error (f"Traceback: { traceback .format_exc ()} " )
@@ -169,79 +175,72 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
169175 conn .close ()
170176
171177
172- def process_recent_feed (cutoff = datetime .now (timezone .utc ) - timedelta (days = 2 ), max_posts = 100 , max_consec_old_posts = 3 ):
178+ def process_recent_feed (loader , cutoff = datetime .now (timezone .utc ) - timedelta (days = 2 ), max_posts = 100 , max_consec_old_posts = 3 ):
173179 # Process Instagram feed posts and extract event info. Stops
174180 # scraping once posts become older than cutoff.
175- try :
176- logger .info (f"Starting feed processing with cutoff: { cutoff } " )
177- events_added = 0
178- posts_processed = 0
179- consec_old_posts = 0
180- s3_uploader = S3ImageUploader () # Initialize S3 uploader
181-
182-
183- for post in L .get_feed_posts ():
184- try :
185- posts_processed += 1
186- logger .info ("\n " + "-" * 50 )
187- logger .info (f"Processing post: { post .shortcode } by { post .owner_username } " )
188-
189- post_time = post .date_utc .replace (tzinfo = timezone .utc )
190- if post_time < cutoff :
191- consec_old_posts += 1
192- logger .debug (f"Post { post .shortcode } is older than cutoff ({ post_time } ), consecutive old posts: { consec_old_posts } " )
193- if consec_old_posts >= max_consec_old_posts :
194- logger .info (f"Reached { max_consec_old_posts } consecutive old posts, stopping." )
195- break
196- continue # to next post
197- consec_old_posts = 0
198-
199- if posts_processed >= max_posts :
200- logger .info (f"Reached max post limit of { max_posts } , stopping." )
181+ events_added = 0
182+ posts_processed = 0
183+ consec_old_posts = 0
184+ s3_uploader = S3ImageUploader () # Initialize S3 uploader
185+ logger .info (f"Starting feed processing with cutoff: { cutoff } " )
186+
187+ for post in loader .get_feed_posts ():
188+ try :
189+ posts_processed += 1
190+ logger .info ("\n " + "-" * 50 )
191+ logger .info (f"Processing post: { post .shortcode } by { post .owner_username } " )
192+
193+ post_time = post .date_utc .replace (tzinfo = timezone .utc )
194+ if post_time < cutoff :
195+ consec_old_posts += 1
196+ if consec_old_posts >= max_consec_old_posts :
197+ logger .info (f"Reached { max_consec_old_posts } consecutive old posts, stopping." )
201198 break
199+ continue # to next post
200+ consec_old_posts = 0
202201
203- # Safely get image URL and upload to S3
204- raw_image_url = get_post_image_url ( post )
205- if raw_image_url :
206- image_url = s3_uploader . upload_image ( raw_image_url )
207- print ( f"Uploaded image to S3: { image_url } " )
208- else :
209- logger . warning ( f"No image URL found for post { post . shortcode } , skipping image upload" )
210- image_url = None
211-
212- event_data = parse_caption_for_event ( post . caption , image_url )
213-
214- if event_data is None :
215- logger . warning ( f"AI client returned None for post { post . shortcode } " )
216- continue
217-
218- post_url = f"https://www.instagram.com/p/ { post . shortcode } /"
219- if event_data . get ( "name" ) and event_data . get ( "date" ) and event_data . get ( "location" ) and event_data . get ( "start_time" ):
220- if insert_event_to_db ( event_data , post . owner_username , post_url ):
221- events_added += 1
222- logger . info ( f"Successfully added event from { post .owner_username } " )
223- else :
224- missing_fields = [ key for key in [ 'name' , 'date' , 'location' , 'start_time' ] if not event_data . get ( key )]
225- logger . warning ( f"Missing required fields: { missing_fields } , skipping event" )
226- time . sleep ( 5 )
227- except Exception as e :
228- logger . error ( f"Error processing post { post . shortcode } by { post . owner_username } : { str ( e ) } " )
229- logger .error (f"Traceback : { traceback . format_exc () } " )
230- continue # with next post
231- print ( f" \n --- Summary ---" )
232- print (f"Added { events_added } event(s) to Supabase " )
233- logger .info (f"Feed processing completed. Processed { posts_processed } posts, added { events_added } events " )
234- except Exception as e :
235- logger .error (f"Error in process_recent_feed: { str ( e ) } " )
236- logger .error (f"Traceback: { traceback . format_exc () } " )
237- raise
202+ if posts_processed >= max_posts :
203+ logger . info ( f"Reached max post limit of { max_posts } , stopping." )
204+ break
205+
206+ # Safely get image URL and upload to S3
207+ raw_image_url = get_post_image_url ( post )
208+ if raw_image_url :
209+ image_url = s3_uploader . upload_image ( raw_image_url )
210+ logger . info ( f"Uploaded image to S3: { image_url } " )
211+ else :
212+ logger . warning ( f"No image URL found for post { post . shortcode } , skipping image upload" )
213+ image_url = None
214+
215+ event_data = parse_caption_for_event ( post . caption , image_url )
216+
217+ if event_data is None :
218+ logger . warning ( f"AI client returned None for post { post . shortcode } " )
219+ continue
220+
221+ post_url = f"https://www.instagram.com/p/ { post .shortcode } /"
222+ if event_data . get ( "name" ) and event_data . get ( "date" ) and event_data . get ( "location" ) and event_data . get ( "start_time" ) :
223+ if insert_event_to_db ( event_data , post . owner_username , post_url ):
224+ events_added += 1
225+ logger . info ( f"Successfully added event from { post . owner_username } " )
226+ else :
227+ missing_fields = [ key for key in [ 'name' , 'date' , 'location' , 'start_time' ] if not event_data . get ( key )]
228+ logger .warning (f"Missing required fields : { missing_fields } , skipping event " )
229+ time . sleep ( 5 )
230+ except Exception as e :
231+ logger . error (f"Error processing post { post . shortcode } by { post . owner_username } : { str ( e ) } " )
232+ logger .error (f"Traceback: { traceback . format_exc () } " )
233+ continue # with next post
234+ logger .info (f"Feed processing completed. Processed { posts_processed } posts, added { events_added } events " )
235+ logger .info (f"\n --- Summary --- " )
236+ logger . info ( f"Added { events_added } event(s) to Supabase" )
238237
239238
240239@handle_instagram_errors
241240def session ():
242241 L = Instaloader ()
242+ logger .info ("Attemping to load Instagram session..." )
243243 try :
244- logger .info ("Attemping to load Instagram session..." )
245244 L .load_session (
246245 USERNAME ,
247246 {
@@ -262,4 +261,4 @@ def session():
262261
263262if __name__ == "__main__" :
264263 L = session ()
265- process_recent_feed ()
264+ process_recent_feed (L )
0 commit comments