2222from instaloader import Instaloader
2323
2424from example .embedding_utils import generate_event_embedding , is_duplicate_event
25+ from example .models import Events , Clubs
2526from services .openai_service import extract_event_from_caption
2627from services .storage_service import delete_image , upload_image_from_url
2728
@@ -113,10 +114,13 @@ def wrapper(*args, **kwargs):
113114def extract_s3_filename_from_url (image_url : str ) -> str :
114115 if not image_url :
115116 return None
116- filename = image_url .split ('/' )[- 1 ]
117+ filename = image_url .split ("/" )[- 1 ]
117118 return f"events/{ filename } "
118119
119- def append_event_to_csv (event_data , club_ig , post_url , status = "success" , embedding = None ):
120+
121+ def append_event_to_csv (
122+ event_data , club_ig , post_url , status = "success" , embedding = None
123+ ):
120124 csv_file = Path (__file__ ).resolve ().parent / "events_scraped.csv"
121125 csv_file .parent .mkdir (parents = True , exist_ok = True )
122126 file_exists = csv_file .exists ()
@@ -135,7 +139,7 @@ def append_event_to_csv(event_data, club_ig, post_url, status="success", embeddi
135139 "registration" ,
136140 "image_url" ,
137141 "status" ,
138- "embedding"
142+ "embedding" ,
139143 ]
140144 writer = csv .DictWriter (csvfile , fieldnames = fieldnames )
141145 if not file_exists :
@@ -154,7 +158,7 @@ def append_event_to_csv(event_data, club_ig, post_url, status="success", embeddi
154158 "registration" : event_data .get ("registration" , False ),
155159 "image_url" : event_data .get ("image_url" , "" ),
156160 "status" : status ,
157- "embedding" : embedding or ""
161+ "embedding" : embedding or "" ,
158162 }
159163 )
160164
@@ -165,100 +169,81 @@ def insert_event_to_db(event_data, club_ig, post_url):
165169 event_date = event_data .get ("date" )
166170 event_location = event_data .get ("location" ) # .title()
167171 try :
168- with connection .cursor () as cur :
169- # Get club_type from club handle
170- cur .execute ("SELECT club_type FROM clubs WHERE ig = %s" , (club_ig ,))
171- club_row = cur .fetchone ()
172- club_type = club_row [0 ] if club_row else None
173- if not club_type :
174- logger .warning (
175- f"Club with handle { club_ig } not found in clubs. Inserting event with null club_type."
176- )
177-
178- # Check duplicates using vector similarity
179- logger .debug (
180- f"Checking for duplicates using vector similarity: { event_data } "
172+ # Get club_type based on club handle
173+ try :
174+ club = Clubs .objects .get (ig = club_ig )
175+ club_type = club .club_type
176+ except Clubs .DoesNotExist :
177+ club_type = None
178+ logger .warning (
179+ f"Club with handle { club_ig } not found, inserting event with null club_type"
181180 )
182181
183- # Check if this event is a duplicate using vector similarity
184- if is_duplicate_event (event_data ):
185- logger .debug (
186- f"Duplicate event found using vector similarity: { event_name } at { event_location } "
187- )
188- return False
189-
190- # Generate embedding for the event
191- embedding = generate_event_embedding (event_data )
192-
193- insert_query = """
194- INSERT INTO events (
195- club_handle, url, name, date, start_time, end_time, location, price, food, registration, image_url, embedding, club_type
196- )
197- VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::vector, %s)
198- ON CONFLICT DO NOTHING;
199- """
200- cur .execute (
201- insert_query ,
202- (
203- club_ig ,
204- post_url ,
205- event_name ,
206- event_date ,
207- event_data ["start_time" ],
208- event_data ["end_time" ] or None ,
209- event_location ,
210- event_data .get ("price" , None ),
211- event_data .get ("food" ) or "" ,
212- bool (event_data .get ("registration" , False )),
213- event_data .get ("image_url" ),
214- embedding ,
215- club_type ,
216- ),
217- )
218- logger .debug (f"Event inserted: { event_data .get ('name' )} from { club_ig } " )
182+ # Check duplicates using vector sim
183+ logger .debug (f"Checking duplicates for event with data: { event_data } " )
184+ if is_duplicate_event (event_data ):
185+ logger .debug (f"Duplicate event found: { event_name } at { event_location } " )
186+ return False
219187
188+ # Generate embedding
189+ embedding = generate_event_embedding (event_data )
190+
191+ # Create event using Django ORM
192+ event = Events .objects .create (
193+ club_handle = club_ig ,
194+ url = post_url ,
195+ name = event_name ,
196+ date = event_date ,
197+ start_time = event_data ["start_time" ],
198+ end_time = event_data ["end_time" ] or None ,
199+ location = event_location ,
200+ price = event_data .get ("price" , None ),
201+ food = event_data .get ("food" ) or "" ,
202+ registration = bool (event_data .get ("registration" , False )),
203+ image_url = event_data .get ("image_url" ),
204+ embedding = embedding ,
205+ club_type = club_type ,
206+ )
207+ logger .debug (f"Event inserted: { event_data .get ('name' )} from { club_ig } " )
220208 try :
221- append_event_to_csv (event_data , club_ig , post_url , status = "success" , embedding = embedding )
209+ append_event_to_csv (
210+ event_data , club_ig , post_url , status = "success" , embedding = embedding
211+ )
222212 logger .info (f"Appended event to CSV: { event_data .get ('name' )} " )
223213 except Exception as csv_err :
224214 logger .error (
225215 f"Database insert succeeded, but failed to append to CSV: { csv_err } "
226216 )
227217 logger .error (f"Traceback: { traceback .format_exc ()} " )
228218 return False
229-
230219 return True
231220 except Exception as e :
232221 logger .error (f"Database error: { str (e )} " )
233222 logger .error (f"Event data: { event_data } " )
234223 logger .error (f"Traceback: { traceback .format_exc ()} " )
235-
236224 try :
237225 embedding = generate_event_embedding (event_data )
238- append_event_to_csv (event_data , club_ig , post_url , status = "failed" , embedding = embedding )
226+ append_event_to_csv (
227+ event_data , club_ig , post_url , status = "failed" , embedding = embedding
228+ )
239229 logger .info (f"Appended event to CSV: { event_data .get ('name' )} " )
240230 except Exception as csv_err :
241- logger .error (
242- f"Database insert failed, and failed to append to CSV: { csv_err } "
243- )
231+ logger .error (f"Database and CSV inserts failed: { csv_err } " )
244232 logger .error (f"Traceback: { traceback .format_exc ()} " )
245-
246233 return False
247234
248235
249236def get_seen_shortcodes ():
250237 """Fetches all post shortcodes from events table in DB"""
251238 logger .info ("Fetching seen shortcodes from the database..." )
252239 try :
253- with connection .cursor () as cur :
254- cur .execute ("SELECT url FROM events WHERE url IS NOT NULL" )
255- urls = cur .fetchall ()
256- shortcodes = {url [0 ].split ('/' )[- 2 ] for url in urls if url [0 ]}
257- return shortcodes
240+ events = Events .objects .filter (url__isnull = False ).values_list ("url" , flat = True )
241+ shortcodes = {url .split ("/" )[- 2 ] for url in events if url }
242+ return shortcodes
258243 except Exception as e :
259244 logger .error (f"Could not fetch shortcodes from database: { e } " )
260245 return set ()
261-
246+
262247
263248def process_recent_feed (
264249 loader ,
@@ -272,7 +257,7 @@ def process_recent_feed(
272257 posts_processed = 0
273258 consec_old_posts = 0
274259 logger .info (f"Starting feed processing with cutoff: { cutoff } " )
275-
260+
276261 seen_shortcodes = get_seen_shortcodes ()
277262
278263 for post in loader .get_feed_posts ():
@@ -286,7 +271,7 @@ def process_recent_feed(
286271 )
287272 break
288273 continue # to next post
289-
274+
290275 consec_old_posts = 0
291276 posts_processed += 1
292277 logger .info ("\n " + "-" * 50 )
@@ -311,7 +296,9 @@ def process_recent_feed(
311296 if image_url :
312297 s3_filename = extract_s3_filename_from_url (image_url )
313298 if s3_filename and delete_image (s3_filename ):
314- logger .info (f"Deleted S3 file for failed event extraction: { s3_filename } " )
299+ logger .info (
300+ f"Deleted S3 file for failed event extraction: { s3_filename } "
301+ )
315302 continue
316303
317304 post_url = f"https://www.instagram.com/p/{ post .shortcode } /"
@@ -329,7 +316,9 @@ def process_recent_feed(
329316 if image_url :
330317 s3_filename = extract_s3_filename_from_url (image_url )
331318 if s3_filename and delete_image (s3_filename ):
332- logger .info (f"Deleted S3 file for failed DB insert: { s3_filename } " )
319+ logger .info (
320+ f"Deleted S3 file for failed DB insert: { s3_filename } "
321+ )
333322 else :
334323 missing_fields = [
335324 key
@@ -344,13 +333,19 @@ def process_recent_feed(
344333 if image_url :
345334 s3_filename = extract_s3_filename_from_url (image_url )
346335 if s3_filename and delete_image (s3_filename ):
347- logger .info (f"Deleted S3 file for event with missing fields: { s3_filename } " )
336+ logger .info (
337+ f"Deleted S3 file for event with missing fields: { s3_filename } "
338+ )
348339 append_event_to_csv (
349- event_data , post .owner_username , post_url , status = "missing_fields" , embedding = embedding
340+ event_data ,
341+ post .owner_username ,
342+ post_url ,
343+ status = "missing_fields" ,
344+ embedding = embedding ,
350345 )
351-
346+
352347 time .sleep (random .uniform (15 , 45 ))
353-
348+
354349 if posts_processed >= max_posts :
355350 logger .info (f"Reached max post limit of { max_posts } , stopping" )
356351 break
0 commit comments