1313import random
1414import time
1515import traceback
16+ import re
1617from datetime import datetime , timedelta , timezone
1718from pathlib import Path
1819
@@ -116,6 +117,32 @@ def extract_s3_filename_from_url(image_url: str) -> str:
116117 return f"events/{ filename } "
117118
118119
120+ def normalize_string (s ):
121+ if not s :
122+ return ""
123+ return re .sub (r"\W+" , "" , s ).lower ().strip ()
124+
125+
126+ def is_duplicate_event (event_data ):
127+ """Check for duplicate events (same name, date, location, time)"""
128+ name = normalize_string (event_data .get ("name" ))
129+ location = normalize_string (event_data .get ("location" ))
130+ date = event_data .get ("date" )
131+ start_time = event_data .get ("start_time" )
132+ end_time = event_data .get ("end_time" )
133+
134+ candidates = Events .objects .filter (date = date )
135+ for c in candidates :
136+ if (
137+ normalize_string (c .name ) == name and
138+ normalize_string (c .location ) == location and
139+ str (c .start_time ) == str (start_time ) and
140+ (not end_time or str (c .end_time ) == str (end_time ))
141+ ):
142+ return True
143+ return False
144+
145+
119146def append_event_to_csv (
120147 event_data , club_ig , post_url , status = "success" , embedding = None
121148):
@@ -169,6 +196,13 @@ def insert_event_to_db(event_data, club_ig, post_url):
169196 event_date = event_data .get ("date" )
170197 event_location = event_data .get ("location" ) # .title()
171198 try :
199+ # Duplicate check
200+ if is_duplicate_event (event_data ):
201+ logger .info (
202+ f"Duplicate event detected, skipping { event_name } on { event_date } at { event_location } "
203+ )
204+ return False
205+
172206 # Get club_type based on club handle
173207 try :
174208 club = Clubs .objects .get (ig = club_ig )
@@ -191,17 +225,18 @@ def insert_event_to_db(event_data, club_ig, post_url):
191225 for existing in Events .objects .filter (
192226 id__in = candidate_ids , date = event_date
193227 ):
228+ # Only replace if new event has image but existing doesn't,
229+ # or if new description is longer (more info)
230+ new_img = event_data .get ("image_url" )
231+ old_img = existing .image_url
232+ new_desc = event_data .get ("description" ) or ""
233+ old_desc = existing .description or ""
194234 if (
195- (existing .location or "" ) == (event_location or "" )
196- and (existing .start_time or "" )
197- == (event_data .get ("start_time" ) or "" )
198- and (
199- (existing .end_time or None )
200- == (event_data .get ("end_time" ) or None )
201- )
235+ (not old_img and new_img )
236+ or (len (new_desc ) > len (old_desc ) + 10 )
202237 ):
203238 logger .info (
204- f"Deleting older duplicate event id={ existing .id } before inserting refreshed version "
239+ f"Replacing older event: id={ existing .id } with newer one "
205240 )
206241 existing .delete ()
207242 except Exception as dedup_err :
0 commit comments