@@ -75,8 +75,50 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
7575 logger .warning (f"{ log_prefix } Event '{ title } ' missing categories, assigning 'Uncategorized'" )
7676 categories = ["Uncategorized" ]
7777
78- if is_duplicate_event (event_data , ig_handle = ig_handle , source_url = source_url ):
79- return "duplicate"
78+ detector = EventDuplicateDetector ()
79+ has_match , matched_event = detector .find_match (event_data , ig_handle = ig_handle , source_url = source_url )
80+
81+ if has_match :
82+ # If event is from same club, update event info
83+ if matched_event and matched_event .ig_handle == ig_handle :
84+ logger .info (f"{ log_prefix } Updating existing event '{ matched_event .title } ' (ID: { matched_event .id } ) with new information" )
85+
86+ matched_event .title = title
87+ matched_event .location = location
88+ matched_event .description = description or None
89+ matched_event .price = price or None
90+ matched_event .food = food or None
91+ matched_event .registration = registration
92+ matched_event .source_image_url = source_image_url or None
93+ matched_event .categories = categories
94+ matched_event .source_url = source_url
95+ matched_event .save ()
96+
97+ # Delete old event dates and create new ones
98+ EventDates .objects .filter (event = matched_event ).delete ()
99+ event_dates = []
100+ for occ in occurrences :
101+ dtstart_utc = parse_utc_datetime (occ .get ("dtstart_utc" ))
102+ dtend_utc_raw = occ .get ("dtend_utc" )
103+ dtend_utc = (
104+ parse_utc_datetime (dtend_utc_raw )
105+ if dtend_utc_raw and dtend_utc_raw .strip ()
106+ else None
107+ )
108+ event_dates .append (
109+ EventDates (
110+ event = matched_event ,
111+ dtstart_utc = dtstart_utc ,
112+ dtend_utc = dtend_utc ,
113+ duration = occ .get ("duration" ) or None ,
114+ tz = occ .get ("tz" ) or None ,
115+ )
116+ )
117+ EventDates .objects .bulk_create (event_dates )
118+ logger .info (f"{ log_prefix } Updated event with { len (event_dates )} new date(s)" )
119+ return "updated"
120+ else :
121+ return "duplicate"
80122
81123 # Only fetch if club_type wasn't passed in
82124 if club_type is None :
@@ -137,33 +179,102 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
137179 return False
138180
139181
140- def is_duplicate_event (event_data , ig_handle = None , source_url = None ):
141- """Check for duplicate events using title, occurrences, location, and description."""
142-
143- title = event_data .get ("title" ) or ""
144- location = event_data .get ("location" ) or ""
145- description = event_data .get ("description" ) or ""
146- occurrences = event_data .get ("occurrences" )
147-
148- log_prefix = f"[{ ig_handle } ] [{ source_url .split ('/' )[- 1 ] if source_url else 'UNKNOWN' } ]"
182+ class EventDuplicateDetector :
183+ """Handles duplicate event detection"""
184+
185+ def __init__ (self ):
186+ self .SAME_CLUB_TITLE_THRESHOLD = 0.8
187+ self .TITLE_SIMILARITY_THRESHOLD = 0.7
188+ self .LOCATION_SIMILARITY_THRESHOLD = 0.5
189+ self .DESCRIPTION_SIMILARITY_THRESHOLD = 0.3
190+
191+ def find_match (self , event_data , ig_handle = None , source_url = None ):
192+ """
193+ Check for duplicate events using title, occurrences, location, and description.
194+ Returns:
195+ tuple: (has_match: bool, matched_event: Event | None)
196+ - has_match: True if a matching event was found (could be duplicate or update)
197+ - matched_event: The matching event object if found, None otherwise
198+ """
199+ title = event_data .get ("title" ) or ""
200+ location = event_data .get ("location" ) or ""
201+ description = event_data .get ("description" ) or ""
202+ occurrences = event_data .get ("occurrences" )
149203
150- if not occurrences :
151- return False
204+ log_prefix = f"[{ ig_handle } ] [{ source_url .split ('/' )[- 1 ] if source_url else 'UNKNOWN' } ]"
152205
153- target_start_str = occurrences [0 ].get ("dtstart_utc" )
154- target_start = parse_utc_datetime (target_start_str )
155- if not target_start :
156- return False
206+ if not occurrences :
207+ return False , None
157208
158- from datetime import timedelta
159- day_start = target_start .replace (hour = 0 , minute = 0 , second = 0 , microsecond = 0 )
160- day_end = day_start + timedelta (days = 1 )
209+ target_start_str = occurrences [0 ].get ("dtstart_utc" )
210+ target_start = parse_utc_datetime (target_start_str )
211+ if not target_start :
212+ return False , None
161213
162- try :
214+ try :
215+ # Strategy 1: Check for same-club updates (regardless of date)
216+ matched_event = self ._check_same_club_update (
217+ ig_handle , title , log_prefix
218+ )
219+ if matched_event :
220+ return True , matched_event
221+
222+ # Strategy 2: Check for same-day duplicates
223+ matched_event = self ._check_same_day_duplicate (
224+ target_start , title , location , description , ig_handle , log_prefix
225+ )
226+ if matched_event :
227+ return True , matched_event
228+
229+ except Exception as exc :
230+ logger .error (f"{ log_prefix } Error during duplicate check: { exc !s} " )
231+
232+ return False , None
233+
234+ def _check_same_club_update (self , ig_handle , title , log_prefix ):
235+ """
236+ Check if the same club has posted a similar event before (regardless of date).
237+ This catches updates where the club reposts with a new date/location.
238+ Returns:
239+ Event | None: The matched event if found, None otherwise
240+ """
241+ if not ig_handle :
242+ return None
243+
244+ same_club_events = Events .objects .filter (ig_handle = ig_handle )
245+ for existing_event in same_club_events :
246+ c_title = getattr (existing_event , "title" , "" ) or ""
247+
248+ title_sim = max (
249+ jaccard_similarity (c_title , title ),
250+ sequence_similarity (c_title , title ),
251+ )
252+
253+ if title_sim > self .SAME_CLUB_TITLE_THRESHOLD :
254+ logger .warning (
255+ f"{ log_prefix } Potential update detected: '{ title } ' matches existing event '{ c_title } ' "
256+ f"from same club (title_sim={ title_sim :.3f} ). This will be updated."
257+ )
258+ return existing_event
259+
260+ return None
261+
262+ def _check_same_day_duplicate (self , target_start , title , location , description , ig_handle , log_prefix ):
263+ """
264+ Check for duplicate events on the same day using title, location, and description similarity.
265+ Returns:
266+ Event | None: The matched event if found, None otherwise
267+ """
268+ from datetime import timedelta
269+
270+ day_start = target_start .replace (hour = 0 , minute = 0 , second = 0 , microsecond = 0 )
271+ day_end = day_start + timedelta (days = 1 )
272+
163273 candidates = EventDates .objects .select_related ("event" ).filter (
164274 dtstart_utc__gte = day_start ,
165275 dtstart_utc__lt = day_end
166276 )
277+
167278 if candidates :
168279 logger .debug (f"{ log_prefix } Found { len (candidates )} existing events on { day_start .date ()} for duplicate check." )
169280 for i , cand in enumerate (candidates [:3 ]):
@@ -183,35 +294,45 @@ def is_duplicate_event(event_data, ig_handle=None, source_url=None):
183294 if not c_start :
184295 continue
185296
297+ # Check substring match
186298 norm_title = normalize (title )
187299 norm_c_title = normalize (c_title )
188300 substring_match = norm_c_title in norm_title or norm_title in norm_c_title
189301
302+ # Calculate similarities
190303 title_sim = max (
191304 jaccard_similarity (c_title , title ),
192305 sequence_similarity (c_title , title ),
193306 )
194307 loc_sim = jaccard_similarity (c_loc , location )
195308 desc_sim = jaccard_similarity (c_desc , description )
196309
310+ # Check if substring match + similar location
197311 if substring_match :
198- logger .warning (
199- f"{ log_prefix } Duplicate by substring match: '{ title } ' @ '{ location } ' matches '{ c_title } ' @ '{ c_loc } '"
200- )
201- return True
312+ similar_location = loc_sim > self .LOCATION_SIMILARITY_THRESHOLD
313+ if similar_location :
314+ logger .warning (
315+ f"{ log_prefix } Duplicate by substring match + location: '{ title } ' @ '{ location } ' matches '{ c_title } ' @ '{ c_loc } ' "
316+ f"(ig_handle: { ig_handle } vs { existing_event .ig_handle } , loc_sim={ loc_sim :.3f} )"
317+ )
318+ return existing_event
319+ else :
320+ logger .debug (
321+ f"{ log_prefix } Substring match but different location: '{ title } ' ({ ig_handle } ) @ '{ location } ' vs '{ c_title } ' ({ existing_event .ig_handle } ) @ '{ c_loc } ' (loc_sim={ loc_sim :.3f} )"
322+ )
323+ continue
202324
203- if (title_sim > 0.7 and loc_sim > 0.5 ) or (
204- loc_sim > 0.5 and desc_sim > 0.3
325+ # Check similarity thresholds
326+ if (title_sim > self .TITLE_SIMILARITY_THRESHOLD and loc_sim > self .LOCATION_SIMILARITY_THRESHOLD ) or (
327+ loc_sim > self .LOCATION_SIMILARITY_THRESHOLD and desc_sim > self .DESCRIPTION_SIMILARITY_THRESHOLD
205328 ):
206329 logger .warning (
207330 f"{ log_prefix } Duplicate by similarity: '{ title } ' @ '{ location } ' matches '{ c_title } ' @ '{ c_loc } ' "
208331 f"(title_sim={ title_sim :.3f} , loc_sim={ loc_sim :.3f} , desc_sim={ desc_sim :.3f} )"
209332 )
210- return True
211- except Exception as exc :
212- logger .error (f"{ log_prefix } Error during duplicate check: { exc !s} " )
213-
214- return False
333+ return existing_event
334+
335+ return None
215336
216337
217338def append_event_to_csv (
0 commit comments