@@ -117,16 +117,24 @@ def is_duplicate_event(event_data):
117117 c_desc = getattr (c , "description" , "" ) or ""
118118 c_dtstart_utc = getattr (c , "dtstart_utc" , None )
119119 if c_dtstart_utc and c_dtstart_utc .date () == date .date ():
120- # Use substring match and Jaccard similarity
121- if normalize ( c_title ) in normalize (title ) or normalize ( title ) in normalize ( c_title ):
122- return True
120+ norm_title = normalize ( title )
121+ norm_c_title = normalize (c_title )
122+ substring_match = norm_c_title in norm_title or norm_title in norm_c_title
123123 title_sim = max (
124124 jaccard_similarity (c_title , title ),
125125 SequenceMatcher (None , c_title .lower (), title .lower ()).ratio (),
126126 )
127127 loc_sim = jaccard_similarity (c_loc , location )
128128 desc_sim = jaccard_similarity (c_desc , description )
129+ logger .debug (
130+ f"Comparing to candidate: '{ c_title } ' @ '{ c_loc } '" ,
131+ f"substring_match={ substring_match } , title_sim={ title_sim :.3f} , loc_sim={ loc_sim :.3f} , desc_sim={ desc_sim :.3f} "
132+ )
133+ if substring_match :
134+ logger .debug ("-> Duplicate by substring match" )
135+ return True
129136 if (title_sim > 0.4 ) or (loc_sim > 0.5 and desc_sim > 0.3 ):
137+ logger .debug ("-> Duplicate by similarity threshold" )
130138 return True
131139 except Exception as e :
132140 logger .error (f"Error during duplicate check: { e !s} " )
@@ -258,6 +266,16 @@ def insert_event_to_db(event_data, ig_handle, source_url):
258266 logger .info (
259267 f"Duplicate event detected, skipping { title } on { date } at { location } "
260268 )
269+ try :
270+ append_event_to_csv (
271+ event_data ,
272+ ig_handle ,
273+ source_url ,
274+ added_to_db = "duplicate" ,
275+ embedding = event_data .get ("embedding" ),
276+ )
277+ except Exception as csv_e :
278+ logger .error (f"Error writing duplicate event to CSV: { csv_e } " )
261279 return False
262280
263281 # Get club_type by matching ig_handle from Events to ig of Clubs
0 commit comments