@@ -88,42 +88,55 @@ def normalize_string(s):
8888 return re .sub (r"\W+" , "" , s ).lower ().strip ()
8989
9090
91+ def jaccard_similarity (a , b ):
92+ """Compute Jaccard similarity between two strings (case-insensitive, word-based)."""
93+ set_a = set (re .findall (r"\w+" , a .lower ()))
94+ set_b = set (re .findall (r"\w+" , b .lower ()))
95+ if not set_a or not set_b :
96+ return 0.0
97+ intersection = set_a & set_b
98+ union = set_a | set_b
99+ return len (intersection ) / len (union )
100+
101+
91102def is_duplicate_event (event_data ):
92103 """Check for duplicate events using ig_handle, title, datetime, location, and description."""
93- ig_handle = normalize_string ( event_data .get ("ig_handle" ) or "" )
94- title = normalize_string ( event_data .get ("title" ) or "" )
95- location = normalize_string ( event_data .get ("location" ) or "" )
96- description = normalize_string ( event_data .get ("description" ) or "" )
97- dtstart = event_data .get ("dtstart " )
98- if not dtstart :
104+ ig_handle = event_data .get ("ig_handle" ) or ""
105+ title = event_data .get ("title" ) or ""
106+ location = event_data .get ("location" ) or ""
107+ description = event_data .get ("description" ) or ""
108+ dtstart_utc = event_data .get ("dtstart_utc " )
109+ if not dtstart_utc :
99110 return False
100111
101112 try :
102- date = datetime .fromisoformat (dtstart )
103- # Candidates: same ig_handle and same day
113+ date = datetime .fromisoformat (dtstart_utc )
104114 candidates = Events .objects .filter (
105- dtstart__date = date .date (),
115+ dtstart_utc__date = date .date (),
106116 ig_handle__isnull = False ,
107117 )
108118 for c in candidates :
109- c_ig_handle = normalize_string ( getattr (c , "ig_handle" , "" ) or "" )
110- c_title = normalize_string ( getattr (c , "title" , "" ) or "" )
111- c_loc = normalize_string ( getattr (c , "location" , "" ) or "" )
112- c_desc = normalize_string ( getattr (c , "description" , "" ) or "" )
113- c_dtstart = getattr (c , "dtstart " , None )
114- # Require same ig_handle and dtstart within 2 hours
119+ c_ig_handle = getattr (c , "ig_handle" , "" ) or ""
120+ c_title = getattr (c , "title" , "" ) or ""
121+ c_loc = getattr (c , "location" , "" ) or ""
122+ c_desc = getattr (c , "description" , "" ) or ""
123+ c_dtstart_utc = getattr (c , "dtstart_utc " , None )
124+ # Require same ig_handle and dtstart_utc within 2 hours
115125 if (
116126 ig_handle
117127 and c_ig_handle
118128 and ig_handle == c_ig_handle
119- and c_dtstart
120- and abs (( c_dtstart - date ). total_seconds ()) <= 2 * 3600
129+ and c_dtstart_utc
130+ and c_dtstart_utc . date () == date . date ()
121131 ):
122- # Fuzzy match title, location, and description
123- title_sim = SequenceMatcher (None , c_title , title ).ratio ()
124- loc_sim = SequenceMatcher (None , c_loc , location ).ratio ()
125- desc_sim = SequenceMatcher (None , c_desc , description ).ratio ()
126- if title_sim > 0.55 and loc_sim > 0.55 and desc_sim > 0.55 :
132+ # Use Jaccard similarity for context-based matching
133+ title_sim = max (
134+ jaccard_similarity (c_title , title ),
135+ SequenceMatcher (None , c_title .lower (), title .lower ()).ratio (),
136+ )
137+ loc_sim = jaccard_similarity (c_loc , location )
138+ desc_sim = jaccard_similarity (c_desc , description )
139+ if (title_sim > 0.4 ) or (loc_sim > 0.5 and desc_sim > 0.3 ):
127140 return True
128141 except Exception as e :
129142 logger .error (f"Error during duplicate check: { e !s} " )
0 commit comments