Skip to content

Commit ff0ec98

Browse files
committed
add duplicate event detection logging
1 parent 0b5debb commit ff0ec98

File tree

1 file changed

+21
-3
lines changed

1 file changed

+21
-3
lines changed

backend/scraping/instagram_feed.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,24 @@ def is_duplicate_event(event_data):
117117
c_desc = getattr(c, "description", "") or ""
118118
c_dtstart_utc = getattr(c, "dtstart_utc", None)
119119
if c_dtstart_utc and c_dtstart_utc.date() == date.date():
120-
# Use substring match and Jaccard similarity
121-
if normalize(c_title) in normalize(title) or normalize(title) in normalize(c_title):
122-
return True
120+
norm_title = normalize(title)
121+
norm_c_title = normalize(c_title)
122+
substring_match = norm_c_title in norm_title or norm_title in norm_c_title
123123
title_sim = max(
124124
jaccard_similarity(c_title, title),
125125
SequenceMatcher(None, c_title.lower(), title.lower()).ratio(),
126126
)
127127
loc_sim = jaccard_similarity(c_loc, location)
128128
desc_sim = jaccard_similarity(c_desc, description)
129+
logger.debug(
130+
f"Comparing to candidate: '{c_title}' @ '{c_loc}'",
131+
f"substring_match={substring_match}, title_sim={title_sim:.3f}, loc_sim={loc_sim:.3f}, desc_sim={desc_sim:.3f}"
132+
)
133+
if substring_match:
134+
logger.debug("-> Duplicate by substring match")
135+
return True
129136
if (title_sim > 0.4) or (loc_sim > 0.5 and desc_sim > 0.3):
137+
logger.debug("-> Duplicate by similarity threshold")
130138
return True
131139
except Exception as e:
132140
logger.error(f"Error during duplicate check: {e!s}")
@@ -258,6 +266,16 @@ def insert_event_to_db(event_data, ig_handle, source_url):
258266
logger.info(
259267
f"Duplicate event detected, skipping {title} on {date} at {location}"
260268
)
269+
try:
270+
append_event_to_csv(
271+
event_data,
272+
ig_handle,
273+
source_url,
274+
added_to_db="duplicate",
275+
embedding=event_data.get("embedding"),
276+
)
277+
except Exception as csv_e:
278+
logger.error(f"Error writing duplicate event to CSV: {csv_e}")
261279
return False
262280

263281
# Get club_type by matching ig_handle from Events to ig of Clubs

0 commit comments

Comments
 (0)