Skip to content

Commit 8fcf97f

Browse files
committed
improve duplicate event detection
1 parent 10bf576 commit 8fcf97f

File tree

1 file changed

+43
-8
lines changed

1 file changed

+43
-8
lines changed

backend/scraping/instagram_feed.py

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import random
1414
import time
1515
import traceback
16+
import re
1617
from datetime import datetime, timedelta, timezone
1718
from pathlib import Path
1819

@@ -116,6 +117,32 @@ def extract_s3_filename_from_url(image_url: str) -> str:
116117
return f"events/{filename}"
117118

118119

120+
def normalize_string(s):
121+
if not s:
122+
return ""
123+
return re.sub(r"\W+", "", s).lower().strip()
124+
125+
126+
def is_duplicate_event(event_data):
127+
"""Check for duplicate events (same name, date, location, time)"""
128+
name = normalize_string(event_data.get("name"))
129+
location = normalize_string(event_data.get("location"))
130+
date = event_data.get("date")
131+
start_time = event_data.get("start_time")
132+
end_time = event_data.get("end_time")
133+
134+
candidates = Events.objects.filter(date=date)
135+
for c in candidates:
136+
if (
137+
normalize_string(c.name) == name and
138+
normalize_string(c.location) == location and
139+
str(c.start_time) == str(start_time) and
140+
(not end_time or str(c.end_time) == str(end_time))
141+
):
142+
return True
143+
return False
144+
145+
119146
def append_event_to_csv(
120147
event_data, club_ig, post_url, status="success", embedding=None
121148
):
@@ -169,6 +196,13 @@ def insert_event_to_db(event_data, club_ig, post_url):
169196
event_date = event_data.get("date")
170197
event_location = event_data.get("location") # .title()
171198
try:
199+
# Duplicate check
200+
if is_duplicate_event(event_data):
201+
logger.info(
202+
f"Duplicate event detected, skipping {event_name} on {event_date} at {event_location}"
203+
)
204+
return False
205+
172206
# Get club_type based on club handle
173207
try:
174208
club = Clubs.objects.get(ig=club_ig)
@@ -191,17 +225,18 @@ def insert_event_to_db(event_data, club_ig, post_url):
191225
for existing in Events.objects.filter(
192226
id__in=candidate_ids, date=event_date
193227
):
228+
# Only replace if new event has image but existing doesn't,
229+
# or if new description is longer (more info)
230+
new_img = event_data.get("image_url")
231+
old_img = existing.image_url
232+
new_desc = event_data.get("description") or ""
233+
old_desc = existing.description or ""
194234
if (
195-
(existing.location or "") == (event_location or "")
196-
and (existing.start_time or "")
197-
== (event_data.get("start_time") or "")
198-
and (
199-
(existing.end_time or None)
200-
== (event_data.get("end_time") or None)
201-
)
235+
(not old_img and new_img)
236+
or (len(new_desc) > len(old_desc) + 10)
202237
):
203238
logger.info(
204-
f"Deleting older duplicate event id={existing.id} before inserting refreshed version"
239+
f"Replacing older event: id={existing.id} with newer one"
205240
)
206241
existing.delete()
207242
except Exception as dedup_err:

0 commit comments

Comments
 (0)