Skip to content

Commit 3c214d1

Browse files
committed
improve duplicate detection and event updates; add blocks to prevent scraping when no username provided
1 parent 5f5ee61 commit 3c214d1

File tree

2 files changed

+171
-35
lines changed

2 files changed

+171
-35
lines changed

backend/scraping/main.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,15 @@
2323
def get_targets():
2424
"""
2525
Determine if we are in 'Single User' mode or 'Batch' mode.
26+
Returns:
27+
tuple: (mode: str, targets: list)
28+
- mode: "single" if TARGET_USERNAME env var is set, "batch" otherwise
29+
- targets: list of usernames to scrape
2630
"""
27-
username = os.getenv("TARGET_USERNAME")
28-
if username:
29-
return "single", [username]
31+
if "TARGET_USERNAME" in os.environ:
32+
username = os.getenv("TARGET_USERNAME")
33+
# Return username even if empty
34+
return "single", [username] if username else []
3035

3136
batch_users = [
3237
url.split("instagram.com/")[1].split("/")[0]
@@ -45,6 +50,16 @@ def filter_valid_posts(posts):
4550
def main():
4651
mode, targets = get_targets()
4752
logger.info(f"--- Workflow Started: {mode.upper()} ---")
53+
54+
# Validate targets before proceeding
55+
if not targets or not any(t and t.strip() for t in targets):
56+
if mode == "single":
57+
logger.error("Repository dispatch triggered but no valid username provided, exiting.")
58+
sys.exit(1)
59+
else:
60+
logger.warning("No valid targets found in batch mode, exiting.")
61+
sys.exit(0)
62+
4863
scraper = InstagramScraper()
4964
processor = EventProcessor(concurrency=5)
5065

backend/utils/scraping_utils.py

Lines changed: 153 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,50 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
7575
logger.warning(f"{log_prefix} Event '{title}' missing categories, assigning 'Uncategorized'")
7676
categories = ["Uncategorized"]
7777

78-
if is_duplicate_event(event_data, ig_handle=ig_handle, source_url=source_url):
79-
return "duplicate"
78+
detector = EventDuplicateDetector()
79+
has_match, matched_event = detector.find_match(event_data, ig_handle=ig_handle, source_url=source_url)
80+
81+
if has_match:
82+
# If event is from same club, update event info
83+
if matched_event and matched_event.ig_handle == ig_handle:
84+
logger.info(f"{log_prefix} Updating existing event '{matched_event.title}' (ID: {matched_event.id}) with new information")
85+
86+
matched_event.title = title
87+
matched_event.location = location
88+
matched_event.description = description or None
89+
matched_event.price = price or None
90+
matched_event.food = food or None
91+
matched_event.registration = registration
92+
matched_event.source_image_url = source_image_url or None
93+
matched_event.categories = categories
94+
matched_event.source_url = source_url
95+
matched_event.save()
96+
97+
# Delete old event dates and create new ones
98+
EventDates.objects.filter(event=matched_event).delete()
99+
event_dates = []
100+
for occ in occurrences:
101+
dtstart_utc = parse_utc_datetime(occ.get("dtstart_utc"))
102+
dtend_utc_raw = occ.get("dtend_utc")
103+
dtend_utc = (
104+
parse_utc_datetime(dtend_utc_raw)
105+
if dtend_utc_raw and dtend_utc_raw.strip()
106+
else None
107+
)
108+
event_dates.append(
109+
EventDates(
110+
event=matched_event,
111+
dtstart_utc=dtstart_utc,
112+
dtend_utc=dtend_utc,
113+
duration=occ.get("duration") or None,
114+
tz=occ.get("tz") or None,
115+
)
116+
)
117+
EventDates.objects.bulk_create(event_dates)
118+
logger.info(f"{log_prefix} Updated event with {len(event_dates)} new date(s)")
119+
return "updated"
120+
else:
121+
return "duplicate"
80122

81123
# Only fetch if club_type wasn't passed in
82124
if club_type is None:
@@ -137,33 +179,102 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
137179
return False
138180

139181

140-
def is_duplicate_event(event_data, ig_handle=None, source_url=None):
141-
"""Check for duplicate events using title, occurrences, location, and description."""
142-
143-
title = event_data.get("title") or ""
144-
location = event_data.get("location") or ""
145-
description = event_data.get("description") or ""
146-
occurrences = event_data.get("occurrences")
147-
148-
log_prefix = f"[{ig_handle}] [{source_url.split('/')[-1] if source_url else 'UNKNOWN'}]"
182+
class EventDuplicateDetector:
183+
"""Handles duplicate event detection"""
184+
185+
def __init__(self):
186+
self.SAME_CLUB_TITLE_THRESHOLD = 0.8
187+
self.TITLE_SIMILARITY_THRESHOLD = 0.7
188+
self.LOCATION_SIMILARITY_THRESHOLD = 0.5
189+
self.DESCRIPTION_SIMILARITY_THRESHOLD = 0.3
190+
191+
def find_match(self, event_data, ig_handle=None, source_url=None):
192+
"""
193+
Check for duplicate events using title, occurrences, location, and description.
194+
Returns:
195+
tuple: (has_match: bool, matched_event: Event | None)
196+
- has_match: True if a matching event was found (could be duplicate or update)
197+
- matched_event: The matching event object if found, None otherwise
198+
"""
199+
title = event_data.get("title") or ""
200+
location = event_data.get("location") or ""
201+
description = event_data.get("description") or ""
202+
occurrences = event_data.get("occurrences")
149203

150-
if not occurrences:
151-
return False
204+
log_prefix = f"[{ig_handle}] [{source_url.split('/')[-1] if source_url else 'UNKNOWN'}]"
152205

153-
target_start_str = occurrences[0].get("dtstart_utc")
154-
target_start = parse_utc_datetime(target_start_str)
155-
if not target_start:
156-
return False
206+
if not occurrences:
207+
return False, None
157208

158-
from datetime import timedelta
159-
day_start = target_start.replace(hour=0, minute=0, second=0, microsecond=0)
160-
day_end = day_start + timedelta(days=1)
209+
target_start_str = occurrences[0].get("dtstart_utc")
210+
target_start = parse_utc_datetime(target_start_str)
211+
if not target_start:
212+
return False, None
161213

162-
try:
214+
try:
215+
# Strategy 1: Check for same-club updates (regardless of date)
216+
matched_event = self._check_same_club_update(
217+
ig_handle, title, log_prefix
218+
)
219+
if matched_event:
220+
return True, matched_event
221+
222+
# Strategy 2: Check for same-day duplicates
223+
matched_event = self._check_same_day_duplicate(
224+
target_start, title, location, description, ig_handle, log_prefix
225+
)
226+
if matched_event:
227+
return True, matched_event
228+
229+
except Exception as exc:
230+
logger.error(f"{log_prefix} Error during duplicate check: {exc!s}")
231+
232+
return False, None
233+
234+
def _check_same_club_update(self, ig_handle, title, log_prefix):
235+
"""
236+
Check if the same club has posted a similar event before (regardless of date).
237+
This catches updates where the club reposts with a new date/location.
238+
Returns:
239+
Event | None: The matched event if found, None otherwise
240+
"""
241+
if not ig_handle:
242+
return None
243+
244+
same_club_events = Events.objects.filter(ig_handle=ig_handle)
245+
for existing_event in same_club_events:
246+
c_title = getattr(existing_event, "title", "") or ""
247+
248+
title_sim = max(
249+
jaccard_similarity(c_title, title),
250+
sequence_similarity(c_title, title),
251+
)
252+
253+
if title_sim > self.SAME_CLUB_TITLE_THRESHOLD:
254+
logger.warning(
255+
f"{log_prefix} Potential update detected: '{title}' matches existing event '{c_title}' "
256+
f"from same club (title_sim={title_sim:.3f}). This will be updated."
257+
)
258+
return existing_event
259+
260+
return None
261+
262+
def _check_same_day_duplicate(self, target_start, title, location, description, ig_handle, log_prefix):
263+
"""
264+
Check for duplicate events on the same day using title, location, and description similarity.
265+
Returns:
266+
Event | None: The matched event if found, None otherwise
267+
"""
268+
from datetime import timedelta
269+
270+
day_start = target_start.replace(hour=0, minute=0, second=0, microsecond=0)
271+
day_end = day_start + timedelta(days=1)
272+
163273
candidates = EventDates.objects.select_related("event").filter(
164274
dtstart_utc__gte=day_start,
165275
dtstart_utc__lt=day_end
166276
)
277+
167278
if candidates:
168279
logger.debug(f"{log_prefix} Found {len(candidates)} existing events on {day_start.date()} for duplicate check.")
169280
for i, cand in enumerate(candidates[:3]):
@@ -183,35 +294,45 @@ def is_duplicate_event(event_data, ig_handle=None, source_url=None):
183294
if not c_start:
184295
continue
185296

297+
# Check substring match
186298
norm_title = normalize(title)
187299
norm_c_title = normalize(c_title)
188300
substring_match = norm_c_title in norm_title or norm_title in norm_c_title
189301

302+
# Calculate similarities
190303
title_sim = max(
191304
jaccard_similarity(c_title, title),
192305
sequence_similarity(c_title, title),
193306
)
194307
loc_sim = jaccard_similarity(c_loc, location)
195308
desc_sim = jaccard_similarity(c_desc, description)
196309

310+
# Check if substring match + similar location
197311
if substring_match:
198-
logger.warning(
199-
f"{log_prefix} Duplicate by substring match: '{title}' @ '{location}' matches '{c_title}' @ '{c_loc}'"
200-
)
201-
return True
312+
similar_location = loc_sim > self.LOCATION_SIMILARITY_THRESHOLD
313+
if similar_location:
314+
logger.warning(
315+
f"{log_prefix} Duplicate by substring match + location: '{title}' @ '{location}' matches '{c_title}' @ '{c_loc}' "
316+
f"(ig_handle: {ig_handle} vs {existing_event.ig_handle}, loc_sim={loc_sim:.3f})"
317+
)
318+
return existing_event
319+
else:
320+
logger.debug(
321+
f"{log_prefix} Substring match but different location: '{title}' ({ig_handle}) @ '{location}' vs '{c_title}' ({existing_event.ig_handle}) @ '{c_loc}' (loc_sim={loc_sim:.3f})"
322+
)
323+
continue
202324

203-
if (title_sim > 0.7 and loc_sim > 0.5) or (
204-
loc_sim > 0.5 and desc_sim > 0.3
325+
# Check similarity thresholds
326+
if (title_sim > self.TITLE_SIMILARITY_THRESHOLD and loc_sim > self.LOCATION_SIMILARITY_THRESHOLD) or (
327+
loc_sim > self.LOCATION_SIMILARITY_THRESHOLD and desc_sim > self.DESCRIPTION_SIMILARITY_THRESHOLD
205328
):
206329
logger.warning(
207330
f"{log_prefix} Duplicate by similarity: '{title}' @ '{location}' matches '{c_title}' @ '{c_loc}' "
208331
f"(title_sim={title_sim:.3f}, loc_sim={loc_sim:.3f}, desc_sim={desc_sim:.3f})"
209332
)
210-
return True
211-
except Exception as exc:
212-
logger.error(f"{log_prefix} Error during duplicate check: {exc!s}")
213-
214-
return False
333+
return existing_event
334+
335+
return None
215336

216337

217338
def append_event_to_csv(

0 commit comments

Comments
 (0)