Skip to content

Commit 3de26b2

Browse files
committed
enhance duplicate event check with Jaccard similarity for improved context-based accuracy
1 parent 2d0f7d2 commit 3de26b2

File tree

1 file changed

+35
-22
lines changed

1 file changed

+35
-22
lines changed

backend/scraping/instagram_feed.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -88,42 +88,55 @@ def normalize_string(s):
8888
return re.sub(r"\W+", "", s).lower().strip()
8989

9090

91+
def jaccard_similarity(a, b):
92+
"""Compute Jaccard similarity between two strings (case-insensitive, word-based)."""
93+
set_a = set(re.findall(r"\w+", a.lower()))
94+
set_b = set(re.findall(r"\w+", b.lower()))
95+
if not set_a or not set_b:
96+
return 0.0
97+
intersection = set_a & set_b
98+
union = set_a | set_b
99+
return len(intersection) / len(union)
100+
101+
91102
def is_duplicate_event(event_data):
92103
"""Check for duplicate events using ig_handle, title, datetime, location, and description."""
93-
ig_handle = normalize_string(event_data.get("ig_handle") or "")
94-
title = normalize_string(event_data.get("title") or "")
95-
location = normalize_string(event_data.get("location") or "")
96-
description = normalize_string(event_data.get("description") or "")
97-
dtstart = event_data.get("dtstart")
98-
if not dtstart:
104+
ig_handle = event_data.get("ig_handle") or ""
105+
title = event_data.get("title") or ""
106+
location = event_data.get("location") or ""
107+
description = event_data.get("description") or ""
108+
dtstart_utc = event_data.get("dtstart_utc")
109+
if not dtstart_utc:
99110
return False
100111

101112
try:
102-
date = datetime.fromisoformat(dtstart)
103-
# Candidates: same ig_handle and same day
113+
date = datetime.fromisoformat(dtstart_utc)
104114
candidates = Events.objects.filter(
105-
dtstart__date=date.date(),
115+
dtstart_utc__date=date.date(),
106116
ig_handle__isnull=False,
107117
)
108118
for c in candidates:
109-
c_ig_handle = normalize_string(getattr(c, "ig_handle", "") or "")
110-
c_title = normalize_string(getattr(c, "title", "") or "")
111-
c_loc = normalize_string(getattr(c, "location", "") or "")
112-
c_desc = normalize_string(getattr(c, "description", "") or "")
113-
c_dtstart = getattr(c, "dtstart", None)
114-
# Require same ig_handle and dtstart within 2 hours
119+
c_ig_handle = getattr(c, "ig_handle", "") or ""
120+
c_title = getattr(c, "title", "") or ""
121+
c_loc = getattr(c, "location", "") or ""
122+
c_desc = getattr(c, "description", "") or ""
123+
c_dtstart_utc = getattr(c, "dtstart_utc", None)
124+
# Require same ig_handle and dtstart_utc within 2 hours
115125
if (
116126
ig_handle
117127
and c_ig_handle
118128
and ig_handle == c_ig_handle
119-
and c_dtstart
120-
and abs((c_dtstart - date).total_seconds()) <= 2 * 3600
129+
and c_dtstart_utc
130+
and c_dtstart_utc.date() == date.date()
121131
):
122-
# Fuzzy match title, location, and description
123-
title_sim = SequenceMatcher(None, c_title, title).ratio()
124-
loc_sim = SequenceMatcher(None, c_loc, location).ratio()
125-
desc_sim = SequenceMatcher(None, c_desc, description).ratio()
126-
if title_sim > 0.55 and loc_sim > 0.55 and desc_sim > 0.55:
132+
# Use Jaccard similarity for context-based matching
133+
title_sim = max(
134+
jaccard_similarity(c_title, title),
135+
SequenceMatcher(None, c_title.lower(), title.lower()).ratio(),
136+
)
137+
loc_sim = jaccard_similarity(c_loc, location)
138+
desc_sim = jaccard_similarity(c_desc, description)
139+
if (title_sim > 0.4) or (loc_sim > 0.5 and desc_sim > 0.3):
127140
return True
128141
except Exception as e:
129142
logger.error(f"Error during duplicate check: {e!s}")

0 commit comments

Comments
 (0)