Skip to content

Commit 2095fe9

Browse files
committed
move scraping utilities to separate file
1 parent b26da96 commit 2095fe9

File tree

4 files changed

+64
-63
lines changed

4 files changed

+64
-63
lines changed

.github/workflows/update-events-data.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ jobs:
6060
EMAIL_ENCRYPTION_KEY: ${{ secrets.EMAIL_ENCRYPTION_KEY }}
6161
EMAIL_HASH_KEY: ${{ secrets.EMAIL_HASH_KEY }}
6262
SECRET_KEY: ${{ secrets.SECRET_KEY }}
63+
CLERK_SECRET_KEY: ${{ secrets.CLERK_SECRET_KEY }}
6364

6465
steps:
6566
- uses: actions/checkout@v4

backend/scraping/instagram_feed.py

Lines changed: 23 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,10 @@
1111
import csv
1212
import json
1313
import random
14-
import re
1514
import time
1615
import traceback
1716
from datetime import datetime, timedelta, timezone as pytimezone
1817
from pathlib import Path
19-
from difflib import SequenceMatcher
2018

2119
import requests
2220
from requests.exceptions import ReadTimeout, ConnectionError
@@ -36,6 +34,13 @@
3634
from shared.constants.user_agents import USER_AGENTS
3735
from utils.embedding_utils import find_similar_events
3836
from utils.events_utils import clean_datetime, clean_duration
37+
from utils.scraping_utils import (
38+
normalize,
39+
jaccard_similarity,
40+
sequence_similarity,
41+
get_post_image_url,
42+
)
43+
3944

4045
MAX_POSTS = int(os.getenv("MAX_POSTS", "25"))
4146
MAX_CONSEC_OLD_POSTS = 10
@@ -55,48 +60,6 @@
5560
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")
5661

5762

58-
def get_post_image_url(post):
59-
try:
60-
if post._node.get("image_versions2"):
61-
return post._node["image_versions2"]["candidates"][0]["url"]
62-
63-
if post._node.get("carousel_media"):
64-
return post._node["carousel_media"][0]["image_versions2"]["candidates"][0][
65-
"url"
66-
]
67-
68-
if post._node.get("display_url"):
69-
return post._node["display_url"]
70-
return None
71-
except (KeyError, AttributeError) as e:
72-
logger.error(
73-
f"Error accessing image URL for post {getattr(post, 'shortcode', 'unknown')}: {e!s}"
74-
)
75-
return None
76-
77-
78-
def extract_s3_filename_from_url(image_url: str) -> str:
79-
if not image_url:
80-
return None
81-
filename = image_url.split("/")[-1]
82-
return f"events/{filename}"
83-
84-
85-
def normalize(s):
86-
return re.sub(r"[^a-z0-9]", "", s.lower())
87-
88-
89-
def jaccard_similarity(a, b):
90-
"""Compute Jaccard similarity between two strings (case-insensitive, word-based)."""
91-
set_a = set(re.findall(r"\w+", a.lower()))
92-
set_b = set(re.findall(r"\w+", b.lower()))
93-
if not set_a or not set_b:
94-
return 0.0
95-
intersection = set_a & set_b
96-
union = set_a | set_b
97-
return len(intersection) / len(union)
98-
99-
10063
def is_duplicate_event(event_data):
10164
"""Check for duplicate events using title, datetime, location, and description."""
10265
title = event_data.get("title") or ""
@@ -109,9 +72,7 @@ def is_duplicate_event(event_data):
10972
try:
11073
date = datetime.fromisoformat(dtstart_utc)
11174
candidates = Events.objects.filter(dtstart_utc__date=date.date())
112-
print(f"Checking for duplicates on {date.date()} - found {candidates.count()} candidates")
11375
for c in candidates:
114-
print("Candidate:", c.title, c.location, c.dtstart_utc)
11576
c_title = getattr(c, "title", "") or ""
11677
c_loc = getattr(c, "location", "") or ""
11778
c_desc = getattr(c, "description", "") or ""
@@ -122,19 +83,20 @@ def is_duplicate_event(event_data):
12283
substring_match = norm_c_title in norm_title or norm_title in norm_c_title
12384
title_sim = max(
12485
jaccard_similarity(c_title, title),
125-
SequenceMatcher(None, c_title.lower(), title.lower()).ratio(),
86+
sequence_similarity(c_title, title),
12687
)
12788
loc_sim = jaccard_similarity(c_loc, location)
12889
desc_sim = jaccard_similarity(c_desc, description)
129-
logger.debug(
130-
f"Comparing to candidate: '{c_title}' @ '{c_loc}'",
131-
f"substring_match={substring_match}, title_sim={title_sim:.3f}, loc_sim={loc_sim:.3f}, desc_sim={desc_sim:.3f}"
132-
)
13390
if substring_match:
134-
logger.debug("-> Duplicate by substring match")
91+
logger.warning(
92+
f"Duplicate by substring match: '{title}' @ '{location}' matches '{c_title}' @ '{c_loc}'"
93+
)
13594
return True
13695
if (title_sim > 0.4) or (loc_sim > 0.5 and desc_sim > 0.3):
137-
logger.debug("-> Duplicate by similarity threshold")
96+
logger.warning(
97+
f"Duplicate by similarity: '{title}' @ '{location}' matches '{c_title}' @ '{c_loc}' "
98+
f"(title_sim={title_sim:.3f}, loc_sim={loc_sim:.3f}, desc_sim={desc_sim:.3f})"
99+
)
138100
return True
139101
except Exception as e:
140102
logger.error(f"Error during duplicate check: {e!s}")
@@ -263,7 +225,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
263225
school = event_data.get("school", "")
264226

265227
if is_duplicate_event(event_data):
266-
logger.info(
228+
logger.warning(
267229
f"Duplicate event detected, skipping {title} on {date} at {location}"
268230
)
269231
try:
@@ -276,7 +238,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
276238
)
277239
except Exception as csv_e:
278240
logger.error(f"Error writing duplicate event to CSV: {csv_e}")
279-
return False
241+
return "duplicate"
280242

281243
# Get club_type by matching ig_handle from Events to ig of Clubs
282244
try:
@@ -531,11 +493,16 @@ def check_post_limit():
531493
break
532494
continue
533495

534-
if insert_event_to_db(event_data, post.owner_username, source_url):
496+
result = insert_event_to_db(event_data, post.owner_username, source_url)
497+
if result is True:
535498
events_added += 1
536499
logger.info(
537500
f"[{post.shortcode}] [{post.owner_username}] Successfully added event '{event_data.get('title')}'"
538501
)
502+
elif result == "duplicate":
503+
logger.warning(
504+
f"[{post.shortcode}] [{post.owner_username}] Duplicate event, not added: '{event_data.get('title')}'"
505+
)
539506
else:
540507
logger.error(
541508
f"[{post.shortcode}] [{post.owner_username}] Failed to add event '{event_data.get('title')}'"

backend/utils/scraping_utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import re
2+
from difflib import SequenceMatcher
3+
from scraping.logging_config import logger
4+
5+
6+
def normalize(s):
7+
"""Normalize a string for comparison (lowercase, alphanumeric only)."""
8+
return re.sub(r"[^a-z0-9]", "", s.lower())
9+
10+
11+
def jaccard_similarity(a, b):
12+
"""Compute Jaccard similarity between two strings (case-insensitive, word-based)."""
13+
set_a = set(re.findall(r"\w+", a.lower()))
14+
set_b = set(re.findall(r"\w+", b.lower()))
15+
if not set_a or not set_b:
16+
return 0.0
17+
intersection = set_a & set_b
18+
union = set_a | set_b
19+
return len(intersection) / len(union)
20+
21+
22+
def sequence_similarity(a, b):
23+
"""Compute SequenceMatcher similarity between two strings (case-insensitive)."""
24+
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
25+
26+
27+
def get_post_image_url(post):
28+
try:
29+
if post._node.get("image_versions2"):
30+
return post._node["image_versions2"]["candidates"][0]["url"]
31+
if post._node.get("carousel_media"):
32+
return post._node["carousel_media"][0]["image_versions2"]["candidates"][0]["url"]
33+
if post._node.get("display_url"):
34+
return post._node["display_url"]
35+
return None
36+
except (KeyError, AttributeError) as e:
37+
logger.warning(f"Failed to extract image URL from post: {e}")
38+
return None

frontend/src/shared/lib/eventUtils.ts

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,8 @@ export const isEventNew = (event: Event): boolean => {
2525
const now = new Date();
2626
const addedAt = new Date(event.added_at);
2727

28-
// Match the logic from handleToggleNewEvents
29-
const todayAt7am = new Date();
30-
todayAt7am.setHours(7, 0, 0, 0);
31-
32-
const cutoffDate = now >= todayAt7am ? todayAt7am : new Date(todayAt7am.getTime() - 24 * 60 * 60 * 1000);
33-
34-
return addedAt >= cutoffDate;
28+
// New events are those added in past 24 hours
29+
return (now.getTime() - addedAt.getTime()) <= 24 * 60 * 60 * 1000;
3530
};
3631

3732
/**

0 commit comments

Comments
 (0)