1111import csv
1212import json
1313import random
14- import re
1514import time
1615import traceback
1716from datetime import datetime , timedelta , timezone as pytimezone
1817from pathlib import Path
19- from difflib import SequenceMatcher
2018
2119import requests
2220from requests .exceptions import ReadTimeout , ConnectionError
3634from shared .constants .user_agents import USER_AGENTS
3735from utils .embedding_utils import find_similar_events
3836from utils .events_utils import clean_datetime , clean_duration
37+ from utils .scraping_utils import (
38+ normalize ,
39+ jaccard_similarity ,
40+ sequence_similarity ,
41+ get_post_image_url ,
42+ )
43+
3944
4045MAX_POSTS = int (os .getenv ("MAX_POSTS" , "25" ))
4146MAX_CONSEC_OLD_POSTS = 10
5560SUPABASE_DB_URL = os .getenv ("SUPABASE_DB_URL" )
5661
5762
58- def get_post_image_url (post ):
59- try :
60- if post ._node .get ("image_versions2" ):
61- return post ._node ["image_versions2" ]["candidates" ][0 ]["url" ]
62-
63- if post ._node .get ("carousel_media" ):
64- return post ._node ["carousel_media" ][0 ]["image_versions2" ]["candidates" ][0 ][
65- "url"
66- ]
67-
68- if post ._node .get ("display_url" ):
69- return post ._node ["display_url" ]
70- return None
71- except (KeyError , AttributeError ) as e :
72- logger .error (
73- f"Error accessing image URL for post { getattr (post , 'shortcode' , 'unknown' )} : { e !s} "
74- )
75- return None
76-
77-
78- def extract_s3_filename_from_url (image_url : str ) -> str :
79- if not image_url :
80- return None
81- filename = image_url .split ("/" )[- 1 ]
82- return f"events/{ filename } "
83-
84-
85- def normalize (s ):
86- return re .sub (r"[^a-z0-9]" , "" , s .lower ())
87-
88-
89- def jaccard_similarity (a , b ):
90- """Compute Jaccard similarity between two strings (case-insensitive, word-based)."""
91- set_a = set (re .findall (r"\w+" , a .lower ()))
92- set_b = set (re .findall (r"\w+" , b .lower ()))
93- if not set_a or not set_b :
94- return 0.0
95- intersection = set_a & set_b
96- union = set_a | set_b
97- return len (intersection ) / len (union )
98-
99-
10063def is_duplicate_event (event_data ):
10164 """Check for duplicate events using title, datetime, location, and description."""
10265 title = event_data .get ("title" ) or ""
@@ -109,9 +72,7 @@ def is_duplicate_event(event_data):
10972 try :
11073 date = datetime .fromisoformat (dtstart_utc )
11174 candidates = Events .objects .filter (dtstart_utc__date = date .date ())
112- print (f"Checking for duplicates on { date .date ()} - found { candidates .count ()} candidates" )
11375 for c in candidates :
114- print ("Candidate:" , c .title , c .location , c .dtstart_utc )
11576 c_title = getattr (c , "title" , "" ) or ""
11677 c_loc = getattr (c , "location" , "" ) or ""
11778 c_desc = getattr (c , "description" , "" ) or ""
@@ -122,19 +83,20 @@ def is_duplicate_event(event_data):
12283 substring_match = norm_c_title in norm_title or norm_title in norm_c_title
12384 title_sim = max (
12485 jaccard_similarity (c_title , title ),
125- SequenceMatcher ( None , c_title . lower () , title . lower ()). ratio ( ),
86+ sequence_similarity ( c_title , title ),
12687 )
12788 loc_sim = jaccard_similarity (c_loc , location )
12889 desc_sim = jaccard_similarity (c_desc , description )
129- logger .debug (
130- f"Comparing to candidate: '{ c_title } ' @ '{ c_loc } '" ,
131- f"substring_match={ substring_match } , title_sim={ title_sim :.3f} , loc_sim={ loc_sim :.3f} , desc_sim={ desc_sim :.3f} "
132- )
13390 if substring_match :
134- logger .debug ("-> Duplicate by substring match" )
91+ logger .warning (
92+ f"Duplicate by substring match: '{ title } ' @ '{ location } ' matches '{ c_title } ' @ '{ c_loc } '"
93+ )
13594 return True
13695 if (title_sim > 0.4 ) or (loc_sim > 0.5 and desc_sim > 0.3 ):
137- logger .debug ("-> Duplicate by similarity threshold" )
96+ logger .warning (
97+ f"Duplicate by similarity: '{ title } ' @ '{ location } ' matches '{ c_title } ' @ '{ c_loc } ' "
98+ f"(title_sim={ title_sim :.3f} , loc_sim={ loc_sim :.3f} , desc_sim={ desc_sim :.3f} )"
99+ )
138100 return True
139101 except Exception as e :
140102 logger .error (f"Error during duplicate check: { e !s} " )
@@ -263,7 +225,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
263225 school = event_data .get ("school" , "" )
264226
265227 if is_duplicate_event (event_data ):
266- logger .info (
228+ logger .warning (
267229 f"Duplicate event detected, skipping { title } on { date } at { location } "
268230 )
269231 try :
@@ -276,7 +238,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
276238 )
277239 except Exception as csv_e :
278240 logger .error (f"Error writing duplicate event to CSV: { csv_e } " )
279- return False
241+ return "duplicate"
280242
281243 # Get club_type by matching ig_handle from Events to ig of Clubs
282244 try :
@@ -531,11 +493,16 @@ def check_post_limit():
531493 break
532494 continue
533495
534- if insert_event_to_db (event_data , post .owner_username , source_url ):
496+ result = insert_event_to_db (event_data , post .owner_username , source_url )
497+ if result is True :
535498 events_added += 1
536499 logger .info (
537500 f"[{ post .shortcode } ] [{ post .owner_username } ] Successfully added event '{ event_data .get ('title' )} '"
538501 )
502+ elif result == "duplicate" :
503+ logger .warning (
504+ f"[{ post .shortcode } ] [{ post .owner_username } ] Duplicate event, not added: '{ event_data .get ('title' )} '"
505+ )
539506 else :
540507 logger .error (
541508 f"[{ post .shortcode } ] [{ post .owner_username } ] Failed to add event '{ event_data .get ('title' )} '"
0 commit comments