2020from utils .scraping_utils import insert_event_to_db
2121
2222
23+ def _get_all_images (post ):
24+ """
25+ Returns all image URLs for a post.
26+ """
27+ images = post .get ("images" , [])
28+ # Fallback to displayUrl if images missing
29+ if not images and post .get ("displayUrl" ):
30+ images = [post ["displayUrl" ]]
31+ return images
32+
2333class EventProcessor :
2434 def __init__ (self , concurrency = 5 ):
2535 self .concurrency = concurrency
@@ -66,7 +76,6 @@ def _save_event(self, event_data, ig_handle, source_url, club_type):
6676 @sync_to_async (thread_sensitive = True )
6777 def _ignore_post (self , shortcode ):
6878 IgnoredPost .objects .get_or_create (shortcode = shortcode )
69-
7079 async def _process_single_post_extraction (self , post ):
7180 """Extracts event data from a single post using OpenAI."""
7281 async with self .semaphore :
@@ -75,33 +84,29 @@ async def _process_single_post_extraction(self, post):
7584
7685 return await self ._extract_events (
7786 post .get ("caption" ),
78- post .get ("source_image_url " ), # Using S3 URL
87+ post .get ("all_s3_urls " ),
7988 post_dt
8089 )
8190
8291 async def process (self , posts_data , cutoff_date ):
83- """Main entry point to process a list of raw posts."""
8492 logger .info (f"Processing { len (posts_data )} posts..." )
85-
93+
8694 seen_shortcodes = await self ._get_seen_shortcodes ()
8795 valid_posts = []
8896
8997 # 1. Filter Posts
9098 for post in posts_data :
9199 url = post .get ("url" )
92- if not url or "/p/" not in url : continue
93-
94- # Basic validation
95- if not post .get ("caption" ) or not post .get ("displayUrl" ): continue
96-
97- # Date Check
100+ if not url or "/p/" not in url :
101+ continue
102+ if not post .get ("caption" ):
103+ continue
98104 post_dt = parse_utc_datetime (post .get ("timestamp" ))
99- if not post_dt or post_dt < cutoff_date : continue
100-
101- # Duplicate Check
105+ if not post_dt or post_dt < cutoff_date :
106+ continue
102107 shortcode = url .strip ("/" ).split ("/" )[- 1 ]
103- if shortcode in seen_shortcodes : continue
104-
108+ if shortcode in seen_shortcodes :
109+ continue
105110 valid_posts .append (post )
106111
107112 if not valid_posts :
@@ -110,15 +115,29 @@ async def process(self, posts_data, cutoff_date):
110115
111116 logger .info (f"Found { len (valid_posts )} new posts. Starting image uploads..." )
112117
113- # 2. Upload Images
114- upload_tasks = [self ._upload_image (p .get ("displayUrl" )) for p in valid_posts ]
115- s3_urls = await asyncio .gather (* upload_tasks )
116- for post , s3_url in zip (valid_posts , s3_urls , strict = False ):
117- post ["source_image_url" ] = s3_url
118+ # 2. Upload all images for each post (with carousel support)
119+ all_image_tasks = []
120+ for post in valid_posts :
121+ image_urls = _get_all_images (post )
122+ post ["all_image_urls" ] = image_urls
123+ all_image_tasks .append ([self ._upload_image (img_url ) for img_url in image_urls ])
124+ flat_tasks = [task for sublist in all_image_tasks for task in sublist ]
125+ flat_results = await asyncio .gather (* flat_tasks )
126+ # Map uploaded S3 URLs back to posts
127+ idx = 0
128+ for post in valid_posts :
129+ n_imgs = len (post ["all_image_urls" ])
130+ post ["all_s3_urls" ] = flat_results [idx :idx + n_imgs ]
131+ idx += n_imgs
118132
119133 # 3. Extract Events
120134 logger .info ("Extracting event data..." )
121- extract_tasks = [self ._process_single_post_extraction (p ) for p in valid_posts ]
135+ extract_tasks = []
136+ for post in valid_posts :
137+ extract_tasks .append (self ._process_single_post_extraction ({
138+ ** post ,
139+ "all_s3_urls" : post ["all_s3_urls" ]
140+ }))
122141 results = await asyncio .gather (* extract_tasks )
123142
124143 # 4. Save to DB
@@ -127,15 +146,44 @@ async def process(self, posts_data, cutoff_date):
127146 ig_handle = post .get ("ownerUsername" )
128147 source_url = post .get ("url" )
129148 shortcode = source_url .strip ("/" ).split ("/" )[- 1 ]
130- club_type = self . _get_club_type ( ig_handle )
149+ all_s3_urls = post . get ( "all_s3_urls" , [] )
131150
132151 if not extracted_events :
133- # Mark as ignored if AI found nothing
134152 await self ._ignore_post (shortcode )
135153 continue
136154
155+ if not isinstance (extracted_events , list ):
156+ extracted_events = [extracted_events ]
157+
158+ # If 1 image is provided, but AI returned multiple event objects,
159+ # merge them into a single "Weekly/Summary" event.
160+ if len (all_s3_urls ) == 1 and len (extracted_events ) > 1 :
161+ base_event = extracted_events [0 ]
162+
163+ # 1. Consolidate all dates from all events into the first event
164+ combined_occurrences = []
165+ for evt in extracted_events :
166+ combined_occurrences .extend (evt .get ("occurrences" ) or [])
167+ base_event ["occurrences" ] = combined_occurrences
168+
169+ # 2. Update title/description to reflect it's a summary
170+ club_name = post .get ("ownerFullName" ) or ig_handle or "Club"
171+ base_event ["title" ] = f"{ club_name } Weekly Events"
172+ base_event ["description" ] = (base_event .get ("description" ) or "" ) + "\n \n (Condensed from multiple events)"
173+
174+ extracted_events = [base_event ]
175+
137176 for event_data in extracted_events :
138- success = await self ._save_event (event_data , ig_handle , source_url , club_type )
139- if success : saved_count += 1
177+ # Map the correct picture to the event.
178+ image_idx = event_data .get ("image_index" )
179+ if image_idx is not None and isinstance (image_idx , int ) and 0 <= image_idx < len (all_s3_urls ):
180+ event_data ["source_image_url" ] = all_s3_urls [image_idx ]
181+ else :
182+ # Fallback: Use the first image (cover) if no index specified
183+ event_data ["source_image_url" ] = all_s3_urls [0 ]
184+
185+ success = await self ._save_event (event_data , ig_handle , source_url , self ._get_club_type (ig_handle ))
186+ if success :
187+ saved_count += 1
140188
141189 logger .info (f"Processing complete. Saved { saved_count } new events." )
0 commit comments