Skip to content

Commit 6178bf3

Browse files
committed
Add image handling and extraction improvements for image carousels
1 parent 28b3559 commit 6178bf3

File tree

4 files changed

+100
-41
lines changed

4 files changed

+100
-41
lines changed

backend/scraping/event_processor.py

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@
2020
from utils.scraping_utils import insert_event_to_db
2121

2222

23+
def _get_all_images(post):
24+
"""
25+
Returns all image URLs for a post.
26+
"""
27+
images = post.get("images", [])
28+
# Fallback to displayUrl if images missing
29+
if not images and post.get("displayUrl"):
30+
images = [post["displayUrl"]]
31+
return images
32+
2333
class EventProcessor:
2434
def __init__(self, concurrency=5):
2535
self.concurrency = concurrency
@@ -66,7 +76,6 @@ def _save_event(self, event_data, ig_handle, source_url, club_type):
6676
@sync_to_async(thread_sensitive=True)
6777
def _ignore_post(self, shortcode):
6878
IgnoredPost.objects.get_or_create(shortcode=shortcode)
69-
7079
async def _process_single_post_extraction(self, post):
7180
"""Extracts event data from a single post using OpenAI."""
7281
async with self.semaphore:
@@ -75,33 +84,29 @@ async def _process_single_post_extraction(self, post):
7584

7685
return await self._extract_events(
7786
post.get("caption"),
78-
post.get("source_image_url"), # Using S3 URL
87+
post.get("all_s3_urls"),
7988
post_dt
8089
)
8190

8291
async def process(self, posts_data, cutoff_date):
83-
"""Main entry point to process a list of raw posts."""
8492
logger.info(f"Processing {len(posts_data)} posts...")
85-
93+
8694
seen_shortcodes = await self._get_seen_shortcodes()
8795
valid_posts = []
8896

8997
# 1. Filter Posts
9098
for post in posts_data:
9199
url = post.get("url")
92-
if not url or "/p/" not in url: continue
93-
94-
# Basic validation
95-
if not post.get("caption") or not post.get("displayUrl"): continue
96-
97-
# Date Check
100+
if not url or "/p/" not in url:
101+
continue
102+
if not post.get("caption"):
103+
continue
98104
post_dt = parse_utc_datetime(post.get("timestamp"))
99-
if not post_dt or post_dt < cutoff_date: continue
100-
101-
# Duplicate Check
105+
if not post_dt or post_dt < cutoff_date:
106+
continue
102107
shortcode = url.strip("/").split("/")[-1]
103-
if shortcode in seen_shortcodes: continue
104-
108+
if shortcode in seen_shortcodes:
109+
continue
105110
valid_posts.append(post)
106111

107112
if not valid_posts:
@@ -110,15 +115,29 @@ async def process(self, posts_data, cutoff_date):
110115

111116
logger.info(f"Found {len(valid_posts)} new posts. Starting image uploads...")
112117

113-
# 2. Upload Images
114-
upload_tasks = [self._upload_image(p.get("displayUrl")) for p in valid_posts]
115-
s3_urls = await asyncio.gather(*upload_tasks)
116-
for post, s3_url in zip(valid_posts, s3_urls, strict=False):
117-
post["source_image_url"] = s3_url
118+
# 2. Upload all images for each post (with carousel support)
119+
all_image_tasks = []
120+
for post in valid_posts:
121+
image_urls = _get_all_images(post)
122+
post["all_image_urls"] = image_urls
123+
all_image_tasks.append([self._upload_image(img_url) for img_url in image_urls])
124+
flat_tasks = [task for sublist in all_image_tasks for task in sublist]
125+
flat_results = await asyncio.gather(*flat_tasks)
126+
# Map uploaded S3 URLs back to posts
127+
idx = 0
128+
for post in valid_posts:
129+
n_imgs = len(post["all_image_urls"])
130+
post["all_s3_urls"] = flat_results[idx:idx + n_imgs]
131+
idx += n_imgs
118132

119133
# 3. Extract Events
120134
logger.info("Extracting event data...")
121-
extract_tasks = [self._process_single_post_extraction(p) for p in valid_posts]
135+
extract_tasks = []
136+
for post in valid_posts:
137+
extract_tasks.append(self._process_single_post_extraction({
138+
**post,
139+
"all_s3_urls": post["all_s3_urls"]
140+
}))
122141
results = await asyncio.gather(*extract_tasks)
123142

124143
# 4. Save to DB
@@ -127,15 +146,44 @@ async def process(self, posts_data, cutoff_date):
127146
ig_handle = post.get("ownerUsername")
128147
source_url = post.get("url")
129148
shortcode = source_url.strip("/").split("/")[-1]
130-
club_type = self._get_club_type(ig_handle)
149+
all_s3_urls = post.get("all_s3_urls", [])
131150

132151
if not extracted_events:
133-
# Mark as ignored if AI found nothing
134152
await self._ignore_post(shortcode)
135153
continue
136154

155+
if not isinstance(extracted_events, list):
156+
extracted_events = [extracted_events]
157+
158+
# If 1 image is provided, but AI returned multiple event objects,
159+
# merge them into a single "Weekly/Summary" event.
160+
if len(all_s3_urls) == 1 and len(extracted_events) > 1:
161+
base_event = extracted_events[0]
162+
163+
# 1. Consolidate all dates from all events into the first event
164+
combined_occurrences = []
165+
for evt in extracted_events:
166+
combined_occurrences.extend(evt.get("occurrences") or [])
167+
base_event["occurrences"] = combined_occurrences
168+
169+
# 2. Update title/description to reflect it's a summary
170+
club_name = post.get("ownerFullName") or ig_handle or "Club"
171+
base_event["title"] = f"{club_name} Weekly Events"
172+
base_event["description"] = (base_event.get("description") or "") + "\n\n(Condensed from multiple events)"
173+
174+
extracted_events = [base_event]
175+
137176
for event_data in extracted_events:
138-
success = await self._save_event(event_data, ig_handle, source_url, club_type)
139-
if success: saved_count += 1
177+
# Map the correct picture to the event.
178+
image_idx = event_data.get("image_index")
179+
if image_idx is not None and isinstance(image_idx, int) and 0 <= image_idx < len(all_s3_urls):
180+
event_data["source_image_url"] = all_s3_urls[image_idx]
181+
else:
182+
# Fallback: Use the first image (cover) if no index specified
183+
event_data["source_image_url"] = all_s3_urls[0]
184+
185+
success = await self._save_event(event_data, ig_handle, source_url, self._get_club_type(ig_handle))
186+
if success:
187+
saved_count += 1
140188

141189
logger.info(f"Processing complete. Saved {saved_count} new events.")

backend/scraping/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
1111
django.setup()
1212

13-
from backend.scraping.event_processor import EventProcessor
14-
from backend.scraping.instagram_scraper import InstagramScraper
13+
from scraping.event_processor import EventProcessor
14+
from scraping.instagram_scraper import InstagramScraper
1515
from django.utils import timezone
1616

1717
from scraping.logging_config import logger

backend/services/openai_service.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def generate_event_embedding(self, event) -> list[float]:
9999
def extract_events_from_caption(
100100
self,
101101
caption_text: str | None = None,
102+
all_s3_urls: list[str] | None = None,
102103
source_image_url: str | None = None,
103104
post_created_at: datetime | None = None,
104105
school: str = "University of Waterloo",
@@ -126,14 +127,17 @@ def extract_events_from_caption(
126127
categories_str = "\n".join(f"- {cat}" for cat in EVENT_CATEGORIES)
127128

128129
prompt = f"""
129-
Analyze the following Instagram caption and image and extract event information if it's an event post.
130+
Analyze the following Instagram caption and list of images. Extract event information if it's an event post.
130131
131132
School context: This post is from {school}. Use this to guide location and timezone decisions.
132133
Current context: Today is {current_day_of_week}, {current_date}
133134
Post was created on: {context_day}, {context_date} at {context_time}
134135
Current semester end date: {semester_end_time}
135136
136137
Caption: {caption_text}
138+
139+
Images (0-indexed):
140+
{self._format_image_list_for_prompt(all_s3_urls) or []}
137141
138142
STRICT CONTENT POLICY:
139143
- ONLY extract an event if the post is clearly announcing or describing a real-world event with BOTH:
@@ -156,6 +160,7 @@ def extract_events_from_caption(
156160
"price": number or null,
157161
"food": string,
158162
"registration": boolean,
163+
"image_index": integer,
159164
"occurrences": [
160165
{{
161166
"dtstart_utc": string, // UTC start "YYYY-MM-DDTHH:MM:SSZ"
@@ -165,10 +170,15 @@ def extract_events_from_caption(
165170
}}
166171
],
167172
"school": string,
168-
"source_image_url": string,
169173
"categories": list // one or more of the following, as a JSON array of strings: {categories_str}
170174
}}
171175
176+
IMAGE MAPPING RULES:
177+
- You are provided with a list of images.
178+
- For each extracted event, identify which specific image contains the relevant details (e.g., date/time/location).
179+
- Set "image_index" to the 0-based index of that image.
180+
- Otherwise, set "image_index": 0.
181+
172182
OCCURRENCE RULES (CRITICAL):
173183
- Every event MUST include at least one occurrence with a concrete UTC start time.
174184
- Return ALL explicit dates and times mentioned in the post as separate entries in the occurrences array.
@@ -195,7 +205,6 @@ def extract_events_from_caption(
195205
- For description: Make this the caption text word-for-word. If there is no caption text, use the image text.
196206
- If information is not available, use empty string for strings, null for price/coordinates, and false for booleans.
197207
- Return ONLY the JSON array text, no extra commentary.
198-
{f"- An image is provided at: {source_image_url}. If there are conflicts between caption and image information, prioritize the caption text." if source_image_url else ""}
199208
"""
200209

201210
try:
@@ -259,12 +268,15 @@ def extract_events_from_caption(
259268
"source_image_url",
260269
"categories",
261270
"occurrences",
271+
"image_index"
262272
]
263273

264274
for event_obj in events_list:
265275
for field in required_fields:
266276
if field not in event_obj:
267-
if field in ["price", "latitude", "longitude"]:
277+
if field == "image_index":
278+
event_obj[field] = 0
279+
elif field in ["price", "latitude", "longitude"]:
268280
event_obj[field] = None
269281
elif field in ["registration"]:
270282
event_obj[field] = False

backend/utils/scraping_utils.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,15 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
7272
if is_duplicate_event(event_data):
7373
return "duplicate"
7474

75-
# Get club_type by matching ig_handle from Events to ig of Clubs
76-
try:
77-
club = Clubs.objects.get(ig=ig_handle)
78-
club_type = club.club_type
79-
except Clubs.DoesNotExist:
80-
club_type = None
81-
logger.warning(
82-
f"Club with handle {ig_handle} not found, inserting event with club_type NULL"
83-
)
84-
75+
# Only fetch if club_type wasn't passed in
76+
if club_type is None:
77+
try:
78+
club = Clubs.objects.get(ig=ig_handle)
79+
club_type = club.club_type
80+
except Clubs.DoesNotExist:
81+
club_type = None
82+
logger.warning(f"Club {ig_handle} not found, setting club_type NULL")
83+
8584
create_kwargs = {
8685
"ig_handle": ig_handle,
8786
"title": title,

0 commit comments

Comments
 (0)