Skip to content

Commit 715de61

Browse files
committed
Improved image download and event update processes
1 parent 4a7c6b1 commit 715de61

File tree

3 files changed

+37
-12
lines changed

3 files changed

+37
-12
lines changed

backend/scraping/event_processor.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,18 @@ async def process(self, posts_data, cutoff_date):
127127

128128
# 2. Upload all images for each post (with carousel support)
129129
all_image_tasks = []
130+
131+
async def _upload_image_bounded(url):
132+
async with self.semaphore:
133+
return await self._upload_image(url)
134+
130135
for post in valid_posts:
131136
ig_handle = post.get("ownerUsername")
132137
shortcode = post.get("url", "").strip("/").split("/")[-1]
133138
logger.info(f"[{ig_handle}] [{shortcode}] Uploading images...")
134139
image_urls = _get_all_images(post)
135140
post["all_image_urls"] = image_urls
136-
all_image_tasks.append([self._upload_image(img_url) for img_url in image_urls])
141+
all_image_tasks.append([_upload_image_bounded(img_url) for img_url in image_urls])
137142

138143
flat_tasks = [task for sublist in all_image_tasks for task in sublist]
139144
flat_results = await asyncio.gather(*flat_tasks, return_exceptions=True)
@@ -216,15 +221,23 @@ async def process(self, posts_data, cutoff_date):
216221

217222
club_type = await self._get_club_type(ig_handle)
218223
try:
219-
success = await self._save_event(event_data, ig_handle, source_url, club_type)
224+
result = await self._save_event(event_data, ig_handle, source_url, club_type)
220225
except Exception as e:
221226
append_event_to_csv(event_data, ig_handle, source_url, added_to_db="error", club_type=club_type)
222227
logger.error(f"[{ig_handle}] [{shortcode}] Error saving event: {e}")
223228
continue
224-
if success:
229+
230+
if result is True:
225231
append_event_to_csv(event_data, ig_handle, source_url, added_to_db="success", club_type=club_type)
226232
logger.info(f"[{ig_handle}] [{shortcode}] Saved event: '{event_data.get('title', '')}'")
227233
saved_count += 1
234+
elif result == "updated":
235+
append_event_to_csv(event_data, ig_handle, source_url, added_to_db="updated", club_type=club_type)
236+
logger.info(f"[{ig_handle}] [{shortcode}] Updated event: '{event_data.get('title', '')}'")
237+
saved_count += 1
238+
elif result == "duplicate":
239+
append_event_to_csv(event_data, ig_handle, source_url, added_to_db="duplicate_post", club_type=club_type)
240+
logger.info(f"[{ig_handle}] [{shortcode}] Duplicate event (no changes): '{event_data.get('title', '')}'")
228241

229242
logger.info(f"Processing complete. Saved {saved_count} new events.")
230243
return saved_count

backend/services/storage_service.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,21 @@ def _validate_image(self, image_data: bytes) -> bool:
5353

5454
def _download_image_from_url(self, image_url: str) -> bytes | None:
5555
"""Download image from URL"""
56-
try:
57-
response = requests.get(image_url, timeout=60)
58-
response.raise_for_status()
59-
return response.content
60-
except Exception as e:
61-
logger.error(f"Failed to download image from {image_url}: {e}")
62-
return None
56+
import time
57+
max_retries = 3
58+
for attempt in range(max_retries):
59+
try:
60+
response = requests.get(image_url, timeout=60)
61+
response.raise_for_status()
62+
return response.content
63+
except Exception as e:
64+
if attempt == max_retries - 1:
65+
logger.error(f"Failed to download image from {image_url} after {max_retries} attempts: {e}")
66+
return None
67+
wait_time = 2 ** attempt
68+
logger.warning(f"Download failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {wait_time}s...")
69+
time.sleep(wait_time)
70+
return None
6371

6472
def upload_image_from_url(
6573
self, image_url: str, filename: str | None = None

backend/utils/scraping_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66

77
from django.db import transaction
8+
from django.utils import timezone
89

910
from utils.date_utils import parse_utc_datetime
1011

@@ -92,6 +93,7 @@ def insert_event_to_db(event_data, ig_handle, source_url, club_type=None):
9293
matched_event.source_image_url = source_image_url or None
9394
matched_event.categories = categories
9495
matched_event.source_url = source_url
96+
matched_event.added_at = timezone.now() # Bump to top
9597
matched_event.save()
9698

9799
# Delete old event dates and create new ones
@@ -201,7 +203,8 @@ def find_match(self, event_data, ig_handle=None, source_url=None):
201203
description = event_data.get("description") or ""
202204
occurrences = event_data.get("occurrences")
203205

204-
log_prefix = f"[{ig_handle}] [{source_url.split('/')[-1] if source_url else 'UNKNOWN'}]"
206+
shortcode = source_url.strip("/").split("/")[-1] if source_url else "UNKNOWN"
207+
log_prefix = f"[{ig_handle}] [{shortcode}]"
205208

206209
if not occurrences:
207210
return False, None
@@ -418,4 +421,5 @@ def append_event_to_csv(
418421
"status": "CONFIRMED",
419422
}
420423
)
421-
logger.info(f"[{ig_handle}] [{source_url.split('/')[-1] if source_url else 'UNKNOWN'}] Event written to CSV: '{title}' - Status: {added_to_db}")
424+
shortcode = source_url.strip("/").split("/")[-1] if source_url else "UNKNOWN"
425+
logger.info(f"[{ig_handle}] [{shortcode}] Event written to CSV - Status: {added_to_db}")

0 commit comments

Comments
 (0)