Skip to content

Commit 7326a1b

Browse files
committed
small improvements to scraping, processing, logging, workflows
1 parent 57910ba commit 7326a1b

File tree

6 files changed

+22
-9
lines changed

6 files changed

+22
-9
lines changed

.github/workflows/process-single-user.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
name: Process Single User
2+
run-name: Process @${{ github.event.client_payload.username }}
23

34
on:
45
repository_dispatch:

backend/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ pgvector==0.4.1
99
django-ratelimit==3.0.1
1010

1111
# Scraping and web utilities
12-
--find-links=./wheels
1312
setuptools>=65.5.0
1413
apify_client
1514
python-dotenv

backend/scraping/event_processor.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,16 @@ async def process(self, posts_data, cutoff_date):
128128
# 2. Upload all images for each post (with carousel support)
129129
all_image_tasks = []
130130
for post in valid_posts:
131+
ig_handle = post.get("ownerUsername")
132+
shortcode = post.get("url", "").strip("/").split("/")[-1]
133+
logger.info(f"[{ig_handle}] [{shortcode}] Uploading images...")
131134
image_urls = _get_all_images(post)
132135
post["all_image_urls"] = image_urls
133136
all_image_tasks.append([self._upload_image(img_url) for img_url in image_urls])
137+
134138
flat_tasks = [task for sublist in all_image_tasks for task in sublist]
135139
flat_results = await asyncio.gather(*flat_tasks)
140+
136141
# Map uploaded S3 URLs back to posts
137142
idx = 0
138143
for post in valid_posts:
@@ -141,9 +146,11 @@ async def process(self, posts_data, cutoff_date):
141146
idx += n_imgs
142147

143148
# 3. Extract Events
144-
logger.info(f"[{ig_handle}] [{shortcode}] Extracting event data...")
145149
extract_tasks = []
146150
for post in valid_posts:
151+
ig_handle = post.get("ownerUsername")
152+
shortcode = post.get("url", "").strip("/").split("/")[-1]
153+
logger.info(f"[{ig_handle}] [{shortcode}] Extracting event data...")
147154
extract_tasks.append(self._process_single_post_extraction({
148155
**post,
149156
"all_s3_urls": post["all_s3_urls"]

backend/scraping/logging_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def setup_logging():
2626
logging.getLogger("httpx").setLevel(logging.WARNING)
2727
logging.getLogger("httpcore").setLevel(logging.WARNING)
2828
logging.getLogger("urllib3").setLevel(logging.WARNING)
29-
fmt = "%(asctime)s - %(levelname)s - %(message)s"
29+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
3030
handlers = [
3131
logging.StreamHandler(sys.stderr),
3232
logging.FileHandler(LOG_FILE, encoding="utf-8"),

backend/scraping/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def main():
7878
sys.exit(0)
7979
else:
8080
logger.info("No new events were added")
81-
sys.exit(2)
81+
sys.exit(0)
8282
except Exception as e:
8383
logger.error(f"Critical error in processing: {e}", exc_info=True)
8484
sys.exit(1)

backend/utils/scraping_utils.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,20 @@ def is_duplicate_event(event_data):
150150
if not target_start:
151151
return False
152152

153+
from datetime import timedelta
154+
day_start = target_start.replace(hour=0, minute=0, second=0, microsecond=0)
155+
day_end = day_start + timedelta(days=1)
156+
153157
try:
154158
candidates = EventDates.objects.select_related("event").filter(
155-
dtstart_utc__date=target_start.date()
159+
dtstart_utc__gte=day_start,
160+
dtstart_utc__lt=day_end
156161
)
162+
if candidates:
163+
logger.debug(f"Found {len(candidates)} existing events on {day_start.date()} for duplicate check.")
164+
for i, cand in enumerate(candidates[:3]):
165+
evt = cand.event
166+
logger.debug(f" Candidate #{i+1}: '{evt.title}' @ {cand.dtstart_utc}")
157167

158168
for candidate in candidates:
159169
existing_event = candidate.event
@@ -168,10 +178,6 @@ def is_duplicate_event(event_data):
168178
if not c_start:
169179
continue
170180

171-
# Compare same-day occurrences with fuzzy matching on title/location/description.
172-
if c_start.date() != target_start.date():
173-
continue
174-
175181
norm_title = normalize(title)
176182
norm_c_title = normalize(c_title)
177183
substring_match = norm_c_title in norm_title or norm_title in norm_c_title

0 commit comments

Comments
 (0)