Skip to content

Commit 5f5ee61

Browse files
committed
change schedules for batch workflow and validating event sources
1 parent 7851f76 commit 5f5ee61

File tree

5 files changed

+163
-5
lines changed

5 files changed

+163
-5
lines changed

.github/workflows/update-events-data.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Scrape Instagram, Update Events DB, Update Static Data
22

33
on:
44
schedule:
5-
- cron: '0 9 * * *' # 4am EST (UTC-5)
5+
- cron: '0 9 */2 * *' # Every 2 days at 4am EST (UTC-5)
66

77
jobs:
88
update_events_data:

.github/workflows/validate-event-sources.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: Validate Event Sources
33
on:
44
workflow_dispatch:
55
schedule:
6-
- cron: '0 9 * * *' # 9am UTC daily
6+
- cron: '0 9 * * 1' # 9am UTC every Monday (4am EST)
77

88
jobs:
99
validate-events:

backend/scraping/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ def main():
5353
# Single user: 1 day lookback, 1 post limit
5454
posts = scraper.scrape(targets[0], results_limit=1, cutoff_days=1)
5555
else:
56-
# Batch mode: 2 days lookback, 1 post per account
57-
posts = scraper.scrape(targets, results_limit=1, cutoff_days=2)
56+
# Batch mode: 4 days lookback, 1 post per account
57+
posts = scraper.scrape(targets, results_limit=1, cutoff_days=4)
5858

5959
raw_path = Path(__file__).parent / "apify_raw_results.json"
6060
with raw_path.open("w", encoding="utf-8") as f:

backend/scraping/main_rotated.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""
2+
Account Rotation System
3+
This script implements smart account rotation to reduce Apify credits while
4+
maintaining full event coverage.
5+
6+
Usage:
7+
python backend/scraping/main_rotated.py
8+
9+
How it works:
10+
- Splits accounts into 3 groups
11+
- Group A: Scraped on Mon, Thu, Sun
12+
- Group B: Scraped on Tue, Fri
13+
- Group C: Scraped on Wed, Sat
14+
- Each account checked 2-3x per week
15+
"""
16+
17+
import asyncio
18+
import json
19+
import os
20+
import sys
21+
from datetime import datetime, timedelta
22+
from pathlib import Path
23+
24+
# Setup Django
25+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26+
import django
27+
28+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
29+
django.setup()
30+
31+
from scraping.event_processor import EventProcessor
32+
from scraping.instagram_scraper import InstagramScraper
33+
from django.utils import timezone
34+
35+
from scraping.logging_config import logger
36+
from shared.constants.urls_to_scrape import FULL_URLS
37+
38+
39+
def get_rotation_group():
40+
"""
41+
Determine which group of accounts to scrape based on day of week.
42+
Returns: (group_name, account_indices)
43+
"""
44+
day_of_week = datetime.now().weekday() # 0=Monday, 6=Sunday
45+
46+
# Group A: Monday (0), Thursday (3), Sunday (6)
47+
# Group B: Tuesday (1), Friday (4)
48+
# Group C: Wednesday (2), Saturday (5)
49+
50+
rotation_schedule = {
51+
0: ("A", 0), # Monday -> Group A
52+
1: ("B", 1), # Tuesday -> Group B
53+
2: ("C", 2), # Wednesday -> Group C
54+
3: ("A", 0), # Thursday -> Group A
55+
4: ("B", 1), # Friday -> Group B
56+
5: ("C", 2), # Saturday -> Group C
57+
6: ("A", 0), # Sunday -> Group A
58+
}
59+
60+
return rotation_schedule[day_of_week]
61+
62+
63+
def split_accounts_into_groups(accounts, num_groups=3):
64+
"""
65+
Split accounts into N groups evenly.
66+
Returns: list of lists
67+
"""
68+
groups = [[] for _ in range(num_groups)]
69+
for i, account in enumerate(accounts):
70+
groups[i % num_groups].append(account)
71+
return groups
72+
73+
74+
def get_targets():
75+
"""
76+
Get Instagram accounts to scrape based on rotation schedule.
77+
"""
78+
username = os.getenv("TARGET_USERNAME")
79+
if username:
80+
# Single user mode - no rotation
81+
return "single", [username]
82+
83+
# Get all Instagram accounts
84+
all_accounts = [
85+
url.split("instagram.com/")[1].split("/")[0]
86+
for url in FULL_URLS
87+
if "instagram.com/" in url
88+
]
89+
90+
# Split into groups
91+
groups = split_accounts_into_groups(all_accounts, num_groups=3)
92+
93+
# Get today's group
94+
group_name, group_index = get_rotation_group()
95+
today_accounts = groups[group_index]
96+
97+
logger.info(f"Rotation Group {group_name} ({len(today_accounts)}/{len(all_accounts)} accounts)")
98+
99+
return "rotated", today_accounts
100+
101+
102+
def filter_valid_posts(posts):
103+
return [
104+
post for post in posts
105+
if not post.get("error") and not post.get("errorDescription")
106+
and post.get("url") and "/p/" in post.get("url")
107+
]
108+
109+
110+
def main():
111+
mode, targets = get_targets()
112+
logger.info(f"--- Workflow Started: {mode.upper()} ---")
113+
scraper = InstagramScraper()
114+
processor = EventProcessor(concurrency=5)
115+
116+
# Configure run based on mode
117+
if mode == "single":
118+
# Single user: 1 day lookback, 1 post limit
119+
posts = scraper.scrape(targets[0], results_limit=1, cutoff_days=1)
120+
else:
121+
# Rotated mode: 5 days lookback, 1 post per account
122+
posts = scraper.scrape(targets, results_limit=1, cutoff_days=5)
123+
124+
raw_path = Path(__file__).parent / "apify_raw_results.json"
125+
with raw_path.open("w", encoding="utf-8") as f:
126+
json.dump(posts, f, ensure_ascii=False, indent=2)
127+
128+
# Filter out results not containing posts before processing
129+
posts = filter_valid_posts(posts)
130+
if not posts:
131+
logger.info("No posts retrieved. Exiting.")
132+
sys.exit(0)
133+
134+
cutoff_date = timezone.now() - timedelta(days=1)
135+
try:
136+
saved_count = asyncio.run(processor.process(posts, cutoff_date))
137+
138+
if saved_count > 0:
139+
logger.info(f"Successfully added {saved_count} event(s)")
140+
sys.exit(0)
141+
else:
142+
logger.info("No new events were added")
143+
sys.exit(0)
144+
except Exception as e:
145+
logger.error(f"Critical error in processing: {e}", exc_info=True)
146+
sys.exit(1)
147+
148+
149+
if __name__ == "__main__":
150+
main()

backend/scripts/validate_event_sources.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
django.setup()
2020

2121
import aiohttp # noqa: E402
22-
from django.db import transaction # noqa: E402
22+
from django.db import transaction, connection # noqa: E402
2323

2424
from apps.events.models import ( # noqa: E402
2525
EventDates,
@@ -203,6 +203,9 @@ async def check_event_source(self, session, event):
203203
def delete_event(self, event, reason):
204204
"""Delete an event and all its related data"""
205205
try:
206+
# Close existing database connections to prevent SSL SYSCALL errors
207+
connection.close()
208+
206209
with transaction.atomic():
207210
event_id = event.id
208211
event_title = event.title or "Untitled"
@@ -229,6 +232,8 @@ def delete_event(self, event, reason):
229232

230233
except Exception as e:
231234
logger.error(f"Error deleting event {event.id}: {e}")
235+
# Close connection on error to ensure clean state
236+
connection.close()
232237
raise
233238

234239
async def validate_events_batch(self, session, events, semaphore):
@@ -286,6 +291,9 @@ def validate_all_events(self, limit=None, school=None):
286291
# Run async validation
287292
asyncio.run(self._async_validate_all(events, total_events))
288293

294+
# Close stale database connections before deletion
295+
connection.close()
296+
289297
# Delete invalid events (must be done in sync context, not inside asyncio.run)
290298
logger.info(f"\nDeleting {len(self.events_to_delete)} invalid events...")
291299
for event, reason in self.events_to_delete:

0 commit comments

Comments
 (0)