Skip to content

Commit 7f1c438

Browse files
committed
implement single user scraping as separate workflow
1 parent d1f7219 commit 7f1c438

File tree

4 files changed

+129
-23
lines changed

4 files changed

+129
-23
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
name: Process Single User
2+
3+
on:
4+
repository_dispatch:
5+
types: [new_instagram_post]
6+
7+
jobs:
8+
process-post:
9+
runs-on: ubuntu-latest
10+
permissions:
11+
contents: write
12+
env:
13+
# --- Django & App Config ---
14+
PRODUCTION: '1'
15+
DJANGO_SETTINGS_MODULE: 'config.settings.development'
16+
SECRET_KEY: ${{ secrets.SECRET_KEY }}
17+
CLERK_SECRET_KEY: ${{ secrets.CLERK_SECRET_KEY }}
18+
# --- Database Config ---
19+
DATABASE_URL: ${{ secrets.SUPABASE_DB_URL }}
20+
SUPABASE_DB_URL: ${{ secrets.SUPABASE_DB_URL }}
21+
POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
22+
POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
23+
POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
24+
POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }}
25+
POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }}
26+
# --- Service Keys ---
27+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
28+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
29+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
30+
AWS_S3_BUCKET_NAME: ${{ secrets.AWS_S3_BUCKET_NAME }}
31+
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
32+
RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}
33+
RESEND_FROM_EMAIL: ${{ secrets.RESEND_FROM_EMAIL }}
34+
EMAIL_ENCRYPTION_KEY: ${{ secrets.EMAIL_ENCRYPTION_KEY }}
35+
EMAIL_HASH_KEY: ${{ secrets.EMAIL_HASH_KEY }}
36+
# --- Apify Token ---
37+
APIFY_API_TOKEN: ${{ secrets.APIFY_API_TOKEN }}
38+
TARGET_USERNAME: ${{ github.event.client_payload.username }}
39+
MAX_CONCURRENT_TASKS: 1
40+
41+
steps:
42+
- uses: actions/checkout@v4
43+
44+
- name: Set up Python
45+
uses: actions/setup-python@v5
46+
with:
47+
python-version: '3.11'
48+
49+
- name: Create logs directory
50+
working-directory: backend/scraping
51+
run: mkdir -p logs
52+
53+
- name: Cache pip
54+
uses: actions/cache@v4
55+
with:
56+
path: ~/.cache/pip
57+
key: ${{ runner.os }}-pip-${{ hashFiles('backend/requirements.txt') }}
58+
restore-keys: |
59+
${{ runner.os }}-pip-
60+
61+
- name: Install dependencies
62+
working-directory: backend
63+
run: |
64+
pip install --prefer-binary -r requirements.txt
65+
66+
- name: Run Scraper for Single User
67+
working-directory: backend/scraping
68+
run: |
69+
python -u process_single_user.py 2>&1 | tee logs/scraping.log
70+
71+
- name: Upload logs as artifacts
72+
if: always()
73+
uses: actions/upload-artifact@v4
74+
with:
75+
name: logs-${{ github.run_number }}
76+
path: |
77+
backend/scraping/logs/events_scraped.csv
78+
backend/scraping/logs/scraping.log
79+
backend/scraping/apify_raw_results.json
80+
if-no-files-found: 'ignore'

.github/workflows/update-events-data.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,6 @@ name: Scrape Instagram, Update Events DB, Update Static Data
33
on:
44
schedule:
55
- cron: '0 13 * * *' # 8am EST (UTC-5)
6-
workflow_dispatch:
7-
inputs:
8-
username:
9-
description: 'Instagram username to scrape'
10-
required: true
11-
type: string
126

137
jobs:
148
instagram_feed:
@@ -69,8 +63,6 @@ jobs:
6963
7064
- name: Run scraper
7165
working-directory: backend/scraping
72-
env:
73-
SCRAPE_USERNAME: ${{ github.event.inputs.username }}
7466
run: |
7567
python -u instagram_feed.py 2>&1 | tee logs/scraping.log
7668
continue-on-error: false

backend/scraping/instagram_feed.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -305,28 +305,22 @@ def get_seen_shortcodes():
305305
return set()
306306

307307

308-
def get_apify_input():
309-
"""
310-
Builds the Apify actor input JSON for apify/instagram-post-scraper.
311-
"""
308+
def get_apify_input(username=None):
312309
cutoff_date = timezone.now() - timedelta(days=CUTOFF_DAYS)
313310
cutoff_str = cutoff_date.strftime("%Y-%m-%d")
314311
logger.info(f"Setting post cutoff date to {cutoff_str} ({CUTOFF_DAYS} day ago)")
315312

316-
# Scrape single username if provided via env
317-
single_username = os.getenv("SCRAPE_USERNAME")
318-
if single_username:
319-
usernames = [single_username]
320-
logger.info(f"Scraping @{single_username}")
313+
if username:
314+
usernames = [username]
315+
logger.info(f"Scraping @{username}")
321316
else:
322-
# Parse usernames from URLs
323317
usernames = []
324318
for url in FULL_URLS:
325319
try:
326320
clean_url = url.split("instagram.com/")[1]
327-
username = clean_url.split("/")[0]
328-
if username and username not in usernames:
329-
usernames.append(username)
321+
uname = clean_url.split("/")[0]
322+
if uname and uname not in usernames:
323+
usernames.append(uname)
330324
except Exception:
331325
logger.warning(f"Could not parse username from URL: {url}")
332326

@@ -557,10 +551,11 @@ async def process_scraped_posts(posts_data, cutoff_date):
557551
logger.info(f"Added {total_events_added} event(s) to Supabase")
558552

559553

560-
def run_apify_scraper():
554+
def run_apify_scraper(username=None):
561555
"""
562556
Initializes Apify client, runs the Instagram scraper,
563557
saves the raw results, and processes them.
558+
If username is provided, only scrape that user.
564559
"""
565560
if not APIFY_API_TOKEN:
566561
logger.critical("APIFY_API_TOKEN not found in environment. Aborting.")
@@ -569,7 +564,7 @@ def run_apify_scraper():
569564
posts_data = []
570565
try:
571566
client = ApifyClient(APIFY_API_TOKEN)
572-
actor_input = get_apify_input()
567+
actor_input = get_apify_input(username)
573568
logger.info("Starting Apify actor 'apify/instagram-post-scraper'...")
574569
run = client.actor("apify/instagram-post-scraper").call(run_input=actor_input)
575570
logger.info(f"Apify run started (ID: {run['id']}). Waiting for results...")
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import os
2+
import sys
3+
import django
4+
5+
# Setup Django
6+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
8+
django.setup()
9+
10+
from scraping.logging_config import logger
11+
from scraping.instagram_feed import run_apify_scraper, process_scraped_posts
12+
from django.utils import timezone
13+
from datetime import timedelta
14+
15+
16+
def main():
17+
target_user = os.environ.get("TARGET_USERNAME")
18+
if not target_user:
19+
logger.error("No TARGET_USERNAME provided.")
20+
sys.exit(1)
21+
22+
logger.info(f"Scraping @{target_user}...")
23+
posts_data = run_apify_scraper(username=target_user)
24+
if not posts_data:
25+
logger.warning("No posts found.")
26+
return
27+
28+
cutoff_date = timezone.now() - timedelta(days=2)
29+
import asyncio
30+
try:
31+
asyncio.run(process_scraped_posts(posts_data, cutoff_date))
32+
logger.info("Done.")
33+
except Exception as e:
34+
logger.error(f"Error during processing: {e}")
35+
sys.exit(1)
36+
37+
38+
if __name__ == "__main__":
39+
main()

0 commit comments

Comments
 (0)