Skip to content

Commit b95692c

Browse files
authored
Merge pull request #116 from ericahan22/apify
Refactor: migrate scraping from Instaloader to Apify (pray)
2 parents a6207a4 + 5e74e88 commit b95692c

File tree

7 files changed

+519
-322
lines changed

7 files changed

+519
-322
lines changed

.github/copilot-instructions.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ frontend/
150150
- django-cors-headers
151151
- whitenoise
152152
- pgvector
153-
- instaloader (custom)
153+
- apify_client
154154
- requests, beautifulsoup4
155155
- openai
156156
- python-dotenv
@@ -172,7 +172,7 @@ frontend/
172172
## CI/CD
173173

174174
- GitHub Actions workflow runs daily scraping job.
175-
- Requires secrets: Instagram credentials, OpenAI API key, database URL.
175+
- Requires secrets: APIFY_API_TOKEN, OpenAI API key, database URL.
176176
- Uses Python 3.11, caches pip dependencies, uploads logs as artifacts.
177177

178178
---

.github/workflows/update-events-data.yml

Lines changed: 14 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,62 +2,42 @@ name: Scrape Instagram, Update Events DB, Update Static Data
22

33
on:
44
schedule:
5-
- cron: '0 12-23/4 * * *' # 7am, 11am, 3pm, 7pm, 11pm EST
5+
- cron: '0 13 * * *' # 8am EST (UTC-5)
66
workflow_dispatch:
77
inputs:
8-
run_scraper:
9-
required: true
10-
type: boolean
11-
default: false
12-
MAX_POSTS:
13-
required: false
14-
type: number
15-
default: 15
16-
CUTOFF_DAYS:
17-
required: false
18-
type: number
19-
default: 2
208

219
jobs:
2210
instagram_feed:
2311
runs-on: ubuntu-latest
2412
permissions:
2513
contents: write
2614
env:
27-
MAX_POSTS: ${{ github.event.inputs.MAX_POSTS || '15' }}
28-
CUTOFF_DAYS: ${{ github.event.inputs.CUTOFF_DAYS || '2' }}
15+
# --- Django & App Config ---
2916
PRODUCTION: '1'
3017
DJANGO_SETTINGS_MODULE: 'config.settings.development'
18+
SECRET_KEY: ${{ secrets.SECRET_KEY }}
19+
CLERK_SECRET_KEY: ${{ secrets.CLERK_SECRET_KEY }}
20+
# --- Database Config ---
3121
DATABASE_URL: ${{ secrets.SUPABASE_DB_URL }}
3222
SUPABASE_DB_URL: ${{ secrets.SUPABASE_DB_URL }}
3323
POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
3424
POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
3525
POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
3626
POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }}
3727
POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }}
38-
USERNAME: ${{ secrets.USERNAME }}
39-
PASSWORD: ${{ secrets.PASSWORD }}
40-
SESSIONID: ${{ secrets.SESSIONID }}
41-
CSRFTOKEN: ${{ secrets.CSRFTOKEN }}
42-
DS_USER_ID: ${{ secrets.DS_USER_ID }}
43-
IG_DID: ${{ secrets.IG_DID }}
44-
MID: ${{ secrets.MID }}
28+
# --- Service Keys ---
4529
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
46-
DOC_ID: ${{ secrets.DOC_ID }}
47-
USER_AGENT: ${{ secrets.USER_AGENT }}
48-
X_IG_APP_ID: ${{ secrets.X_IG_APP_ID }}
4930
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
5031
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
5132
AWS_S3_BUCKET_NAME: ${{ secrets.AWS_S3_BUCKET_NAME }}
5233
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
5334
RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}
5435
RESEND_FROM_EMAIL: ${{ secrets.RESEND_FROM_EMAIL }}
55-
ZYTE_PROXY: ${{ secrets.ZYTE_PROXY }}
5636
EMAIL_ENCRYPTION_KEY: ${{ secrets.EMAIL_ENCRYPTION_KEY }}
5737
EMAIL_HASH_KEY: ${{ secrets.EMAIL_HASH_KEY }}
58-
SECRET_KEY: ${{ secrets.SECRET_KEY }}
59-
CLERK_SECRET_KEY: ${{ secrets.CLERK_SECRET_KEY }}
60-
38+
# --- Apify Token ---
39+
APIFY_API_TOKEN: ${{ secrets.APIFY_API_TOKEN }}
40+
6141
steps:
6242
- uses: actions/checkout@v4
6343

@@ -66,11 +46,9 @@ jobs:
6646
with:
6747
python-version: '3.11'
6848

69-
- name: Create logs and cache directories
49+
- name: Create logs directory
7050
working-directory: backend/scraping
71-
run: |
72-
mkdir -p logs
73-
mkdir -p $GITHUB_WORKSPACE/.insta_cache
51+
run: mkdir -p logs
7452

7553
- name: Cache pip
7654
uses: actions/cache@v4
@@ -80,34 +58,27 @@ jobs:
8058
restore-keys: |
8159
${{ runner.os }}-pip-
8260
83-
- name: Cache Instaloader session
84-
uses: actions/cache@v4
85-
with:
86-
path: ${{ github.workspace }}/.insta_cache
87-
key: ${{ runner.os }}-instaloader-session-${{ hashFiles('backend/requirements.txt') }}
88-
restore-keys: |
89-
${{ runner.os }}-instaloader-session-
90-
9161
- name: Install dependencies
9262
working-directory: backend
9363
run: |
94-
pip install --upgrade pip setuptools wheel
9564
pip install --prefer-binary -r requirements.txt
9665
9766
- name: Run scraper
98-
if: github.event_name == 'schedule' || github.event.inputs.run_scraper == 'true'
9967
working-directory: backend/scraping
10068
run: |
10169
python -u instagram_feed.py 2>&1 | tee logs/scraping.log
10270
continue-on-error: false
10371

10472
- name: Upload logs as artifacts
73+
if: always()
10574
uses: actions/upload-artifact@v4
10675
with:
10776
name: logs-${{ github.run_number }}
10877
path: |
10978
backend/scraping/logs/events_scraped.csv
11079
backend/scraping/logs/scraping.log
80+
backend/scraping/apify_raw_results.json
81+
if-no-files-found: 'ignore'
11182

11283
- name: Generate static data file
11384
id: generate_static

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ __pycache__/
1616
backend/scraping/*.csv
1717
backend/scraping/test*.py
1818
backend/testing/
19+
backend/apify_raw_results.json
1920

2021
# Log files
2122
*.log

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ npm run dev # Development server on port 5173
5151
- **Apps**: `events`, `clubs`, `newsletter`, `promotions`, `core`
5252
- **Database**: PostgreSQL (production) / SQLite (local with USE_SQLITE=1)
5353
- **Services**: OpenAI integration, email service, S3 storage
54-
- **Scraping**: Instagram data extraction with custom instaloader build
54+
- **Scraping**: Instagram data extraction with Apify
5555

5656
### Frontend Structure (`/frontend/`)
5757
- **Feature-based architecture** under `/src/features/`

backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ django-ratelimit==3.0.1
1111
# Scraping and web utilities
1212
--find-links=./wheels
1313
setuptools>=65.5.0
14-
instaloader==4.14.2+custom1
14+
apify_client
1515
python-dotenv
1616
python-dateutil==2.9.0.post0
1717
requests==2.31.0

0 commit comments

Comments
 (0)