Skip to content

Commit 49c1909

Browse files
committed
Refactor scraping for daily and live ingestion:
- Created a new `main.py` script to orchestrate the scraping process and handle both single and batch modes - Deleted `process_single_user.py` and `instagram_feed.py` to use `main.py` instead as entry point - Moved Apify-related functions to `apify_client.py` - Added `post_processing.py` to manage the processing of scraped posts (replaces `instagram_feed.py`) - Updated `urls_to_scrape.py` to only include most "important" clubs
1 parent 3fc37be commit 49c1909

File tree

11 files changed

+501
-795
lines changed

11 files changed

+501
-795
lines changed

.github/workflows/process-single-user.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ jobs:
6666
- name: Run Scraper for Single User
6767
working-directory: backend/scraping
6868
run: |
69-
python -u process_single_user.py 2>&1 | tee logs/scraping.log
69+
python -u main.py 2>&1 | tee logs/scraping.log
7070
7171
- name: Upload logs as artifacts
7272
if: always()

.github/workflows/update-events-data.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
- name: Run scraper
6565
working-directory: backend/scraping
6666
run: |
67-
python -u instagram_feed.py 2>&1 | tee logs/scraping.log
67+
python -u main.py 2>&1 | tee logs/scraping.log
6868
continue-on-error: false
6969

7070
- name: Upload logs as artifacts

backend/scraping/apify_client.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
import logging
3+
from datetime import timedelta
4+
from django.utils import timezone
5+
from apify_client import ApifyClient
6+
7+
logger = logging.getLogger(__name__)
8+
9+
APIFY_TOKEN = os.getenv("APIFY_API_TOKEN")
10+
APIFY_ACTOR = "apify/instagram-post-scraper"
11+
12+
def run_apify_scraper(usernames=None, results_limit=None):
13+
"""
14+
Run Apify actor for one or more usernames and return the results.
15+
"""
16+
if not APIFY_TOKEN:
17+
logger.critical("APIFY_API_TOKEN not set")
18+
raise RuntimeError("APIFY_API_TOKEN not set")
19+
20+
cutoff_date = timezone.now() - timedelta(days=1)
21+
cutoff_str = cutoff_date.strftime("%Y-%m-%d")
22+
23+
client = ApifyClient(APIFY_TOKEN)
24+
25+
input_data = {
26+
"skipPinnedPosts": True,
27+
"onlyPostsNewerThan": cutoff_str,
28+
}
29+
30+
if usernames:
31+
if isinstance(usernames, str):
32+
usernames = [usernames]
33+
input_data["username"] = usernames
34+
35+
if results_limit is not None:
36+
input_data["resultsLimit"] = results_limit
37+
38+
logger.info(f"Starting Apify actor with cutoff {cutoff_str} for: {usernames}")
39+
40+
run = client.actor(APIFY_ACTOR).call(run_input=input_data)
41+
42+
if not run:
43+
logger.error("Apify run failed or returned no data.")
44+
return []
45+
46+
logger.info(f"Apify run finished (ID: {run.get('id')}). Fetching dataset...")
47+
dataset_items = client.dataset(run["defaultDatasetId"]).list_items().items
48+
49+
logger.info(f"Retrieved {len(dataset_items)} items from Apify.")
50+
return dataset_items

0 commit comments

Comments
 (0)