Skip to content

Commit 55a04d8

Browse files
committed
update cron schedule and MAX_POSTS; switch to direct session for image download
1 parent 25d2885 commit 55a04d8

File tree

3 files changed

+42
-72
lines changed

3 files changed

+42
-72
lines changed

.github/workflows/update-events-data.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ name: Scrape Instagram, Update Events DB, Update Static Data
22

33
on:
44
schedule:
5-
- cron: '0 3,12,17,22 * * *' # 11pm, 8am, 1pm, 6pm EST
6-
workflow_dispatch: # Optional manual trigger
5+
- cron: '0 12-23/4 * * *' # 7am, 11am, 3pm, 7pm, 11pm EST
6+
workflow_dispatch:
77
inputs:
88
run_scraper:
99
required: true
@@ -12,7 +12,7 @@ on:
1212
MAX_POSTS:
1313
required: false
1414
type: number
15-
default: 30
15+
default: 15
1616
CUTOFF_DAYS:
1717
required: false
1818
type: number
@@ -24,7 +24,7 @@ jobs:
2424
permissions:
2525
contents: write
2626
env:
27-
MAX_POSTS: ${{ github.event.inputs.MAX_POSTS || '30' }}
27+
MAX_POSTS: ${{ github.event.inputs.MAX_POSTS || '15' }}
2828
CUTOFF_DAYS: ${{ github.event.inputs.CUTOFF_DAYS || '2' }}
2929
PRODUCTION: '1'
3030
DJANGO_SETTINGS_MODULE: 'config.settings.development'

backend/scraping/instagram_feed.py

Lines changed: 37 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
from utils.date_utils import parse_utc_datetime
4141

4242

43-
MAX_POSTS = int(os.getenv("MAX_POSTS", "30"))
43+
MAX_POSTS = int(os.getenv("MAX_POSTS", "15"))
4444
MAX_CONSEC_OLD_POSTS = 10
4545
CUTOFF_DAYS = int(os.getenv("CUTOFF_DAYS", "2"))
4646

@@ -311,38 +311,6 @@ def get_seen_shortcodes():
311311
return set()
312312

313313

314-
def safe_feed_posts(loader, retries=3, backoff=60):
315-
"""
316-
Yield posts from loader.get_feed_posts(), retrying on network errors.
317-
On error, re-instantiate the session and skip already-yielded posts.
318-
"""
319-
seen_shortcodes = set()
320-
attempts = 0
321-
while attempts < retries:
322-
try:
323-
for post in loader.get_feed_posts():
324-
if hasattr(post, "shortcode"):
325-
if post.shortcode in seen_shortcodes:
326-
continue
327-
seen_shortcodes.add(post.shortcode)
328-
yield post
329-
time.sleep(random.uniform(30, 90))
330-
break # Finished all posts
331-
except (ReadTimeout, ConnectionError, requests.exceptions.SSLError) as e:
332-
attempts += 1
333-
logger.warning(f"Network error: {e!s}. Retrying in {backoff} seconds (attempt {attempts}/{retries})...")
334-
time.sleep(backoff)
335-
try:
336-
new_loader = session()
337-
loader.__dict__.update(new_loader.__dict__)
338-
logger.info("Session re-instantiated successfully. Continuing feed scrape.")
339-
except Exception as session_e:
340-
logger.error(f"Failed to re-instantiate session: {session_e}")
341-
break
342-
if attempts >= retries:
343-
logger.error("Too many consecutive network errors. Aborting feed scrape.")
344-
345-
346314
def process_recent_feed(
347315
loader,
348316
cutoff=None,
@@ -365,7 +333,7 @@ def process_recent_feed(
365333
seen_shortcodes = get_seen_shortcodes()
366334

367335
try:
368-
for post in safe_feed_posts(loader):
336+
for post in loader.get_feed_posts():
369337
try:
370338
post_time = timezone.make_aware(post.date_utc) if timezone.is_naive(post.date_utc) else post.date_utc
371339
if post_time < cutoff:
@@ -496,6 +464,8 @@ def process_recent_feed(
496464
f"Reached {max_consec_old_posts} consecutive old posts, stopping."
497465
)
498466
break
467+
468+
time.sleep(random.uniform(30, 90))
499469

500470
if not termination_reason:
501471
termination_reason = "no_more_posts"
@@ -513,7 +483,7 @@ def process_recent_feed(
513483
logger.info(f"Added {events_added} event(s) to Supabase")
514484

515485

516-
def test_zyte_proxy(country="CA"):
486+
def create_proxy_session(country="CA"):
517487
"""
518488
Patch requests.Session to route through Zyte with geolocation,
519489
test Zyte proxy routing and geolocation
@@ -526,17 +496,11 @@ def test_zyte_proxy(country="CA"):
526496
logger.warning(
527497
"ZYTE_PROXY not set - skipping proxied geolocation test and trying direct request"
528498
)
529-
try:
530-
resp = requests.get("https://ipapi.co/json/", timeout=15)
531-
resp.raise_for_status()
532-
logger.info("Direct geolocation test succeeded")
533-
return True
534-
except Exception as e:
535-
logger.warning(f"Direct geolocation test failed: {e!s}")
536-
return False
537-
538-
old_request = requests.Session.request
539-
499+
return requests.Session()
500+
501+
session = requests.Session()
502+
old_request = session.request
503+
540504
def zyte_request(self, method, url, **kwargs):
541505
headers = kwargs.get("headers", {})
542506
headers["Zyte-Geolocation"] = country
@@ -547,29 +511,30 @@ def zyte_request(self, method, url, **kwargs):
547511
kwargs["timeout"] = kwargs.get("timeout", 30)
548512
return old_request(self, method, url, **kwargs)
549513

550-
requests.Session.request = zyte_request
551-
514+
session.request = zyte_request.__get__(session, requests.Session)
552515
logger.debug(f"Testing Zyte proxy geolocation: {country}")
553516
try:
554-
resp = requests.get(
555-
"https://ipapi.co/json/",
556-
timeout=30,
557-
verify=str(zyte_cert_path) if zyte_cert_path else True,
558-
proxies={"http": zyte_proxy, "https": zyte_proxy},
559-
)
517+
resp = session.get("https://ipapi.co/json/")
560518
resp.raise_for_status()
561519
data = resp.json()
562520
logger.debug("Connected via Zyte proxy")
563521
logger.debug(f"Public IP: {data.get('ip')}")
564522
logger.debug(f"Country: {data.get('country_name')} ({data.get('country')})")
565523
logger.debug(f"City: {data.get('city')}")
524+
return session
566525
except Exception as e:
567526
logger.warning(f"Proxied geolocation failed: {e!s}")
568-
return True
527+
return None
569528

570529

571530
def session():
531+
proxied_session = create_proxy_session("CA")
532+
if not proxied_session:
533+
logger.critical("Failed to create proxied session, aborting...")
534+
return None
535+
572536
L = Instaloader(user_agent=random.choice(USER_AGENTS))
537+
L.context._session = proxied_session
573538
L.context.request_timeout = 120
574539
L.context.max_connection_attempts = 5
575540
try:
@@ -603,11 +568,20 @@ def session():
603568

604569

605570
if __name__ == "__main__":
606-
test_zyte_proxy("CA")
607-
logger.info("Attemping to load Instagram session...")
608-
L = session()
609-
if L:
610-
logger.info("Session created successfully!")
611-
process_recent_feed(L)
612-
else:
613-
logger.critical("Failed to initialize Instagram session, stopping...")
571+
lock_file_path = Path(__file__).parent / "scrape.lock"
572+
if lock_file_path.exists():
573+
sys.exit()
574+
try:
575+
lock_file_path.touch()
576+
logger.info("Attemping to load Instagram session...")
577+
L = session()
578+
if L:
579+
logger.info("Session created successfully!")
580+
process_recent_feed(L)
581+
else:
582+
logger.critical("Failed to initialize Instagram session, stopping...")
583+
except Exception as e:
584+
logger.error(f"An uncaught exception occurred: {e}")
585+
finally:
586+
if lock_file_path.exists():
587+
lock_file_path.unlink()

backend/services/storage_service.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,7 @@ def _validate_image(self, image_data: bytes) -> bool:
5656
def _download_image_from_url(self, image_url: str) -> bytes | None:
5757
"""Download image from URL"""
5858
try:
59-
headers = {
60-
"User-Agent": random.choice(USER_AGENTS),
61-
"Referer": "https://www.instagram.com/",
62-
}
63-
response = requests.get(image_url, headers=headers, timeout=60)
59+
response = requests.get(image_url, timeout=60)
6460
response.raise_for_status()
6561

6662
return response.content

0 commit comments

Comments
 (0)