Skip to content

Commit 3acf46f

Browse files
authored
Merge pull request #99 from ericahan22/zyte
Implement Zyte proxy setup
2 parents 210f92d + 28fb4d1 commit 3acf46f

File tree

5 files changed

+127
-19
lines changed

5 files changed

+127
-19
lines changed

.github/workflows/update-events-data.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@ on:
1313
required: true
1414
type: boolean
1515
default: false
16+
MAX_POSTS:
17+
required: false
18+
type: number
19+
default: 100
1620

1721
jobs:
1822
instagram_feed:
1923
runs-on: ubuntu-latest
2024
permissions:
2125
contents: write
2226
env:
27+
MAX_POSTS: ${{ github.event.inputs.MAX_POSTS || '100' }}
2328
SUPABASE_DB_URL: ${{ secrets.SUPABASE_DB_URL }}
2429
POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
2530
POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
@@ -43,6 +48,7 @@ jobs:
4348
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
4449
RESEND_API_KEY: ${{ secrets.RESEND_API_KEY }}
4550
RESEND_FROM_EMAIL: ${{ secrets.RESEND_FROM_EMAIL }}
51+
ZYTE_PROXY: ${{ secrets.ZYTE_PROXY }}
4652
EMAIL_ENCRYPTION_KEY: ${{ secrets.EMAIL_ENCRYPTION_KEY }}
4753
EMAIL_HASH_KEY: ${{ secrets.EMAIL_HASH_KEY }}
4854

@@ -68,6 +74,14 @@ jobs:
6874
restore-keys: |
6975
${{ runner.os }}-pip-
7076
77+
- name: Cache Instaloader session
78+
uses: actions/cache@v3
79+
with:
80+
path: .insta_cache
81+
key: insta-session-${{ hashFiles('**/requirements.txt') }}
82+
restore-keys: |
83+
insta-session-
84+
7185
- name: Install dependencies
7286
working-directory: backend
7387
run: |

backend/scraping/certs/zyte-ca.crt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
-----BEGIN CERTIFICATE-----
2+
MIIEUjCCAzqgAwIBAgIUTSEKJT9gFqPek0yW4Yq+QewypCgwDQYJKoZIhvcNAQEL
3+
BQAwgbkxCzAJBgNVBAYTAklFMRAwDgYDVQQIDAdNdW5zdGVyMQ0wCwYDVQQHDARD
4+
b3JrMRQwEgYDVQQKDAtTY3JhcGluZ0h1YjE1MDMGA1UECwwsTGVhZGluZyBUZWNo
5+
bm9sb2d5IGFuZCBQcm9mZXNzaW9uYWwgU2VydmljZXMxFDASBgNVBAMMC0NyYXds
6+
ZXJhIENBMSYwJAYJKoZIhvcNAQkBFhdzdXBwb3J0QHNjcmFwaW5naHViLmNvbTAe
7+
Fw0yNTA1MTYxODE3MTVaFw0zNTA1MTQxODE3MTVaMIG5MQswCQYDVQQGEwJJRTEQ
8+
MA4GA1UECAwHTXVuc3RlcjENMAsGA1UEBwwEQ29yazEUMBIGA1UECgwLU2NyYXBp
9+
bmdIdWIxNTAzBgNVBAsMLExlYWRpbmcgVGVjaG5vbG9neSBhbmQgUHJvZmVzc2lv
10+
bmFsIFNlcnZpY2VzMRQwEgYDVQQDDAtDcmF3bGVyYSBDQTEmMCQGCSqGSIb3DQEJ
11+
ARYXc3VwcG9ydEBzY3JhcGluZ2h1Yi5jb20wggEiMA0GCSqGSIb3DQEBAQUAA4IB
12+
DwAwggEKAoIBAQDcjecMfrYzsUdPocblk1L1wF5eYxloprFh2v7uKhKJVfOu4OQf
13+
e5foJBkuS9HYqkaJwc6d3QUzhLisdhChZzzZv9QaDCNB2LjsQPx+oI+6wRyCj/t5
14+
jfLZj8mK8RYqTp/P6AGW8QeMqPYTLNnsVoTFMQxBNN7bwSuhdM3JJRy1j2Omu0qu
15+
zyOMZd3axHoAcXUzo77kJP/mReG4M739D1N/KbvDEGxT+T7qGE0f5GRiU0S+S/R3
16+
y92S79kpMIG4v0M1OqBcW8CY6PQw0SN/mQWsX3mD6Tn4/E9TThM9S07BTfQZhsKM
17+
T55xiyhM4ODr6O5NCg2TdEW3QDTBlLeXLWudAgMBAAGjUDBOMB0GA1UdDgQWBBSf
18+
p9cc6kMixqzBMysrYPBw4cu1nTAfBgNVHSMEGDAWgBSfp9cc6kMixqzBMysrYPBw
19+
4cu1nTAMBgNVHRMEBTADAQH/MA0GCSqGSIb3DQEBCwUAA4IBAQBmkvLIUf5O165u
20+
Sf5NqBhI5yrUe3xCYrNEyvYcaLb0KiJKaGQZLa+cyJZ1gDL+iVuX209TglbyLwXL
21+
M4Xl6YTERPF1KXrqmDMbsPJRX9nl/Ok1VdmNyY201fL1E8fZ1LFf8lUmgVhhtpwJ
22+
MZttrBmmovS9x7nQO+ITzZXG//VPewT3NasY1rvMWizfwtYiyALzCj5DAoEypY5V
23+
oP/DmHGOdpy2zy4uy00gMJot6KyLv0U4H9t8soe2+1PMZUSDlsB9CfSa2ORUXbcg
24+
hxy7ujs8hx4Zb+3f5AvpoydCHxlKsq9v1a6lHXLKD1UnsxPnlZ4N2P2VBkT7n+gp
25+
xwhGbUVi
26+
-----END CERTIFICATE-----

backend/scraping/instagram_feed.py

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
import csv
1313
import logging
1414
import random
15-
import re
1615
import time
1716
import traceback
17+
import re
18+
import requests
1819
import json
1920
from datetime import datetime, timedelta, timezone
2021
from pathlib import Path
@@ -26,6 +27,8 @@
2627
from apps.events.models import Events
2728
from services.openai_service import extract_events_from_caption, generate_embedding
2829
from services.storage_service import upload_image_from_url
30+
from zyte_setup import setup_zyte
31+
from logging_config import logger
2932
from utils.embedding_utils import find_similar_events
3033

3134
USER_AGENTS = [
@@ -36,23 +39,7 @@
3639
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
3740
]
3841

39-
LOG_DIR = Path("logs")
40-
LOG_DIR.mkdir(exist_ok=True)
41-
LOG_FILE = LOG_DIR / "scraping.log"
42-
43-
logging.getLogger("urllib3").setLevel(logging.WARNING)
44-
logging.getLogger("requests").setLevel(logging.WARNING)
45-
logging.basicConfig(
46-
level=logging.DEBUG,
47-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
48-
handlers=[
49-
logging.StreamHandler(sys.stdout),
50-
logging.FileHandler(LOG_FILE, encoding="utf-8"),
51-
],
52-
)
53-
logger = logging.getLogger(__name__)
54-
55-
MAX_POSTS = 50
42+
MAX_POSTS = int(os.getenv("MAX_POSTS", "100"))
5643
MAX_CONSEC_OLD_POSTS = 10
5744
CUTOFF_DAYS = 2
5845

@@ -429,10 +416,53 @@ def process_recent_feed(
429416
logger.info(f"Added {events_added} event(s) to Supabase")
430417

431418

419+
def test_zyte_proxy(country="CA"):
420+
"""
421+
Patch requests.Session to route through Zyte with geolocation,
422+
test Zyte proxy routing and geolocation
423+
"""
424+
zyte_cert_path = setup_zyte()
425+
zyte_proxy = os.getenv("ZYTE_PROXY")
426+
os.environ['https_proxy'] = zyte_proxy
427+
428+
old_request = requests.Session.request
429+
430+
def zyte_request(self, method, url, **kwargs):
431+
headers = kwargs.get("headers", {})
432+
headers["Zyte-Geolocation"] = country
433+
kwargs["headers"] = headers
434+
kwargs["verify"] = zyte_cert_path
435+
kwargs["proxies"] = {"http": zyte_proxy, "https": zyte_proxy}
436+
kwargs["timeout"] = kwargs.get("timeout", 60)
437+
return old_request(self, method, url, **kwargs)
438+
439+
requests.Session.request = zyte_request
440+
441+
logging.debug(f"Testing Zyte proxy geolocation: {country}")
442+
try:
443+
resp = requests.get(
444+
"https://ipapi.co/json/",
445+
timeout=15,
446+
verify=zyte_cert_path)
447+
resp.raise_for_status()
448+
data = resp.json()
449+
logging.debug(f"Connected via Zyte proxy")
450+
logging.debug(f"Public IP: {data.get('ip')}")
451+
logging.debug(f"Country: {data.get('country_name')} ({data.get('country')})")
452+
logging.debug(f"City: {data.get('city')}")
453+
except Exception as e:
454+
print(f"Proxy geolocation test failed: {e}")
455+
456+
432457
@handle_instagram_errors
433458
def session():
434459
L = Instaloader(user_agent=random.choice(USER_AGENTS))
435-
session_file = Path(__file__).resolve().parent.parent / ("session-" + USERNAME)
460+
try:
461+
SESSION_CACHE_DIR = Path(os.getenv("GITHUB_WORKSPACE", ".")) / ".insta_cache"
462+
SESSION_CACHE_DIR.mkdir(exist_ok=True)
463+
session_file = SESSION_CACHE_DIR / f"session-{USERNAME}"
464+
except Exception as e:
465+
session_file = Path(__file__).resolve().parent.parent / ("session-" + USERNAME)
436466
try:
437467
if session_file.exists():
438468
L.load_session_from_file(USERNAME, filename=str(session_file))
@@ -457,6 +487,7 @@ def session():
457487

458488

459489
if __name__ == "__main__":
490+
test_zyte_proxy("CA")
460491
logger.info("Attemping to load Instagram session...")
461492
L = session()
462493
if L:

backend/scraping/logging_config.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import logging
2+
import sys
3+
from pathlib import Path
4+
5+
LOG_DIR = Path("logs")
6+
LOG_DIR.mkdir(exist_ok=True)
7+
LOG_FILE = LOG_DIR / "scraping.log"
8+
9+
logging.getLogger("urllib3").setLevel(logging.WARNING)
10+
logging.getLogger("requests").setLevel(logging.WARNING)
11+
logging.basicConfig(
12+
level=logging.DEBUG,
13+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14+
handlers=[
15+
logging.StreamHandler(sys.stdout),
16+
logging.FileHandler(LOG_FILE, encoding="utf-8"),
17+
],
18+
)
19+
logger = logging.getLogger(__name__)

backend/scraping/zyte_setup.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
from pathlib import Path
3+
from logging_config import logger
4+
5+
6+
def setup_zyte():
7+
"""
8+
Sets up Zyte CA certificate, sets environment variables for requests,
9+
returns cert path
10+
"""
11+
cert_path = Path(__file__).parent / "certs" / "zyte-ca.crt"
12+
try:
13+
os.environ["REQUESTS_CA_BUNDLE"] = str(cert_path)
14+
os.environ["CURL_CA_BUNDLE"] = str(cert_path)
15+
os.environ["SSL_CERT_FILE"] = str(cert_path)
16+
except Exception as e:
17+
logger.error(f"Failed to set Zyte cert env vars: {e}")
18+
return cert_path

0 commit comments

Comments
 (0)