Skip to content

Commit 9e4b3db

Browse files
committed
fix events_scraped.csv path again + other minor fixes
1 parent 02cca61 commit 9e4b3db

File tree

1 file changed

+82
-83
lines changed

1 file changed

+82
-83
lines changed

backend/scraping/instagram_feed.py

Lines changed: 82 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from instaloader import *
1+
from instaloader import Instaloader
22
from dotenv import load_dotenv
33
import os
44
import csv
@@ -8,10 +8,11 @@
88
import psycopg2
99
import logging
1010
import traceback
11-
from datetime import datetime
1211
import sys
1312
from fuzzywuzzy import fuzz
1413
import time
14+
from pathlib import Path
15+
1516

1617
logging.basicConfig(
1718
level=logging.DEBUG,
@@ -23,6 +24,21 @@
2324
)
2425
logger = logging.getLogger(__name__)
2526

27+
28+
# Load environment variables from .env file
29+
load_dotenv()
30+
31+
# Get credentials from environment variables
32+
USERNAME = os.getenv("USERNAME")
33+
PASSWORD = os.getenv("PASSWORD")
34+
CSRFTOKEN = os.getenv("CSRFTOKEN")
35+
SESSIONID = os.getenv("SESSIONID")
36+
DS_USER_ID = os.getenv("DS_USER_ID")
37+
MID = os.getenv("MID")
38+
IG_DID = os.getenv("IG_DID")
39+
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")
40+
41+
2642
def get_post_image_url(post):
2743
try:
2844
if "image_versions2" in post._node and post._node["image_versions2"]:
@@ -56,23 +72,11 @@ def wrapper(*args, **kwargs):
5672
return wrapper
5773

5874

59-
# Load environment variables from .env file
60-
load_dotenv()
61-
62-
# Get credentials from environment variables
63-
USERNAME = os.getenv("USERNAME")
64-
PASSWORD = os.getenv("PASSWORD")
65-
CSRFTOKEN = os.getenv("CSRFTOKEN")
66-
SESSIONID = os.getenv("SESSIONID")
67-
DS_USER_ID = os.getenv("DS_USER_ID")
68-
MID = os.getenv("MID")
69-
IG_DID = os.getenv("IG_DID")
70-
71-
7275
def append_event_to_csv(event_data, club_ig, post_url, status="success"):
73-
csv_file = "backend/scraping/events_scraped.csv"
74-
os.makedirs(os.path.dirname(csv_file), exist_ok=True)
75-
file_exists = os.path.isfile(csv_file)
76+
csv_file = Path(__file__).resolve().parent / "events_scraped.csv"
77+
csv_file.parent.mkdir(parents=True, exist_ok=True)
78+
file_exists = csv_file.exists()
79+
7680
with open(csv_file, "a", newline="", encoding="utf-8") as csvfile:
7781
fieldnames = [
7882
"club_handle", "url", "name", "date", "start_time", "end_time",
@@ -104,7 +108,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
104108
event_location = event_data.get("location") #.title()
105109
conn = None
106110
try:
107-
conn = psycopg2.connect(os.getenv("SUPABASE_DB_URL"))
111+
conn = psycopg2.connect(SUPABASE_DB_URL)
108112
cur = conn.cursor()
109113

110114
# Check duplicates
@@ -146,6 +150,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
146150

147151
try:
148152
append_event_to_csv(event_data, club_ig, post_url, status="success")
153+
logger.info(f"Appended event to CSV: {event_data.get('name')}")
149154
except Exception as csv_err:
150155
logger.error(f"Database insert succeeded, but failed to append to CSV: {csv_err}")
151156
logger.error(f"Traceback: {traceback.format_exc()}")
@@ -159,6 +164,7 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
159164

160165
try:
161166
append_event_to_csv(event_data, club_ig, post_url, status="failed")
167+
logger.info(f"Appended event to CSV: {event_data.get('name')}")
162168
except Exception as csv_err:
163169
logger.error(f"Database insert failed, and failed to append to CSV: {csv_err}")
164170
logger.error(f"Traceback: {traceback.format_exc()}")
@@ -169,79 +175,72 @@ def insert_event_to_db(event_data, club_ig, post_url, sim_threshold=80):
169175
conn.close()
170176

171177

172-
def process_recent_feed(cutoff=datetime.now(timezone.utc) - timedelta(days=2), max_posts=100, max_consec_old_posts=3):
178+
def process_recent_feed(loader, cutoff=datetime.now(timezone.utc) - timedelta(days=2), max_posts=100, max_consec_old_posts=3):
173179
# Process Instagram feed posts and extract event info. Stops
174180
# scraping once posts become older than cutoff.
175-
try:
176-
logger.info(f"Starting feed processing with cutoff: {cutoff}")
177-
events_added = 0
178-
posts_processed = 0
179-
consec_old_posts = 0
180-
s3_uploader = S3ImageUploader() # Initialize S3 uploader
181-
182-
183-
for post in L.get_feed_posts():
184-
try:
185-
posts_processed += 1
186-
logger.info("\n" + "-" * 50)
187-
logger.info(f"Processing post: {post.shortcode} by {post.owner_username}")
188-
189-
post_time = post.date_utc.replace(tzinfo=timezone.utc)
190-
if post_time < cutoff:
191-
consec_old_posts += 1
192-
logger.debug(f"Post {post.shortcode} is older than cutoff ({post_time}), consecutive old posts: {consec_old_posts}")
193-
if consec_old_posts >= max_consec_old_posts:
194-
logger.info(f"Reached {max_consec_old_posts} consecutive old posts, stopping.")
195-
break
196-
continue # to next post
197-
consec_old_posts = 0
198-
199-
if posts_processed >= max_posts:
200-
logger.info(f"Reached max post limit of {max_posts}, stopping.")
181+
events_added = 0
182+
posts_processed = 0
183+
consec_old_posts = 0
184+
s3_uploader = S3ImageUploader() # Initialize S3 uploader
185+
logger.info(f"Starting feed processing with cutoff: {cutoff}")
186+
187+
for post in loader.get_feed_posts():
188+
try:
189+
posts_processed += 1
190+
logger.info("\n" + "-" * 50)
191+
logger.info(f"Processing post: {post.shortcode} by {post.owner_username}")
192+
193+
post_time = post.date_utc.replace(tzinfo=timezone.utc)
194+
if post_time < cutoff:
195+
consec_old_posts += 1
196+
if consec_old_posts >= max_consec_old_posts:
197+
logger.info(f"Reached {max_consec_old_posts} consecutive old posts, stopping.")
201198
break
199+
continue # to next post
200+
consec_old_posts = 0
202201

203-
# Safely get image URL and upload to S3
204-
raw_image_url = get_post_image_url(post)
205-
if raw_image_url:
206-
image_url = s3_uploader.upload_image(raw_image_url)
207-
print(f"Uploaded image to S3: {image_url}")
208-
else:
209-
logger.warning(f"No image URL found for post {post.shortcode}, skipping image upload")
210-
image_url = None
211-
212-
event_data = parse_caption_for_event(post.caption, image_url)
213-
214-
if event_data is None:
215-
logger.warning(f"AI client returned None for post {post.shortcode}")
216-
continue
217-
218-
post_url = f"https://www.instagram.com/p/{post.shortcode}/"
219-
if event_data.get("name") and event_data.get("date") and event_data.get("location") and event_data.get("start_time"):
220-
if insert_event_to_db(event_data, post.owner_username, post_url):
221-
events_added += 1
222-
logger.info(f"Successfully added event from {post.owner_username}")
223-
else:
224-
missing_fields = [key for key in ['name', 'date', 'location', 'start_time'] if not event_data.get(key)]
225-
logger.warning(f"Missing required fields: {missing_fields}, skipping event")
226-
time.sleep(5)
227-
except Exception as e:
228-
logger.error(f"Error processing post {post.shortcode} by {post.owner_username}: {str(e)}")
229-
logger.error(f"Traceback: {traceback.format_exc()}")
230-
continue # with next post
231-
print(f"\n--- Summary ---")
232-
print(f"Added {events_added} event(s) to Supabase")
233-
logger.info(f"Feed processing completed. Processed {posts_processed} posts, added {events_added} events")
234-
except Exception as e:
235-
logger.error(f"Error in process_recent_feed: {str(e)}")
236-
logger.error(f"Traceback: {traceback.format_exc()}")
237-
raise
202+
if posts_processed >= max_posts:
203+
logger.info(f"Reached max post limit of {max_posts}, stopping.")
204+
break
205+
206+
# Safely get image URL and upload to S3
207+
raw_image_url = get_post_image_url(post)
208+
if raw_image_url:
209+
image_url = s3_uploader.upload_image(raw_image_url)
210+
logger.info(f"Uploaded image to S3: {image_url}")
211+
else:
212+
logger.warning(f"No image URL found for post {post.shortcode}, skipping image upload")
213+
image_url = None
214+
215+
event_data = parse_caption_for_event(post.caption, image_url)
216+
217+
if event_data is None:
218+
logger.warning(f"AI client returned None for post {post.shortcode}")
219+
continue
220+
221+
post_url = f"https://www.instagram.com/p/{post.shortcode}/"
222+
if event_data.get("name") and event_data.get("date") and event_data.get("location") and event_data.get("start_time"):
223+
if insert_event_to_db(event_data, post.owner_username, post_url):
224+
events_added += 1
225+
logger.info(f"Successfully added event from {post.owner_username}")
226+
else:
227+
missing_fields = [key for key in ['name', 'date', 'location', 'start_time'] if not event_data.get(key)]
228+
logger.warning(f"Missing required fields: {missing_fields}, skipping event")
229+
time.sleep(5)
230+
except Exception as e:
231+
logger.error(f"Error processing post {post.shortcode} by {post.owner_username}: {str(e)}")
232+
logger.error(f"Traceback: {traceback.format_exc()}")
233+
continue # with next post
234+
logger.info(f"Feed processing completed. Processed {posts_processed} posts, added {events_added} events")
235+
logger.info(f"\n--- Summary ---")
236+
logger.info(f"Added {events_added} event(s) to Supabase")
238237

239238

240239
@handle_instagram_errors
241240
def session():
242241
L = Instaloader()
242+
logger.info("Attemping to load Instagram session...")
243243
try:
244-
logger.info("Attemping to load Instagram session...")
245244
L.load_session(
246245
USERNAME,
247246
{
@@ -262,4 +261,4 @@ def session():
262261

263262
if __name__ == "__main__":
264263
L = session()
265-
process_recent_feed()
264+
process_recent_feed(L)

0 commit comments

Comments
 (0)