Skip to content

Commit 5163691

Browse files
committed
fix datetime handling (django > built-in), small improvements to logging in feed processing
1 parent 878cbf3 commit 5163691

File tree

4 files changed

+92
-59
lines changed

4 files changed

+92
-59
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ node_modules/
2626
# IDE/editor settings
2727
.vscode/
2828
chrome_profile/
29+
.flake8
2930

3031
# SQLite database (for local dev only)
3132
db.sqlite3

backend/scraping/instagram_feed.py

Lines changed: 87 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@
1414
import re
1515
import time
1616
import traceback
17-
from datetime import datetime, timedelta, timezone
17+
from datetime import datetime, timedelta
1818
from pathlib import Path
1919

2020
import requests
2121
from dotenv import load_dotenv
2222
from instaloader import Instaloader
2323
from logging_config import logger
2424
from zyte_setup import setup_zyte
25+
from django.utils import timezone
2526

2627
from apps.clubs.models import Clubs
2728
from apps.events.models import Events
@@ -124,58 +125,85 @@ def append_event_to_csv(
124125

125126
dtstart = event_data.get("dtstart", "")
126127
dtend = event_data.get("dtend", "")
127-
dtstart_utc, dtend_utc, duration, all_day = tz_compute(dtstart, dtend)
128+
dtstart_utc = event_data.get("dtstart_utc", "")
129+
dtend_utc = event_data.get("dtend_utc", "")
130+
duration = event_data.get("duration", "")
131+
all_day = event_data.get("all_day", False)
132+
location = event_data.get("location", "")
133+
food = event_data.get("food", "")
134+
price = event_data.get("price", "")
135+
registration = bool(event_data.get("registration", False))
136+
description = event_data.get("description", "")
137+
rrule = event_data.get("rrule", "")
138+
latitude = event_data.get("latitude", None)
139+
longitude = event_data.get("longitude", None)
140+
tz = event_data.get("tz", "")
141+
school = event_data.get("school", "")
142+
source_image_url = event_data.get("source_image_url", "")
143+
title = event_data.get("title", "")
144+
145+
fieldnames = [
146+
"ig_handle",
147+
"title",
148+
"source_url",
149+
"dtstart",
150+
"dtstart_utc",
151+
"dtend",
152+
"dtend_utc",
153+
"duration",
154+
"location",
155+
"food",
156+
"price",
157+
"registration",
158+
"description",
159+
"rrule",
160+
"latitude",
161+
"longitude",
162+
"tz",
163+
"school",
164+
"source_image_url",
165+
"all_day",
166+
"club_type",
167+
"raw_json",
168+
"added_to_db",
169+
"status",
170+
"embedding",
171+
]
128172

129173
with open(csv_file, "a", newline="", encoding="utf-8") as csvfile:
130-
fieldnames = [
131-
"ig_handle",
132-
"title",
133-
"source_url",
134-
"dtstart",
135-
"dtstart_utc",
136-
"dtend",
137-
"dtend_utc",
138-
"duration",
139-
"location",
140-
"food",
141-
"price",
142-
"registration",
143-
"description",
144-
"reactions",
145-
"embedding",
146-
"source_image_url",
147-
"all_day",
148-
"club_type",
149-
"raw_json",
150-
"added_to_db",
151-
]
152-
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
174+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator="\n")
153175
if not file_exists:
154176
writer.writeheader()
155177
writer.writerow(
156178
{
157179
"ig_handle": ig_handle,
158-
"title": event_data.get("title"),
180+
"title": title,
159181
"source_url": source_url,
160182
"dtstart": dtstart,
161183
"dtstart_utc": dtstart_utc,
162184
"dtend": dtend,
163185
"dtend_utc": dtend_utc,
164186
"duration": duration,
165-
"location": event_data.get("location"),
166-
"food": event_data.get("food", ""),
167-
"price": event_data.get("price", ""),
168-
"registration": bool(event_data.get("registration", False)),
169-
"description": event_data.get("description", ""),
170-
"reactions": json.dumps(event_data.get("reactions") or {}),
171-
"embedding": embedding or "",
172-
"source_image_url": event_data.get("source_image_url") or "",
187+
"location": location,
188+
"food": food,
189+
"price": price,
190+
"registration": registration,
191+
"description": description,
192+
"rrule": rrule,
193+
"latitude": latitude,
194+
"longitude": longitude,
195+
"tz": tz,
196+
"school": school,
197+
"source_image_url": source_image_url,
173198
"all_day": all_day,
174199
"club_type": club_type or event_data.get("club_type") or "",
175200
"raw_json": json.dumps(event_data, ensure_ascii=False),
176201
"added_to_db": added_to_db,
202+
"status": "CONFIRMED",
203+
"embedding": embedding or "",
177204
}
178205
)
206+
logger.info(f"Event written to CSV with status: {added_to_db}")
179207

180208

181209
def insert_event_to_db(event_data, ig_handle, source_url):
@@ -271,17 +299,21 @@ def insert_event_to_db(event_data, ig_handle, source_url):
271299
embedding=embedding,
272300
club_type=club_type,
273301
)
302+
logger.info("Event added successfully")
274303
return True
275304
except Exception as e:
276-
logger.error(f"Error inserting event to db: {e}")
277-
append_event_to_csv(
278-
event_data,
279-
ig_handle,
280-
source_url,
281-
added_to_db="failed",
282-
embedding=embedding,
283-
club_type=club_type,
284-
)
305+
logger.error(f"Error inserting event to DB: {e}")
306+
try:
307+
append_event_to_csv(
308+
event_data,
309+
ig_handle,
310+
source_url,
311+
added_to_db="failed",
312+
embedding=embedding,
313+
club_type=club_type,
314+
)
315+
except Exception as csv_e:
316+
logger.error(f"Error writing event to CSV after DB failure: {csv_e}")
285317
return False
286318

287319

@@ -327,7 +359,7 @@ def process_recent_feed(
327359
if post.shortcode in seen_shortcodes or post_time < cutoff:
328360
consec_old_posts += 1
329361
logger.debug(
330-
f"Skipping post {post.shortcode}; consec_old_posts={consec_old_posts}"
362+
f"[{post.shortcode}] [{post.owner_username}] Skipping post; consec_old_posts={consec_old_posts}"
331363
)
332364
if consec_old_posts >= max_consec_old_posts:
333365
termination_reason = (
@@ -342,19 +374,17 @@ def process_recent_feed(
342374
consec_old_posts = 0
343375
posts_processed += 1
344376
logger.info("-" * 100)
345-
logger.info(
346-
f"Processing post: {post.shortcode} by {post.owner_username}"
347-
)
377+
logger.info(f"[{post.shortcode}] [{post.owner_username}] Processing post")
348378

349379
# Safely get image URL and upload to S3
350380
raw_image_url = get_post_image_url(post)
351381
if raw_image_url:
352382
time.sleep(random.uniform(1, 3))
353383
source_image_url = upload_image_from_url(raw_image_url)
354-
logger.info(f"Uploaded image to S3: {source_image_url}")
384+
logger.info(f"[{post.shortcode}] [{post.owner_username}] Uploaded image to S3: {source_image_url}")
355385
else:
356386
logger.warning(
357-
f"No image URL found for post {post.shortcode}, skipping image upload"
387+
f"[{post.shortcode}] [{post.owner_username}] No image URL found for post, skipping image upload"
358388
)
359389
source_image_url = None
360390

@@ -363,14 +393,15 @@ def process_recent_feed(
363393
)
364394
if not events_data or len(events_data) == 0:
365395
logger.warning(
366-
f"AI client returned no events for post {post.shortcode}"
396+
f"[{post.shortcode}] [{post.owner_username}] AI client returned no events for post"
367397
)
368398
if posts_processed >= max_posts:
369399
termination_reason = f"reached_max_posts={max_posts}"
370400
logger.info(f"Reached max post limit of {max_posts}, stopping")
371401
break
372402
continue
373403

404+
logger.debug(f"[{post.shortcode}] [{post.owner_username}] Event data: {json.dumps(events_data, ensure_ascii=False, separators=(',', ':'))}")
374405
source_url = f"https://www.instagram.com/p/{post.shortcode}/"
375406
today = datetime.now(timezone.utc).date()
376407

@@ -388,7 +419,7 @@ def process_recent_feed(
388419
if not event_data.get(key)
389420
]
390421
logger.warning(
391-
f"Missing required fields for event '{event_data.get('title', 'Unknown')}': {missing_fields}, skipping event"
422+
f"[{post.shortcode}] [{post.owner_username}] Missing required fields for event '{event_data.get('title', 'Unknown')}': {missing_fields}, skipping event"
392423
)
393424
embedding = generate_event_embedding(event_data)
394425
append_event_to_csv(
@@ -403,18 +434,18 @@ def process_recent_feed(
403434
date = datetime.fromisoformat(event_data.get("dtstart")).date()
404435
if date < today:
405436
logger.info(
406-
f"Skipping event '{event_data.get('title')}' with past date {date}"
437+
f"[{post.shortcode}] [{post.owner_username}] Skipping event '{event_data.get('title')}' with past date {date}"
407438
)
408439
continue
409440

410441
if insert_event_to_db(event_data, post.owner_username, source_url):
411442
events_added += 1
412443
logger.info(
413-
f"Successfully added event '{event_data.get('title')}' from {post.owner_username}"
444+
f"[{post.shortcode}] [{post.owner_username}] Successfully added event '{event_data.get('title')}'"
414445
)
415446
else:
416447
logger.error(
417-
f"Failed to add event '{event_data.get('title')}' from {post.owner_username}"
448+
f"[{post.shortcode}] [{post.owner_username}] Failed to add event '{event_data.get('title')}'"
418449
)
419450

420451
if posts_processed >= max_posts:
@@ -426,9 +457,9 @@ def process_recent_feed(
426457

427458
except Exception as e:
428459
logger.error(
429-
f"Error processing post {post.shortcode} by {post.owner_username}: {e!s}"
460+
f"[{post.shortcode}] [{post.owner_username}] Error processing post: {e!s}"
430461
)
431-
logger.error(f"Traceback: {traceback.format_exc()}")
462+
logger.error(f"[{post.shortcode}] [{post.owner_username}] Traceback: {traceback.format_exc()}")
432463
time.sleep(random.uniform(3, 8))
433464
continue
434465

backend/scraping/logging_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
logging.getLogger("requests").setLevel(logging.WARNING)
1313
logging.getLogger("botocore").setLevel(logging.WARNING)
1414
logging.getLogger("httpcore").setLevel(logging.WARNING)
15+
logging.getLogger("openai").setLevel(logging.WARNING)
1516

16-
fmt = "%(asctime)s - pid=%(process)d - thread=%(threadName)s - %(name)s - %(levelname)s - %(message)s"
17+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
1718
handlers = [
1819
logging.StreamHandler(sys.stdout),
1920
logging.FileHandler(LOG_FILE, encoding="utf-8"),

backend/services/openai_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def extract_events_from_caption(
136136
- Title-case event titles (e.g., "...talk" -> "...Talk", "COFFEE CRAWL" -> "Coffee Crawl")
137137
- If multiple dates are mentioned (e.g., "Friday and Saturday"), create separate events for each date
138138
- If recurring events are mentioned (e.g., "every Friday"), just create one event
139-
- For dtstart and dtend, if year not found, assume {now.year}
139+
- For dtstart and dtend, if year not found, assume {now.year}. If an event's end time is earlier than its start time (e.g., "from 7 pm - 12 am"), interpret end time as being on the NEXT calendar day
140140
- For dtstart_utc and dtend_utc: convert the local time (dtstart/dtend) to UTC using the timezone (tz). Format as YYYY-MM-DD HH:MM:SSZ. If dtstart/dtend are empty, then dtstart_utc/dtend_utc should also be empty
141141
- For duration: calculate the duration between dtstart and dtend in HH:MM:SS format (e.g., "02:30:00" for 2 hours 30 minutes). If dtend is empty or not available, use empty string ""
142142
- When interpreting relative terms like "tonight", "tomorrow", "weekly", "every Friday", use the current date context above and the date the post was made. If an explicit date is found in the image, use that date
@@ -365,7 +365,7 @@ def generate_recommended_filters(self, events_data: list[dict]) -> list[list[str
365365
Generate filter keywords that:
366366
1. Capture the most common themes in the events titles data above that are actually found as a string within the events data above. if you do something that's 2 words, it better be fully included in the event data above.
367367
2. Are SHORT (1-3 words max) and SPECIFIC
368-
3. Reflect actual themes in the event data above
368+
3. Reflect actual themes in the event data above
369369
4. The list MUST start with ["Food%20and%20Drink", "Pizza", "free food"]
370370
5. The Filter string MUST exist in atleast 3 event titles above.
371371

0 commit comments

Comments
 (0)