Skip to content

Commit a611722

Browse files
committed
add event categories (update events model to use JSONField, add to OpenAI prompt for category assignment)
1 parent c425745 commit a611722

File tree

7 files changed

+139
-8
lines changed

7 files changed

+139
-8
lines changed

.github/workflows/update-events-data.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,9 @@ jobs:
120120
id: generate_static
121121
working-directory: backend/scraping
122122
run: python generate_static_data.py
123-
if: github.event_name == 'schedule'
124123

125124
- name: Commit and push changes
126-
if: github.event_name == 'schedule' && steps.generate_static.outcome == 'success'
125+
if: steps.generate_static.outcome == 'success'
127126
run: |
128127
git config --global user.name 'github-actions[bot]'
129128
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.7 on 2025-11-03 20:29
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('events', '0019_remove_events_events_school_status_dtstart_utc_dtend_utc_added_at_idx_and_more'),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name='events',
15+
name='categories',
16+
field=models.JSONField(blank=True, default=list, help_text="List of event categories (e.g. ['Academic', 'Cultural'])"),
17+
),
18+
]

backend/apps/events/models.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,10 @@ class Events(models.Model):
3838
duration = models.DurationField(blank=True, null=True, help_text="'8:00:00'")
3939

4040
# Event categorization
41-
categories = models.CharField(
42-
max_length=255,
43-
null=True,
41+
categories = models.JSONField(
42+
default=list,
4443
blank=True,
45-
help_text="'Career, Networking, Professional Development'",
44+
help_text="List of event categories (e.g. ['Academic', 'Cultural'])",
4645
)
4746

4847
# Timezone information

backend/scraping/instagram_feed.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def append_event_to_csv(
135135
school = event_data.get("school", "")
136136
source_image_url = event_data.get("source_image_url", "")
137137
title = event_data.get("title", "")
138+
categories = event_data.get("categories", [])
138139

139140
fieldnames = [
140141
"ig_handle",
@@ -158,6 +159,7 @@ def append_event_to_csv(
158159
"source_image_url",
159160
"all_day",
160161
"club_type",
162+
"categories",
161163
"raw_json",
162164
"added_to_db",
163165
"status",
@@ -191,6 +193,7 @@ def append_event_to_csv(
191193
"source_image_url": source_image_url,
192194
"all_day": all_day,
193195
"club_type": club_type or event_data.get("club_type") or "",
196+
"categories": json.dumps(categories, ensure_ascii=False),
194197
"raw_json": json.dumps(event_data, ensure_ascii=False),
195198
"added_to_db": added_to_db,
196199
"status": "CONFIRMED",
@@ -223,6 +226,11 @@ def insert_event_to_db(event_data, ig_handle, source_url):
223226
latitude = event_data.get("latitude", None)
224227
longitude = event_data.get("longitude", None)
225228
school = event_data.get("school", "")
229+
categories = event_data.get("categories", [])
230+
231+
if not categories or not isinstance(categories, list):
232+
logger.warning(f"Event '{title}' missing categories, assigning 'Uncategorized'")
233+
categories = ["Uncategorized"]
226234

227235
if is_duplicate_event(event_data):
228236
try:
@@ -232,6 +240,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
232240
source_url,
233241
added_to_db="duplicate",
234242
embedding=event_data.get("embedding"),
243+
club_type=None,
235244
)
236245
except Exception as csv_e:
237246
logger.error(f"Error writing duplicate event to CSV: {csv_e}")
@@ -301,6 +310,7 @@ def insert_event_to_db(event_data, ig_handle, source_url):
301310
"longitude": longitude,
302311
"school": school,
303312
"rrule": event_data.get("rrule", ""),
313+
"categories": categories,
304314
}
305315

306316
try:
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import os
2+
import sys
3+
import django
4+
import openai
5+
import concurrent.futures
6+
import threading
7+
8+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development")
10+
django.setup()
11+
12+
from apps.events.models import Events
13+
from shared.constants.event_categories import EVENT_CATEGORIES
14+
15+
DEFAULT_CATEGORY = "Uncategorized"
16+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
17+
18+
CATEGORY_LIST = "\n".join(f"- {cat}" for cat in EVENT_CATEGORIES)
19+
lock = threading.Lock() # For Django ORM thread safety
20+
21+
22+
def get_categories_from_openai(title, description, event_id=None):
23+
prompt = (
24+
f"Given the following event title and description, select all applicable categories from this list (output as a JSON array of strings, case-sensitive, must use only these):\n"
25+
f"{CATEGORY_LIST}\n\n"
26+
f"Title: {title}\n"
27+
f"Description: {description}\n\n"
28+
f"Return a JSON array of one or more categories. If none fit, return ['{DEFAULT_CATEGORY}']."
29+
)
30+
try:
31+
print(f"[{event_id}] Requesting categories from OpenAI...")
32+
client = openai.OpenAI(api_key=OPENAI_API_KEY)
33+
response = client.chat.completions.create(
34+
model="gpt-3.5-turbo",
35+
messages=[
36+
{"role": "system", "content": "You are an expert event classifier."},
37+
{"role": "user", "content": prompt},
38+
],
39+
max_tokens=64,
40+
temperature=0,
41+
)
42+
content = response.choices[0].message.content.strip()
43+
print(f"[{event_id}] OpenAI response: {content}")
44+
if content.startswith("```"):
45+
content = content.split("```")[-2].strip() if "```" in content[3:] else content.replace("```json", "").replace("```", "").strip()
46+
import json
47+
cats = json.loads(content)
48+
if isinstance(cats, list) and all(isinstance(c, str) for c in cats):
49+
valid_cats = [c for c in cats if c in EVENT_CATEGORIES]
50+
if valid_cats:
51+
print(f"[{event_id}] Valid categories: {valid_cats}")
52+
return valid_cats
53+
else:
54+
print(f"[{event_id}] No valid categories found in response, using default.")
55+
except Exception as e:
56+
print(f"[{event_id}] OpenAI error: {e}")
57+
return [DEFAULT_CATEGORY]
58+
59+
60+
def process_event(event):
61+
title = event.title or ""
62+
description = event.description or ""
63+
print(f"\nProcessing event (ID: {event.id})")
64+
print(f"[{event.id}] Title: {title}")
65+
print(f"[{event.id}] Description: {description[:100]}{'...' if len(description) > 100 else ''}")
66+
cats = get_categories_from_openai(title, description, event_id=event.id)
67+
with lock:
68+
if event.categories != cats:
69+
print(f"[{event.id}] Updating categories from {event.categories} to {cats}")
70+
event.categories = cats
71+
event.save(update_fields=["categories"])
72+
print(f"[{event.id}] Updated successfully.")
73+
return 1
74+
else:
75+
print(f"[{event.id}] Categories already up to date.")
76+
return 0
77+
78+
79+
def main():
80+
print("Starting event category backfill (only uncategorized events)...")
81+
events = list(Events.objects.filter(categories=[DEFAULT_CATEGORY]))
82+
print(f"Found {len(events)} uncategorized events to process.")
83+
updated = 0
84+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
85+
results = list(executor.map(process_event, events))
86+
updated = sum(results)
87+
print(f"\nDone. Updated {updated} out of {len(events)} uncategorized events with OpenAI-categorized categories.")
88+
89+
90+
if __name__ == "__main__":
91+
main()

backend/services/openai_service.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from scraping.logging_config import logger
2121
from shared.constants.emojis import EMOJI_CATEGORIES
22+
from shared.constants.event_categories import EVENT_CATEGORIES
2223
from utils.events_utils import clean_datetime
2324
from utils.date_utils import get_current_semester_end_time
2425
from datetime import timezone as pytimezone
@@ -120,9 +121,10 @@ def extract_events_from_caption(
120121
context_date = context_datetime.strftime("%Y-%m-%d")
121122
context_day = context_datetime.strftime("%A")
122123
context_time = context_datetime.strftime("%H:%M")
123-
124+
124125
# Get current semester end time for inferring RRULE UNTIL dates
125126
semester_end_time = get_current_semester_end_time(school)
127+
categories_str = "\n".join(f"- {cat}" for cat in EVENT_CATEGORIES)
126128

127129
prompt = f"""
128130
Analyze the following Instagram caption and image and extract event information if it's an event post.
@@ -165,7 +167,8 @@ def extract_events_from_caption(
165167
"rdate": string, // comma-separated datetime strings in format "YYYYMMDDTHHMMSS,YYYYMMDDTHHMMSS,..." (e.g., "20251113T170000,20251204T170000,20251218T170000")
166168
"school": string,
167169
"source_image_url": string,
168-
"description": string
170+
"description": string,
171+
"categories": list // one or more of the following, as a JSON array of strings: {categories_str}
169172
}}
170173
171174
CONSOLIDATION RULES (VERY IMPORTANT):
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
EVENT_CATEGORIES = [
2+
"Academic",
3+
"Career & Networking",
4+
"Social & Games",
5+
"Athletics",
6+
"Creative Arts",
7+
"Cultural",
8+
"Religious",
9+
"Advocacy & Causes",
10+
"Sales & Fundraising"
11+
]

0 commit comments

Comments
 (0)