Skip to content

Commit d03c798

Browse files
committed
add reactions field to Events model, refactor feed processing to use Django ORM
1 parent 6aa4bb1 commit d03c798

File tree

2 files changed

+71
-75
lines changed

2 files changed

+71
-75
lines changed

backend/example/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class Events(models.Model):
4242
embedding = VectorField(dimensions=1536, blank=True, null=True)
4343
added_at = models.DateTimeField(auto_now_add=True, null=True)
4444
club_type = models.CharField(max_length=50, blank=True, null=True)
45+
reactions = models.JSONField(default=dict, blank=True)
4546

4647
class Meta:
4748
db_table = "events"

backend/scraping/instagram_feed.py

Lines changed: 70 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from instaloader import Instaloader
2323

2424
from example.embedding_utils import generate_event_embedding, is_duplicate_event
25+
from example.models import Events, Clubs
2526
from services.openai_service import extract_event_from_caption
2627
from services.storage_service import delete_image, upload_image_from_url
2728

@@ -113,10 +114,13 @@ def wrapper(*args, **kwargs):
113114
def extract_s3_filename_from_url(image_url: str) -> str:
114115
if not image_url:
115116
return None
116-
filename = image_url.split('/')[-1]
117+
filename = image_url.split("/")[-1]
117118
return f"events/{filename}"
118119

119-
def append_event_to_csv(event_data, club_ig, post_url, status="success", embedding=None):
120+
121+
def append_event_to_csv(
122+
event_data, club_ig, post_url, status="success", embedding=None
123+
):
120124
csv_file = Path(__file__).resolve().parent / "events_scraped.csv"
121125
csv_file.parent.mkdir(parents=True, exist_ok=True)
122126
file_exists = csv_file.exists()
@@ -135,7 +139,7 @@ def append_event_to_csv(event_data, club_ig, post_url, status="success", embeddi
135139
"registration",
136140
"image_url",
137141
"status",
138-
"embedding"
142+
"embedding",
139143
]
140144
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
141145
if not file_exists:
@@ -154,7 +158,7 @@ def append_event_to_csv(event_data, club_ig, post_url, status="success", embeddi
154158
"registration": event_data.get("registration", False),
155159
"image_url": event_data.get("image_url", ""),
156160
"status": status,
157-
"embedding": embedding or ""
161+
"embedding": embedding or "",
158162
}
159163
)
160164

@@ -165,100 +169,81 @@ def insert_event_to_db(event_data, club_ig, post_url):
165169
event_date = event_data.get("date")
166170
event_location = event_data.get("location") # .title()
167171
try:
168-
with connection.cursor() as cur:
169-
# Get club_type from club handle
170-
cur.execute("SELECT club_type FROM clubs WHERE ig = %s", (club_ig,))
171-
club_row = cur.fetchone()
172-
club_type = club_row[0] if club_row else None
173-
if not club_type:
174-
logger.warning(
175-
f"Club with handle {club_ig} not found in clubs. Inserting event with null club_type."
176-
)
177-
178-
# Check duplicates using vector similarity
179-
logger.debug(
180-
f"Checking for duplicates using vector similarity: {event_data}"
172+
# Get club_type based on club handle
173+
try:
174+
club = Clubs.objects.get(ig=club_ig)
175+
club_type = club.club_type
176+
except Clubs.DoesNotExist:
177+
club_type = None
178+
logger.warning(
179+
f"Club with handle {club_ig} not found, inserting event with null club_type"
181180
)
182181

183-
# Check if this event is a duplicate using vector similarity
184-
if is_duplicate_event(event_data):
185-
logger.debug(
186-
f"Duplicate event found using vector similarity: {event_name} at {event_location}"
187-
)
188-
return False
189-
190-
# Generate embedding for the event
191-
embedding = generate_event_embedding(event_data)
192-
193-
insert_query = """
194-
INSERT INTO events (
195-
club_handle, url, name, date, start_time, end_time, location, price, food, registration, image_url, embedding, club_type
196-
)
197-
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::vector, %s)
198-
ON CONFLICT DO NOTHING;
199-
"""
200-
cur.execute(
201-
insert_query,
202-
(
203-
club_ig,
204-
post_url,
205-
event_name,
206-
event_date,
207-
event_data["start_time"],
208-
event_data["end_time"] or None,
209-
event_location,
210-
event_data.get("price", None),
211-
event_data.get("food") or "",
212-
bool(event_data.get("registration", False)),
213-
event_data.get("image_url"),
214-
embedding,
215-
club_type,
216-
),
217-
)
218-
logger.debug(f"Event inserted: {event_data.get('name')} from {club_ig}")
182+
# Check duplicates using vector sim
183+
logger.debug(f"Checking duplicates for event with data: {event_data}")
184+
if is_duplicate_event(event_data):
185+
logger.debug(f"Duplicate event found: {event_name} at {event_location}")
186+
return False
219187

188+
# Generate embedding
189+
embedding = generate_event_embedding(event_data)
190+
191+
# Create event using Django ORM
192+
event = Events.objects.create(
193+
club_handle=club_ig,
194+
url=post_url,
195+
name=event_name,
196+
date=event_date,
197+
start_time=event_data["start_time"],
198+
end_time=event_data["end_time"] or None,
199+
location=event_location,
200+
price=event_data.get("price", None),
201+
food=event_data.get("food") or "",
202+
registration=bool(event_data.get("registration", False)),
203+
image_url=event_data.get("image_url"),
204+
embedding=embedding,
205+
club_type=club_type,
206+
)
207+
logger.debug(f"Event inserted: {event_data.get('name')} from {club_ig}")
220208
try:
221-
append_event_to_csv(event_data, club_ig, post_url, status="success", embedding=embedding)
209+
append_event_to_csv(
210+
event_data, club_ig, post_url, status="success", embedding=embedding
211+
)
222212
logger.info(f"Appended event to CSV: {event_data.get('name')}")
223213
except Exception as csv_err:
224214
logger.error(
225215
f"Database insert succeeded, but failed to append to CSV: {csv_err}"
226216
)
227217
logger.error(f"Traceback: {traceback.format_exc()}")
228218
return False
229-
230219
return True
231220
except Exception as e:
232221
logger.error(f"Database error: {str(e)}")
233222
logger.error(f"Event data: {event_data}")
234223
logger.error(f"Traceback: {traceback.format_exc()}")
235-
236224
try:
237225
embedding = generate_event_embedding(event_data)
238-
append_event_to_csv(event_data, club_ig, post_url, status="failed", embedding=embedding)
226+
append_event_to_csv(
227+
event_data, club_ig, post_url, status="failed", embedding=embedding
228+
)
239229
logger.info(f"Appended event to CSV: {event_data.get('name')}")
240230
except Exception as csv_err:
241-
logger.error(
242-
f"Database insert failed, and failed to append to CSV: {csv_err}"
243-
)
231+
logger.error(f"Database and CSV inserts failed: {csv_err}")
244232
logger.error(f"Traceback: {traceback.format_exc()}")
245-
246233
return False
247234

248235

249236
def get_seen_shortcodes():
250237
"""Fetches all post shortcodes from events table in DB"""
251238
logger.info("Fetching seen shortcodes from the database...")
252239
try:
253-
with connection.cursor() as cur:
254-
cur.execute("SELECT url FROM events WHERE url IS NOT NULL")
255-
urls = cur.fetchall()
256-
shortcodes = {url[0].split('/')[-2] for url in urls if url[0]}
257-
return shortcodes
240+
events = Events.objects.filter(url__isnull=False).values_list("url", flat=True)
241+
shortcodes = {url.split("/")[-2] for url in events if url}
242+
return shortcodes
258243
except Exception as e:
259244
logger.error(f"Could not fetch shortcodes from database: {e}")
260245
return set()
261-
246+
262247

263248
def process_recent_feed(
264249
loader,
@@ -272,7 +257,7 @@ def process_recent_feed(
272257
posts_processed = 0
273258
consec_old_posts = 0
274259
logger.info(f"Starting feed processing with cutoff: {cutoff}")
275-
260+
276261
seen_shortcodes = get_seen_shortcodes()
277262

278263
for post in loader.get_feed_posts():
@@ -286,7 +271,7 @@ def process_recent_feed(
286271
)
287272
break
288273
continue # to next post
289-
274+
290275
consec_old_posts = 0
291276
posts_processed += 1
292277
logger.info("\n" + "-" * 50)
@@ -311,7 +296,9 @@ def process_recent_feed(
311296
if image_url:
312297
s3_filename = extract_s3_filename_from_url(image_url)
313298
if s3_filename and delete_image(s3_filename):
314-
logger.info(f"Deleted S3 file for failed event extraction: {s3_filename}")
299+
logger.info(
300+
f"Deleted S3 file for failed event extraction: {s3_filename}"
301+
)
315302
continue
316303

317304
post_url = f"https://www.instagram.com/p/{post.shortcode}/"
@@ -329,7 +316,9 @@ def process_recent_feed(
329316
if image_url:
330317
s3_filename = extract_s3_filename_from_url(image_url)
331318
if s3_filename and delete_image(s3_filename):
332-
logger.info(f"Deleted S3 file for failed DB insert: {s3_filename}")
319+
logger.info(
320+
f"Deleted S3 file for failed DB insert: {s3_filename}"
321+
)
333322
else:
334323
missing_fields = [
335324
key
@@ -344,13 +333,19 @@ def process_recent_feed(
344333
if image_url:
345334
s3_filename = extract_s3_filename_from_url(image_url)
346335
if s3_filename and delete_image(s3_filename):
347-
logger.info(f"Deleted S3 file for event with missing fields: {s3_filename}")
336+
logger.info(
337+
f"Deleted S3 file for event with missing fields: {s3_filename}"
338+
)
348339
append_event_to_csv(
349-
event_data, post.owner_username, post_url, status="missing_fields", embedding=embedding
340+
event_data,
341+
post.owner_username,
342+
post_url,
343+
status="missing_fields",
344+
embedding=embedding,
350345
)
351-
346+
352347
time.sleep(random.uniform(15, 45))
353-
348+
354349
if posts_processed >= max_posts:
355350
logger.info(f"Reached max post limit of {max_posts}, stopping")
356351
break

0 commit comments

Comments
 (0)