Skip to content

Remove Certain Categories from LLM Monitor #394

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions ai_ta_backend/service/retrieval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ def llm_monitor_message(self, course_name: str, conversation_id: str, user_email
'S12': 'Sexual Content',
'S13': 'Elections'
}

# Categories to exclude from triggering alerts (Privacy, Defamation, and Specialized Advice, and Elections)
excluded_categories = {'S5', 'S6', 'S7', 'S13'}

# Analyze each message using LLM
for message in messages:
Expand All @@ -247,18 +250,24 @@ def llm_monitor_message(self, course_name: str, conversation_id: str, user_email
# Prepare default LLM monitor tags
llm_monitor_tags = {"llm_monitor_model": llm_monitor_model}

# Identify triggered categories
triggered_categories = []
for category_code, category_name in safety_categories.items():
if category_code in response_content:
triggered_categories.append(category_name)

# Analyze if the message should be considered unsafe for alerting
alert_categories = [cat for cat, code in zip(triggered_categories,
[code for code in safety_categories.keys() if safety_categories[code] in triggered_categories])
if code not in excluded_categories]

# Assign tags to unsafe messages and send email when necessary
if 'unsafe' in response_content.lower():
if 'unsafe' in response_content.lower() and alert_categories:
llm_monitor_tags["status"] = "unsafe"
# Identify and store triggered categories
llm_monitor_tags["triggered_categories"] = [
category_name for category_code, category_name in safety_categories.items()
if category_code in response_content
]
llm_monitor_tags["triggered_categories"] = triggered_categories

# Prepare alert email if unsafe
alert_details = llm_monitor_tags.get("triggered_categories", [])
if alert_details:
# Prepare alert email only if there are non-excluded categories
if alert_categories:
alert_body = "\n".join([
"LLM Monitor Alert",
"------------------------",
Expand All @@ -269,16 +278,16 @@ def llm_monitor_message(self, course_name: str, conversation_id: str, user_email
f"Convo ID: {conversation_id}",
"------------------------",
f"Responsible Role: {message.get('role')}",
f"Categories: {', '.join(alert_details)}",
f"Categories: {', '.join(alert_categories)}",
"------------------------",
f"Message Content:\n{json.dumps(message_content, indent=2)}",
"",
])

message_id = message.get('id')
# print(f"LLM Monitor Alert Triggered! Message ID: {message_id}")
print(f"LLM Monitor Alert Triggered! Message ID: {message_id}")

send_email(subject=f"LLM Monitor Alert - {', '.join(alert_details)}",
send_email(subject=f"LLM Monitor Alert - {', '.join(alert_categories)}",
body_text=alert_body,
sender="[email protected]",
recipients=["[email protected]", "[email protected]", "[email protected]"],
Expand Down