Skip to content

Commit 3ab87af

Browse files
committed
added more robust personal email rejection handling
1 parent 8456159 commit 3ab87af

11 files changed

Lines changed: 6234 additions & 4156 deletions

src/agent/assets/all_email_provider_domains.txt

Lines changed: 6104 additions & 0 deletions
Large diffs are not rendered by default.

src/agent/company_verification_layer.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import re
2+
from functools import lru_cache
3+
from pathlib import Path
24
from typing import Dict, Optional
35

46
from openai import AsyncOpenAI
@@ -27,14 +29,54 @@
2729
"protonmail.com",
2830
}
2931

32+
EMAIL_PROVIDER_BLACKLIST_PATH = Path(__file__).parent / "assets" / "all_email_provider_domains.txt"
33+
3034

3135
EMAIL_REGEX = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.IGNORECASE)
3236

3337

38+
def _normalize_domain(domain: Optional[str]) -> str:
39+
raw = (domain or "").strip().lower()
40+
if not raw:
41+
return ""
42+
43+
if "@" in raw:
44+
raw = raw.split("@")[-1]
45+
if raw.startswith("www."):
46+
raw = raw[4:]
47+
return raw.strip(". ")
48+
49+
50+
@lru_cache(maxsize=1)
51+
def load_personal_email_blacklist() -> set[str]:
52+
domains = {d.lower() for d in COMMON_PERSONAL_DOMAINS}
53+
54+
try:
55+
for line in EMAIL_PROVIDER_BLACKLIST_PATH.read_text(encoding="utf-8", errors="ignore").splitlines():
56+
candidate = _normalize_domain(line)
57+
if not candidate:
58+
continue
59+
if candidate.startswith("#") or candidate.startswith("/*"):
60+
continue
61+
if re.fullmatch(r"[a-z0-9.-]+\.[a-z]{2,}", candidate):
62+
domains.add(candidate)
63+
except Exception as exc:
64+
logger.warning("Failed to load email provider blacklist from %s: %s", EMAIL_PROVIDER_BLACKLIST_PATH, exc)
65+
66+
return domains
67+
68+
69+
def is_blacklisted_email_domain(domain: Optional[str]) -> bool:
70+
normalized = _normalize_domain(domain)
71+
if not normalized:
72+
return False
73+
return normalized in load_personal_email_blacklist()
74+
75+
3476
def _extract_email_domain(email: Optional[str]) -> str:
3577
if not email or "@" not in email:
3678
return ""
37-
return email.split("@")[-1].strip().lower()
79+
return _normalize_domain(email)
3880

3981

4082
def _extract_form_email(email_subject: str, email_body: str) -> str:
@@ -121,14 +163,14 @@ async def run_company_verification(
121163
"status": "company verification skipped: missing form email domain",
122164
}
123165

124-
if form_domain in COMMON_PERSONAL_DOMAINS:
166+
if is_blacklisted_email_domain(form_domain):
125167
fallback = CompanyVerificationResult(
126168
is_corporate_email=False,
127169
is_legit_company=False,
128170
company_type="unknown",
129171
company_name=company_name or None,
130172
sender_domain=form_domain,
131-
reason="form email domain is a known personal email provider",
173+
reason="form email domain is in personal/provider blacklist",
132174
)
133175
return {
134176
"company_verification": fallback.model_dump(),

src/agent/email_classification_graph.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from openai import AsyncOpenAI
1111

1212
from src.agent.config import Config
13+
from src.agent.company_verification_layer import is_blacklisted_email_domain
1314
from src.agent.graph_schemas import (
1415
EmailClassificationOutput,
1516
EmailClassificationRequest,
@@ -341,6 +342,31 @@ async def classify_email(state: EmailClassificationState):
341342
}
342343

343344

345+
async def enforce_personal_email_blacklist(state: EmailClassificationState):
346+
classification = dict(state.classification or {})
347+
contact_email = str(classification.get("email") or "").strip().lower()
348+
email_domain = contact_email.split("@")[-1] if "@" in contact_email else ""
349+
350+
if not email_domain:
351+
return {
352+
"status": "email blacklist check skipped: missing contact email domain",
353+
}
354+
355+
if not is_blacklisted_email_domain(email_domain):
356+
return {
357+
"status": "email blacklist check passed",
358+
}
359+
360+
classification["action"] = "disqualify"
361+
classification["salesperson"] = "none"
362+
classification["blacklist_reason"] = f"personal/provider email domain blocked: {email_domain}"
363+
364+
return {
365+
"classification": classification,
366+
"status": "email disqualified by personal/provider blacklist",
367+
}
368+
369+
344370
email_classification_graph_builder = StateGraph(
345371
EmailClassificationState,
346372
input_schema=EmailClassificationRequest,
@@ -350,10 +376,12 @@ async def classify_email(state: EmailClassificationState):
350376
email_classification_graph_builder.add_node("build_query_text", build_query_text)
351377
email_classification_graph_builder.add_node("retrieve_context", retrieve_context)
352378
email_classification_graph_builder.add_node("classify_email", classify_email)
379+
email_classification_graph_builder.add_node("enforce_personal_email_blacklist", enforce_personal_email_blacklist)
353380

354381
email_classification_graph_builder.add_edge(START, "build_query_text")
355382
email_classification_graph_builder.add_edge("build_query_text", "retrieve_context")
356383
email_classification_graph_builder.add_edge("retrieve_context", "classify_email")
357-
email_classification_graph_builder.add_edge("classify_email", END)
384+
email_classification_graph_builder.add_edge("classify_email", "enforce_personal_email_blacklist")
385+
email_classification_graph_builder.add_edge("enforce_personal_email_blacklist", END)
358386

359387
email_classification_graph = email_classification_graph_builder.compile()

src/agent/email_ingestion_graph.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from langgraph.graph import END, START, StateGraph
1414

1515
from src.agent.config import Config
16+
from src.agent.company_verification_layer import is_blacklisted_email_domain
1617
from src.agent.graph_schemas import EmailClassificationResult, EmailIngestionOutput, EmailIngestionRequest, EmailIngestionState
1718
from src.agent.logger import get_logger
1819
from src.agent.excel_tracker import EmailClassificationExcelTracker
@@ -537,6 +538,31 @@ async def classify_email(state: EmailIngestionState):
537538
}
538539

539540

541+
async def enforce_personal_email_blacklist(state: EmailIngestionState):
542+
classification = dict(state.classification or {})
543+
contact_email = str(classification.get("email") or "").strip().lower()
544+
email_domain = contact_email.split("@")[-1] if "@" in contact_email else ""
545+
546+
if not email_domain:
547+
return {
548+
"status": "email blacklist check skipped: missing contact email domain",
549+
}
550+
551+
if not is_blacklisted_email_domain(email_domain):
552+
return {
553+
"status": "email blacklist check passed",
554+
}
555+
556+
classification["action"] = "disqualify"
557+
classification["salesperson"] = "none"
558+
classification["blacklist_reason"] = f"personal/provider email domain blocked: {email_domain}"
559+
560+
return {
561+
"classification": classification,
562+
"status": "email disqualified by personal/provider blacklist",
563+
}
564+
565+
540566
def _append_to_excel_sync(
541567
thread_id: str,
542568
created_at: str,
@@ -729,13 +755,15 @@ async def forward_to_salesperson(state: EmailIngestionState):
729755
email_ingestion_graph_builder.add_node("download_attachments", download_attachments)
730756
email_ingestion_graph_builder.add_node("extract_attachment_text", extract_attachment_text)
731757
email_ingestion_graph_builder.add_node("classify_email", classify_email)
758+
email_ingestion_graph_builder.add_node("enforce_personal_email_blacklist", enforce_personal_email_blacklist)
732759
email_ingestion_graph_builder.add_node("forward_to_salesperson", forward_to_salesperson)
733760

734761
email_ingestion_graph_builder.add_edge(START, "get_email_messages")
735762
email_ingestion_graph_builder.add_edge("get_email_messages", "download_attachments")
736763
email_ingestion_graph_builder.add_edge("download_attachments", "extract_attachment_text")
737764
email_ingestion_graph_builder.add_edge("extract_attachment_text", "classify_email")
738-
email_ingestion_graph_builder.add_edge("classify_email", "forward_to_salesperson")
765+
email_ingestion_graph_builder.add_edge("classify_email", "enforce_personal_email_blacklist")
766+
email_ingestion_graph_builder.add_edge("enforce_personal_email_blacklist", "forward_to_salesperson")
739767
email_ingestion_graph_builder.add_edge("forward_to_salesperson", END)
740768

741769
email_ingestion_graph = email_ingestion_graph_builder.compile()

src/agent/get_access_token.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import os
2-
from dotenv import load_dotenv
3-
from msal import ConfidentialClientApplication
2+
import json
43
from pathlib import Path
4+
from urllib.parse import urlencode
5+
from urllib.request import Request, urlopen
6+
7+
from dotenv import load_dotenv
58

69
# A standalone script if you need an access token for postman or other testing purposes.
710

@@ -18,25 +21,30 @@ def get_access_token(self):
1821
if not all([client_id, client_secret, tenant_id]):
1922
raise ValueError(f"Missing credentials: CLIENT_ID={client_id}, CLIENT_SECRET={bool(client_secret)}, TENANT_ID={tenant_id}")
2023

21-
msal_app = ConfidentialClientApplication(
22-
client_id=client_id,
23-
client_credential=client_secret,
24-
authority=f"https://login.microsoftonline.com/{tenant_id}",
25-
)
26-
27-
result = msal_app.acquire_token_silent(
28-
scopes=["https://graph.microsoft.com/.default"],
29-
account=None,
24+
token_url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
25+
payload = {
26+
"client_id": client_id,
27+
"client_secret": client_secret,
28+
"scope": "https://graph.microsoft.com/.default",
29+
"grant_type": "client_credentials",
30+
}
31+
32+
body = urlencode(payload).encode("utf-8")
33+
request = Request(
34+
token_url,
35+
data=body,
36+
headers={"Content-Type": "application/x-www-form-urlencoded"},
37+
method="POST",
3038
)
31-
if not result:
32-
result = msal_app.acquire_token_for_client(
33-
scopes=["https://graph.microsoft.com/.default"]
34-
)
35-
36-
if not result:
37-
return None
3839

39-
return result.get("access_token")
40+
try:
41+
with urlopen(request, timeout=30) as response:
42+
response_data = response.read().decode("utf-8", errors="ignore")
43+
result = json.loads(response_data or "{}")
44+
return result.get("access_token")
45+
except Exception as exc:
46+
print(f"Token request failed: {exc}")
47+
return None
4048

4149
token = GraphClient().get_access_token()
4250
print(f"{token} ..." if token else "NO TOKEN")

0 commit comments

Comments
 (0)