Skip to content

Commit a2c6857

Browse files
committed
feat: add LLM result filtering
- Created `filter_results` function to evaluate search results - Condenses SerpApi results to save LLM tokens - Filters out non-company results (blogs, directories, news) - Gracefully falls back to original results on error - Saves time and cost by not scraping junk leads
1 parent 7ad3c8d commit a2c6857

File tree

1 file changed

+82
-2
lines changed

1 file changed

+82
-2
lines changed

backend/python/src/handlers/process_run.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,84 @@ def rewrite_query(query: str, groq_api_key: str) -> Dict[str, Any]:
112112
return json.loads(content)
113113

114114

115+
def filter_results(results: list, query: str, groq_api_key: str) -> list:
116+
if not results:
117+
return []
118+
119+
print(f"Filtering {len(results)} results using LLM...")
120+
121+
# Prepare a condensed list of results for the LLM to save tokens
122+
condensed_results = []
123+
for i, r in enumerate(results):
124+
condensed_results.append(
125+
{
126+
"index": i,
127+
"title": r.get("title", ""),
128+
"snippet": r.get("snippet", r.get("description", "")),
129+
"domain": parse_domain(r.get("link", r.get("website", ""))),
130+
}
131+
)
132+
133+
prompt = f"""
134+
The user is looking for companies matching: {json.dumps(query)}
135+
136+
Below is a list of search results. Your job is to filter out the junk.
137+
Identify which results are ACTUAL company homepages or about pages.
138+
139+
REJECT the following types of results:
140+
- Blog posts, listicles (e.g. "10 Best Coffee Shops")
141+
- News articles
142+
- Directory listings (Yelp, TripAdvisor, LinkedIn, Crunchbase)
143+
- Social media profiles (Facebook, Instagram, Twitter)
144+
- Forum threads (Reddit, Quora)
145+
146+
Results to evaluate:
147+
{json.dumps(condensed_results, indent=2)}
148+
149+
Return ONLY a JSON object with a single key "valid_indices" containing an array of integers (the indices of the valid companies).
150+
{{
151+
"valid_indices": [0, 2, 5]
152+
}}
153+
"""
154+
headers = {
155+
"Authorization": f"Bearer {groq_api_key}",
156+
"Content-Type": "application/json",
157+
}
158+
payload = {
159+
"model": GROQ_REWRITE_MODEL,
160+
"messages": [{"role": "user", "content": prompt}],
161+
"max_tokens": 500,
162+
"temperature": 0.1,
163+
"response_format": {"type": "json_object"},
164+
}
165+
166+
try:
167+
response = requests.post(
168+
GROQ_API_URL,
169+
headers=headers,
170+
json=payload,
171+
timeout=45,
172+
)
173+
response.raise_for_status()
174+
choices = response.json().get("choices", [])
175+
if not choices:
176+
return results # Fallback if LLM fails
177+
178+
content = choices[0].get("message", {}).get("content", "{}")
179+
result_json = json.loads(content)
180+
valid_indices = set(result_json.get("valid_indices", []))
181+
182+
filtered = [r for i, r in enumerate(results) if i in valid_indices]
183+
print(f"Filtered down to {len(filtered)} valid companies.")
184+
185+
# Fallback to returning all if filtering wiped out everything (safeguard)
186+
return filtered if filtered else results
187+
188+
except Exception as e:
189+
print(f"Failed to filter results: {str(e)}")
190+
return results
191+
192+
115193
def handler(event: Dict[str, Any], context: Any) -> None:
116194
for record in event.get("Records", []):
117195
if record.get("eventName") != "INSERT":
@@ -198,11 +276,13 @@ def handler(event: Dict[str, Any], context: Any) -> None:
198276
print(f"SerpApi fetch failed for '{sq}': {str(serp_e)}")
199277

200278
print(f"Found {len(all_results)} total unique results")
201-
results = all_results
279+
280+
# Filter results using LLM
281+
filtered_results = filter_results(all_results, query, groq_api_key)
202282

203283
# 3. Map to Leads
204284
leads = []
205-
for i, r in enumerate(results):
285+
for i, r in enumerate(filtered_results):
206286
link = r.get("link", r.get("website", ""))
207287
domain = parse_domain(link)
208288
lead = {

0 commit comments

Comments
 (0)