Skip to content

Commit df5347b

Browse files
authored
Merge pull request #34 from armaan-71/feat/result-filtering
Feat/result filtering
2 parents 7ad3c8d + a8743a7 commit df5347b

File tree

1 file changed

+81
-2
lines changed

1 file changed

+81
-2
lines changed

backend/python/src/handlers/process_run.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,83 @@ def rewrite_query(query: str, groq_api_key: str) -> Dict[str, Any]:
112112
return json.loads(content)
113113

114114

115+
def filter_results(results: list, query: str, groq_api_key: str) -> list:
116+
if not results:
117+
return []
118+
119+
print(f"Filtering {len(results)} results using LLM...")
120+
121+
# Prepare a condensed list of results for the LLM to save tokens
122+
condensed_results = [
123+
{
124+
"index": i,
125+
"title": r.get("title", ""),
126+
"snippet": r.get("snippet", r.get("description", "")),
127+
"domain": parse_domain(r.get("link", r.get("website", ""))),
128+
}
129+
for i, r in enumerate(results)
130+
]
131+
132+
prompt = f"""
133+
The user is looking for companies matching: {json.dumps(query)}
134+
135+
Below is a list of search results. Your job is to filter out the junk.
136+
Identify which results are ACTUAL company homepages or about pages.
137+
138+
REJECT the following types of results:
139+
- Blog posts, listicles (e.g. "10 Best Coffee Shops")
140+
- News articles
141+
- Directory listings (Yelp, TripAdvisor, LinkedIn, Crunchbase)
142+
- Social media profiles (Facebook, Instagram, Twitter)
143+
- Forum threads (Reddit, Quora)
144+
145+
Results to evaluate:
146+
{json.dumps(condensed_results, indent=2)}
147+
148+
Return ONLY a JSON object with a single key "valid_indices" containing an array of integers (the indices of the valid companies).
149+
{{
150+
"valid_indices": [0, 2, 5]
151+
}}
152+
"""
153+
headers = {
154+
"Authorization": f"Bearer {groq_api_key}",
155+
"Content-Type": "application/json",
156+
}
157+
payload = {
158+
"model": GROQ_REWRITE_MODEL,
159+
"messages": [{"role": "user", "content": prompt}],
160+
"max_tokens": 500,
161+
"temperature": 0.1,
162+
"response_format": {"type": "json_object"},
163+
}
164+
165+
try:
166+
response = requests.post(
167+
GROQ_API_URL,
168+
headers=headers,
169+
json=payload,
170+
timeout=45,
171+
)
172+
response.raise_for_status()
173+
choices = response.json().get("choices", [])
174+
if not choices:
175+
return results # Fallback if LLM fails
176+
177+
content = choices[0].get("message", {}).get("content", "{}")
178+
result_json = json.loads(content)
179+
valid_indices = set(result_json.get("valid_indices", []))
180+
181+
filtered = [r for i, r in enumerate(results) if i in valid_indices]
182+
print(f"Filtered down to {len(filtered)} valid companies.")
183+
184+
# Fallback to returning all if filtering wiped out everything (safeguard)
185+
return filtered if filtered else results
186+
187+
except Exception as e:
188+
print(f"Failed to filter results: {str(e)}")
189+
return results
190+
191+
115192
def handler(event: Dict[str, Any], context: Any) -> None:
116193
for record in event.get("Records", []):
117194
if record.get("eventName") != "INSERT":
@@ -198,11 +275,13 @@ def handler(event: Dict[str, Any], context: Any) -> None:
198275
print(f"SerpApi fetch failed for '{sq}': {str(serp_e)}")
199276

200277
print(f"Found {len(all_results)} total unique results")
201-
results = all_results
278+
279+
# Filter results using LLM
280+
filtered_results = filter_results(all_results, query, groq_api_key)
202281

203282
# 3. Map to Leads
204283
leads = []
205-
for i, r in enumerate(results):
284+
for i, r in enumerate(filtered_results):
206285
link = r.get("link", r.get("website", ""))
207286
domain = parse_domain(link)
208287
lead = {

0 commit comments

Comments
 (0)