@@ -112,6 +112,84 @@ def rewrite_query(query: str, groq_api_key: str) -> Dict[str, Any]:
112112 return json .loads (content )
113113
114114
115+ def filter_results (results : list , query : str , groq_api_key : str ) -> list :
116+ if not results :
117+ return []
118+
119+ print (f"Filtering { len (results )} results using LLM..." )
120+
121+ # Prepare a condensed list of results for the LLM to save tokens
122+ condensed_results = []
123+ for i , r in enumerate (results ):
124+ condensed_results .append (
125+ {
126+ "index" : i ,
127+ "title" : r .get ("title" , "" ),
128+ "snippet" : r .get ("snippet" , r .get ("description" , "" )),
129+ "domain" : parse_domain (r .get ("link" , r .get ("website" , "" ))),
130+ }
131+ )
132+
133+ prompt = f"""
134+ The user is looking for companies matching: { json .dumps (query )}
135+
136+ Below is a list of search results. Your job is to filter out the junk.
137+ Identify which results are ACTUAL company homepages or about pages.
138+
139+ REJECT the following types of results:
140+ - Blog posts, listicles (e.g. "10 Best Coffee Shops")
141+ - News articles
142+ - Directory listings (Yelp, TripAdvisor, LinkedIn, Crunchbase)
143+ - Social media profiles (Facebook, Instagram, Twitter)
144+ - Forum threads (Reddit, Quora)
145+
146+ Results to evaluate:
147+ { json .dumps (condensed_results , indent = 2 )}
148+
149+ Return ONLY a JSON object with a single key "valid_indices" containing an array of integers (the indices of the valid companies).
150+ {{
151+ "valid_indices": [0, 2, 5]
152+ }}
153+ """
154+ headers = {
155+ "Authorization" : f"Bearer { groq_api_key } " ,
156+ "Content-Type" : "application/json" ,
157+ }
158+ payload = {
159+ "model" : GROQ_REWRITE_MODEL ,
160+ "messages" : [{"role" : "user" , "content" : prompt }],
161+ "max_tokens" : 500 ,
162+ "temperature" : 0.1 ,
163+ "response_format" : {"type" : "json_object" },
164+ }
165+
166+ try :
167+ response = requests .post (
168+ GROQ_API_URL ,
169+ headers = headers ,
170+ json = payload ,
171+ timeout = 45 ,
172+ )
173+ response .raise_for_status ()
174+ choices = response .json ().get ("choices" , [])
175+ if not choices :
176+ return results # Fallback if LLM fails
177+
178+ content = choices [0 ].get ("message" , {}).get ("content" , "{}" )
179+ result_json = json .loads (content )
180+ valid_indices = set (result_json .get ("valid_indices" , []))
181+
182+ filtered = [r for i , r in enumerate (results ) if i in valid_indices ]
183+ print (f"Filtered down to { len (filtered )} valid companies." )
184+
185+ # Fallback to returning all if filtering wiped out everything (safeguard)
186+ return filtered if filtered else results
187+
188+ except Exception as e :
189+ print (f"Failed to filter results: { str (e )} " )
190+ return results
191+
192+
115193def handler (event : Dict [str , Any ], context : Any ) -> None :
116194 for record in event .get ("Records" , []):
117195 if record .get ("eventName" ) != "INSERT" :
@@ -198,11 +276,13 @@ def handler(event: Dict[str, Any], context: Any) -> None:
198276 print (f"SerpApi fetch failed for '{ sq } ': { str (serp_e )} " )
199277
200278 print (f"Found { len (all_results )} total unique results" )
201- results = all_results
279+
280+ # Filter results using LLM
281+ filtered_results = filter_results (all_results , query , groq_api_key )
202282
203283 # 3. Map to Leads
204284 leads = []
205- for i , r in enumerate (results ):
285+ for i , r in enumerate (filtered_results ):
206286 link = r .get ("link" , r .get ("website" , "" ))
207287 domain = parse_domain (link )
208288 lead = {
0 commit comments