@@ -112,6 +112,83 @@ def rewrite_query(query: str, groq_api_key: str) -> Dict[str, Any]:
112112 return json .loads (content )
113113
114114
115+ def filter_results (results : list , query : str , groq_api_key : str ) -> list :
116+ if not results :
117+ return []
118+
119+ print (f"Filtering { len (results )} results using LLM..." )
120+
121+ # Prepare a condensed list of results for the LLM to save tokens
122+ condensed_results = [
123+ {
124+ "index" : i ,
125+ "title" : r .get ("title" , "" ),
126+ "snippet" : r .get ("snippet" , r .get ("description" , "" )),
127+ "domain" : parse_domain (r .get ("link" , r .get ("website" , "" ))),
128+ }
129+ for i , r in enumerate (results )
130+ ]
131+
132+ prompt = f"""
133+ The user is looking for companies matching: { json .dumps (query )}
134+
135+ Below is a list of search results. Your job is to filter out the junk.
136+ Identify which results are ACTUAL company homepages or about pages.
137+
138+ REJECT the following types of results:
139+ - Blog posts, listicles (e.g. "10 Best Coffee Shops")
140+ - News articles
141+ - Directory listings (Yelp, TripAdvisor, LinkedIn, Crunchbase)
142+ - Social media profiles (Facebook, Instagram, Twitter)
143+ - Forum threads (Reddit, Quora)
144+
145+ Results to evaluate:
146+ { json .dumps (condensed_results , indent = 2 )}
147+
148+ Return ONLY a JSON object with a single key "valid_indices" containing an array of integers (the indices of the valid companies).
149+ {{
150+ "valid_indices": [0, 2, 5]
151+ }}
152+ """
153+ headers = {
154+ "Authorization" : f"Bearer { groq_api_key } " ,
155+ "Content-Type" : "application/json" ,
156+ }
157+ payload = {
158+ "model" : GROQ_REWRITE_MODEL ,
159+ "messages" : [{"role" : "user" , "content" : prompt }],
160+ "max_tokens" : 500 ,
161+ "temperature" : 0.1 ,
162+ "response_format" : {"type" : "json_object" },
163+ }
164+
165+ try :
166+ response = requests .post (
167+ GROQ_API_URL ,
168+ headers = headers ,
169+ json = payload ,
170+ timeout = 45 ,
171+ )
172+ response .raise_for_status ()
173+ choices = response .json ().get ("choices" , [])
174+ if not choices :
175+ return results # Fallback if LLM fails
176+
177+ content = choices [0 ].get ("message" , {}).get ("content" , "{}" )
178+ result_json = json .loads (content )
179+ valid_indices = set (result_json .get ("valid_indices" , []))
180+
181+ filtered = [r for i , r in enumerate (results ) if i in valid_indices ]
182+ print (f"Filtered down to { len (filtered )} valid companies." )
183+
184+ # Fallback to returning all if filtering wiped out everything (safeguard)
185+ return filtered if filtered else results
186+
187+ except Exception as e :
188+ print (f"Failed to filter results: { str (e )} " )
189+ return results
190+
191+
115192def handler (event : Dict [str , Any ], context : Any ) -> None :
116193 for record in event .get ("Records" , []):
117194 if record .get ("eventName" ) != "INSERT" :
@@ -198,11 +275,13 @@ def handler(event: Dict[str, Any], context: Any) -> None:
198275 print (f"SerpApi fetch failed for '{ sq } ': { str (serp_e )} " )
199276
200277 print (f"Found { len (all_results )} total unique results" )
201- results = all_results
278+
279+ # Filter results using LLM
280+ filtered_results = filter_results (all_results , query , groq_api_key )
202281
203282 # 3. Map to Leads
204283 leads = []
205- for i , r in enumerate (results ):
284+ for i , r in enumerate (filtered_results ):
206285 link = r .get ("link" , r .get ("website" , "" ))
207286 domain = parse_domain (link )
208287 lead = {
0 commit comments