1
+ import os
2
+ import json
3
+ import time
4
+ import requests
5
+ from dotenv import load_dotenv
6
+ from openai import OpenAI
7
+ from serpapi .google_search import GoogleSearch
8
+
9
+ # ANSI color codes
10
+ class Colors :
11
+ CYAN = '\033 [96m'
12
+ YELLOW = '\033 [93m'
13
+ GREEN = '\033 [92m'
14
+ RED = '\033 [91m'
15
+ MAGENTA = '\033 [95m'
16
+ BLUE = '\033 [94m'
17
+ RESET = '\033 [0m'
18
+
19
+ # Load environment variables
20
+ load_dotenv ()
21
+
22
+ # Initialize clients
23
+ client = OpenAI (api_key = os .getenv ("DEEPSEEK_API_KEY" ), base_url = "https://api.deepseek.com" )
24
+ firecrawl_api_key = os .getenv ("FIRECRAWL_API_KEY" )
25
+ serp_api_key = os .getenv ("SERP_API_KEY" )
26
+
27
+ def search_google (query ):
28
+ """Search Google using SerpAPI and return top results."""
29
+ print (f"{ Colors .YELLOW } Searching Google for '{ query } '...{ Colors .RESET } " )
30
+ search = GoogleSearch ({"q" : query , "api_key" : serp_api_key })
31
+ return search .get_dict ().get ("organic_results" , [])
32
+
33
+ def select_urls_with_r1 (company , objective , serp_results ):
34
+ """
35
+ Use R1 to select the most relevant URLs from SERP results for the given company and objective.
36
+ Returns a list of URLs.
37
+ """
38
+ try :
39
+ # Prepare the data for R1
40
+ serp_data = [{"title" : r .get ("title" ), "link" : r .get ("link" ), "snippet" : r .get ("snippet" )}
41
+ for r in serp_results if r .get ("link" )]
42
+
43
+ response = client .chat .completions .create (
44
+ model = "deepseek-reasoner" ,
45
+ messages = [
46
+ {
47
+ "role" : "system" ,
48
+ "content" : "You are a URL selector that always responds with valid JSON. You select URLs from the SERP results relevant to the company and objective. Your response must be a JSON object with a 'selected_urls' array property containing strings."
49
+ },
50
+ {
51
+ "role" : "user" ,
52
+ "content" : (
53
+ f"Company: { company } \n "
54
+ f"Objective: { objective } \n "
55
+ f"SERP Results: { json .dumps (serp_data )} \n \n "
56
+ "Return a JSON object with a property 'selected_urls' that contains an array "
57
+ "of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\" selected_urls\" : [\" https://example.com\" , \" https://example2.com\" ]}"
58
+ )
59
+ }
60
+ ]
61
+ )
62
+
63
+ try :
64
+ # First try to parse as JSON
65
+ result = json .loads (response .choices [0 ].message .content )
66
+ if isinstance (result , dict ) and "selected_urls" in result :
67
+ urls = result ["selected_urls" ]
68
+ else :
69
+ # If JSON doesn't have the expected structure, fall back to text parsing
70
+ response_text = response .choices [0 ].message .content
71
+ urls = [line .strip () for line in response_text .split ('\n ' )
72
+ if line .strip ().startswith (('http://' , 'https://' ))]
73
+ except json .JSONDecodeError :
74
+ # If JSON parsing fails, fall back to text parsing
75
+ response_text = response .choices [0 ].message .content
76
+ urls = [line .strip () for line in response_text .split ('\n ' )
77
+ if line .strip ().startswith (('http://' , 'https://' ))]
78
+
79
+ # Clean up URLs - remove wildcards and trailing slashes
80
+ cleaned_urls = [url .replace ('/*' , '' ).rstrip ('/' ) for url in urls ]
81
+ cleaned_urls = [url for url in cleaned_urls if url ]
82
+
83
+ if not cleaned_urls :
84
+ print (f"{ Colors .YELLOW } No valid URLs found.{ Colors .RESET } " )
85
+ return []
86
+
87
+ print (f"{ Colors .CYAN } Selected URLs for extraction by R1:{ Colors .RESET } " )
88
+ for url in cleaned_urls :
89
+ print (f"- { url } " )
90
+
91
+ return cleaned_urls
92
+
93
+ except Exception as e :
94
+ print (f"{ Colors .RED } Error selecting URLs with R1: { e } { Colors .RESET } " )
95
+ return []
96
+
97
+ def extract_company_info (urls , prompt , company , api_key ):
98
+ """Use requests to call Firecrawl's extract endpoint with selected URLs."""
99
+ print (f"{ Colors .YELLOW } Extracting structured data from the provided URLs using Firecrawl...{ Colors .RESET } " )
100
+
101
+ headers = {
102
+ 'Content-Type' : 'application/json' ,
103
+ 'Authorization' : f'Bearer { api_key } '
104
+ }
105
+
106
+ payload = {
107
+ "urls" : urls ,
108
+ "prompt" : prompt + " for " + company ,
109
+ "enableWebSearch" : True
110
+ }
111
+
112
+ try :
113
+ response = requests .post (
114
+ "https://api.firecrawl.dev/v1/extract" ,
115
+ headers = headers ,
116
+ json = payload ,
117
+ timeout = 30
118
+ )
119
+
120
+ data = response .json ()
121
+
122
+ if not data .get ('success' ):
123
+ print (f"{ Colors .RED } API returned error: { data .get ('error' , 'No error message' )} { Colors .RESET } " )
124
+ return None
125
+
126
+ # Assuming Firecrawl provides a way to retrieve data with 'id'
127
+ extraction_id = data .get ('id' )
128
+ if not extraction_id :
129
+ print (f"{ Colors .RED } No extraction ID found in response.{ Colors .RESET } " )
130
+ return None
131
+
132
+ # Polling for the extraction result
133
+ return poll_firecrawl_result (extraction_id , api_key )
134
+
135
+ except requests .exceptions .RequestException as e :
136
+ print (f"{ Colors .RED } Request failed: { e } { Colors .RESET } " )
137
+ return None
138
+ except json .JSONDecodeError as e :
139
+ print (f"{ Colors .RED } Failed to parse response: { e } { Colors .RESET } " )
140
+ return None
141
+ except Exception as e :
142
+ print (f"{ Colors .RED } Failed to extract data: { e } { Colors .RESET } " )
143
+ return None
144
+
145
+ def poll_firecrawl_result (extraction_id , api_key , interval = 5 , max_attempts = 36 ):
146
+ """Poll Firecrawl API to get the extraction result."""
147
+ url = f"https://api.firecrawl.dev/v1/extract/{ extraction_id } "
148
+ headers = {
149
+ 'Authorization' : f'Bearer { api_key } '
150
+ }
151
+
152
+ for attempt in range (1 , max_attempts + 1 ):
153
+ try :
154
+ # print(f"{Colors.YELLOW}Polling for extraction result (Attempt {attempt}/{max_attempts})...{Colors.RESET}")
155
+ response = requests .get (url , headers = headers , timeout = 30 )
156
+ response .raise_for_status ()
157
+ data = response .json ()
158
+
159
+ if data .get ('success' ) and data .get ('data' ):
160
+ print (f"{ Colors .GREEN } Data successfully extracted:{ Colors .RESET } " )
161
+ print (json .dumps (data ['data' ], indent = 2 ))
162
+ return data ['data' ]
163
+ elif data .get ('success' ) and not data .get ('data' ):
164
+ time .sleep (interval )
165
+ else :
166
+ print (f"{ Colors .RED } API Error: { data .get ('error' , 'No error message provided' )} { Colors .RESET } " )
167
+ return None
168
+
169
+ except requests .exceptions .RequestException :
170
+ return None
171
+ except json .JSONDecodeError :
172
+ return None
173
+ except Exception :
174
+ return None
175
+
176
+ print (f"{ Colors .RED } Max polling attempts reached. Extraction did not complete in time.{ Colors .RESET } " )
177
+ return None
178
+
179
+ def main ():
180
+ company = input (f"{ Colors .BLUE } Enter the company name: { Colors .RESET } " )
181
+ objective = input (f"{ Colors .BLUE } Enter what information you want about the company: { Colors .RESET } " )
182
+
183
+ serp_results = search_google (f"{ company } " )
184
+ if not serp_results :
185
+ print (f"{ Colors .RED } No search results found.{ Colors .RESET } " )
186
+ return
187
+
188
+ # Ask R1 to select URLs
189
+ selected_urls = select_urls_with_r1 (company , objective , serp_results )
190
+
191
+ if not selected_urls :
192
+ print (f"{ Colors .RED } R1 did not return any URLs.{ Colors .RESET } " )
193
+ return
194
+
195
+ data = extract_company_info (selected_urls , objective , company , firecrawl_api_key )
196
+
197
+ if data :
198
+ print (f"{ Colors .GREEN } Extraction completed successfully.{ Colors .RESET } " )
199
+ else :
200
+ print (f"{ Colors .RED } Failed to extract the requested information. Try refining your prompt or choosing a different company.{ Colors .RESET } " )
201
+
202
+ if __name__ == "__main__" :
203
+ main ()
0 commit comments