@@ -41,28 +41,46 @@ func NewWebScraper(jc types.JobConfiguration, statsCollector *stats.StatsCollect
41
41
}
42
42
43
43
func (ws * WebScraper ) ExecuteJob (j types.Job ) (types.JobResult , error ) {
44
+ logrus .Info ("Starting ExecuteJob for web scraper" )
45
+
46
+ // Step 1: Unmarshal arguments
44
47
args := & WebScraperArgs {}
45
- j .Arguments .Unmarshal (args )
48
+ logrus .Info ("Unmarshaling job arguments" )
49
+ if err := j .Arguments .Unmarshal (args ); err != nil {
50
+ logrus .Errorf ("Failed to unmarshal job arguments: %v" , err )
51
+ return types.JobResult {Error : fmt .Sprintf ("Invalid arguments: %v" , err )}, err
52
+ }
53
+ logrus .Infof ("Job arguments unmarshaled successfully: %+v" , args )
46
54
55
+ // Step 2: Validate URL against blacklist
56
+ logrus .Info ("Validating URL against blacklist" )
47
57
for _ , u := range ws .configuration .Blacklist {
58
+ logrus .Debugf ("Checking if URL contains blacklisted term: %s" , u )
48
59
if strings .Contains (args .URL , u ) {
60
+ logrus .Warnf ("URL %s is blacklisted due to term: %s" , args .URL , u )
49
61
ws .stats .Add (stats .WebInvalid , 1 )
50
62
logrus .Errorf ("Blacklisted URL: %s" , args .URL )
51
63
return types.JobResult {
52
64
Error : fmt .Sprintf ("URL blacklisted: %s" , args .URL ),
53
65
}, nil
54
66
}
55
67
}
68
+ logrus .Infof ("URL %s passed blacklist validation" , args .URL )
56
69
70
+ // Step 3: Perform web scraping
71
+ logrus .Infof ("Initiating web scraping for URL: %s with depth: %d" , args .URL , args .Depth )
57
72
result , err := scrapeWeb ([]string {args .URL }, args .Depth )
58
73
if err != nil {
74
+ logrus .Errorf ("Web scraping failed for URL %s: %v" , args .URL , err )
59
75
ws .stats .Add (stats .WebErrors , 1 )
60
76
return types.JobResult {Error : err .Error ()}, err
61
77
}
78
+ logrus .Infof ("Web scraping succeeded for URL %s: %v" , args .URL , result )
62
79
63
- // Do the web scraping here
64
- // For now, just return the URL
80
+ // Step 4: Process result and return
81
+ logrus . Info ( "Updating statistics for successful web scraping" )
65
82
ws .stats .Add (stats .WebSuccess , 1 )
83
+ logrus .Infof ("Returning web scraping result for URL %s" , args .URL )
66
84
return types.JobResult {
67
85
Data : result ,
68
86
}, nil
@@ -107,64 +125,82 @@ type CollectedData struct {
107
125
// logrus.WithField("result", string(res)).Info("Scraping completed")
108
126
// }()
109
127
func scrapeWeb (uri []string , depth int ) ([]byte , error ) {
128
+ logrus .Infof ("Starting scrapeWeb with parameters: URIs=%v, Depth=%d" , uri , depth )
110
129
// Set default depth to 1 if 0 is provided
111
130
if depth <= 0 {
131
+ logrus .Infof ("Invalid depth (%d) provided, setting default depth to 1" , depth )
112
132
depth = 1
113
133
}
114
134
135
+ logrus .Info ("Initializing CollectedData struct" )
115
136
var collectedData CollectedData
116
137
138
+ logrus .Info ("Creating new Colly collector" )
117
139
c := colly .NewCollector (
118
140
colly .Async (true ), // Enable asynchronous requests
119
141
colly .AllowURLRevisit (),
120
142
colly .IgnoreRobotsTxt (),
121
143
colly .MaxDepth (depth ),
122
144
)
145
+ logrus .Info ("Colly collector created successfully" )
123
146
124
147
// Adjust the parallelism and delay based on your needs and server capacity
148
+ logrus .Info ("Setting scraping limits with parallelism and delay" )
125
149
limitRule := colly.LimitRule {
126
150
DomainGlob : "*" ,
127
151
Parallelism : 4 , // Increased parallelism
128
152
Delay : 500 * time .Millisecond , // Reduced delay
129
153
}
154
+ logrus .Info ("Applying scraping limits to the collector" )
130
155
if err := c .Limit (& limitRule ); err != nil {
131
156
logrus .Errorf ("[-] Unable to set scraper limit. Using default. Error: %v" , err )
132
157
}
133
158
134
159
// Increase the timeout slightly if necessary
135
- c .SetRequestTimeout (240 * time .Second ) // Increased to 4 minutes
160
+ logrus .Info ("Setting request timeout to 240 seconds" )
161
+ c .SetRequestTimeout (240 * time .Second )
136
162
137
163
// Initialize a backoff strategy
164
+ logrus .Info ("Initializing exponential backoff strategy" )
138
165
backoffStrategy := backoff .NewExponentialBackOff ()
139
166
167
+ logrus .Info ("Registering OnError callback to handle request errors" )
140
168
c .OnError (func (r * colly.Response , err error ) {
169
+ logrus .Errorf ("Error occurred during request to URL: %s. StatusCode: %d, Error: %v" , r .Request .URL , r .StatusCode , err )
141
170
if r .StatusCode == http .StatusTooManyRequests {
142
171
// Parse the Retry-After header (in seconds)
143
172
retryAfter , convErr := strconv .Atoi (r .Headers .Get ("Retry-After" ))
144
173
if convErr != nil {
145
174
// If not in seconds, it might be a date. Handle accordingly.
146
- logrus .Debugf ( "[-] Retry-After: %s" , r .Headers .Get ("Retry-After" ))
175
+ logrus .Warnf ( " Retry-After header is present but unrecognized format : %s" , r .Headers .Get ("Retry-After" ))
147
176
}
148
177
// Calculate the next delay
149
178
nextDelay := backoffStrategy .NextBackOff ()
150
179
if retryAfter > 0 {
151
180
nextDelay = time .Duration (retryAfter ) * time .Second
152
181
}
153
- logrus .Warnf ("[-] Rate limited. Retrying after %v" , nextDelay )
182
+ logrus .Warnf ("Rate limited for URL: %s . Retrying after %v" , r . Request . URL , nextDelay )
154
183
time .Sleep (nextDelay )
155
184
// Retry the request
185
+ logrus .Info ("Retrying the request" )
156
186
_ = r .Request .Retry ()
187
+
157
188
} else {
189
+ logrus .Errorf ("Request failed for URL: %s with error: %v" , r .Request .URL , err )
158
190
logrus .Errorf ("[-] Request URL: %s failed with error: %v" , r .Request .URL , err )
159
191
}
160
192
})
161
193
194
+ logrus .Info ("Registering OnHTML callback for h1, h2 elements (titles)" )
162
195
c .OnHTML ("h1, h2" , func (e * colly.HTMLElement ) {
196
+ logrus .Infof ("Title (h1/h2) found: %s" , e .Text )
163
197
// Directly append a new Section to collectedData.Sections
164
198
collectedData .Sections = append (collectedData .Sections , Section {Title : e .Text })
165
199
})
166
200
201
+ logrus .Info ("Registering OnHTML callback for paragraph elements" )
167
202
c .OnHTML ("p" , func (e * colly.HTMLElement ) {
203
+ logrus .Infof ("Paragraph detected: %s" , e .Text )
168
204
// Check if there are any sections to append paragraphs to
169
205
if len (collectedData .Sections ) > 0 {
170
206
// Get a reference to the last section
@@ -185,15 +221,19 @@ func scrapeWeb(uri []string, depth int) ([]byte, error) {
185
221
}
186
222
})
187
223
224
+ logrus .Info ("Registering OnHTML callback for image elements" )
188
225
c .OnHTML ("img" , func (e * colly.HTMLElement ) {
226
+ logrus .Infof ("Image detected with source URL: %s" , e .Attr ("src" ))
189
227
imageURL := e .Request .AbsoluteURL (e .Attr ("src" ))
190
228
if len (collectedData .Sections ) > 0 {
191
229
lastSection := & collectedData .Sections [len (collectedData .Sections )- 1 ]
192
230
lastSection .Images = append (lastSection .Images , imageURL )
193
231
}
194
232
})
195
233
234
+ logrus .Info ("Registering OnHTML callback for anchor elements" )
196
235
c .OnHTML ("a" , func (e * colly.HTMLElement ) {
236
+ logrus .Infof ("Link detected: %s" , e .Attr ("href" ))
197
237
pageURL := e .Request .AbsoluteURL (e .Attr ("href" ))
198
238
// Check if the URL protocol is supported (http or https)
199
239
if strings .HasPrefix (pageURL , "http://" ) || strings .HasPrefix (pageURL , "https://" ) {
@@ -202,16 +242,24 @@ func scrapeWeb(uri []string, depth int) ([]byte, error) {
202
242
}
203
243
})
204
244
245
+ logrus .Infof ("Starting to visit URLs: %v" , uri )
205
246
for _ , u := range uri {
206
247
err := c .Visit (u )
248
+ logrus .Infof ("Visiting URL: %s" , u )
249
+ err = c .Visit (u )
207
250
if err != nil {
251
+ logrus .Errorf ("Failed to visit URL: %s. Error: %v" , u , err )
208
252
return nil , err
209
253
}
210
254
}
211
255
212
256
// Wait for all requests to finish
257
+ logrus .Info ("Waiting for all requests to complete" )
213
258
c .Wait ()
214
259
260
+ logrus .Info ("Scraping completed, marshaling collected data into JSON format" )
215
261
j , _ := json .Marshal (collectedData )
262
+
263
+ logrus .Infof ("Scraping successful. Returning data for URIs: %v" , uri )
216
264
return j , nil
217
265
}
0 commit comments