58
58
59
59
/**
60
60
* A queue-based fetcher.
61
- *
61
+ *
62
62
* <p>
63
63
* This fetcher uses a well-known model of one producer (a QueueFeeder) and many
64
64
* consumers (FetcherThread-s).
65
- *
65
+ *
66
66
* <p>
67
67
* QueueFeeder reads input fetchlists and populates a set of FetchItemQueue-s,
68
68
* which hold FetchItem-s that describe the items to be fetched. There are as
69
69
* many queues as there are unique hosts, but at any given time the total number
70
70
* of fetch items in all queues is less than a fixed number (currently set to a
71
71
* multiple of the number of threads).
72
- *
72
+ *
73
73
* <p>
74
74
* As items are consumed from the queues, the QueueFeeder continues to add new
75
75
* input items, so that their total count stays fixed (FetcherThread-s may also
76
76
* add new items to the queues e.g. as a results of redirection) - until all
77
77
* input items are exhausted, at which point the number of items in the queues
78
78
* begins to decrease. When this number reaches 0 fetcher will finish.
79
- *
79
+ *
80
80
* <p>
81
81
* This fetcher implementation handles per-host blocking itself, instead of
82
82
* delegating this work to protocol-specific plugins. Each per-host queue
85
85
* list of requests in progress, and the time the last request was finished. As
86
86
* FetcherThread-s ask for new items to be fetched, queues may return eligible
87
87
* items or null if for "politeness" reasons this host's queue is not yet ready.
88
- *
88
+ *
89
89
* <p>
90
90
* If there are still unfetched items in the queues, but none of the items are
91
91
* ready, FetcherThread-s will spin-wait until either some items become
92
92
* available, or a timeout is reached (at which point the Fetcher will abort,
93
93
* assuming the task is hung).
94
- *
94
+ *
95
95
* @author Andrzej Bialecki
96
96
*/
97
97
public class Fetcher extends NutchTool implements Tool {
@@ -147,7 +147,7 @@ public static class FetcherRun extends
147
147
private AtomicInteger activeThreads = new AtomicInteger (0 );
148
148
private AtomicInteger spinWaiting = new AtomicInteger (0 );
149
149
private long start = System .currentTimeMillis ();
150
- private AtomicLong lastRequestStart = new AtomicLong (start );
150
+ private AtomicLong lastRequestStart = new AtomicLong (start );
151
151
private AtomicLong bytes = new AtomicLong (0 ); // total bytes fetched
152
152
private AtomicInteger pages = new AtomicInteger (0 ); // total pages fetched
153
153
private AtomicInteger errors = new AtomicInteger (0 ); // total pages errored
@@ -157,7 +157,7 @@ public static class FetcherRun extends
157
157
private AtomicInteger getActiveThreads () {
158
158
return activeThreads ;
159
159
}
160
-
160
+
161
161
private void reportStatus (Context context , FetchItemQueues fetchQueues , int pagesLastSec , int bytesLastSec )
162
162
throws IOException {
163
163
StringBuilder status = new StringBuilder ();
@@ -184,13 +184,13 @@ private void reportStatus(Context context, FetchItemQueues fetchQueues, int page
184
184
context .setStatus (status .toString ());
185
185
}
186
186
187
- @ Override
187
+ @ Override
188
188
public void setup (Mapper <Text , CrawlDatum , Text , NutchWritable >.Context context ) {
189
189
Configuration conf = context .getConfiguration ();
190
190
segmentName = conf .get (Nutch .SEGMENT_NAME_KEY );
191
191
storingContent = isStoringContent (conf );
192
192
parsing = isParsing (conf );
193
- }
193
+ }
194
194
195
195
@ Override
196
196
public void run (Context innerContext )
@@ -218,11 +218,6 @@ public void run(Context innerContext)
218
218
feeder = new QueueFeeder (innerContext , fetchQueues ,
219
219
threadCount * queueDepthMultiplier );
220
220
221
- // the value of the time limit is either -1 or the time where it should
222
- // finish
223
- long timelimit = conf .getLong ("fetcher.timelimit" , -1 );
224
- if (timelimit != -1 )
225
- feeder .setTimeLimit (timelimit );
226
221
feeder .start ();
227
222
228
223
int startDelay = conf .getInt ("fetcher.threads.start.delay" , 10 );
@@ -427,9 +422,12 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
427
422
* fetches started during half of the MapReduce task timeout
428
423
* (mapreduce.task.timeout, default value: 10 minutes). In order to
429
424
* avoid that the task timeout is hit and the fetcher job is failed,
430
- * we stop the fetching now.
425
+ * we stop the fetching now. See also the property
426
+ * fetcher.threads.timeout.divisor.
431
427
*/
432
428
if ((System .currentTimeMillis () - lastRequestStart .get ()) > timeout ) {
429
+ LOG .warn ("Timeout reached with no new requests since {} seconds." ,
430
+ timeout );
433
431
LOG .warn ("Aborting with {} hung threads{}." , activeThreads ,
434
432
feeder .isAlive () ? " (queue feeder still alive)" : "" );
435
433
innerContext .getCounter ("FetcherStatus" , "hungThreads" )
@@ -448,6 +446,18 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
448
446
LOG .warn (sb .toString ());
449
447
}
450
448
}
449
+
450
+ /*
451
+ * signal the queue feeder that the timeout is reached and wait
452
+ * shortly for it to shut down
453
+ */
454
+ fetchQueues .setTimeoutReached ();
455
+ if (feeder .isAlive ()) {
456
+ LOG .info (
457
+ "Signaled QueueFeeder to stop, waiting 1.5 seconds before exiting." );
458
+ Thread .sleep (1500 );
459
+ }
460
+
451
461
/*
452
462
* log and count queued items dropped from the fetch queues because
453
463
* of the timeout
@@ -469,7 +479,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
469
479
}
470
480
}
471
481
472
- public void fetch (Path segment , int threads ) throws IOException ,
482
+ public void fetch (Path segment , int threads ) throws IOException ,
473
483
InterruptedException , ClassNotFoundException {
474
484
475
485
checkConfiguration ();
@@ -626,7 +636,7 @@ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws
626
636
else {
627
637
String segmentDir = crawlId +"/segments" ;
628
638
File segmentsDir = new File (segmentDir );
629
- File [] segmentsList = segmentsDir .listFiles ();
639
+ File [] segmentsList = segmentsDir .listFiles ();
630
640
Arrays .sort (segmentsList , (f1 , f2 ) -> {
631
641
if (f1 .lastModified ()>f2 .lastModified ())
632
642
return -1 ;
0 commit comments