File tree Expand file tree Collapse file tree
main/java/edu/uci/ics/crawler4j
test/java/edu/uci/ics/crawler4j/tests/fetcher Expand file tree Collapse file tree Original file line number Diff line number Diff line change 2121 * Several core components of crawler4j extend this class
2222 * to make them configurable.
2323 *
24+ * @deprecated This will removed without notice.
2425 * @author Yasser Ganjisaffar
2526 */
27+ @ Deprecated
2628public abstract class Configurable {
2729
2830 protected CrawlConfig config ;
Original file line number Diff line number Diff line change 4343 *
4444 * @author Yasser Ganjisaffar
4545 */
46- public class CrawlController extends Configurable {
46+ public class CrawlController {
4747
4848 static final Logger logger = LoggerFactory .getLogger (CrawlController .class );
49+ private final CrawlConfig config ;
4950
5051 /**
5152 * The 'customData' object can be used for passing custom crawl-related
@@ -80,9 +81,9 @@ public class CrawlController extends Configurable {
8081
8182 public CrawlController (CrawlConfig config , PageFetcher pageFetcher ,
8283 RobotstxtServer robotstxtServer ) throws Exception {
83- super (config );
84-
8584 config .validate ();
85+ this .config = config ;
86+
8687 File folder = new File (config .getCrawlStorageFolder ());
8788 if (!folder .exists ()) {
8889 if (folder .mkdirs ()) {
@@ -233,8 +234,6 @@ protected <T extends WebCrawler> void start(final WebCrawlerFactory<T> crawlerFa
233234 }
234235
235236 final CrawlController controller = this ;
236- final CrawlConfig config = this .getConfig ();
237-
238237 Thread monitorThread = new Thread (new Runnable () {
239238
240239 @ Override
@@ -547,4 +546,8 @@ public void shutdown() {
547546 pageFetcher .shutDown ();
548547 frontier .finish ();
549548 }
549+
550+ public CrawlConfig getConfig () {
551+ return config ;
552+ }
550553}
Original file line number Diff line number Diff line change 6161import org .slf4j .Logger ;
6262import org .slf4j .LoggerFactory ;
6363
64- import edu .uci .ics .crawler4j .crawler .Configurable ;
6564import edu .uci .ics .crawler4j .crawler .CrawlConfig ;
6665import edu .uci .ics .crawler4j .crawler .authentication .AuthInfo ;
6766import edu .uci .ics .crawler4j .crawler .authentication .BasicAuthInfo ;
7473/**
7574 * @author Yasser Ganjisaffar
7675 */
77- public class PageFetcher extends Configurable {
76+ public class PageFetcher {
7877 protected static final Logger logger = LoggerFactory .getLogger (PageFetcher .class );
7978 protected final Object mutex = new Object ();
79+ /**
80+ * This field is protected for retro compatibility. Please use the getter method: getConfig() to
81+ * read this field;
82+ */
83+ protected final CrawlConfig config ;
8084 protected PoolingHttpClientConnectionManager connectionManager ;
8185 protected CloseableHttpClient httpClient ;
8286 protected long lastFetchTime = 0 ;
8387 protected IdleConnectionMonitorThread connectionMonitorThread = null ;
8488
8589 public PageFetcher (CrawlConfig config ) {
86- super ( config ) ;
90+ this . config = config ;
8791
8892 RequestConfig requestConfig = RequestConfig .custom ()
8993 .setExpectContinueEnabled (false )
@@ -336,4 +340,7 @@ protected HttpUriRequest newHttpUriRequest(String url) {
336340 return new HttpGet (url );
337341 }
338342
343+ protected CrawlConfig getConfig () {
344+ return config ;
345+ }
339346}
Original file line number Diff line number Diff line change 3232import com .sleepycat .je .OperationStatus ;
3333import com .sleepycat .je .Transaction ;
3434
35- import edu .uci .ics .crawler4j .crawler .Configurable ;
3635import edu .uci .ics .crawler4j .crawler .CrawlConfig ;
3736import edu .uci .ics .crawler4j .util .Util ;
3837
3938/**
4039 * @author Yasser Ganjisaffar
4140 */
42- public class Counters extends Configurable {
41+ public class Counters {
4342 private static final Logger logger = LoggerFactory .getLogger (Counters .class );
4443
4544 public static class ReservedCounterNames {
@@ -56,8 +55,6 @@ public static class ReservedCounterNames {
5655 protected Map <String , Long > counterValues ;
5756
5857 public Counters (Environment env , CrawlConfig config ) {
59- super (config );
60-
6158 this .env = env ;
6259 this .counterValues = new HashMap <>();
6360
Original file line number Diff line number Diff line change 2727import com .sleepycat .je .Environment ;
2828import com .sleepycat .je .OperationStatus ;
2929
30- import edu .uci .ics .crawler4j .crawler .Configurable ;
3130import edu .uci .ics .crawler4j .crawler .CrawlConfig ;
3231import edu .uci .ics .crawler4j .util .Util ;
3332
3433/**
3534 * @author Yasser Ganjisaffar
3635 */
3736
38- public class DocIDServer extends Configurable {
37+ public class DocIDServer {
3938 private static final Logger logger = LoggerFactory .getLogger (DocIDServer .class );
4039
4140 private final Database docIDsDB ;
@@ -46,7 +45,6 @@ public class DocIDServer extends Configurable {
4645 private int lastDocID ;
4746
4847 public DocIDServer (Environment env , CrawlConfig config ) {
49- super (config );
5048 DatabaseConfig dbConfig = new DatabaseConfig ();
5149 dbConfig .setAllowCreate (true );
5250 dbConfig .setTransactional (config .isResumableCrawling ());
Original file line number Diff line number Diff line change 2525import com .sleepycat .je .DatabaseException ;
2626import com .sleepycat .je .Environment ;
2727
28- import edu .uci .ics .crawler4j .crawler .Configurable ;
2928import edu .uci .ics .crawler4j .crawler .CrawlConfig ;
3029import edu .uci .ics .crawler4j .url .WebURL ;
3130
3231/**
3332 * @author Yasser Ganjisaffar
3433 */
3534
36- public class Frontier extends Configurable {
35+ public class Frontier {
3736 protected static final Logger logger = LoggerFactory .getLogger (Frontier .class );
3837
3938 private static final String DATABASE_NAME = "PendingURLsDB" ;
4039 private static final int IN_PROCESS_RESCHEDULE_BATCH_SIZE = 100 ;
40+ private final CrawlConfig config ;
4141 protected WorkQueues workQueues ;
4242
4343 protected InProcessPagesDB inProcessPages ;
@@ -52,7 +52,7 @@ public class Frontier extends Configurable {
5252 protected Counters counters ;
5353
5454 public Frontier (Environment env , CrawlConfig config ) {
55- super ( config ) ;
55+ this . config = config ;
5656 this .counters = new Counters (env , config );
5757 try {
5858 workQueues = new WorkQueues (env , DATABASE_NAME , config .isResumableCrawling ());
Original file line number Diff line number Diff line change 3535import org .slf4j .Logger ;
3636import org .slf4j .LoggerFactory ;
3737
38- import edu .uci .ics .crawler4j .crawler .Configurable ;
3938import edu .uci .ics .crawler4j .crawler .CrawlConfig ;
4039import edu .uci .ics .crawler4j .crawler .Page ;
4140import edu .uci .ics .crawler4j .crawler .exceptions .ParseException ;
4746/**
4847 * @author Yasser Ganjisaffar
4948 */
50- public class Parser extends Configurable {
49+ public class Parser {
5150
5251 protected static final Logger logger = LoggerFactory .getLogger (Parser .class );
5352
5453 private final HtmlParser htmlParser ;
5554 private final ParseContext parseContext ;
55+ private final CrawlConfig config ;
5656
5757 public Parser (CrawlConfig config ) throws InstantiationException , IllegalAccessException {
58- super ( config ) ;
58+ this . config = config ;
5959 htmlParser = new HtmlParser ();
6060 parseContext = new ParseContext ();
6161 parseContext .set (HtmlMapper .class , AllTagMapper .class .newInstance ());
Original file line number Diff line number Diff line change @@ -28,8 +28,8 @@ public PageFetchResult fetchPage(WebURL webUrl)
2828
2929 synchronized (mutex ) {
3030 long now = new Date ().getTime ();
31- if (now - this .lastFetchTime < this . config .getPolitenessDelay ()) {
32- Thread .sleep (this . config .getPolitenessDelay () - (now - this .lastFetchTime ));
31+ if (now - this .lastFetchTime < getConfig () .getPolitenessDelay ()) {
32+ Thread .sleep (getConfig () .getPolitenessDelay () - (now - this .lastFetchTime ));
3333 }
3434 this .lastFetchTime = new Date ().getTime ();
3535 }
You can’t perform that action at this time.
0 commit comments