Skip to content

Commit 3acdacc

Browse files
committed
yasserg#145, see details
- no need for core classes to extends Configurable, good old IoC is better - deprecated Configurable
1 parent 684381f commit 3acdacc

8 files changed

Lines changed: 30 additions & 23 deletions

File tree

crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
* Several core components of crawler4j extend this class
2222
* to make them configurable.
2323
*
24+
* @deprecated This will removed without notice.
2425
* @author Yasser Ganjisaffar
2526
*/
27+
@Deprecated
2628
public abstract class Configurable {
2729

2830
protected CrawlConfig config;

crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@
4343
*
4444
* @author Yasser Ganjisaffar
4545
*/
46-
public class CrawlController extends Configurable {
46+
public class CrawlController {
4747

4848
static final Logger logger = LoggerFactory.getLogger(CrawlController.class);
49+
private final CrawlConfig config;
4950

5051
/**
5152
* The 'customData' object can be used for passing custom crawl-related
@@ -80,9 +81,9 @@ public class CrawlController extends Configurable {
8081

8182
public CrawlController(CrawlConfig config, PageFetcher pageFetcher,
8283
RobotstxtServer robotstxtServer) throws Exception {
83-
super(config);
84-
8584
config.validate();
85+
this.config = config;
86+
8687
File folder = new File(config.getCrawlStorageFolder());
8788
if (!folder.exists()) {
8889
if (folder.mkdirs()) {
@@ -233,8 +234,6 @@ protected <T extends WebCrawler> void start(final WebCrawlerFactory<T> crawlerFa
233234
}
234235

235236
final CrawlController controller = this;
236-
final CrawlConfig config = this.getConfig();
237-
238237
Thread monitorThread = new Thread(new Runnable() {
239238

240239
@Override
@@ -547,4 +546,8 @@ public void shutdown() {
547546
pageFetcher.shutDown();
548547
frontier.finish();
549548
}
549+
550+
public CrawlConfig getConfig() {
551+
return config;
552+
}
550553
}

crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
import org.slf4j.Logger;
6262
import org.slf4j.LoggerFactory;
6363

64-
import edu.uci.ics.crawler4j.crawler.Configurable;
6564
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
6665
import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
6766
import edu.uci.ics.crawler4j.crawler.authentication.BasicAuthInfo;
@@ -74,16 +73,21 @@
7473
/**
7574
* @author Yasser Ganjisaffar
7675
*/
77-
public class PageFetcher extends Configurable {
76+
public class PageFetcher {
7877
protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class);
7978
protected final Object mutex = new Object();
79+
/**
80+
* This field is protected for retro compatibility. Please use the getter method: getConfig() to
81+
* read this field;
82+
*/
83+
protected final CrawlConfig config;
8084
protected PoolingHttpClientConnectionManager connectionManager;
8185
protected CloseableHttpClient httpClient;
8286
protected long lastFetchTime = 0;
8387
protected IdleConnectionMonitorThread connectionMonitorThread = null;
8488

8589
public PageFetcher(CrawlConfig config) {
86-
super(config);
90+
this.config = config;
8791

8892
RequestConfig requestConfig = RequestConfig.custom()
8993
.setExpectContinueEnabled(false)
@@ -336,4 +340,7 @@ protected HttpUriRequest newHttpUriRequest(String url) {
336340
return new HttpGet(url);
337341
}
338342

343+
protected CrawlConfig getConfig() {
344+
return config;
345+
}
339346
}

crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Counters.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,13 @@
3232
import com.sleepycat.je.OperationStatus;
3333
import com.sleepycat.je.Transaction;
3434

35-
import edu.uci.ics.crawler4j.crawler.Configurable;
3635
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
3736
import edu.uci.ics.crawler4j.util.Util;
3837

3938
/**
4039
* @author Yasser Ganjisaffar
4140
*/
42-
public class Counters extends Configurable {
41+
public class Counters {
4342
private static final Logger logger = LoggerFactory.getLogger(Counters.class);
4443

4544
public static class ReservedCounterNames {
@@ -56,8 +55,6 @@ public static class ReservedCounterNames {
5655
protected Map<String, Long> counterValues;
5756

5857
public Counters(Environment env, CrawlConfig config) {
59-
super(config);
60-
6158
this.env = env;
6259
this.counterValues = new HashMap<>();
6360

crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,14 @@
2727
import com.sleepycat.je.Environment;
2828
import com.sleepycat.je.OperationStatus;
2929

30-
import edu.uci.ics.crawler4j.crawler.Configurable;
3130
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
3231
import edu.uci.ics.crawler4j.util.Util;
3332

3433
/**
3534
* @author Yasser Ganjisaffar
3635
*/
3736

38-
public class DocIDServer extends Configurable {
37+
public class DocIDServer {
3938
private static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);
4039

4140
private final Database docIDsDB;
@@ -46,7 +45,6 @@ public class DocIDServer extends Configurable {
4645
private int lastDocID;
4746

4847
public DocIDServer(Environment env, CrawlConfig config) {
49-
super(config);
5048
DatabaseConfig dbConfig = new DatabaseConfig();
5149
dbConfig.setAllowCreate(true);
5250
dbConfig.setTransactional(config.isResumableCrawling());

crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,19 @@
2525
import com.sleepycat.je.DatabaseException;
2626
import com.sleepycat.je.Environment;
2727

28-
import edu.uci.ics.crawler4j.crawler.Configurable;
2928
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
3029
import edu.uci.ics.crawler4j.url.WebURL;
3130

3231
/**
3332
* @author Yasser Ganjisaffar
3433
*/
3534

36-
public class Frontier extends Configurable {
35+
public class Frontier {
3736
protected static final Logger logger = LoggerFactory.getLogger(Frontier.class);
3837

3938
private static final String DATABASE_NAME = "PendingURLsDB";
4039
private static final int IN_PROCESS_RESCHEDULE_BATCH_SIZE = 100;
40+
private final CrawlConfig config;
4141
protected WorkQueues workQueues;
4242

4343
protected InProcessPagesDB inProcessPages;
@@ -52,7 +52,7 @@ public class Frontier extends Configurable {
5252
protected Counters counters;
5353

5454
public Frontier(Environment env, CrawlConfig config) {
55-
super(config);
55+
this.config = config;
5656
this.counters = new Counters(env, config);
5757
try {
5858
workQueues = new WorkQueues(env, DATABASE_NAME, config.isResumableCrawling());

crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import org.slf4j.Logger;
3636
import org.slf4j.LoggerFactory;
3737

38-
import edu.uci.ics.crawler4j.crawler.Configurable;
3938
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
4039
import edu.uci.ics.crawler4j.crawler.Page;
4140
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
@@ -47,15 +46,16 @@
4746
/**
4847
* @author Yasser Ganjisaffar
4948
*/
50-
public class Parser extends Configurable {
49+
public class Parser {
5150

5251
protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
5352

5453
private final HtmlParser htmlParser;
5554
private final ParseContext parseContext;
55+
private final CrawlConfig config;
5656

5757
public Parser(CrawlConfig config) throws InstantiationException, IllegalAccessException {
58-
super(config);
58+
this.config = config;
5959
htmlParser = new HtmlParser();
6060
parseContext = new ParseContext();
6161
parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());

crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ public PageFetchResult fetchPage(WebURL webUrl)
2828

2929
synchronized (mutex) {
3030
long now = new Date().getTime();
31-
if (now - this.lastFetchTime < this.config.getPolitenessDelay()) {
32-
Thread.sleep(this.config.getPolitenessDelay() - (now - this.lastFetchTime));
31+
if (now - this.lastFetchTime < getConfig().getPolitenessDelay()) {
32+
Thread.sleep(getConfig().getPolitenessDelay() - (now - this.lastFetchTime));
3333
}
3434
this.lastFetchTime = new Date().getTime();
3535
}

0 commit comments

Comments
 (0)