Skip to content

Commit 3c30e7a

Browse files
committed
Merge yasserg#155 into refactored codebase
2 parents c7f62d0 + ccf2cc2 commit 3c30e7a

2 files changed

Lines changed: 9 additions & 4 deletions

File tree

crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ public void run() {
298298
if (!someoneIsWorking) {
299299
if (!shuttingDown) {
300300
long queueLength = frontier.getQueueLength();
301-
if (queueLength > 0) {
301+
if (! frontier.isFinished() && queueLength > 0) {
302302
continue;
303303
}
304304
logger.info(
@@ -308,7 +308,7 @@ public void run() {
308308
" seconds to make sure...");
309309
sleep(config.getThreadShutdownDelaySeconds());
310310
queueLength = frontier.getQueueLength();
311-
if (queueLength > 0) {
311+
if (! frontier.isFinished() && queueLength > 0) {
312312
continue;
313313
}
314314
}

crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ public class Frontier {
4848
protected boolean isFinished = false;
4949

5050
protected long scheduledPages;
51+
protected long fetchedPages = 0;
5152

5253
protected Counters counters;
5354

@@ -83,7 +84,7 @@ public Frontier(Environment env, CrawlConfig config) {
8384
}
8485

8586
public void scheduleAll(List<WebURL> urls) {
86-
int maxPagesToFetch = config.getMaxPagesToFetch();
87+
int maxPagesToFetch = -1; //config.getMaxPagesToFetch();
8788
synchronized (mutex) {
8889
int newScheduledPage = 0;
8990
for (WebURL url : urls) {
@@ -110,7 +111,7 @@ public void scheduleAll(List<WebURL> urls) {
110111
}
111112

112113
public void schedule(WebURL url) {
113-
int maxPagesToFetch = config.getMaxPagesToFetch();
114+
int maxPagesToFetch = -1; //config.getMaxPagesToFetch();
114115
synchronized (mutex) {
115116
try {
116117
if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {
@@ -125,8 +126,11 @@ public void schedule(WebURL url) {
125126
}
126127

127128
public void getNextURLs(int max, List<WebURL> result) {
129+
int maxPagesToFetch = config.getMaxPagesToFetch();
128130
while (true) {
129131
synchronized (mutex) {
132+
if (maxPagesToFetch > 0 && fetchedPages > maxPagesToFetch)
133+
finish();
130134
if (isFinished) {
131135
return;
132136
}
@@ -162,6 +166,7 @@ public void getNextURLs(int max, List<WebURL> result) {
162166
}
163167

164168
public void setProcessed(WebURL webURL) {
169+
fetchedPages++;
165170
counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES);
166171
if (inProcessPages != null) {
167172
if (!inProcessPages.removeURL(webURL)) {

0 commit comments

Comments
 (0)