diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index aee1144cd..7b7ddcda5 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -279,7 +279,7 @@ public void run() { if (!someoneIsWorking) { if (!shuttingDown) { long queueLength = frontier.getQueueLength(); - if (queueLength > 0) { + if (! frontier.isFinished() && queueLength > 0) { continue; } logger.info( @@ -287,7 +287,7 @@ public void run() { "sure..."); sleep(10); queueLength = frontier.getQueueLength(); - if (queueLength > 0) { + if (! frontier.isFinished() && queueLength > 0) { continue; } } diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java index 6a43d5e35..59a8b216c 100644 --- a/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java +++ b/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java @@ -48,6 +48,7 @@ public class Frontier extends Configurable { protected boolean isFinished = false; protected long scheduledPages; + protected long fetchedPages = 0; protected Counters counters; @@ -82,7 +83,7 @@ public Frontier(Environment env, CrawlConfig config) { } public void scheduleAll(List urls) { - int maxPagesToFetch = config.getMaxPagesToFetch(); + int maxPagesToFetch = -1; //config.getMaxPagesToFetch(); synchronized (mutex) { int newScheduledPage = 0; for (WebURL url : urls) { @@ -108,7 +109,7 @@ public void scheduleAll(List urls) { } public void schedule(WebURL url) { - int maxPagesToFetch = config.getMaxPagesToFetch(); + int maxPagesToFetch = -1; //config.getMaxPagesToFetch(); synchronized (mutex) { try { if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) { @@ -123,8 +124,11 @@ public void schedule(WebURL url) { } public void getNextURLs(int max, List result) { + int maxPagesToFetch = config.getMaxPagesToFetch(); while (true) { synchronized (mutex) { + if (maxPagesToFetch > 0 && fetchedPages > maxPagesToFetch) + finish(); if (isFinished) { return; } @@ -160,6 +164,7 @@ public void getNextURLs(int max, List result) { } public void setProcessed(WebURL webURL) { + fetchedPages++; counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES); if (inProcessPages != null) { if (!inProcessPages.removeURL(webURL)) {