diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index c11df693c..f822b5602 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -49,12 +49,10 @@ public class Page {
private byte[] bytes;
- private List targetRequests = new ArrayList();
+ private List targetRequests = new ArrayList<>();
private String charset;
- public Page() {
- }
public static Page fail(){
Page page = new Page();
@@ -105,9 +103,9 @@ public Json getJson() {
/**
* @param html html
- * @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
+
public void setHtml(Html html) {
this.html = html;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 886e74a92..d9117ce2f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -62,7 +62,7 @@ public class Spider implements Runnable, Task {
protected Downloader downloader;
- protected List pipelines = new ArrayList();
+ protected List pipelines = new ArrayList<>();
protected PageProcessor pageProcessor;
@@ -86,11 +86,11 @@ public class Spider implements Runnable, Task {
protected boolean exitWhenComplete = true;
- protected final static int STAT_INIT = 0;
+ protected static final int STAT_INIT = 0;
- protected final static int STAT_RUNNING = 1;
+ protected static final int STAT_RUNNING = 1;
- protected final static int STAT_STOPPED = 2;
+ protected static final int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
@@ -246,7 +246,7 @@ public Spider setPipelines(List pipelines) {
* @return this
*/
public Spider clearPipeline() {
- pipelines = new ArrayList();
+ pipelines = new ArrayList<>();
return this;
}
@@ -313,7 +313,8 @@ public void run() {
// wait until new url added
waitNewUrl();
} else {
- threadPool.execute(new Runnable() {
+ threadPool.execute(
+ new Runnable() {
@Override
public void run() {
try {
@@ -427,7 +428,6 @@ private void onDownloadSuccess(Request request, Page page) {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
- return;
}
private void onDownloaderFail(Request request) {
@@ -458,6 +458,8 @@ protected void sleep(int time) {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
+ //restore interrupted thread
+ Thread.currentThread().interrupt();
}
}
@@ -564,6 +566,7 @@ private void waitNewUrl() {
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e);
+ Thread.currentThread().interrupt();
} finally {
newUrlLock.unlock();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
index ff8e26998..e4ffd38d1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
@@ -15,7 +15,9 @@
*/
@Experimental
public class SmartContentSelector implements Selector {
-
+ /***
+ * Empty/ default constructor for SmartContentSelector
+ */
public SmartContentSelector() {
}
@@ -33,7 +35,7 @@ public String select(String html) {
int start;
int end;
StringBuilder text = new StringBuilder();
- ArrayList indexDistribution = new ArrayList();
+ ArrayList indexDistribution = new ArrayList<>();
lines = Arrays.asList(html.split("\n"));
@@ -47,39 +49,42 @@ public String select(String html) {
}
start = -1; end = -1;
- boolean boolstart = false, boolend = false;
+ boolean boolstart = false;
+ boolean boolend = false;
text.setLength(0);
-
- for (int i = 0; i < indexDistribution.size() - 1; i++) {
- if (indexDistribution.get(i) > threshold && ! boolstart) {
- if (indexDistribution.get(i+1).intValue() != 0
+
+ int i=0;
+ while (i < indexDistribution.size() - 1) {
+
+ if ((indexDistribution.get(i) > threshold && ! boolstart)
+ && (indexDistribution.get(i+1).intValue() != 0
|| indexDistribution.get(i+2).intValue() != 0
- || indexDistribution.get(i+3).intValue() != 0) {
+ || indexDistribution.get(i+3).intValue() != 0) ){
boolstart = true;
start = i;
- continue;
+ i++;
}
- }
- if (boolstart) {
- if (indexDistribution.get(i).intValue() == 0
- || indexDistribution.get(i+1).intValue() == 0) {
+
+ if ((boolstart) && (indexDistribution.get(i).intValue() == 0
+ || indexDistribution.get(i+1).intValue() == 0) ){
end = i;
boolend = true;
}
- }
+
+
StringBuilder tmp = new StringBuilder();
if (boolend) {
- //System.out.println(start+1 + "\t\t" + end+1);
for (int ii = start; ii <= end; ii++) {
- if (lines.get(ii).length() < 5) continue;
+ if (lines.get(ii).length() < 5) i++;
tmp.append(lines.get(ii) + "\n");
}
String str = tmp.toString();
- //System.out.println(str);
- if (str.contains("Copyright") ) continue;
+
+ if (str.contains("Copyright")) i++;
text.append(str);
boolstart = boolend = false;
}
+ i++;
}
return text.toString();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
index 79b9efece..721401f8b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
@@ -11,6 +11,14 @@
public class FilePersistentBase {
protected String path;
+
+ public FilePersistentBase() {
+ setPath("/data/webmagic/");
+ }
+
+ public FilePersistentBase(String path) {
+ setPath(path);
+ }
public static String PATH_SEPERATOR = "/";
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
index 0db9b819d..22d818217 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
@@ -28,11 +28,11 @@ public class FilePageModelPipeline extends FilePersistentBase implements PageMod
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public FilePageModelPipeline() {
- setPath("/data/webmagic/");
+ super();
}
public FilePageModelPipeline(String path) {
- setPath(path);
+ super(path);
}
@Override
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
index 7a7f80a25..7b3b3035c 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
@@ -29,11 +29,11 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
* new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
- setPath("/data/webmagic/");
+ super();
}
public JsonFilePageModelPipeline(String path) {
- setPath(path);
+ super(path);
}
@Override
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
index 3d416964b..dafb7ce70 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
@@ -12,7 +12,7 @@
*/
public abstract class IPUtils {
- public static String getFirstNoLoopbackIPAddresses() throws SocketException {
+ public static String getFirstNoLoopbackIPAddresses() throws SocketException, NullPointerException{
Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces();