Skip to content

Commit bdbdc3e

Browse files
committed
Merge pull request #54 from MadEgg/binary-content
Added configuration parameter processBinaryContentInCrawling
2 parents 373f09d + 705cac0 commit bdbdc3e

2 files changed

Lines changed: 21 additions & 1 deletion

File tree

src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ public class CrawlConfig {
7979
* Should we fetch binary content such as images, audio, ...?
8080
*/
8181
private boolean includeBinaryContentInCrawling = false;
82+
83+
/**
84+
* Should we process binary content such as image, audio, ... using TIKA?
85+
*/
86+
private boolean processBinaryContentInCrawling = false;
8287

8388
/**
8489
* Maximum Connections per host
@@ -306,6 +311,17 @@ public boolean isIncludeBinaryContentInCrawling() {
306311
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) {
307312
this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
308313
}
314+
315+
public boolean isProcessBinaryContentInCrawling() {
316+
return processBinaryContentInCrawling;
317+
}
318+
319+
/**
320+
* Should we process binary content such as images, audio, ... using TIKA?
321+
*/
322+
public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling) {
323+
this.processBinaryContentInCrawling = processBinaryContentInCrawling;
324+
}
309325

310326
public int getMaxConnectionsPerHost() {
311327
return maxConnectionsPerHost;

src/main/java/edu/uci/ics/crawler4j/parser/Parser.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
6060
if (Util.hasBinaryContent(page.getContentType())) { // BINARY
6161
BinaryParseData parseData = new BinaryParseData();
6262
if (config.isIncludeBinaryContentInCrawling()) {
63-
parseData.setBinaryContent(page.getContentData());
63+
if (config.isProcessBinaryContentInCrawling()) {
64+
parseData.setBinaryContent(page.getContentData());
65+
} else {
66+
parseData.setHtml("<html></html>");
67+
}
6468
page.setParseData(parseData);
6569
if (parseData.getHtml() == null) {
6670
throw new ParseException();

0 commit comments

Comments
 (0)