Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
version: "3.7"

services:

# ----------------------------------------
# Build the indexer
# ----------------------------------------

indexer:
build:
context: .
args:
http_proxy: ${HTTP_PROXY}
https_proxy: ${HTTPS_PROXY}
environment:
- "SOLR_URL=http://warc-solr:8983/solr/discovery"


# ----------------------------------------
# Solr instances for running tests against
# ----------------------------------------

warc-solr:
image: ukwa/webarchive-discovery-solr:master
ports:
- 8983:8983
volumes:
- $PWD/tmp/logs:/opt/solr/server/logs
21 changes: 14 additions & 7 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@
<url>https://github.com/ukwa/webarchive-discovery.git</url>
</scm>
<properties>
<solr.version>8.7.0</solr.version>
<solr.version>8.11.0</solr.version>
<webarchive.commons.version>1.1.9</webarchive.commons.version>
<jdk.version>1.8</jdk.version>
<hadoop.version>0.20.2</hadoop.version>
<hadoop.version>3.3.4</hadoop.version>
<spark.version>3.3.0</spark.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<ram>1024</ram>
<jackson.version>2.11.3</jackson.version>
<slf4j.version.override>1.5.11</slf4j.version.override> <!-- last version compatible with ancient Hadoop -->
<jackson.version>2.13.3</jackson.version>
<guava.version>18.0</guava.version>
<slf4j.version.override>1.7.32</slf4j.version.override> <!-- last version compatible with Hadoop/Spark -->
<log4j.version>2.19.0</log4j.version>
</properties>
<build>
Expand All @@ -85,8 +87,7 @@
<!-- Show 100% of the lines from the stack trace (doesn't
work) -->
<trimStackTrace>false</trimStackTrace>
<forkMode>always</forkMode>
<argLine>-Xms${ram}m -Xmx${ram}m</argLine>
<argLine>-Xms${ram}m -Xmx${ram}m --illegal-access=warn</argLine>
</configuration>
<executions>
<execution>
Expand Down Expand Up @@ -168,7 +169,7 @@
</build>

<dependencies>
<!-- Logging-->
<!-- Logging and testing -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Expand All @@ -179,5 +180,11 @@
<artifactId>jcl-over-slf4j</artifactId>
<version>${slf4j.version.override}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
153 changes: 124 additions & 29 deletions warc-hadoop-indexer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,67 @@
</execution>
</executions>
</plugin>

<!-- Shaded job jar, roughly followin https://cloud.google.com/blog/products/data-analytics/managing-java-dependencies-apache-spark-applications-cloud-dataproc -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<finalName>${project.artifactId}-${project.version}-shaded-job</finalName>
<transformers>
<!-- in case there are multiple parser services jars -->
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<relocations>
<relocation>
<!-- avoid Guava version conflict -->
<pattern>com</pattern>
<shadedPattern>uk.bl.wa.shaded.com</shadedPattern>
<includes>
<include>com.google.common.**</include>
</includes>
</relocation>
</relocations>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4j.version}</version>
</dependency>

<dependency>
<groupId>uk.bl.wa.discovery</groupId>
Expand All @@ -77,6 +124,18 @@
<groupId>org.apache.logging.log4j</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>bouncycastle</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.bouncycastle</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.antlr</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
Expand All @@ -85,24 +144,60 @@
<version>${solr.version}</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-test</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.mrunit</groupId>
<artifactId>mrunit</artifactId>
<version>0.9.0-incubating</version>
<classifier>hadoop1</classifier>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-minicluster</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<!-- The following dependencies seem to be required to make the MiniCluster run job okay.
They seem to be from the hadoop-client dependencies, but they need adding here it seems.-->
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.74.Final</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
<version>1.60</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>2.28.2</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ public int run(String[] args) throws IOException, ParseException,
return 0;
}

private void setup(String[] args, JobConf conf) throws ParseException {
private void setup(String[] args, JobConf conf) throws ParseException, IOException {
// Process Hadoop args first:
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public void configure( JobConf job ) {
LOG.info("Got task.id " + mapTaskId + " and input.file " + inputFile);

// Set up a decent font cache location for PDFBox
System.setProperty("pdfbox.fontcache", job.get("mapred.child.tmp"));
System.setProperty("pdfbox.fontcache", job.get("mapreduce.task.tmp.dir", "/tmp"));
}

private void configureAnnotations() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ public void reduce(IntWritable key, Iterator<WritableSolrRecord> values,
noValues++;

if (!opts.dummyRun) {
//solr.getSolrDocument().writeMap(ew);
docs.add(solr.getSolrDocument());
// Have we exceeded the batchSize?
checkSubmission(docs, opts.solr.batchSize, reporter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
@SuppressWarnings( "deprecation" )
public class OutlinkExtractor extends Configured implements Tool {

private boolean wait = false;
private boolean wait = true;

protected void createJobConf(JobConf conf, String[] args)
throws IOException {
Expand All @@ -61,6 +61,9 @@ protected void createJobConf(JobConf conf, String[] args)

conf.setOutputKeyClass( Text.class );
conf.setOutputValueClass( Text.class );

// FIXME Hard-code a single reducer:
conf.setNumReduceTasks(1);
}

public int run(String[] args) throws IOException {
Expand All @@ -76,6 +79,7 @@ public int run(String[] args) throws IOException {
} else {
JobClient client = new JobClient(conf);
client.submitJob(conf);
client.close();
}
return 0;
}
Expand Down
Loading