Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
404 changes: 404 additions & 0 deletions eclipse-formatter.xml

Large diffs are not rendered by default.

139 changes: 83 additions & 56 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.commoncrawl</groupId>
Expand Down Expand Up @@ -29,60 +32,6 @@
<junit.jupiter.version>5.13.4</junit.jupiter.version>
</properties>

<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.14.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.4</version>
<configuration>
<includes>
<include>**/Test*.java</include>
<include>**/*Test.java</include>
</includes>
<properties>
<excludeTags>performance</excludeTags>
</properties>
<argLine>
<!-- required for Java 17 -->
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
</argLine>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<finalName>${project.artifactId}-${project.version}</finalName>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>


<dependencyManagement>
<dependencies>
<dependency>
Expand Down Expand Up @@ -167,4 +116,82 @@
</dependency>

</dependencies>

<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.14.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.4</version>
<configuration>
<includes>
<include>**/Test*.java</include>
<include>**/*Test.java</include>
</includes>
<properties>
<excludeTags>performance</excludeTags>
</properties>
<argLine>
<!-- required for Java 17 -->
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
</argLine>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<finalName>${project.artifactId}-${project.version}</finalName>
</configuration>
<executions>
<execution>
<goals>
<goal>single</goal>
</goals>
<phase>package</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>2.46.1</version>
<configuration>
<pom>
<!-- These are the defaults, you can override if you want -->
<includes>
<include>pom.xml</include>
</includes>
<sortPom>
<indentAttribute>all</indentAttribute>
<keepBlankLines>true</keepBlankLines>
<expandEmptyElements>false</expandEmptyElements>
<nrOfIndentSpace>-1</nrOfIndentSpace>
<predefinedSortOrder>recommended_2008_06</predefinedSortOrder>
</sortPom>
</pom>
<java>
<eclipse>
<file>${project.basedir}/eclipse-formatter.xml</file>
</eclipse>
</java>
</configuration>
</plugin>
</plugins>
</build>
</project>
41 changes: 21 additions & 20 deletions src/main/java/org/commoncrawl/net/HostName.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ public class HostName {
private static final Logger LOG = LoggerFactory.getLogger(HostName.class);

public static enum Type {
hostname,
IPv4,
IPv6
hostname, IPv4, IPv6
}

private Type type;
Expand All @@ -70,7 +68,10 @@ public static enum Type {
public static final Pattern IPV4_ADDRESS_PATTERN_VARIANT_DECIMAL = Pattern
.compile("(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){0,3}(?:[0-9]+)");

/** Lazy pattern to catch IPv6 addresses (or what looks similar, does not validate) */
/**
* Lazy pattern to catch IPv6 addresses (or what looks similar, does not
* validate)
*/
public static final Pattern IPV6_ADDRESS_PATTERN = Pattern.compile("\\[[0-9a-fA-F:]+\\]");

public HostName(String hostName) {
Expand Down Expand Up @@ -148,7 +149,7 @@ private void setHostName(String name) {
}
}
if (hostName.endsWith(".")) {
hostName = hostName.substring(0, hostName.length()-1);
hostName = hostName.substring(0, hostName.length() - 1);
}
revHost = reverseHost(hostName);
EffectiveTLD privateETld = EffectiveTldFinder.getEffectiveTLD(hostName, false);
Expand Down Expand Up @@ -218,7 +219,7 @@ public String getHostNameReversed() {
*/
public static String[] reverseHost(String hostName) {
String[] rev = SPLIT_HOST_PATTERN.split(hostName);
for (int i = 0; i < (rev.length/2); i++) {
for (int i = 0; i < (rev.length / 2); i++) {
String temp = rev[i];
rev[i] = rev[rev.length - i - 1];
rev[rev.length - i - 1] = temp;
Expand Down Expand Up @@ -250,7 +251,8 @@ private String canonicalizeIpAddress(String ipAddrStr) throws IllegalArgumentExc
* otherwise specified):
* <ol>
* <li>host name
* <li>parts 1 - 5 of the reversed host name (first part is the top-level domain)
* <li>parts 1 - 5 of the reversed host name (first part is the top-level
* domain)
* <li>registry suffix
* <li>domain name below registry suffix
* <li>private suffix
Expand All @@ -264,18 +266,17 @@ private String canonicalizeIpAddress(String ipAddrStr) throws IllegalArgumentExc
* @return row
*/
public Row asRow() {
return RowFactory.create(
hostName,
((revHost != null && revHost.length > 0) ? revHost[0] : null),
((revHost != null && revHost.length > 1) ? revHost[1] : null),
((revHost != null && revHost.length > 2) ? revHost[2] : null),
((revHost != null && revHost.length > 3) ? revHost[3] : null),
((revHost != null && revHost.length > 4) ? revHost[4] : null),
getRegistrySuffix(),
getDomainNameUnderRegistrySuffix(),
getPrivateSuffix(),
getPrivateDomainName(),
getHostNameReversed()
);
return RowFactory.create( //
hostName, //
((revHost != null && revHost.length > 0) ? revHost[0] : null), //
((revHost != null && revHost.length > 1) ? revHost[1] : null), //
((revHost != null && revHost.length > 2) ? revHost[2] : null), //
((revHost != null && revHost.length > 3) ? revHost[3] : null), //
((revHost != null && revHost.length > 4) ? revHost[4] : null), //
getRegistrySuffix(), //
getDomainNameUnderRegistrySuffix(), //
getPrivateSuffix(), //
getPrivateDomainName(), //
getHostNameReversed());
}
}
45 changes: 22 additions & 23 deletions src/main/java/org/commoncrawl/spark/CCIndex2Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public CdxLine(String line) throws IOException {
mime = getString("mime");
mimeDetected = getString("mime-detected");

filename = getString("filename");
filename = getString("filename");
offset = getInt("offset");
length = getInt("length");
status = getHttpStatus("status");
Expand Down Expand Up @@ -90,37 +90,36 @@ public static Row convertCdxLine(String line) {
}
if (useBuiltinNestedSchema) {
// Note: the row layout must be congruent with the built-in schema
return RowFactory.create(
RowFactory.create(
cdx.urlkey,
cdx.uri.getUrlString(),
cdx.uri.getHostName().asRow(),
cdx.uri.getProtocol(),
cdx.uri.getPort(),
cdx.uri.getPath(),
cdx.uri.getQuery()),
RowFactory.create(cdx.timestamp, cdx.status, cdx.redirect),
RowFactory.create(cdx.digest, cdx.mime, cdx.mimeDetected,
cdx.charset, cdx.languages, cdx.truncated),
RowFactory.create(cdx.filename, cdx.offset, cdx.length, cdx.segment),
return RowFactory.create(RowFactory.create( //
cdx.urlkey, //
cdx.uri.getUrlString(), //
cdx.uri.getHostName().asRow(), //
cdx.uri.getProtocol(), //
cdx.uri.getPort(), //
cdx.uri.getPath(), //
cdx.uri.getQuery()), //
RowFactory.create(cdx.timestamp, cdx.status, cdx.redirect), //
RowFactory.create(cdx.digest, cdx.mime, cdx.mimeDetected, cdx.charset, cdx.languages,
cdx.truncated), //
RowFactory.create(cdx.filename, cdx.offset, cdx.length, cdx.segment), //
cdx.crawl, cdx.subset);
} else {
Row h = cdx.uri.getHostName().asRow();
return RowFactory.create(
// SURT and complete URL
cdx.urlkey,
cdx.urlkey, //
cdx.uri.getUrlString(),
// host
h.get(0), h.get(1),
h.get(2), h.get(3),
h.get(4), h.get(5),
h.get(6), h.get(7),
h.get(8), h.get(9),
h.get(0), h.get(1), //
h.get(2), h.get(3), //
h.get(4), h.get(5), //
h.get(6), h.get(7), //
h.get(8), h.get(9), //
h.get(10),
// URL components
cdx.uri.getProtocol(),
cdx.uri.getPort(),
cdx.uri.getPath(),
cdx.uri.getProtocol(), //
cdx.uri.getPort(), //
cdx.uri.getPath(), //
cdx.uri.getQuery(),
// fetch info
cdx.timestamp, cdx.status,
Expand Down
34 changes: 16 additions & 18 deletions src/main/java/org/commoncrawl/spark/EOTIndexTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* Convert End of Term Web Archive's CDX index into a tabular format.
*/
Expand All @@ -35,8 +34,7 @@ public class EOTIndexTable extends IndexTable {
private static final Logger LOG = LoggerFactory.getLogger(EOTIndexTable.class);
protected String name = EOTIndexTable.class.getCanonicalName();

protected static final Pattern filenameAnalyzer = Pattern
.compile("^crawl-data/([^/]+)/segments/([^/]+)/(warc)/");
protected static final Pattern filenameAnalyzer = Pattern.compile("^crawl-data/([^/]+)/segments/([^/]+)/(warc)/");

protected static class CdxLine extends IndexTable.CdxLine {
String digest;
Expand All @@ -54,7 +52,7 @@ public CdxLine(String line) throws IOException {
digest = getString("digest");
mime = getString("mime");

filename = getString("filename");
filename = getString("filename");
offset = getLong("offset");
length = getLong("length");
status = getHttpStatus("status");
Expand All @@ -79,22 +77,22 @@ public static Row convertCdxLine(String line) {
return null;
}
Row h = cdx.uri.getHostName().asRow();
return RowFactory.create(
return RowFactory.create( //
// SURT and complete URL
cdx.urlkey,
cdx.uri.getUrlString(),
cdx.urlkey, //
cdx.uri.getUrlString(), //
// host
h.get(0), h.get(1),
h.get(2), h.get(3),
h.get(4), h.get(5),
h.get(6), h.get(7),
h.get(8), h.get(9),
h.get(10),
h.get(0), h.get(1), //
h.get(2), h.get(3), //
h.get(4), h.get(5), //
h.get(6), h.get(7), //
h.get(8), h.get(9), //
h.get(10), //
// URL components
cdx.uri.getProtocol(),
cdx.uri.getPort(),
cdx.uri.getPath(),
cdx.uri.getQuery(),
cdx.uri.getProtocol(), //
cdx.uri.getPort(), //
cdx.uri.getPath(), //
cdx.uri.getQuery(), //
// fetch info
cdx.timestamp, cdx.status,
// content-related
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/commoncrawl/spark/IndexTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ public void run(String inputPaths, String outputPath, Function<String, Row> mapI
String[] partitionColumns = {};
if (!partitionBy.trim().isEmpty()) {
partitionColumns = partitionBy.trim().split("\\s*,\\s*");
Column[] pCols = new Column[partitionColumns.length + 1];
Column[] pCols = new Column[partitionColumns.length + 1];
for (int i = 0; i < partitionColumns.length; i++) {
pCols[i] = df.col(partitionColumns[i]);
}
Expand Down Expand Up @@ -538,7 +538,7 @@ public void run(String[] args) throws IOException {
String inputPaths = arguments[0];
String outputPath = arguments[1];

if ("orc".equals(outputFormat) && "gzip".equals(outputCompression) ) {
if ("orc".equals(outputFormat) && "gzip".equals(outputCompression)) {
// gzip for Parquet, zlib for ORC
outputCompression = "zlib";
}
Expand Down
Loading