Skip to content

Commit 7828aa0

Browse files
committed
ExtractTool: Add --concurrent option
1 parent 55353a7 commit 7828aa0

File tree

2 files changed

+79
-17
lines changed

2 files changed

+79
-17
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package org.netpreserve.jwarc;
2+
3+
import java.net.URI;
4+
import java.util.HashSet;
5+
import java.util.Set;
6+
7+
/**
8+
* A set for testing whether WARC records are concurrent (i.e. part of the same capture event).
9+
*/
10+
public class ConcurrentRecordSet {
11+
private final Set<URI> set = new HashSet<>();
12+
13+
/**
14+
* Adds a record to the set.
15+
*/
16+
public void add(WarcRecord record) {
17+
set.add(record.id());
18+
if (record instanceof WarcCaptureRecord) {
19+
set.addAll(((WarcCaptureRecord) record).concurrentTo());
20+
}
21+
}
22+
23+
/**
24+
* Tests if the given record is concurrent to any previously added record.
25+
*/
26+
public boolean contains(WarcRecord record) {
27+
if (set.contains(record.id())) return true;
28+
if (record instanceof WarcCaptureRecord) {
29+
for (URI id : ((WarcCaptureRecord) record).concurrentTo()) {
30+
if (set.contains(id)) return true;
31+
}
32+
}
33+
return false;
34+
}
35+
36+
/**
37+
* Removes all records from the set.
38+
*/
39+
public void clear() {
40+
set.clear();
41+
}
42+
}

src/org/netpreserve/jwarc/tools/ExtractTool.java

+37-17
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
import java.nio.channels.WritableByteChannel;
1616
import java.nio.file.Path;
1717
import java.nio.file.Paths;
18-
import java.util.ArrayList;
19-
import java.util.List;
20-
import java.util.Optional;
18+
import java.util.*;
2119

2220
import static java.nio.charset.StandardCharsets.US_ASCII;
2321
import static java.nio.charset.StandardCharsets.UTF_8;
@@ -85,6 +83,7 @@ private static void usage(int exitValue) {
8583
System.err.println();
8684
System.err.println("Options:");
8785
System.err.println();
86+
System.err.println(" --concurrent\talso outputs any immediately following concurrent records");
8887
System.err.println(" --headers\toutput only record (and HTTP) headers");
8988
System.err.println(" --payload\toutput only record payload, if necessary");
9089
System.err.println(" \tdecode transfer and/or content encoding");
@@ -95,11 +94,16 @@ public static void main(String[] args) throws IOException {
9594
ExtractAction action = ExtractAction.RECORD;
9695
Path warcFile = null;
9796
List<Long> offsets = new ArrayList<>();
97+
boolean extractConcurrent = false;
9898
for (String arg : args) {
9999
switch (arg) {
100100
case "-h":
101101
case "--help":
102102
usage(0);
103+
break;
104+
case "--concurrent":
105+
extractConcurrent = true;
106+
break;
103107
case "--headers":
104108
action = ExtractAction.HEADERS;
105109
break;
@@ -128,7 +132,9 @@ public static void main(String[] args) throws IOException {
128132
}
129133
if (warcFile == null || offsets.isEmpty()) {
130134
usage(1);
135+
return;
131136
}
137+
WritableByteChannel out = Channels.newChannel(System.out);
132138
for (long offset : offsets) {
133139
try (FileChannel channel = FileChannel.open(warcFile);
134140
WarcReader reader = new WarcReader(channel.position(offset))) {
@@ -137,22 +143,36 @@ public static void main(String[] args) throws IOException {
137143
System.err.println("No record found at position " + offset);
138144
System.exit(1);
139145
}
140-
WritableByteChannel out = Channels.newChannel(System.out);
141-
switch (action) {
142-
case RECORD:
143-
writeWarcHeaders(out, record.get());
144-
writeBody(out, record.get().body());
145-
out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII)));
146-
break;
147-
case HEADERS:
148-
writeWarcHeaders(out, record.get());
149-
writeHttpHeaders(out, record.get());
150-
break;
151-
case PAYLOAD:
152-
writePayload(out, record.get());
153-
break;
146+
147+
writeRecord(record.get(), out, action);
148+
149+
if (extractConcurrent) {
150+
ConcurrentRecordSet concurrentSet = new ConcurrentRecordSet();
151+
while (true) {
152+
concurrentSet.add(record.get());
153+
record = reader.next();
154+
if (!record.isPresent() || !concurrentSet.contains(record.get())) break;
155+
writeRecord(record.get(), out, action);
156+
}
154157
}
155158
}
156159
}
157160
}
161+
162+
private static void writeRecord(WarcRecord record, WritableByteChannel out, ExtractAction action) throws IOException {
163+
switch (action) {
164+
case RECORD:
165+
writeWarcHeaders(out, record);
166+
writeBody(out, record.body());
167+
out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII)));
168+
break;
169+
case HEADERS:
170+
writeWarcHeaders(out, record);
171+
writeHttpHeaders(out, record);
172+
break;
173+
case PAYLOAD:
174+
writePayload(out, record);
175+
break;
176+
}
177+
}
158178
}

0 commit comments

Comments
 (0)