Skip to content

Commit 9eafca9

Browse files
committed
CdxWriter: add sort option
1 parent 8d57881 commit 9eafca9

File tree

4 files changed

+49
-13
lines changed

4 files changed

+49
-13
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
### New features
66

77
- CdxRecord: surt(), format(), values() and toString()
8-
- CdxWriter: CDXJ output support
8+
- CdxWriter
9+
- CDXJ output support
10+
- sort option
911
- HttpMessage: `Content-Encoding: zstd` support
1012
- HttpRequest: `Content-Encoding: chunked` support
1113
- WarcReader: [Zstandard compressed WARC Files](https://iipc.github.io/warc-specifications/specifications/warc-zstd/) support

src/org/netpreserve/jwarc/cdx/CdxWriter.java

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import java.io.Writer;
1111
import java.net.URI;
1212
import java.nio.file.Path;
13+
import java.util.ArrayList;
1314
import java.util.List;
1415
import java.util.function.Consumer;
1516
import java.util.function.Predicate;
@@ -25,6 +26,7 @@ public class CdxWriter implements Closeable {
2526
private boolean postAppend = false;
2627
private Predicate<WarcRecord> recordFilter;
2728
private Consumer<String> warningHandler;
29+
private List<String> sortBuffer;
2830

2931
public CdxWriter(Writer writer) {
3032
this.writer = writer;
@@ -58,8 +60,16 @@ public void write(WarcCaptureRecord capture, String filename,
5860
String rawUrlKey = capture.target() + (capture.target().contains("?") ? '&' : '?') + encodedRequest;
5961
urlKey = URIs.toNormalizedSurt(rawUrlKey);
6062
}
61-
writer.write(format.format(capture, filename, position, length, urlKey));
62-
writer.write('\n');
63+
writeLine(format.format(capture, filename, position, length, urlKey));
64+
}
65+
66+
private void writeLine(String line) throws IOException {
67+
if (sortBuffer != null) {
68+
sortBuffer.add(line);
69+
} else {
70+
writer.write(line);
71+
writer.write('\n');
72+
}
6373
}
6474

6575
/**
@@ -166,8 +176,24 @@ public void onWarning(Consumer<String> warningHandler) {
166176
this.warningHandler = warningHandler;
167177
}
168178

179+
public void setSort(boolean sort) {
180+
if (sort) {
181+
sortBuffer = new ArrayList<>();
182+
} else {
183+
sortBuffer = null;
184+
}
185+
}
186+
169187
@Override
170188
public void close() throws IOException {
189+
if (sortBuffer != null) {
190+
sortBuffer.sort(null);
191+
for (String line : sortBuffer) {
192+
writer.write(line);
193+
writer.write('\n');
194+
}
195+
sortBuffer = null;
196+
}
171197
writer.close();
172198
}
173199
}

src/org/netpreserve/jwarc/tools/CdxTool.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public static void main(String[] args) throws IOException {
3131
boolean fullFilePath = false;
3232
boolean postAppend = false;
3333
boolean digestUnchanged = false;
34+
boolean sort = false;
3435
Predicate<WarcRecord> filter = null;
3536
for (int i = 0; i < args.length; i++) {
3637
if (args[i].startsWith("-")) {
@@ -86,6 +87,10 @@ public static void main(String[] args) throws IOException {
8687
case "--revisits-excluded":
8788
filter = record -> !(record instanceof WarcRevisit);
8889
break;
90+
case "-s":
91+
case "--sort":
92+
sort = true;
93+
break;
8994
case "-w":
9095
case "--warc-full-path":
9196
fullFilePath = true;
@@ -112,6 +117,7 @@ public static void main(String[] args) throws IOException {
112117
cdxWriter.setFormat(format);
113118
cdxWriter.setPostAppend(postAppend);
114119
cdxWriter.setRecordFilter(filter);
120+
cdxWriter.setSort(sort);
115121

116122
if (printHeader) cdxWriter.writeHeaderLine();
117123
cdxWriter.process(files, fullFilePath);

test/org/netpreserve/jwarc/cdx/CdxWriterTest.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,28 @@ public void test() throws IOException {
2828
HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found")
2929
.body(MediaType.HTML, new byte[0])
3030
.build();
31-
warcWriter.write(new WarcResponse.Builder("http://example.org/")
32-
.date(Instant.parse("2022-03-01T12:44:34Z"))
31+
warcWriter.write(new WarcRevisit.Builder("http://example.org/")
32+
.date(Instant.parse("2022-03-02T21:44:34Z"))
3333
.body(httpResponse)
3434
.payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3")
3535
.build());
36-
warcWriter.write(new WarcRevisit.Builder("http://example.org/")
37-
.date(Instant.parse("2022-03-02T21:44:34Z"))
36+
warcWriter.write(new WarcResponse.Builder("http://example.org/")
37+
.date(Instant.parse("2022-03-01T12:44:34Z"))
3838
.body(httpResponse)
3939
.payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3")
4040
.build());
4141
}
4242

4343
StringWriter cdxBuffer = new StringWriter();
44-
CdxWriter cdxWriter = new CdxWriter(cdxBuffer);
45-
cdxWriter.setFormat(new CdxFormat.Builder().digestUnchanged().build());
46-
cdxWriter.writeHeaderLine();
47-
cdxWriter.process(Collections.singletonList(testWarcFile), true);
44+
try (CdxWriter cdxWriter = new CdxWriter(cdxBuffer)) {
45+
cdxWriter.setFormat(new CdxFormat.Builder().digestUnchanged().build());
46+
cdxWriter.setSort(true);
47+
cdxWriter.writeHeaderLine();
48+
cdxWriter.process(Collections.singletonList(testWarcFile), true);
49+
}
4850
assertEquals(" CDX N b a m s k r M S V g\n" +
49-
"org,example)/ 20220301124434 http://example.org/ text/html 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 398 0 " + testWarcFile + "\n" +
50-
"org,example)/ 20220302214434 http://example.org/ warc/revisit 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 397 398 " + testWarcFile + "\n",
51+
"org,example)/ 20220301124434 http://example.org/ text/html 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 398 397 " + testWarcFile + "\n" +
52+
"org,example)/ 20220302214434 http://example.org/ warc/revisit 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 397 0 " + testWarcFile + "\n",
5153
cdxBuffer.toString());
5254
}
5355

0 commit comments

Comments
 (0)