Skip to content

Commit 8d57881

Browse files
committed
CDXJ format: omit status and mime when unknown
Matches webrecorder/cdxj-indexer.
1 parent 3b64b2a commit 8d57881

File tree

2 files changed

+34
-18
lines changed

2 files changed

+34
-18
lines changed

src/org/netpreserve/jwarc/cdx/CdxFormat.java

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,20 @@ private void formatJsonBlock(WarcCaptureRecord record, String filename, long pos
119119
value = PYWB_REVISIT_MIMETYPE;
120120
} else {
121121
try {
122-
value = record.payload().map(p -> p.type().base()).orElse(MediaType.OCTET_STREAM).toString();
122+
if (record instanceof WarcResponse &&
123+
record.contentType().equals(MediaType.HTTP_RESPONSE)) {
124+
value = ((WarcResponse) record).http().headers().first("Content-Type")
125+
.map(s -> MediaType.parseLeniently(s).base().toString())
126+
.orElse(null);
127+
} else if (record instanceof WarcResource) {
128+
value = record.headers().first("Content-Type")
129+
.map(s -> MediaType.parseLeniently(s).base().toString())
130+
.orElse(null);
131+
} else {
132+
value = record.payload()
133+
.map(p -> p.type().base())
134+
.map(Object::toString).orElse(null);
135+
}
123136
} catch (IOException e) {
124137
value = null;
125138
}
@@ -130,7 +143,8 @@ private void formatJsonBlock(WarcCaptureRecord record, String filename, long pos
130143
break;
131144
case "status":
132145
try {
133-
value = String.valueOf(statusCode(record));
146+
Integer status = statusCode(record);
147+
value = status == 0 ? null : String.valueOf(status);
134148
} catch (IOException e) {
135149
value = null;
136150
}
@@ -184,7 +198,7 @@ private static int statusCode(WarcCaptureRecord record) throws IOException {
184198
return ((WarcResponse) record).gemini().statusHttpEquivalent();
185199
}
186200
}
187-
return 200;
201+
return 0;
188202
}
189203

190204
String formatField(byte fieldName, WarcCaptureRecord record, String filename, long position, long size, String urlkey) throws IOException {
@@ -223,17 +237,9 @@ String formatField(byte fieldName, WarcCaptureRecord record, String filename, lo
223237
return "-";
224238
}
225239
case RESPONSE_CODE:
226-
if (record instanceof WarcResponse || record instanceof WarcRevisit) {
227-
if (record instanceof WarcRevisit) {
228-
return Integer.toString(((WarcRevisit) record).http().status());
229-
}
230-
else if (record.contentType().base().equals(MediaType.HTTP)) {
231-
return Integer.toString(((WarcResponse) record).http().status());
232-
} else if (record.contentType().base().equals(MediaType.GEMINI)) {
233-
return String.format("%02d", ((WarcResponse) record).gemini().statusHttpEquivalent());
234-
}
235-
}
236-
return Integer.toString(statusCode(record));
240+
int status = statusCode(record);
241+
if (status == 0) status = 200;
242+
return Integer.toString(status);
237243
default:
238244
throw new IllegalArgumentException("Unknown CDX field: " + (char) fieldName);
239245
}

test/org/netpreserve/jwarc/cdx/CdxFormatTest.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
package org.netpreserve.jwarc.cdx;
22

33
import org.junit.Test;
4-
import org.netpreserve.jwarc.HttpResponse;
5-
import org.netpreserve.jwarc.MediaType;
6-
import org.netpreserve.jwarc.WarcResponse;
7-
import org.netpreserve.jwarc.WarcRevisit;
4+
import org.netpreserve.jwarc.*;
85

96
import java.io.IOException;
7+
import java.net.URI;
108
import java.nio.file.Path;
119
import java.nio.file.Paths;
1210
import java.time.Instant;
@@ -129,4 +127,16 @@ public void testCdxj() throws Exception {
129127
assertEquals("org,example)/ 20220302214434 {\"url\": \"http://example.org/\", \"mime\": \"text/html\", \"status\": \"404\", \"digest\": \"sha1:AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7\", \"length\": \"456\", \"offset\": \"123\", \"filename\": \"example.warc.gz\"}",
130128
CdxFormat.CDXJ.format(response, path.getFileName().toString(), 123, 456));
131129
}
130+
131+
@Test
132+
public void testCdxjResource() throws Exception {
133+
Path path = Paths.get("/home/jwarc/resource.warc.gz");
134+
WarcResource resource = new WarcResource.Builder(URI.create("http://example.org/"))
135+
.date(Instant.parse("2022-03-02T21:44:34Z"))
136+
.payloadDigest("sha1", "AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7")
137+
.body(null, new byte[0])
138+
.build();
139+
assertEquals("org,example)/ 20220302214434 {\"url\": \"http://example.org/\", \"digest\": \"sha1:AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7\", \"length\": \"456\", \"offset\": \"123\", \"filename\": \"resource.warc.gz\"}",
140+
CdxFormat.CDXJ.format(resource, path.getFileName().toString(), 123, 456));
141+
}
132142
}

0 commit comments

Comments
 (0)