Skip to content

Commit e113046

Browse files
committed
WaczWriter: add support for generating pages.jsonl
1 parent efb1662 commit e113046

File tree

4 files changed

+164
-0
lines changed

4 files changed

+164
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
* Copyright (C) 2025 National Library of Australia and the jwarc contributors
4+
*/
5+
6+
package org.netpreserve.jwarc;
7+
8+
import java.io.Closeable;
9+
import java.io.IOException;
10+
import java.io.Writer;
11+
import java.util.LinkedHashMap;
12+
import java.util.Map;
13+
14+
/**
15+
* Writes pages.jsonl records.
16+
* @see <a href="https://specs.webrecorder.net/wacz/latest/#pages-jsonl">WACZ spec</a>
17+
*/
18+
public class JsonPagesWriter implements Closeable {
19+
private final Writer writer;
20+
private boolean headerWritten = false;
21+
22+
public JsonPagesWriter(Writer writer) throws IOException {
23+
this.writer = writer;
24+
}
25+
26+
public void process(WarcReader reader) throws IOException {
27+
for (WarcRecord record : reader) {
28+
if (record instanceof WarcResponse || record instanceof WarcResource) {
29+
WarcCaptureRecord capture = (WarcCaptureRecord) record;
30+
MediaType type = capture.payloadType();
31+
if (type.base().equals(MediaType.HTML) || type.base().equals(MediaType.XHTML)) {
32+
addPage(capture.target(), capture.date().toString(), null);
33+
}
34+
}
35+
}
36+
}
37+
38+
/**
39+
* Writes the mandatory header line for pages.jsonl.
40+
*/
41+
private void writeHeader() throws IOException {
42+
Map<String, Object> header = new LinkedHashMap<>();
43+
header.put("format", "json-pages-1.0");
44+
header.put("id", "pages");
45+
header.put("title", "All Pages");
46+
Json.write(writer, header);
47+
writer.write("\n");
48+
}
49+
50+
/**
51+
* Adds a page entry to pages.jsonl.
52+
*
53+
* @param url the URL of the page
54+
* @param ts the timestamp in RFC3339 format
55+
* @param title an optional title for the page
56+
*/
57+
public void addPage(String url, String ts, String title) throws IOException {
58+
if (!headerWritten) {
59+
writeHeader();
60+
headerWritten = true;
61+
}
62+
Map<String, Object> page = new LinkedHashMap<>();
63+
page.put("url", url);
64+
page.put("ts", ts);
65+
if (title != null) page.put("title", title);
66+
// Add other optional fields (id, text, size) as needed
67+
Json.write(writer, page);
68+
writer.write("\n");
69+
}
70+
71+
@Override
72+
public void close() throws IOException {
73+
writer.close();
74+
}
75+
76+
public void flush() throws IOException {
77+
writer.flush();
78+
}
79+
}

src/org/netpreserve/jwarc/MediaType.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ private static byte[] init__media_type_eof_actions_0()
236236
public static final MediaType JSON = MediaType.parse("application/json");
237237
public static MediaType HTML = MediaType.parse("text/html");
238238
public static MediaType HTML_UTF8 = MediaType.parse("text/html;charset=utf-8");
239+
public static final MediaType XHTML = MediaType.parse("application/xhtml+xml");
239240
public static MediaType HTTP = MediaType.parse("application/http");
240241
public static MediaType HTTP_REQUEST = MediaType.parse("application/http;msgtype=request");
241242
public static MediaType HTTP_RESPONSE = MediaType.parse("application/http;msgtype=response");

src/org/netpreserve/jwarc/WaczWriter.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ public class WaczWriter implements Closeable {
4343
private final MessageDigest messageDigest;
4444
private final FileChannel cdxChannel;
4545
private final CdxWriter cdxWriter;
46+
private final FileChannel pagesChannel;
47+
private final JsonPagesWriter pagesWriter;
48+
private boolean autoPages = true;
4649
private boolean finished;
4750

4851
public WaczWriter(Path path) throws IOException {
@@ -62,6 +65,11 @@ public void close() throws IOException {
6265
this.cdxWriter = new CdxWriter(new BufferedWriter(new OutputStreamWriter(cdxGzipStream, StandardCharsets.UTF_8)));
6366
cdxWriter.setFormat(CdxFormat.CDXJ);
6467
cdxWriter.setSort(true);
68+
69+
Path pagesTempFile = Files.createTempFile("jwarc-pages", ".jsonl");
70+
this.pagesChannel = FileChannel.open(pagesTempFile, READ, WRITE, DELETE_ON_CLOSE);
71+
this.pagesWriter = new JsonPagesWriter(new BufferedWriter(new OutputStreamWriter(Channels.newOutputStream(pagesChannel), StandardCharsets.UTF_8)));
72+
6573
set("profile", "data-package");
6674
set("wacz_version", "1.1.1");
6775
try {
@@ -117,6 +125,10 @@ private void writeResource(String path, SeekableByteChannel source) throws IOExc
117125
long start = source.position();
118126
cdxWriter.process(new WarcReader(source), filename);
119127
source.position(start);
128+
if (autoPages) {
129+
pagesWriter.process(new WarcReader(source));
130+
source.position(start);
131+
}
120132
}
121133

122134
zip.putNextEntry(entry);
@@ -151,6 +163,17 @@ private static long calculateCrc32(InputStream stream) throws IOException {
151163
return crc.getValue();
152164
}
153165

166+
/**
167+
* Adds a page entry to pages.jsonl.
168+
*
169+
* @param url the URL of the page
170+
* @param ts the timestamp in RFC3339 format
171+
* @param title an optional title for the page
172+
*/
173+
public void addPage(String url, String ts, String title) throws IOException {
174+
pagesWriter.addPage(url, ts, title);
175+
}
176+
154177
private void writeDatapackageJson() throws IOException {
155178
// https://specs.webrecorder.net/wacz/1.1.1/#datapackage-json
156179
zip.putNextEntry(new ZipEntry("datapackage.json"));
@@ -180,6 +203,12 @@ public void finish() throws IOException {
180203
cdxChannel.position(0);
181204
writeResource("indexes/index.cdx.gz", cdxChannel);
182205
}
206+
pagesWriter.flush();
207+
boolean pagesProvided = resources.stream().anyMatch(r -> "pages/pages.jsonl".equals(r.get("path")));
208+
if (!pagesProvided && pagesChannel.size() > 0) {
209+
pagesChannel.position(0);
210+
writeResource("pages/pages.jsonl", pagesChannel);
211+
}
183212
writeDatapackageJson();
184213
zip.finish();
185214
metadata.clear();
@@ -202,6 +231,11 @@ public void close() throws IOException {
202231
} catch (IOException e) {
203232
// ignore
204233
}
234+
try {
235+
pagesChannel.close();
236+
} catch (IOException e) {
237+
// ignore
238+
}
205239
zip.close();
206240
}
207241
}
@@ -215,4 +249,12 @@ public void close() throws IOException {
215249
public void set(String name, Object value) {
216250
metadata.put(name, value);
217251
}
252+
253+
/**
254+
* Enables or disables automatic detection of pages from WARC records.
255+
* If disabled, only pages added via {@link #addPage} will be included.
256+
*/
257+
public void setAutoPages(boolean enabled) {
258+
this.autoPages = enabled;
259+
}
218260
}

test/org/netpreserve/jwarc/WaczWriterTest.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
import java.io.IOException;
99
import java.io.InputStream;
1010
import java.io.InputStreamReader;
11+
import java.nio.charset.StandardCharsets;
1112
import java.nio.file.Files;
1213
import java.nio.file.Path;
1314
import java.security.MessageDigest;
1415
import java.security.NoSuchAlgorithmException;
16+
import java.util.Arrays;
1517
import java.util.List;
1618
import java.util.Map;
1719
import java.util.zip.GZIPInputStream;
@@ -71,6 +73,20 @@ public void test() throws Exception {
7173
assertEquals("archive/test.warc.gz", resource.get("path"));
7274
assertEquals("test.warc.gz", resource.get("name"));
7375
assertEquals("sha256:" + warcSha256, resource.get("hash"));
76+
77+
ZipEntry pagesEntry = zipFile.getEntry("pages/pages.jsonl");
78+
assertNotNull(pagesEntry);
79+
try (InputStream is = zipFile.getInputStream(pagesEntry);
80+
BufferedReader reader = new BufferedReader(new InputStreamReader(is))) {
81+
String header = reader.readLine();
82+
assertNotNull(header);
83+
assertTrue(header.contains("json-pages-1.0"));
84+
String page = reader.readLine();
85+
assertNotNull(page);
86+
assertTrue(page.contains("http://example.org/"));
87+
assertTrue(page.contains("\"url\":"));
88+
assertTrue(page.contains("\"ts\":"));
89+
}
7490

7591
String datapackageDigest = sha256(zipFile.getInputStream(datapackageEntry));
7692
ZipEntry digestEntry = zipFile.getEntry("datapackage-digest.json");
@@ -81,6 +97,32 @@ public void test() throws Exception {
8197
}
8298
}
8399

100+
@Test
101+
public void testManualPages() throws Exception {
102+
Path waczFile = temporaryFolder.newFile("manual.wacz").toPath();
103+
Path pagesFile = temporaryFolder.newFile("pages.jsonl").toPath();
104+
Files.write(pagesFile, Arrays.asList("{\"format\": \"json-pages-1.0\", \"id\": \"pages\", \"title\": \"Manual Pages\"}",
105+
"{\"url\": \"http://example.org/manual\", \"ts\": \"2023-01-01T00:00:00Z\"}"), StandardCharsets.UTF_8);
106+
107+
try (WaczWriter waczWriter = new WaczWriter(Files.newOutputStream(waczFile))) {
108+
waczWriter.setAutoPages(false);
109+
waczWriter.writeResource("pages/pages.jsonl", pagesFile);
110+
}
111+
112+
try (ZipFile zipFile = new ZipFile(waczFile.toFile())) {
113+
ZipEntry pagesEntry = zipFile.getEntry("pages/pages.jsonl");
114+
assertNotNull(pagesEntry);
115+
try (InputStream is = zipFile.getInputStream(pagesEntry);
116+
BufferedReader reader = new BufferedReader(new InputStreamReader(is))) {
117+
String header = reader.readLine();
118+
assertTrue(header.contains("Manual Pages"));
119+
String page = reader.readLine();
120+
assertTrue(page.contains("http://example.org/manual"));
121+
assertNull(reader.readLine());
122+
}
123+
}
124+
}
125+
84126
public String sha256(Path path) throws Exception {
85127
try (InputStream stream = Files.newInputStream(path)) {
86128
return sha256(stream);

0 commit comments

Comments
 (0)