Skip to content

Commit 5f97636

Browse files
committed
Add skeleton of WaczReader, WaczWriter, WaczTool
1 parent 403771a commit 5f97636

File tree

6 files changed

+577
-0
lines changed

6 files changed

+577
-0
lines changed
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
* Copyright (C) 2025 National Library of Australia and the jwarc contributors
4+
*/
5+
6+
package org.netpreserve.jwarc;
7+
8+
import java.io.*;
9+
import java.nio.charset.StandardCharsets;
10+
import java.util.*;
11+
12+
class Json {
13+
static Object read(InputStream stream) throws IOException {
14+
return read(new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)));
15+
}
16+
17+
static Object read(Reader reader) throws IOException {
18+
return new Parser(reader).value();
19+
}
20+
21+
private static class Parser {
22+
private final Reader reader;
23+
private int peek = -2;
24+
25+
Parser(Reader reader) {
26+
this.reader = reader;
27+
}
28+
29+
private int peek() throws IOException {
30+
if (peek == -2) peek = reader.read();
31+
return peek;
32+
}
33+
34+
private int next() throws IOException {
35+
int c = peek();
36+
peek = -2;
37+
return c;
38+
}
39+
40+
private int look() throws IOException {
41+
int c = peek();
42+
while (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
43+
c = reader.read();
44+
}
45+
peek = c;
46+
return c;
47+
}
48+
49+
private void consume(int c) throws IOException {
50+
if (next() != c) throw new IOException("Expected '" + (char) c + "'");
51+
}
52+
53+
Object value() throws IOException {
54+
int c = look();
55+
if (c == '"') return string();
56+
if (c == '{') return object();
57+
if (c == '[') return array();
58+
if (c == 't') return literal("true", true);
59+
if (c == 'f') return literal("false", false);
60+
if (c == 'n') return literal("null", null);
61+
if (c == '-' || (c >= '0' && c <= '9')) return number();
62+
if (c == -1) throw new EOFException();
63+
throw new IOException("Unexpected character");
64+
}
65+
66+
private Object number() throws IOException {
67+
StringBuilder buffer = new StringBuilder();
68+
boolean dbl = false;
69+
while (true) {
70+
int c = peek();
71+
if (c == 'e' || c == 'E' || c == '.') {
72+
dbl = true;
73+
} else if ((c < '0' || c > '9') && c != '-' && c != '+') {
74+
try {
75+
if (dbl) return Double.parseDouble(buffer.toString());
76+
return Long.parseLong(buffer.toString());
77+
} catch (NumberFormatException e) {
78+
throw new IOException("Invalid number: " + buffer);
79+
}
80+
}
81+
buffer.append((char) next());
82+
}
83+
}
84+
85+
private Object literal(String s, Boolean value) throws IOException {
86+
for (int i = 0; i < s.length(); i++) {
87+
if (next() != s.charAt(i)) throw new IOException("Expected '" + s + "'");
88+
}
89+
return value;
90+
}
91+
92+
private Object array() throws IOException {
93+
consume('[');
94+
Collection<Object> list = new ArrayList<>();
95+
if (look() != ']') {
96+
while (true) {
97+
list.add(value());
98+
if (look() == ']') break;
99+
consume(',');
100+
}
101+
}
102+
consume(']');
103+
return list;
104+
}
105+
106+
private Object object() throws IOException {
107+
consume('{');
108+
Map<String, Object> map = new LinkedHashMap<>();
109+
if (look() != '}') {
110+
while (true) {
111+
String key = string();
112+
consume(':');
113+
map.put(key, value());
114+
if (look() == '}') break;
115+
consume(',');
116+
look();
117+
}
118+
}
119+
consume('}');
120+
return map;
121+
}
122+
123+
private String string() throws IOException {
124+
consume('"');
125+
StringBuilder sb = new StringBuilder();
126+
while (true) {
127+
int c = next();
128+
if (c < 0) throw new EOFException("Unterminated JSON string");
129+
if (c == '"') return sb.toString();
130+
if (c != '\\') {
131+
sb.append((char) c);
132+
continue;
133+
}
134+
c = next();
135+
if (c == '"' || c == '\\' || c == '/') sb.append((char) c);
136+
else if (c == 'b') sb.append('\b');
137+
else if (c == 'f') sb.append('\f');
138+
else if (c == 'n') sb.append('\n');
139+
else if (c == 'r') sb.append('\r');
140+
else if (c == 't') sb.append('\t');
141+
else if (c == 'u') {
142+
int x = 0;
143+
for (int i = 0; i < 4; i++) {
144+
c = next();
145+
int digit = Character.digit(c, 16);
146+
if (digit < 0) throw new IOException("Invalid hex digit in unicode escape: " + (char) c);
147+
x = (x << 4) + digit;
148+
}
149+
sb.append((char) x);
150+
} else {
151+
throw new IOException("Invalid escape character: \\" + (char) c);
152+
}
153+
}
154+
}
155+
}
156+
157+
static void write(OutputStream out, Object value) throws IOException {
158+
try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FilterOutputStream(out) {
159+
public void close() {
160+
}
161+
}, StandardCharsets.UTF_8))) {
162+
write(writer, value);
163+
}
164+
}
165+
166+
static void write(Appendable out, Object value) throws IOException {
167+
if (value == null) {
168+
out.append("null");
169+
} else if (value instanceof Boolean) {
170+
out.append(value.toString());
171+
} else if (value instanceof String) {
172+
out.append('"');
173+
for (int i = 0; i < ((String) value).length(); i++) {
174+
char c = ((String) value).charAt(i);
175+
if (c == '"') out.append("\\\"");
176+
else if (c == '\\') out.append("\\\\");
177+
else if (c == '\b') out.append("\\b");
178+
else if (c == '\f') out.append("\\f");
179+
else if (c == '\n') out.append("\\n");
180+
else if (c == '\r') out.append("\\r");
181+
else if (c == '\t') out.append("\\t");
182+
else if (c <= 0x1f) {
183+
out.append("\\u00");
184+
out.append(Character.forDigit((c & 0xf0) >>> 4, 16));
185+
out.append(Character.forDigit(c & 0xf, 16));
186+
} else {
187+
out.append(c);
188+
}
189+
}
190+
out.append('"');
191+
} else if (value instanceof Number) {
192+
out.append(value.toString());
193+
} else if (value instanceof Map) {
194+
out.append('{');
195+
Map<?, ?> map = (Map<?, ?>) value;
196+
boolean first = true;
197+
for (Map.Entry<?, ?> entry : map.entrySet()) {
198+
if (!first) out.append(',');
199+
first = false;
200+
write(out, entry.getKey());
201+
out.append(':');
202+
write(out, entry.getValue());
203+
}
204+
out.append('}');
205+
} else if (value instanceof Collection) {
206+
out.append('[');
207+
Collection<?> coll = (Collection<?>) value;
208+
boolean first = true;
209+
for (Object o : coll) {
210+
if (!first) out.append(',');
211+
first = false;
212+
write(out, o);
213+
}
214+
out.append(']');
215+
} else {
216+
throw new IllegalArgumentException("unsupported JSON type: " + value.getClass());
217+
}
218+
}
219+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
* Copyright (C) 2025 National Library of Australia and the jwarc contributors
4+
*/
5+
6+
package org.netpreserve.jwarc;
7+
8+
import java.io.Closeable;
9+
import java.io.IOException;
10+
import java.io.InputStream;
11+
import java.nio.file.Path;
12+
import java.util.Map;
13+
import java.util.zip.ZipEntry;
14+
import java.util.zip.ZipFile;
15+
16+
/**
17+
* Reader for Web Archive Collection Zipped (WACZ) files.
18+
*
19+
* @see <a href="https://specs.webrecorder.net/wacz/latest/">WACZ Specification</a>
20+
*/
21+
public class WaczReader implements Closeable {
22+
private final ZipFile zip;
23+
private Map<String, Object> metadata;
24+
25+
public WaczReader(Path path) throws IOException {
26+
this.zip = new ZipFile(path.toFile());
27+
}
28+
29+
@Override
30+
public void close() throws IOException {
31+
zip.close();
32+
}
33+
34+
/**
35+
* Retrieves the metadata from the WACZ file, reading it from "datapackage.json".
36+
* Caches the result.
37+
*
38+
* @return a Map representing the metadata contained in "datapackage.json".
39+
* @throws IOException if the WACZ file does not contain "datapackage.json",
40+
* if the file cannot be read,
41+
* or if "datapackage.json" is not a valid JSON object.
42+
*/
43+
public Map<String, Object> metadata() throws IOException {
44+
if (metadata != null) return metadata;
45+
ZipEntry entry = zip.getEntry("datapackage.json");
46+
if (entry == null) throw new IOException("WACZ file is missing datapackage.json");
47+
try (InputStream stream = zip.getInputStream(entry)) {
48+
Object value = Json.read(stream);
49+
if (!(value instanceof Map)) throw new IOException("datapackage.json is not a JSON object");
50+
//noinspection unchecked
51+
this.metadata = (Map<String, Object>) value;
52+
return metadata;
53+
}
54+
}
55+
}

0 commit comments

Comments
 (0)