Skip to content

Commit 0c8281f

Browse files
committed
Improve URIs.toNormalizedSurt() compatibility with Python surt
I'm pretty sure there are still corner cases we're not handling (Python surt has many strange quirks) but this should be a lot closer.
1 parent 87d23af commit 0c8281f

File tree

4 files changed

+269
-48
lines changed

4 files changed

+269
-48
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
- WarcReader: [Zstandard compressed WARC Files](https://iipc.github.io/warc-specifications/specifications/warc-zstd/) support
1212
- WarcServer: resource record support
1313

14+
### Fixed
15+
16+
- URIs.toNormalizedSurt(): Improved compatibility with Python [surt](https://github.com/internetarchive/surt).
17+
1418
## 0.32.0
1519

1620
### Added

src/org/netpreserve/jwarc/URIs.java

Lines changed: 226 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,25 @@ public class URIs {
2323
"(?:[#](.*))?" + // fragment
2424
"\\Z", DOTALL);
2525
private static final int SCHEME = 1, SLASHES = 2, AUTHORITY = 3, PATH = 4, QUERY = 5, FRAGMENT = 6;
26-
private final static Pattern AUTHORITY_REGEX = Pattern.compile("([^@]*@)?(.*?)(?::([0-9]+))?", DOTALL);
27-
private final static Pattern IPV4_REGEX = Pattern.compile("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}");
26+
private final static Pattern SURT_URL_REGEX = Pattern.compile(
27+
"\\A(?:(?<scheme>[A-Za-z][A-Za-z0-9+\\-.]*):)?" +
28+
"(?:(?://(?<authority>[^/?#]*))?" +
29+
"(?<path>[^?#]*)" +
30+
"(?:\\?(?<query>[^#]*))?)?" +
31+
"(?:#(?<fragment>.*))?\\Z", DOTALL);
32+
private final static Pattern WWW_REGEX = Pattern.compile("www\\d*\\.");
33+
private final static Pattern HAS_PROTOCOL_REGEX = Pattern.compile("\\A[a-zA-Z][a-zA-Z0-9+\\-.]*:");
34+
private final static Pattern QUERY_SESSIONID_REGEX = Pattern.compile(
35+
"(?:jsessionid=[0-9a-zA-Z]{32}"
36+
+ "|phpsessid=[0-9a-zA-Z]{32}"
37+
+ "|sid=[0-9a-zA-Z]{32}"
38+
+ "|aspsessionid[a-zA-Z]{8}=[a-zA-Z]{24}"
39+
+ "|cfid=[^&]+&cftoken=[^&]+"
40+
+ ")(?:&|$)");
41+
private static final Pattern[] PATH_SESSIONID_REGEXS = new Pattern[]{
42+
Pattern.compile("/\\([a-z]\\([0-9a-z]{24}\\)\\)(/[^?]+.aspx)"),
43+
Pattern.compile("/\\([0-9a-z]{24}\\)(/[^?]+.aspx)"),
44+
};
2845

2946
// According to https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/net/URI.html#uri-syntax-and-components-heading
3047
private static final String ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
@@ -36,6 +53,7 @@ public class URIs {
3653

3754
private static final BitSet PATH_ALLOWED = charBitSet("/@" + UNRESERVED + PUNCT);
3855
private static final BitSet QUERY_ALLOWED = charBitSet(UNRESERVED + RESERVED);
56+
private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
3957

4058
private static BitSet charBitSet(String chars) {
4159
BitSet bitSet = new BitSet(128);
@@ -135,54 +153,117 @@ private static String percentEncodeIfNeeded(String s, BitSet allowed) {
135153
out.append(c); // an 'other' unicode character
136154
} else {
137155
for (byte b : Character.toString(c).getBytes(UTF_8)) {
138-
out.append('%').append(String.format("%02x", (int) b));
156+
appendPercentEncoding(out, b);
139157
}
140158
}
141159
}
142160
return out.toString();
143161
}
144162

163+
private static void appendPercentEncoding(StringBuilder out, byte b) {
164+
out.append('%');
165+
out.append(HEX_DIGITS[(b >> 4) & 0xf]);
166+
out.append(HEX_DIGITS[b & 0xf]);
167+
}
168+
169+
/**
170+
* Converts a given URI into its normalized SURT (Sort-friendly URI Reordering Transform) format.
171+
* <p>
172+
* There are many slightly different implementations of SURT. This one tries to produce the same output as the
173+
* Python <a href="https://github.com/internetarchive/surt">surt</a> module for compatibility with pywb.
174+
*/
145175
public static String toNormalizedSurt(String uri) {
146-
Matcher urlMatcher = URL_REGEX.matcher(uri);
176+
if (uri.startsWith("filedesc")) return uri;
177+
178+
uri = trimSpaces(uri);
179+
uri = uri.replace("\r", "");
180+
uri = uri.replace("\n", "");
181+
uri = uri.replace("\t", "");
182+
183+
if (!uri.isEmpty() && !HAS_PROTOCOL_REGEX.matcher(uri).lookingAt()) {
184+
uri = "http://" + uri;
185+
}
186+
187+
Matcher urlMatcher = SURT_URL_REGEX.matcher(uri);
147188
if (!urlMatcher.matches()) {
148-
throw new IllegalArgumentException("invalid URL: " + uri);
189+
return uri; // shouldn't be possible
149190
}
150-
String authority = urlMatcher.group(AUTHORITY);
151-
String path = urlMatcher.group(PATH);
152-
String query = urlMatcher.group(QUERY);
153-
String fragment = urlMatcher.group(FRAGMENT);
191+
String scheme = urlMatcher.group("scheme");
192+
String authority = urlMatcher.group("authority");
193+
String path = urlMatcher.group("path");
194+
String query = urlMatcher.group("query");
195+
String fragment = urlMatcher.group("fragment");
196+
197+
String host = null;
198+
String port = null;
154199

155-
Matcher authorityMatcher = AUTHORITY_REGEX.matcher(authority);
156-
if (!authorityMatcher.matches()) throw new IllegalStateException("authority didn't match");
157-
String host = authorityMatcher.group(2);
158-
String port = authorityMatcher.group(3);
200+
if (authority != null) {
201+
int atIndex = authority.indexOf('@');
202+
int colonIndex = -1;
203+
for (int i = authority.length() - 1; i > atIndex; i--) {
204+
char c = authority.charAt(i);
205+
if (c == ':') {
206+
colonIndex = i;
207+
break;
208+
} else if (!isAsciiDigit(c)) {
209+
break;
210+
}
211+
}
212+
if (colonIndex >= 0) {
213+
host = authority.substring(atIndex + 1, colonIndex);
214+
port = authority.substring(colonIndex + 1);
215+
} else {
216+
host = authority.substring(atIndex + 1);
217+
}
218+
}
159219

160220
StringBuilder output = new StringBuilder();
161-
if (IPV4_REGEX.matcher(host).matches()) {
162-
output.append(host);
221+
if (host == null) {
222+
if (scheme != null) {
223+
output.append(scheme);
224+
output.append(':');
225+
}
163226
} else {
164-
List<String> hostSegments = Arrays.asList(host.toLowerCase(Locale.ROOT).split("\\."));
165-
if (hostSegments.get(0).equals("www")) {
166-
hostSegments = hostSegments.subList(1, hostSegments.size());
227+
// remove IPv6 brackets
228+
if (host.startsWith("[")) {
229+
host = host.substring(1, host.length() - 1);
167230
}
168-
Collections.reverse(hostSegments);
169-
output.append(normalizePercentEncoding(String.join(",", hostSegments)));
170-
}
171-
if (port != null) {
172-
output.append(':');
173-
output.append(port);
231+
232+
host = host.toLowerCase(Locale.ROOT);
233+
host = trimWWW(host);
234+
host = reverseHost(host);
235+
output.append(normalizePercentEncoding(host));
236+
if (port != null && !port.isEmpty() && !isDefaultPort(scheme, port)) {
237+
output.append(':');
238+
output.append(port);
239+
}
240+
output.append(')');
174241
}
175-
output.append(')');
242+
176243
if (path != null) {
177-
output.append(normalizePercentEncoding(normalizePathSegments(path.toLowerCase(Locale.ROOT))));
244+
path = fullyPercentDecode(path);
245+
path = path.toLowerCase(Locale.ROOT);
246+
if (host != null) path = normalizePathSegments(path);
247+
for (Pattern PATH_SESSIONID : PATH_SESSIONID_REGEXS) {
248+
path = PATH_SESSIONID.matcher(path).replaceFirst("$1");
249+
}
250+
output.append(percentEncodeIllegals(path));
178251
} else {
179252
output.append('/');
180253
}
181-
if (query != null) {
254+
if (query != null && !query.isEmpty()) {
182255
output.append('?');
183-
String[] params = normalizePercentEncoding(query).toLowerCase(Locale.ROOT).split("&", -1);
256+
query = normalizePercentEncoding(query);
257+
query = query.toLowerCase(Locale.ROOT);
258+
query = QUERY_SESSIONID_REGEX.matcher(query).replaceAll("");
259+
String[] params = query.split("&", -1);
184260
Arrays.sort(params);
185-
output.append(String.join("&", params));
261+
boolean first = true;
262+
for (String param : params) {
263+
if (!first) output.append('&');
264+
first = false;
265+
output.append(param);
266+
}
186267
}
187268
if (fragment != null) {
188269
output.append('#');
@@ -191,24 +272,111 @@ public static String toNormalizedSurt(String uri) {
191272
return output.toString();
192273
}
193274

275+
private static boolean isAsciiDigit(char c) {
276+
return c >= '0' && c <= '9';
277+
}
278+
279+
private static String trimWWW(String host) {
280+
Matcher matcher = WWW_REGEX.matcher(host);
281+
if (matcher.lookingAt()) {
282+
return host.substring(matcher.end());
283+
}
284+
return host;
285+
}
286+
287+
private static String reverseHost(String s) {
288+
if (s == null || s.isEmpty()) return s;
289+
290+
StringBuilder result = new StringBuilder();
291+
int end = s.length();
292+
293+
while (end > 0) {
294+
int start = s.lastIndexOf('.', end - 1);
295+
296+
if (result.length() > 0) {
297+
result.append(',');
298+
}
299+
300+
if (start == -1) {
301+
if (end == s.length()) return s;
302+
result.append(s, 0, end);
303+
break;
304+
} else {
305+
result.append(s, start + 1, end);
306+
end = start;
307+
}
308+
}
309+
310+
return result.toString();
311+
}
312+
313+
/**
314+
* Removes leading and trailing spaces from a string.
315+
*/
316+
private static String trimSpaces(String s) {
317+
int start = 0;
318+
int end = s.length();
319+
320+
while (start < end && s.charAt(start) == ' ') {
321+
start++;
322+
}
323+
324+
while (end > start && s.charAt(end - 1) == ' ') {
325+
end--;
326+
}
327+
328+
return s.substring(start, end);
329+
}
330+
331+
private static boolean isDefaultPort(String scheme, String port) {
332+
return (scheme.equalsIgnoreCase("http") && port.equals("80")) ||
333+
(scheme.equalsIgnoreCase("https") && port.equals("443"));
334+
}
335+
194336
static String normalizePathSegments(String path) {
195-
ArrayList<String> output = new ArrayList<>();
196-
for (String segment : path.split("/")) {
197-
switch (segment) {
198-
case "":
199-
case ".":
200-
break;
201-
case "..":
202-
if (!output.isEmpty()) {
203-
output.remove(output.size() - 1);
204-
}
205-
break;
206-
default:
207-
output.add(segment);
208-
break;
337+
if (path == null || path.isEmpty()) return "/";
338+
339+
int len = path.length();
340+
int[] segmentStarts = new int[len / 2 + 1];
341+
int[] segmentEnds = new int[len / 2 + 1];
342+
int size = 0;
343+
344+
for (int i = 0; i < len; ) {
345+
// Skip slashes
346+
if (path.charAt(i) == '/') {
347+
i++;
348+
continue;
349+
}
350+
351+
// Find end of segment
352+
int start = i;
353+
while (i < len && path.charAt(i) != '/') {
354+
i++;
355+
}
356+
int end = i;
357+
358+
int segmentLen = end - start;
359+
//noinspection StatementWithEmptyBody
360+
if (segmentLen == 1 && path.charAt(start) == '.') {
361+
// Ignore "."
362+
} else if (segmentLen == 2 && path.charAt(start) == '.' && path.charAt(start + 1) == '.') {
363+
// Handle ".."
364+
if (size > 0) size--;
365+
} else {
366+
// Valid segment
367+
segmentStarts[size] = start;
368+
segmentEnds[size] = end;
369+
size++;
209370
}
210371
}
211-
return "/" + String.join("/", output);
372+
373+
if (size == 0) return "/";
374+
375+
StringBuilder sb = new StringBuilder(len);
376+
for (int i = 0; i < size; i++) {
377+
sb.append('/').append(path, segmentStarts[i], segmentEnds[i]);
378+
}
379+
return sb.toString();
212380
}
213381

214382
static String normalizePercentEncoding(String s) {
@@ -225,12 +393,23 @@ private static String fullyPercentDecode(String s) {
225393
}
226394

227395
public static String percentEncodeIllegals(String s) {
396+
// optimisation: in the common case there are none, return the original string
397+
boolean seen = false;
398+
for (int i = 0; i < s.length(); i++) {
399+
char c = s.charAt(i);
400+
if (c == '%' || c == '#' || c <= 0x20 || c >= 0x7f) {
401+
seen = true;
402+
break;
403+
}
404+
}
405+
if (!seen) return s;
406+
228407
StringBuilder out = new StringBuilder();
229408
byte[] bytes = s.getBytes(UTF_8);
230409
for (byte rawByte : bytes) {
231410
int b = rawByte & 0xff;
232411
if (b == '%' || b == '#' || b <= 0x20 || b >= 0x7f) {
233-
out.append('%').append(String.format("%02x", b));
412+
appendPercentEncoding(out, (byte) b);
234413
} else {
235414
out.append((char) b);
236415
}
@@ -243,6 +422,7 @@ public static String percentPlusDecode(String s) {
243422
}
244423

245424
private static String percentDecode(String s) {
425+
if (s.indexOf('%') == -1) return s;
246426
ByteBuffer bb = null;
247427
StringBuilder out = new StringBuilder();
248428
for (int i = 0; i < s.length(); i++) {
@@ -277,7 +457,7 @@ private static void tryDecodeUtf8(ByteBuffer bb, StringBuilder out) {
277457
CoderResult result = decoder.decode(bb, cb, true);
278458
if (result.isMalformed()) {
279459
for (int i = 0; i < result.length(); i++) {
280-
out.append('%').append(String.format("%02x", bb.get()));
460+
appendPercentEncoding(out, bb.get());
281461
}
282462
}
283463
out.append(cb.flip());

0 commit comments

Comments
 (0)