@@ -23,8 +23,25 @@ public class URIs {
2323 "(?:[#](.*))?" + // fragment
2424 "\\ Z" , DOTALL );
2525 private static final int SCHEME = 1 , SLASHES = 2 , AUTHORITY = 3 , PATH = 4 , QUERY = 5 , FRAGMENT = 6 ;
26- private final static Pattern AUTHORITY_REGEX = Pattern .compile ("([^@]*@)?(.*?)(?::([0-9]+))?" , DOTALL );
27- private final static Pattern IPV4_REGEX = Pattern .compile ("[0-9]{1,3}\\ .[0-9]{1,3}\\ .[0-9]{1,3}\\ .[0-9]{1,3}" );
26+ private final static Pattern SURT_URL_REGEX = Pattern .compile (
27+ "\\ A(?:(?<scheme>[A-Za-z][A-Za-z0-9+\\ -.]*):)?" +
28+ "(?:(?://(?<authority>[^/?#]*))?" +
29+ "(?<path>[^?#]*)" +
30+ "(?:\\ ?(?<query>[^#]*))?)?" +
31+ "(?:#(?<fragment>.*))?\\ Z" , DOTALL );
32+ private final static Pattern WWW_REGEX = Pattern .compile ("www\\ d*\\ ." );
33+ private final static Pattern HAS_PROTOCOL_REGEX = Pattern .compile ("\\ A[a-zA-Z][a-zA-Z0-9+\\ -.]*:" );
34+ private final static Pattern QUERY_SESSIONID_REGEX = Pattern .compile (
35+ "(?:jsessionid=[0-9a-zA-Z]{32}"
36+ + "|phpsessid=[0-9a-zA-Z]{32}"
37+ + "|sid=[0-9a-zA-Z]{32}"
38+ + "|aspsessionid[a-zA-Z]{8}=[a-zA-Z]{24}"
39+ + "|cfid=[^&]+&cftoken=[^&]+"
40+ + ")(?:&|$)" );
41+ private static final Pattern [] PATH_SESSIONID_REGEXS = new Pattern []{
42+ Pattern .compile ("/\\ ([a-z]\\ ([0-9a-z]{24}\\ )\\ )(/[^?]+.aspx)" ),
43+ Pattern .compile ("/\\ ([0-9a-z]{24}\\ )(/[^?]+.aspx)" ),
44+ };
2845
2946 // According to https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/net/URI.html#uri-syntax-and-components-heading
3047 private static final String ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ;
@@ -36,6 +53,7 @@ public class URIs {
3653
3754 private static final BitSet PATH_ALLOWED = charBitSet ("/@" + UNRESERVED + PUNCT );
3855 private static final BitSet QUERY_ALLOWED = charBitSet (UNRESERVED + RESERVED );
56+ private static final char [] HEX_DIGITS = "0123456789abcdef" .toCharArray ();
3957
4058 private static BitSet charBitSet (String chars ) {
4159 BitSet bitSet = new BitSet (128 );
@@ -135,54 +153,117 @@ private static String percentEncodeIfNeeded(String s, BitSet allowed) {
135153 out .append (c ); // an 'other' unicode character
136154 } else {
137155 for (byte b : Character .toString (c ).getBytes (UTF_8 )) {
138- out . append ( '%' ). append ( String . format ( "%02x" , ( int ) b ) );
156+ appendPercentEncoding ( out , b );
139157 }
140158 }
141159 }
142160 return out .toString ();
143161 }
144162
163+ private static void appendPercentEncoding (StringBuilder out , byte b ) {
164+ out .append ('%' );
165+ out .append (HEX_DIGITS [(b >> 4 ) & 0xf ]);
166+ out .append (HEX_DIGITS [b & 0xf ]);
167+ }
168+
169+ /**
170+ * Converts a given URI into its normalized SURT (Sort-friendly URI Reordering Transform) format.
171+ * <p>
172+ * There are many slightly different implementations of SURT. This one tries to produce the same output as the
173+ * Python <a href="https://github.com/internetarchive/surt">surt</a> module for compatibility with pywb.
174+ */
145175 public static String toNormalizedSurt (String uri ) {
146- Matcher urlMatcher = URL_REGEX .matcher (uri );
176+ if (uri .startsWith ("filedesc" )) return uri ;
177+
178+ uri = trimSpaces (uri );
179+ uri = uri .replace ("\r " , "" );
180+ uri = uri .replace ("\n " , "" );
181+ uri = uri .replace ("\t " , "" );
182+
183+ if (!uri .isEmpty () && !HAS_PROTOCOL_REGEX .matcher (uri ).lookingAt ()) {
184+ uri = "http://" + uri ;
185+ }
186+
187+ Matcher urlMatcher = SURT_URL_REGEX .matcher (uri );
147188 if (!urlMatcher .matches ()) {
148- throw new IllegalArgumentException ( "invalid URL: " + uri );
189+ return uri ; // shouldn't be possible
149190 }
150- String authority = urlMatcher .group (AUTHORITY );
151- String path = urlMatcher .group (PATH );
152- String query = urlMatcher .group (QUERY );
153- String fragment = urlMatcher .group (FRAGMENT );
191+ String scheme = urlMatcher .group ("scheme" );
192+ String authority = urlMatcher .group ("authority" );
193+ String path = urlMatcher .group ("path" );
194+ String query = urlMatcher .group ("query" );
195+ String fragment = urlMatcher .group ("fragment" );
196+
197+ String host = null ;
198+ String port = null ;
154199
155- Matcher authorityMatcher = AUTHORITY_REGEX .matcher (authority );
156- if (!authorityMatcher .matches ()) throw new IllegalStateException ("authority didn't match" );
157- String host = authorityMatcher .group (2 );
158- String port = authorityMatcher .group (3 );
200+ if (authority != null ) {
201+ int atIndex = authority .indexOf ('@' );
202+ int colonIndex = -1 ;
203+ for (int i = authority .length () - 1 ; i > atIndex ; i --) {
204+ char c = authority .charAt (i );
205+ if (c == ':' ) {
206+ colonIndex = i ;
207+ break ;
208+ } else if (!isAsciiDigit (c )) {
209+ break ;
210+ }
211+ }
212+ if (colonIndex >= 0 ) {
213+ host = authority .substring (atIndex + 1 , colonIndex );
214+ port = authority .substring (colonIndex + 1 );
215+ } else {
216+ host = authority .substring (atIndex + 1 );
217+ }
218+ }
159219
160220 StringBuilder output = new StringBuilder ();
161- if (IPV4_REGEX .matcher (host ).matches ()) {
162- output .append (host );
221+ if (host == null ) {
222+ if (scheme != null ) {
223+ output .append (scheme );
224+ output .append (':' );
225+ }
163226 } else {
164- List < String > hostSegments = Arrays . asList ( host . toLowerCase ( Locale . ROOT ). split ( " \\ ." ));
165- if (hostSegments . get ( 0 ). equals ( "www " )) {
166- hostSegments = hostSegments . subList (1 , hostSegments . size () );
227+ // remove IPv6 brackets
228+ if (host . startsWith ( "[ " )) {
229+ host = host . substring (1 , host . length () - 1 );
167230 }
168- Collections .reverse (hostSegments );
169- output .append (normalizePercentEncoding (String .join ("," , hostSegments )));
170- }
171- if (port != null ) {
172- output .append (':' );
173- output .append (port );
231+
232+ host = host .toLowerCase (Locale .ROOT );
233+ host = trimWWW (host );
234+ host = reverseHost (host );
235+ output .append (normalizePercentEncoding (host ));
236+ if (port != null && !port .isEmpty () && !isDefaultPort (scheme , port )) {
237+ output .append (':' );
238+ output .append (port );
239+ }
240+ output .append (')' );
174241 }
175- output . append ( ')' );
242+
176243 if (path != null ) {
177- output .append (normalizePercentEncoding (normalizePathSegments (path .toLowerCase (Locale .ROOT ))));
244+ path = fullyPercentDecode (path );
245+ path = path .toLowerCase (Locale .ROOT );
246+ if (host != null ) path = normalizePathSegments (path );
247+ for (Pattern PATH_SESSIONID : PATH_SESSIONID_REGEXS ) {
248+ path = PATH_SESSIONID .matcher (path ).replaceFirst ("$1" );
249+ }
250+ output .append (percentEncodeIllegals (path ));
178251 } else {
179252 output .append ('/' );
180253 }
181- if (query != null ) {
254+ if (query != null && ! query . isEmpty () ) {
182255 output .append ('?' );
183- String [] params = normalizePercentEncoding (query ).toLowerCase (Locale .ROOT ).split ("&" , -1 );
256+ query = normalizePercentEncoding (query );
257+ query = query .toLowerCase (Locale .ROOT );
258+ query = QUERY_SESSIONID_REGEX .matcher (query ).replaceAll ("" );
259+ String [] params = query .split ("&" , -1 );
184260 Arrays .sort (params );
185- output .append (String .join ("&" , params ));
261+ boolean first = true ;
262+ for (String param : params ) {
263+ if (!first ) output .append ('&' );
264+ first = false ;
265+ output .append (param );
266+ }
186267 }
187268 if (fragment != null ) {
188269 output .append ('#' );
@@ -191,24 +272,111 @@ public static String toNormalizedSurt(String uri) {
191272 return output .toString ();
192273 }
193274
275+ private static boolean isAsciiDigit (char c ) {
276+ return c >= '0' && c <= '9' ;
277+ }
278+
279+ private static String trimWWW (String host ) {
280+ Matcher matcher = WWW_REGEX .matcher (host );
281+ if (matcher .lookingAt ()) {
282+ return host .substring (matcher .end ());
283+ }
284+ return host ;
285+ }
286+
287+ private static String reverseHost (String s ) {
288+ if (s == null || s .isEmpty ()) return s ;
289+
290+ StringBuilder result = new StringBuilder ();
291+ int end = s .length ();
292+
293+ while (end > 0 ) {
294+ int start = s .lastIndexOf ('.' , end - 1 );
295+
296+ if (result .length () > 0 ) {
297+ result .append (',' );
298+ }
299+
300+ if (start == -1 ) {
301+ if (end == s .length ()) return s ;
302+ result .append (s , 0 , end );
303+ break ;
304+ } else {
305+ result .append (s , start + 1 , end );
306+ end = start ;
307+ }
308+ }
309+
310+ return result .toString ();
311+ }
312+
313+ /**
314+ * Removes leading and trailing spaces from a string.
315+ */
316+ private static String trimSpaces (String s ) {
317+ int start = 0 ;
318+ int end = s .length ();
319+
320+ while (start < end && s .charAt (start ) == ' ' ) {
321+ start ++;
322+ }
323+
324+ while (end > start && s .charAt (end - 1 ) == ' ' ) {
325+ end --;
326+ }
327+
328+ return s .substring (start , end );
329+ }
330+
331+ private static boolean isDefaultPort (String scheme , String port ) {
332+ return (scheme .equalsIgnoreCase ("http" ) && port .equals ("80" )) ||
333+ (scheme .equalsIgnoreCase ("https" ) && port .equals ("443" ));
334+ }
335+
194336 static String normalizePathSegments (String path ) {
195- ArrayList <String > output = new ArrayList <>();
196- for (String segment : path .split ("/" )) {
197- switch (segment ) {
198- case "" :
199- case "." :
200- break ;
201- case ".." :
202- if (!output .isEmpty ()) {
203- output .remove (output .size () - 1 );
204- }
205- break ;
206- default :
207- output .add (segment );
208- break ;
337+ if (path == null || path .isEmpty ()) return "/" ;
338+
339+ int len = path .length ();
340+ int [] segmentStarts = new int [len / 2 + 1 ];
341+ int [] segmentEnds = new int [len / 2 + 1 ];
342+ int size = 0 ;
343+
344+ for (int i = 0 ; i < len ; ) {
345+ // Skip slashes
346+ if (path .charAt (i ) == '/' ) {
347+ i ++;
348+ continue ;
349+ }
350+
351+ // Find end of segment
352+ int start = i ;
353+ while (i < len && path .charAt (i ) != '/' ) {
354+ i ++;
355+ }
356+ int end = i ;
357+
358+ int segmentLen = end - start ;
359+ //noinspection StatementWithEmptyBody
360+ if (segmentLen == 1 && path .charAt (start ) == '.' ) {
361+ // Ignore "."
362+ } else if (segmentLen == 2 && path .charAt (start ) == '.' && path .charAt (start + 1 ) == '.' ) {
363+ // Handle ".."
364+ if (size > 0 ) size --;
365+ } else {
366+ // Valid segment
367+ segmentStarts [size ] = start ;
368+ segmentEnds [size ] = end ;
369+ size ++;
209370 }
210371 }
211- return "/" + String .join ("/" , output );
372+
373+ if (size == 0 ) return "/" ;
374+
375+ StringBuilder sb = new StringBuilder (len );
376+ for (int i = 0 ; i < size ; i ++) {
377+ sb .append ('/' ).append (path , segmentStarts [i ], segmentEnds [i ]);
378+ }
379+ return sb .toString ();
212380 }
213381
214382 static String normalizePercentEncoding (String s ) {
@@ -225,12 +393,23 @@ private static String fullyPercentDecode(String s) {
225393 }
226394
227395 public static String percentEncodeIllegals (String s ) {
396+ // optimisation: in the common case there are none, return the original string
397+ boolean seen = false ;
398+ for (int i = 0 ; i < s .length (); i ++) {
399+ char c = s .charAt (i );
400+ if (c == '%' || c == '#' || c <= 0x20 || c >= 0x7f ) {
401+ seen = true ;
402+ break ;
403+ }
404+ }
405+ if (!seen ) return s ;
406+
228407 StringBuilder out = new StringBuilder ();
229408 byte [] bytes = s .getBytes (UTF_8 );
230409 for (byte rawByte : bytes ) {
231410 int b = rawByte & 0xff ;
232411 if (b == '%' || b == '#' || b <= 0x20 || b >= 0x7f ) {
233- out . append ( '%' ). append ( String . format ( "%02x" , b ) );
412+ appendPercentEncoding ( out , ( byte ) b );
234413 } else {
235414 out .append ((char ) b );
236415 }
@@ -243,6 +422,7 @@ public static String percentPlusDecode(String s) {
243422 }
244423
245424 private static String percentDecode (String s ) {
425+ if (s .indexOf ('%' ) == -1 ) return s ;
246426 ByteBuffer bb = null ;
247427 StringBuilder out = new StringBuilder ();
248428 for (int i = 0 ; i < s .length (); i ++) {
@@ -277,7 +457,7 @@ private static void tryDecodeUtf8(ByteBuffer bb, StringBuilder out) {
277457 CoderResult result = decoder .decode (bb , cb , true );
278458 if (result .isMalformed ()) {
279459 for (int i = 0 ; i < result .length (); i ++) {
280- out . append ( '%' ). append ( String . format ( "%02x" , bb .get () ));
460+ appendPercentEncoding ( out , bb .get ());
281461 }
282462 }
283463 out .append (cb .flip ());
0 commit comments