diff --git a/exist-core/src/main/java/org/exist/xquery/functions/xmldb/XMLDBURIFunctions.java b/exist-core/src/main/java/org/exist/xquery/functions/xmldb/XMLDBURIFunctions.java index 23aa8c1f01b..c9de6bdf10c 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/xmldb/XMLDBURIFunctions.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/xmldb/XMLDBURIFunctions.java @@ -98,7 +98,8 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) } else if(isCalledAs("encode-uri")) { return new AnyURIValue(this, URIUtils.encodeXmldbUriFor(args[0].getStringValue())); } else { - return new StringValue(this, URIUtils.urlDecodeUtf8(args[0].getStringValue())); + // RFC 3986 percent-decoding: '+' is a literal plus, not a space (#1824, #44) + return new StringValue(this, URIUtils.decodeForURI(args[0].getStringValue())); } } catch(final URISyntaxException e) { logger.error(e.getMessage(), e); diff --git a/exist-core/src/main/java/org/exist/xquery/util/URIUtils.java b/exist-core/src/main/java/org/exist/xquery/util/URIUtils.java index b87ec3f25d8..b9c0c34609d 100644 --- a/exist-core/src/main/java/org/exist/xquery/util/URIUtils.java +++ b/exist-core/src/main/java/org/exist/xquery/util/URIUtils.java @@ -245,7 +245,77 @@ public static String encodeForURI(final String pathComponent) { return new String(buf.buf, 0, buf.count); } - + + /** + * Decodes a percent-encoded URI path component back to its literal form, the inverse of + * {@link #encodeForURI(String)}. Each {@code %XX} escape is decoded to a byte; consecutive + * escapes are interpreted together as a UTF-8 byte sequence. Every other character is left + * unchanged. + * + * Unlike {@link #urlDecodeUtf8(String)} (which wraps {@link java.net.URLDecoder} and therefore + * follows application/x-www-form-urlencoded rules), this method treats {@code '+'} as a literal + * plus sign, per RFC 3986. This is required for round-tripping names through the xmldb URI + * functions (see eXist-db/exist#1824, #44): {@code decodeForURI(encodeForURI(s))} equals + * {@code s} for every {@code s}. + * + *

This is deliberately a standalone percent-decoder rather than a call to + * {@link java.net.URI#getPath()}. {@code java.net.URI} is unsuitable as a general decoder for the + * arbitrary strings that {@code xmldb:decode}/{@code xmldb:decode-uri} accept: it throws + * {@code URISyntaxException} on inputs that are perfectly valid here (a literal space, a trailing + * or malformed {@code %}, characters such as { or }), and worse, it + * silently truncates at {@code '?'} and {@code '#'} (parsing the remainder as a query or + * fragment) — losing data with no error. This decoder never throws and never truncates: any + * {@code '%'} not followed by two hex digits is preserved verbatim. See {@code URIUtilsTest}.

+ * + * @param uriComponent the percent-encoded path component to decode. + * + * @return the decoded path component. + */ + public static String decodeForURI(final String uriComponent) { + if (uriComponent.indexOf('%') == -1) { + // fast path: nothing percent-encoded, nothing to decode + return uriComponent; + } + + final int len = uriComponent.length(); + final StringBuilder out = new StringBuilder(len); + final java.io.ByteArrayOutputStream pending = new java.io.ByteArrayOutputStream(); + + int i = 0; + while (i < len) { + final char c = uriComponent.charAt(i); + if (c == '%' && i + 2 < len && isHexDigit(uriComponent.charAt(i + 1)) && isHexDigit(uriComponent.charAt(i + 2))) { + pending.write((hexValue(uriComponent.charAt(i + 1)) << 4) | hexValue(uriComponent.charAt(i + 2))); + i += 3; + } else { + if (pending.size() > 0) { + out.append(pending.toString(UTF_8)); + pending.reset(); + } + out.append(c); + i++; + } + } + if (pending.size() > 0) { + out.append(pending.toString(UTF_8)); + } + return out.toString(); + } + + private static boolean isHexDigit(final char c) { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + + private static int hexValue(final char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'F') { + return c - 'A' + 10; + } + return c - 'a' + 10; + } + public static String iriToURI(String uriPart) { String result = urlEncodeUtf8(uriPart); result = result.replaceAll("%23", "#"); diff --git a/exist-core/src/test/java/org/exist/xquery/util/URIUtilsTest.java b/exist-core/src/test/java/org/exist/xquery/util/URIUtilsTest.java index 7086a8c7080..6077ff3390e 100644 --- a/exist-core/src/test/java/org/exist/xquery/util/URIUtilsTest.java +++ b/exist-core/src/test/java/org/exist/xquery/util/URIUtilsTest.java @@ -192,4 +192,99 @@ void encodeForURIPathComponentUtf8() { encoded = URIUtils.encodeForURI("\uD802\uDD07"); assertEquals("%F0%90%A4%87", encoded); } + + /** + * decodeForURI must treat '+' as a literal plus sign, not a space \u2014 the regression in + * eXist-db/exist#1824 and #44, where URLDecoder's form-encoding rules turned '+' into ' '. + */ + @Test + void decodeForURIPlusIsLiteral() { + // a percent-encoded plus decodes back to a plus + assertEquals("1+2", URIUtils.decodeForURI("1%2B2")); + + // a bare '+' (nothing percent-encoded) is returned literally + assertEquals("a+b", URIUtils.decodeForURI("a+b")); + } + + @Test + void decodeForURISpaceAndPercent() { + // space + assertEquals("hello world", URIUtils.decodeForURI("hello%20world")); + + // percent sign + assertEquals("99%", URIUtils.decodeForURI("99%25")); + + // a literal "%2F" in a name encodes to "%252F" and must decode back to "%2F" + assertEquals("%2F", URIUtils.decodeForURI("%252F")); + + // double percent sign + assertEquals("99%%100", URIUtils.decodeForURI("99%25%25100")); + } + + @Test + void decodeForURIUnreservedUnchanged() { + assertEquals("ABCabc019-._~", URIUtils.decodeForURI("ABCabc019-._~")); + } + + @Test + void decodeForURIUtf8() { + // 2 byte character - yen sign + assertEquals("\u00A5", URIUtils.decodeForURI("%C2%A5")); + + // 3 byte character - samaritan letter tsasdiy + assertEquals("\u0811", URIUtils.decodeForURI("%E0%A0%91")); + + // 4 byte character - phoenician letter het + assertEquals("\uD802\uDD07", URIUtils.decodeForURI("%F0%90%A4%87")); + } + + /** + * decodeForURI is the exact inverse of encodeForURI for any input \u2014 the bijective property + * the xmldb URI functions rely on. + */ + @Test + void decodeForURIRoundTripsEncodeForURI() { + final String[] names = { + "plain", "dash-case", "file.ext", "snake_case", "~home", + "hello world", "1+2", "99%", "%2F", "99%%100", + "a:b", "x/y", "Goodbye?", "#comment", "[predicate", "predicate]", "adam@work", + "Hello!", "$100", "Jack&Jill", "it's", "(comment", "comment)", "1*2", "x,y", "a;b", "n=1", + "caf\u00E9", "\u041F\u0440\u0438\u0432\u0435\u0442", "\u6587\u66F8", "\u00A5", "\u0811", "\uD802\uDD07" + }; + for (final String name : names) { + assertEquals(name, URIUtils.decodeForURI(URIUtils.encodeForURI(name)), + "encode/decode round-trip failed for: " + name); + } + } + + /** + * decodeForURI must never throw and never truncate, even on input that is not the output of + * encodeForURI — xmldb:decode/xmldb:decode-uri accept arbitrary user strings. Each case here + * is one that {@code new java.net.URI(s).getPath()} mishandles (throws URISyntaxException, or + * silently drops everything from a '?' or '#' onward), which is why this is a standalone decoder. + */ + @Test + void decodeForURIRobustOnMalformedAndReservedInput() { + // a lone '%' not followed by two hex digits is preserved verbatim (URI: throws) + assertEquals("100%", URIUtils.decodeForURI("100%")); + + // a truncated escape is preserved verbatim (URI: throws) + assertEquals("a%2", URIUtils.decodeForURI("a%2")); + + // a '%' followed by non-hex is preserved verbatim (URI: throws) + assertEquals("a%ZZb", URIUtils.decodeForURI("a%ZZb")); + + // a literal space is left as-is (URI: throws on an unencoded space) + assertEquals("a b", URIUtils.decodeForURI("a b")); + + // '?' and '#' are ordinary characters here, not query/fragment delimiters (URI: truncates to "a") + assertEquals("a?b", URIUtils.decodeForURI("a?b")); + assertEquals("a#b", URIUtils.decodeForURI("a#b")); + + // braces are ordinary characters (URI: throws) + assertEquals("a{b}c", URIUtils.decodeForURI("a{b}c")); + + // valid escapes still decode even when mixed with characters URI would reject + assertEquals("a b?c", URIUtils.decodeForURI("a%20b?c")); + } } diff --git a/exist-core/src/test/xquery/xmldb/uri-encoding-tests.xql b/exist-core/src/test/xquery/xmldb/uri-encoding-tests.xql new file mode 100644 index 00000000000..aa9e8438074 --- /dev/null +++ b/exist-core/src/test/xquery/xmldb/uri-encoding-tests.xql @@ -0,0 +1,62 @@ +(: + : eXist-db Open Source Native XML Database + : Copyright (C) 2001 The eXist-db Authors + : + : info@exist-db.org + : http://www.exist-db.org + : + : This library is free software; you can redistribute it and/or + : modify it under the terms of the GNU Lesser General Public + : License as published by the Free Software Foundation; either + : version 2.1 of the License, or (at your option) any later version. + : + : This library is distributed in the hope that it will be useful, + : but WITHOUT ANY WARRANTY; without even the implied warranty of + : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + : Lesser General Public License for more details. + : + : You should have received a copy of the GNU Lesser General Public + : License along with this library; if not, write to the Free Software + : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + :) +xquery version "3.1"; + +module namespace t="http://exist-db.org/testsuite/xmldb-uri-encoding"; + +declare namespace test="http://exist-db.org/xquery/xqsuite"; + +(: ~ + : Regression tests for xmldb:decode / xmldb:decode-uri percent-decoding. + : eXist-db/exist#1824 and #44: a '+' must be decoded as a literal plus sign (RFC 3986), + : not turned into a space (which is application/x-www-form-urlencoded behavior). + :) + +declare + %test:assertEquals("a+b") +function t:decode-uri-plus-is-literal-when-encoded() { + xmldb:decode-uri(xs:anyURI("a%2Bb")) +}; + +declare + %test:assertEquals("a+b") +function t:decode-uri-bare-plus-is-literal() { + xmldb:decode-uri(xs:anyURI("a+b")) +}; + +declare + %test:assertEquals("a+b") +function t:decode-plus-is-literal-when-encoded() { + xmldb:decode("a%2Bb") +}; + +declare + %test:assertEquals("a b") +function t:decode-uri-percent-20-is-space() { + xmldb:decode-uri(xs:anyURI("a%20b")) +}; + +declare + %test:assertEquals("My Report (2024)+final.xml") +function t:decode-uri-mixed() { + xmldb:decode-uri(xs:anyURI("My%20Report%20%282024%29%2Bfinal.xml")) +};