eXist-db · joewiz · Jun 7, 2026 · Jun 7, 2026 · reinhapa · Jun 7, 2026
@@ -98,7 +98,8 @@ public Sequence eval(Sequence[] args, Sequence contextSequence)
 			} else if(isCalledAs("encode-uri")) {
 				return new AnyURIValue(this, URIUtils.encodeXmldbUriFor(args[0].getStringValue()));
 			} else {
-				return new StringValue(this, URIUtils.urlDecodeUtf8(args[0].getStringValue()));
+				// RFC 3986 percent-decoding: '+' is a literal plus, not a space (#1824, #44)
+				return new StringValue(this, URIUtils.decodeForURI(args[0].getStringValue()));
 			}
 		} catch(final URISyntaxException e) {
             logger.error(e.getMessage(), e);

@@ -245,7 +245,77 @@ public static String encodeForURI(final String pathComponent) {
 
 		return new String(buf.buf, 0, buf.count);
 	}
-
+
+	/**
+	 * Decodes a percent-encoded URI path component back to its literal form, the inverse of
+	 * {@link #encodeForURI(String)}. Each {@code %XX} escape is decoded to a byte; consecutive
+	 * escapes are interpreted together as a UTF-8 byte sequence. Every other character is left
+	 * unchanged.
+	 *
+	 * Unlike {@link #urlDecodeUtf8(String)} (which wraps {@link java.net.URLDecoder} and therefore
+	 * follows application/x-www-form-urlencoded rules), this method treats {@code '+'} as a literal
+	 * plus sign, per RFC 3986. This is required for round-tripping names through the xmldb URI
+	 * functions (see eXist-db/exist#1824, #44): {@code decodeForURI(encodeForURI(s))} equals
+	 * {@code s} for every {@code s}.
+	 *
+	 * <p>This is deliberately a standalone percent-decoder rather than a call to
+	 * {@link java.net.URI#getPath()}. {@code java.net.URI} is unsuitable as a general decoder for the
+	 * arbitrary strings that {@code xmldb:decode}/{@code xmldb:decode-uri} accept: it throws
+	 * {@code URISyntaxException} on inputs that are perfectly valid here (a literal space, a trailing
+	 * or malformed {@code %}, characters such as <code>{</code> or <code>}</code>), and worse, it
+	 * <em>silently truncates</em> at {@code '?'} and {@code '#'} (parsing the remainder as a query or
+	 * fragment) — losing data with no error. This decoder never throws and never truncates: any
+	 * {@code '%'} not followed by two hex digits is preserved verbatim. See {@code URIUtilsTest}.</p>
+	 *
+	 * @param uriComponent the percent-encoded path component to decode.
+	 *
+	 * @return the decoded path component.
+	 */
+	public static String decodeForURI(final String uriComponent) {
+		if (uriComponent.indexOf('%') == -1) {
+			// fast path: nothing percent-encoded, nothing to decode
+			return uriComponent;
+		}
+
+		final int len = uriComponent.length();
+		final StringBuilder out = new StringBuilder(len);
+		final java.io.ByteArrayOutputStream pending = new java.io.ByteArrayOutputStream();
+
+		int i = 0;
+		while (i < len) {
+			final char c = uriComponent.charAt(i);
+			if (c == '%' && i + 2 < len && isHexDigit(uriComponent.charAt(i + 1)) && isHexDigit(uriComponent.charAt(i + 2))) {
+				pending.write((hexValue(uriComponent.charAt(i + 1)) << 4) | hexValue(uriComponent.charAt(i + 2)));
+				i += 3;
+			} else {
+				if (pending.size() > 0) {
+					out.append(pending.toString(UTF_8));
+					pending.reset();
+				}
+				out.append(c);
+				i++;
+			}
+		}
+		if (pending.size() > 0) {
+			out.append(pending.toString(UTF_8));
+		}
+		return out.toString();
+	}
+
+	private static boolean isHexDigit(final char c) {
+		return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
+	}
+
+	private static int hexValue(final char c) {
+		if (c >= '0' && c <= '9') {
+			return c - '0';
+		}
+		if (c >= 'A' && c <= 'F') {
+			return c - 'A' + 10;
+		}
+		return c - 'a' + 10;
+	}
+
 	public static String iriToURI(String uriPart) {
 		String result = urlEncodeUtf8(uriPart);
 		result = result.replaceAll("%23", "#");

@@ -192,4 +192,99 @@ void encodeForURIPathComponentUtf8() {
         encoded = URIUtils.encodeForURI("\uD802\uDD07");
         assertEquals("%F0%90%A4%87", encoded);
     }
+
+    /**
+     * decodeForURI must treat '+' as a literal plus sign, not a space \u2014 the regression in
+     * eXist-db/exist#1824 and #44, where URLDecoder's form-encoding rules turned '+' into ' '.
+     */
+    @Test
+    void decodeForURIPlusIsLiteral() {
+        // a percent-encoded plus decodes back to a plus
+        assertEquals("1+2", URIUtils.decodeForURI("1%2B2"));
+
+        // a bare '+' (nothing percent-encoded) is returned literally
+        assertEquals("a+b", URIUtils.decodeForURI("a+b"));
+    }
+
+    @Test
+    void decodeForURISpaceAndPercent() {
+        // space
+        assertEquals("hello world", URIUtils.decodeForURI("hello%20world"));
+
+        // percent sign
+        assertEquals("99%", URIUtils.decodeForURI("99%25"));
+
+        // a literal "%2F" in a name encodes to "%252F" and must decode back to "%2F"
+        assertEquals("%2F", URIUtils.decodeForURI("%252F"));
+
+        // double percent sign
+        assertEquals("99%%100", URIUtils.decodeForURI("99%25%25100"));
+    }
+
+    @Test
+    void decodeForURIUnreservedUnchanged() {
+        assertEquals("ABCabc019-._~", URIUtils.decodeForURI("ABCabc019-._~"));
+    }
+
+    @Test
+    void decodeForURIUtf8() {
+        // 2 byte character - yen sign
+        assertEquals("\u00A5", URIUtils.decodeForURI("%C2%A5"));
+
+        // 3 byte character - samaritan letter tsasdiy
+        assertEquals("\u0811", URIUtils.decodeForURI("%E0%A0%91"));
+
+        // 4 byte character - phoenician letter het
+        assertEquals("\uD802\uDD07", URIUtils.decodeForURI("%F0%90%A4%87"));
+    }
+
+    /**
+     * decodeForURI is the exact inverse of encodeForURI for any input \u2014 the bijective property
+     * the xmldb URI functions rely on.
+     */
+    @Test
+    void decodeForURIRoundTripsEncodeForURI() {
+        final String[] names = {
+                "plain", "dash-case", "file.ext", "snake_case", "~home",
+                "hello world", "1+2", "99%", "%2F", "99%%100",
+                "a:b", "x/y", "Goodbye?", "#comment", "[predicate", "predicate]", "adam@work",
+                "Hello!", "$100", "Jack&Jill", "it's", "(comment", "comment)", "1*2", "x,y", "a;b", "n=1",
+                "caf\u00E9", "\u041F\u0440\u0438\u0432\u0435\u0442", "\u6587\u66F8", "\u00A5", "\u0811", "\uD802\uDD07"
+        };
+        for (final String name : names) {
+            assertEquals(name, URIUtils.decodeForURI(URIUtils.encodeForURI(name)),
+                    "encode/decode round-trip failed for: " + name);
+        }
+    }
+
+    /**
+     * decodeForURI must never throw and never truncate, even on input that is not the output of
+     * encodeForURI — xmldb:decode/xmldb:decode-uri accept arbitrary user strings. Each case here
+     * is one that {@code new java.net.URI(s).getPath()} mishandles (throws URISyntaxException, or
+     * silently drops everything from a '?' or '#' onward), which is why this is a standalone decoder.
+     */
+    @Test
+    void decodeForURIRobustOnMalformedAndReservedInput() {
+        // a lone '%' not followed by two hex digits is preserved verbatim (URI: throws)
+        assertEquals("100%", URIUtils.decodeForURI("100%"));
+
+        // a truncated escape is preserved verbatim (URI: throws)
+        assertEquals("a%2", URIUtils.decodeForURI("a%2"));
+
+        // a '%' followed by non-hex is preserved verbatim (URI: throws)
+        assertEquals("a%ZZb", URIUtils.decodeForURI("a%ZZb"));
+
+        // a literal space is left as-is (URI: throws on an unencoded space)
+        assertEquals("a b", URIUtils.decodeForURI("a b"));
+
+        // '?' and '#' are ordinary characters here, not query/fragment delimiters (URI: truncates to "a")
+        assertEquals("a?b", URIUtils.decodeForURI("a?b"));
+        assertEquals("a#b", URIUtils.decodeForURI("a#b"));
+
+        // braces are ordinary characters (URI: throws)
+        assertEquals("a{b}c", URIUtils.decodeForURI("a{b}c"));
+
+        // valid escapes still decode even when mixed with characters URI would reject
+        assertEquals("a b?c", URIUtils.decodeForURI("a%20b?c"));
+    }
 }
@@ -0,0 +1,62 @@
+(:
+ : eXist-db Open Source Native XML Database
+ : Copyright (C) 2001 The eXist-db Authors
+ :
+ : info@exist-db.org
+ : http://www.exist-db.org
+ :
+ : This library is free software; you can redistribute it and/or
+ : modify it under the terms of the GNU Lesser General Public
+ : License as published by the Free Software Foundation; either
+ : version 2.1 of the License, or (at your option) any later version.
+ :
+ : This library is distributed in the hope that it will be useful,
+ : but WITHOUT ANY WARRANTY; without even the implied warranty of
+ : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ : Lesser General Public License for more details.
+ :
+ : You should have received a copy of the GNU Lesser General Public
+ : License along with this library; if not, write to the Free Software
+ : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ :)
+xquery version "3.1";
+
+module namespace t="http://exist-db.org/testsuite/xmldb-uri-encoding";
+
+declare namespace test="http://exist-db.org/xquery/xqsuite";
+
+(: ~
+ : Regression tests for xmldb:decode / xmldb:decode-uri percent-decoding.
+ : eXist-db/exist#1824 and #44: a '+' must be decoded as a literal plus sign (RFC 3986),
+ : not turned into a space (which is application/x-www-form-urlencoded behavior).
+ :)
+
+declare
+    %test:assertEquals("a+b")
+function t:decode-uri-plus-is-literal-when-encoded() {
+    xmldb:decode-uri(xs:anyURI("a%2Bb"))
+};
+
+declare
+    %test:assertEquals("a+b")
+function t:decode-uri-bare-plus-is-literal() {
+    xmldb:decode-uri(xs:anyURI("a+b"))
+};
+
+declare
+    %test:assertEquals("a+b")
+function t:decode-plus-is-literal-when-encoded() {
+    xmldb:decode("a%2Bb")
+};
+
+declare
+    %test:assertEquals("a b")
+function t:decode-uri-percent-20-is-space() {
+    xmldb:decode-uri(xs:anyURI("a%20b"))
+};
+
+declare
+    %test:assertEquals("My Report (2024)+final.xml")
+function t:decode-uri-mixed() {
+    xmldb:decode-uri(xs:anyURI("My%20Report%20%282024%29%2Bfinal.xml"))
+};