Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ public Sequence eval(Sequence[] args, Sequence contextSequence)
} else if(isCalledAs("encode-uri")) {
return new AnyURIValue(this, URIUtils.encodeXmldbUriFor(args[0].getStringValue()));
} else {
return new StringValue(this, URIUtils.urlDecodeUtf8(args[0].getStringValue()));
// RFC 3986 percent-decoding: '+' is a literal plus, not a space (#1824, #44)
return new StringValue(this, URIUtils.decodeForURI(args[0].getStringValue()));
}
} catch(final URISyntaxException e) {
logger.error(e.getMessage(), e);
Expand Down
72 changes: 71 additions & 1 deletion exist-core/src/main/java/org/exist/xquery/util/URIUtils.java

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, what about using the Java URI as it is more precise for paths & handles encoding/decoding automatically? In case this does not have a negative performance implication, it would greatly reduce the existing code within the URIUtils class..

import java.net.URI;
import java.net.URISyntaxException;

String pathPart = "my path/with spaces+plus";
try {
    URI uri = new URI(null, null, pathPart, null, null);
    String encodedPath = uri.toString(); // Automatically encodes path
    System.out.println(encodedPath); // Output: my%20path/with%20spaces+plus

    // To decode:
    URI decodedUri = new URI(encodedPath);
    String decodedPath = decodedUri.getPath();
    System.out.println(decodedPath); // Output: /my path/with spaces+plus
} catch (URISyntaxException e) {
    e.printStackTrace();
}

Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,77 @@ public static String encodeForURI(final String pathComponent) {

return new String(buf.buf, 0, buf.count);
}


/**
* Decodes a percent-encoded URI path component back to its literal form, the inverse of
* {@link #encodeForURI(String)}. Each {@code %XX} escape is decoded to a byte; consecutive
* escapes are interpreted together as a UTF-8 byte sequence. Every other character is left
* unchanged.
*
* Unlike {@link #urlDecodeUtf8(String)} (which wraps {@link java.net.URLDecoder} and therefore
* follows application/x-www-form-urlencoded rules), this method treats {@code '+'} as a literal
* plus sign, per RFC 3986. This is required for round-tripping names through the xmldb URI
* functions (see eXist-db/exist#1824, #44): {@code decodeForURI(encodeForURI(s))} equals
* {@code s} for every {@code s}.
*
* <p>This is deliberately a standalone percent-decoder rather than a call to
* {@link java.net.URI#getPath()}. {@code java.net.URI} is unsuitable as a general decoder for the
* arbitrary strings that {@code xmldb:decode}/{@code xmldb:decode-uri} accept: it throws
* {@code URISyntaxException} on inputs that are perfectly valid here (a literal space, a trailing
* or malformed {@code %}, characters such as <code>{</code> or <code>}</code>), and worse, it
* <em>silently truncates</em> at {@code '?'} and {@code '#'} (parsing the remainder as a query or
* fragment) — losing data with no error. This decoder never throws and never truncates: any
* {@code '%'} not followed by two hex digits is preserved verbatim. See {@code URIUtilsTest}.</p>
*
* @param uriComponent the percent-encoded path component to decode.
*
* @return the decoded path component.
*/
public static String decodeForURI(final String uriComponent) {
if (uriComponent.indexOf('%') == -1) {
// fast path: nothing percent-encoded, nothing to decode
return uriComponent;
}

final int len = uriComponent.length();
final StringBuilder out = new StringBuilder(len);
final java.io.ByteArrayOutputStream pending = new java.io.ByteArrayOutputStream();

int i = 0;
while (i < len) {
final char c = uriComponent.charAt(i);
if (c == '%' && i + 2 < len && isHexDigit(uriComponent.charAt(i + 1)) && isHexDigit(uriComponent.charAt(i + 2))) {
pending.write((hexValue(uriComponent.charAt(i + 1)) << 4) | hexValue(uriComponent.charAt(i + 2)));
i += 3;
} else {
if (pending.size() > 0) {
out.append(pending.toString(UTF_8));
pending.reset();
}
out.append(c);
i++;
}
}
if (pending.size() > 0) {
out.append(pending.toString(UTF_8));
}
return out.toString();
}

private static boolean isHexDigit(final char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
}

private static int hexValue(final char c) {
if (c >= '0' && c <= '9') {
return c - '0';
}
if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
}
return c - 'a' + 10;
}

public static String iriToURI(String uriPart) {
String result = urlEncodeUtf8(uriPart);
result = result.replaceAll("%23", "#");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,99 @@ void encodeForURIPathComponentUtf8() {
encoded = URIUtils.encodeForURI("\uD802\uDD07");
assertEquals("%F0%90%A4%87", encoded);
}

/**
* decodeForURI must treat '+' as a literal plus sign, not a space \u2014 the regression in
* eXist-db/exist#1824 and #44, where URLDecoder's form-encoding rules turned '+' into ' '.
*/
@Test
void decodeForURIPlusIsLiteral() {
// a percent-encoded plus decodes back to a plus
assertEquals("1+2", URIUtils.decodeForURI("1%2B2"));

// a bare '+' (nothing percent-encoded) is returned literally
assertEquals("a+b", URIUtils.decodeForURI("a+b"));
}

@Test
void decodeForURISpaceAndPercent() {
// space
assertEquals("hello world", URIUtils.decodeForURI("hello%20world"));

// percent sign
assertEquals("99%", URIUtils.decodeForURI("99%25"));

// a literal "%2F" in a name encodes to "%252F" and must decode back to "%2F"
assertEquals("%2F", URIUtils.decodeForURI("%252F"));

// double percent sign
assertEquals("99%%100", URIUtils.decodeForURI("99%25%25100"));
}

@Test
void decodeForURIUnreservedUnchanged() {
assertEquals("ABCabc019-._~", URIUtils.decodeForURI("ABCabc019-._~"));
}

@Test
void decodeForURIUtf8() {
// 2 byte character - yen sign
assertEquals("\u00A5", URIUtils.decodeForURI("%C2%A5"));

// 3 byte character - samaritan letter tsasdiy
assertEquals("\u0811", URIUtils.decodeForURI("%E0%A0%91"));

// 4 byte character - phoenician letter het
assertEquals("\uD802\uDD07", URIUtils.decodeForURI("%F0%90%A4%87"));
}

/**
* decodeForURI is the exact inverse of encodeForURI for any input \u2014 the bijective property
* the xmldb URI functions rely on.
*/
@Test
void decodeForURIRoundTripsEncodeForURI() {
final String[] names = {
"plain", "dash-case", "file.ext", "snake_case", "~home",
"hello world", "1+2", "99%", "%2F", "99%%100",
"a:b", "x/y", "Goodbye?", "#comment", "[predicate", "predicate]", "adam@work",
"Hello!", "$100", "Jack&Jill", "it's", "(comment", "comment)", "1*2", "x,y", "a;b", "n=1",
"caf\u00E9", "\u041F\u0440\u0438\u0432\u0435\u0442", "\u6587\u66F8", "\u00A5", "\u0811", "\uD802\uDD07"
};
for (final String name : names) {
assertEquals(name, URIUtils.decodeForURI(URIUtils.encodeForURI(name)),
"encode/decode round-trip failed for: " + name);
}
}

/**
* decodeForURI must never throw and never truncate, even on input that is not the output of
* encodeForURI — xmldb:decode/xmldb:decode-uri accept arbitrary user strings. Each case here
* is one that {@code new java.net.URI(s).getPath()} mishandles (throws URISyntaxException, or
* silently drops everything from a '?' or '#' onward), which is why this is a standalone decoder.
*/
@Test
void decodeForURIRobustOnMalformedAndReservedInput() {
// a lone '%' not followed by two hex digits is preserved verbatim (URI: throws)
assertEquals("100%", URIUtils.decodeForURI("100%"));

// a truncated escape is preserved verbatim (URI: throws)
assertEquals("a%2", URIUtils.decodeForURI("a%2"));

// a '%' followed by non-hex is preserved verbatim (URI: throws)
assertEquals("a%ZZb", URIUtils.decodeForURI("a%ZZb"));

// a literal space is left as-is (URI: throws on an unencoded space)
assertEquals("a b", URIUtils.decodeForURI("a b"));

// '?' and '#' are ordinary characters here, not query/fragment delimiters (URI: truncates to "a")
assertEquals("a?b", URIUtils.decodeForURI("a?b"));
assertEquals("a#b", URIUtils.decodeForURI("a#b"));

// braces are ordinary characters (URI: throws)
assertEquals("a{b}c", URIUtils.decodeForURI("a{b}c"));

// valid escapes still decode even when mixed with characters URI would reject
assertEquals("a b?c", URIUtils.decodeForURI("a%20b?c"));
}
}
62 changes: 62 additions & 0 deletions exist-core/src/test/xquery/xmldb/uri-encoding-tests.xql
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
(:
: eXist-db Open Source Native XML Database
: Copyright (C) 2001 The eXist-db Authors
:
: info@exist-db.org
: http://www.exist-db.org
:
: This library is free software; you can redistribute it and/or
: modify it under the terms of the GNU Lesser General Public
: License as published by the Free Software Foundation; either
: version 2.1 of the License, or (at your option) any later version.
:
: This library is distributed in the hope that it will be useful,
: but WITHOUT ANY WARRANTY; without even the implied warranty of
: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
: Lesser General Public License for more details.
:
: You should have received a copy of the GNU Lesser General Public
: License along with this library; if not, write to the Free Software
: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
:)
xquery version "3.1";

module namespace t="http://exist-db.org/testsuite/xmldb-uri-encoding";

declare namespace test="http://exist-db.org/xquery/xqsuite";

(: ~
: Regression tests for xmldb:decode / xmldb:decode-uri percent-decoding.
: eXist-db/exist#1824 and #44: a '+' must be decoded as a literal plus sign (RFC 3986),
: not turned into a space (which is application/x-www-form-urlencoded behavior).
:)

declare
%test:assertEquals("a+b")
function t:decode-uri-plus-is-literal-when-encoded() {
xmldb:decode-uri(xs:anyURI("a%2Bb"))
};

declare
%test:assertEquals("a+b")
function t:decode-uri-bare-plus-is-literal() {
xmldb:decode-uri(xs:anyURI("a+b"))
};

declare
%test:assertEquals("a+b")
function t:decode-plus-is-literal-when-encoded() {
xmldb:decode("a%2Bb")
};

declare
%test:assertEquals("a b")
function t:decode-uri-percent-20-is-space() {
xmldb:decode-uri(xs:anyURI("a%20b"))
};

declare
%test:assertEquals("My Report (2024)+final.xml")
function t:decode-uri-mixed() {
xmldb:decode-uri(xs:anyURI("My%20Report%20%282024%29%2Bfinal.xml"))
};
Loading