Skip to content

Commit d0d1fbb

Browse files
joewizclaude
andcommitted
[bugfix] http-client: byte-safe multipart response parsing with nesting
A multipart response was split with String.split("--" + boundary) over a UTF-8 decode of the body, which corrupts any binary part (a non-UTF-8 byte sequence does not survive the String round-trip), and a nested multipart part was not parsed recursively. splitMultipart, extractPartContentType, and extractPartBody now operate on the raw bytes: the body is split at the boundary using a byte-level search, each part's header region is decoded as ISO-8859-1 (a total byte->char map) only to read the Content-Type, and the part body is sliced as bytes -- so a binary part round-trips byte-for-byte. addBody now recurses into a part whose media type is multipart/*, so nested multipart responses surface their leaf parts. SendRequestFunctionTest gains multipartBinaryPartIsByteSafe (a 0xFF 0xFE part round-trips as base64Binary "//4=") and multipartNestedIsParsedRecursively (outer[ multipart[A,B], C ] -> A,B,C). The two new endpoints are registered in a helper to keep startHttpServer's complexity unchanged. Per-part header reproduction in the http:multipart descriptor remains a follow-up. Part of #6512 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 4f238d8 commit d0d1fbb

2 files changed

Lines changed: 165 additions & 26 deletions

File tree

extensions/modules/http-client/src/main/java/org/exist/xquery/modules/httpclient/ResponseHandler.java

Lines changed: 90 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import java.net.http.HttpResponse;
4747
import java.nio.charset.Charset;
4848
import java.nio.charset.StandardCharsets;
49+
import java.util.Arrays;
4950
import java.util.List;
5051
import java.util.Map;
5152

@@ -184,7 +185,10 @@ private static DocumentImpl buildResponseElement(final HttpResponse<byte[]> resp
184185

185186
private static void addBody(final ValueSequence result, final byte[] bodyBytes,
186187
final String contentType, final BasicFunction callerFunc) throws XPathException {
187-
if (ContentTypeHelper.isXml(contentType)) {
188+
if (ContentTypeHelper.extractMediaType(contentType).startsWith("multipart/")) {
189+
// a nested multipart part: recurse so its sub-parts are typed individually
190+
addMultipartBodies(result, bodyBytes, contentType, callerFunc);
191+
} else if (ContentTypeHelper.isXml(contentType)) {
188192
result.add(parseXml(bodyBytes, callerFunc));
189193
} else if (ContentTypeHelper.isHtml(contentType)) {
190194
result.add(parseHtml(bodyBytes, contentType, callerFunc));
@@ -270,48 +274,108 @@ private static String extractBoundary(final String contentType) {
270274
return boundary.trim();
271275
}
272276

277+
/**
278+
* Splits a multipart body into its parts, operating on the raw bytes so a binary part is not
279+
* corrupted (a String round-trip through a charset would mangle non-text bytes). The CRLF that
280+
* terminates each delimiter line, and the CRLF that precedes the next delimiter, belong to the
281+
* boundary and are stripped; the preamble, the closing {@code --boundary--}, and the epilogue are
282+
* discarded.
283+
*/
273284
private static byte[][] splitMultipart(final byte[] body, final String boundary) {
274-
final String bodyStr = new String(body, StandardCharsets.UTF_8);
275-
final String delimiter = "--" + boundary;
276-
final String[] rawParts = bodyStr.split(delimiter);
285+
final byte[] delim = ("--" + boundary).getBytes(StandardCharsets.US_ASCII);
277286
final List<byte[]> parts = new java.util.ArrayList<>();
278-
for (final String part : rawParts) {
279-
final String trimmed = part.trim();
280-
if (trimmed.isEmpty() || trimmed.equals("--")) {
281-
continue; // Skip preamble and closing delimiter
287+
int pos = indexOf(body, delim, 0);
288+
// a delimiter followed by "--" is the closing delimiter and ends the multipart
289+
while (pos >= 0 && !isClosingDelimiter(body, pos + delim.length)) {
290+
// the CRLF ending the delimiter line, and the CRLF before the next delimiter, belong
291+
// to the boundary, not the part
292+
final int partStart = skipLineEnding(body, pos + delim.length);
293+
final int nextDelim = indexOf(body, delim, partStart);
294+
final int partEnd = trimTrailingLineEnding(body, nextDelim < 0 ? body.length : nextDelim, partStart);
295+
if (partEnd > partStart) {
296+
parts.add(Arrays.copyOfRange(body, partStart, partEnd));
282297
}
283-
parts.add(part.getBytes(StandardCharsets.UTF_8));
298+
pos = nextDelim;
284299
}
285300
return parts.toArray(new byte[0][]);
286301
}
287302

303+
private static boolean isClosingDelimiter(final byte[] body, final int afterDelim) {
304+
return afterDelim + 1 < body.length && body[afterDelim] == '-' && body[afterDelim + 1] == '-';
305+
}
306+
307+
private static int skipLineEnding(final byte[] body, final int pos) {
308+
int p = pos;
309+
if (p < body.length && body[p] == '\r') {
310+
p++;
311+
}
312+
if (p < body.length && body[p] == '\n') {
313+
p++;
314+
}
315+
return p;
316+
}
317+
318+
private static int trimTrailingLineEnding(final byte[] body, final int end, final int floor) {
319+
int e = end;
320+
if (e > floor && body[e - 1] == '\n') {
321+
e--;
322+
}
323+
if (e > floor && body[e - 1] == '\r') {
324+
e--;
325+
}
326+
return e;
327+
}
328+
329+
/**
330+
* The index of the first body byte in a part (just past the blank line separating the part
331+
* headers from the part body), or 0 if the part has no header section.
332+
*/
333+
private static int partHeaderEnd(final byte[] part) {
334+
final int crlfcrlf = indexOf(part, new byte[]{'\r', '\n', '\r', '\n'}, 0);
335+
if (crlfcrlf >= 0) {
336+
return crlfcrlf + 4;
337+
}
338+
final int lflf = indexOf(part, new byte[]{'\n', '\n'}, 0);
339+
if (lflf >= 0) {
340+
return lflf + 2;
341+
}
342+
return 0;
343+
}
344+
288345
private static String extractPartContentType(final byte[] part) {
289-
final String partStr = new String(part, StandardCharsets.UTF_8);
290-
final String[] lines = partStr.split("\r?\n");
291-
for (final String line : lines) {
292-
if (line.toLowerCase().startsWith("content-type:")) {
346+
// Part headers are ASCII; decode only the header region (ISO-8859-1 is a total byte->char map)
347+
final String headers = new String(part, 0, partHeaderEnd(part), StandardCharsets.ISO_8859_1);
348+
for (final String line : headers.split("\r?\n")) {
349+
if (line.regionMatches(true, 0, "content-type:", 0, 13)) {
293350
return line.substring(13).trim();
294351
}
295352
}
296353
return "text/plain";
297354
}
298355

299356
private static byte[] extractPartBody(final byte[] part) {
300-
final String partStr = new String(part, StandardCharsets.UTF_8);
301-
// Body starts after the first blank line (double CRLF or double LF)
302-
int bodyStart = partStr.indexOf("\r\n\r\n");
303-
if (bodyStart >= 0) {
304-
bodyStart += 4;
305-
} else {
306-
bodyStart = partStr.indexOf("\n\n");
307-
if (bodyStart >= 0) {
308-
bodyStart += 2;
309-
} else {
310-
return new byte[0];
357+
return Arrays.copyOfRange(part, partHeaderEnd(part), part.length);
358+
}
359+
360+
/**
361+
* Finds the first occurrence of {@code target} in {@code data} at or after {@code from}.
362+
*/
363+
private static int indexOf(final byte[] data, final byte[] target, final int from) {
364+
for (int i = Math.max(from, 0); i <= data.length - target.length; i++) {
365+
if (matchesAt(data, target, i)) {
366+
return i;
367+
}
368+
}
369+
return -1;
370+
}
371+
372+
private static boolean matchesAt(final byte[] data, final byte[] target, final int offset) {
373+
for (int j = 0; j < target.length; j++) {
374+
if (data[offset + j] != target[j]) {
375+
return false;
311376
}
312377
}
313-
final String bodyStr = partStr.substring(bodyStart);
314-
return bodyStr.getBytes(StandardCharsets.UTF_8);
378+
return true;
315379
}
316380

317381
/**

extensions/modules/http-client/src/test/java/org/exist/xquery/modules/httpclient/SendRequestFunctionTest.java

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ public static void startHttpServer() throws IOException {
269269
}
270270
});
271271

272+
// Multipart byte-safety + nesting endpoints (extracted to keep startHttpServer's complexity down)
273+
registerMultipartTestEndpoints(httpServer);
274+
272275
// OPTIONS endpoint
273276
httpServer.createContext("/options", exchange -> {
274277
exchange.getResponseHeaders().set("Allow", "GET, POST, OPTIONS");
@@ -299,6 +302,45 @@ public static void stopHttpServer() {
299302
}
300303
}
301304

305+
/**
306+
* Registers the multipart response endpoints used by the byte-safety and nesting tests, kept out
307+
* of startHttpServer so that method's complexity is not inflated.
308+
*/
309+
private static void registerMultipartTestEndpoints(final HttpServer server) {
310+
// Multipart response with a binary part (bytes 0xFF 0xFE, base64 "//4=") that is not valid
311+
// UTF-8 — corrupted by the old String-based splitter
312+
server.createContext("/multipart-binary", exchange -> {
313+
final String boundary = "bnd";
314+
final java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
315+
baos.writeBytes(("--" + boundary + "\r\nContent-Type: application/octet-stream\r\n\r\n")
316+
.getBytes(StandardCharsets.US_ASCII));
317+
baos.writeBytes(new byte[]{(byte) 0xFF, (byte) 0xFE});
318+
baos.writeBytes(("\r\n--" + boundary + "\r\nContent-Type: text/plain\r\n\r\nhello\r\n--"
319+
+ boundary + "--\r\n").getBytes(StandardCharsets.US_ASCII));
320+
final byte[] bodyBytes = baos.toByteArray();
321+
exchange.getResponseHeaders().set("Content-Type", "multipart/mixed; boundary=" + boundary);
322+
exchange.sendResponseHeaders(200, bodyBytes.length);
323+
try (OutputStream os = exchange.getResponseBody()) {
324+
os.write(bodyBytes);
325+
}
326+
});
327+
328+
// Nested multipart: outer multipart/mixed whose first part is itself a multipart/mixed
329+
server.createContext("/multipart-nested", exchange -> {
330+
final String inner = "--inner\r\nContent-Type: text/plain\r\n\r\nA\r\n"
331+
+ "--inner\r\nContent-Type: text/plain\r\n\r\nB\r\n--inner--\r\n";
332+
final String body = "--outer\r\nContent-Type: multipart/mixed; boundary=inner\r\n\r\n"
333+
+ inner + "\r\n"
334+
+ "--outer\r\nContent-Type: text/plain\r\n\r\nC\r\n--outer--\r\n";
335+
exchange.getResponseHeaders().set("Content-Type", "multipart/mixed; boundary=outer");
336+
final byte[] bodyBytes = body.getBytes(StandardCharsets.UTF_8);
337+
exchange.sendResponseHeaders(200, bodyBytes.length);
338+
try (OutputStream os = exchange.getResponseBody()) {
339+
os.write(bodyBytes);
340+
}
341+
});
342+
}
343+
302344
private static void sendResponse(HttpExchange exchange, int status, String contentType,
303345
String body) throws IOException {
304346
byte[] bodyBytes = body.getBytes(StandardCharsets.UTF_8);
@@ -1410,4 +1452,37 @@ public void multipartResponseContainsMultibodyElements() throws XMLDBException {
14101452
assertEquals("Multipart response element should contain http:multipart",
14111453
"true", result.getResource(0).getContent().toString());
14121454
}
1455+
1456+
/**
1457+
* A binary multipart part is returned byte-for-byte (base64Binary), not corrupted by a UTF-8
1458+
* String round-trip. The first part is the two bytes 0xFF 0xFE (base64 "//4="); the second is text.
1459+
*/
1460+
@Test
1461+
public void multipartBinaryPartIsByteSafe() throws XMLDBException {
1462+
final ResourceSet result = existEmbeddedServer.executeQuery(
1463+
HTTP_NS +
1464+
"let $r := http:send-request(\n" +
1465+
" <http:request method='GET' href='" + baseUrl() + "/multipart-binary'/>)\n" +
1466+
"return $r[2] instance of xs:base64Binary\n" +
1467+
" and string($r[2]) = '//4='\n" +
1468+
" and $r[3] = 'hello'");
1469+
assertEquals("binary multipart part should round-trip byte-for-byte",
1470+
"true", result.getResource(0).getContent().toString());
1471+
}
1472+
1473+
/**
1474+
* A nested multipart part is parsed recursively, so its sub-parts surface as individual items:
1475+
* outer = [ multipart[ "A", "B" ], "C" ] yields the three leaf text items A, B, C.
1476+
*/
1477+
@Test
1478+
public void multipartNestedIsParsedRecursively() throws XMLDBException {
1479+
final ResourceSet result = existEmbeddedServer.executeQuery(
1480+
HTTP_NS +
1481+
"let $r := http:send-request(\n" +
1482+
" <http:request method='GET' href='" + baseUrl() + "/multipart-nested'/>)\n" +
1483+
"let $bodies := $r[position() gt 1]\n" +
1484+
"return string-join($bodies, ',')");
1485+
assertEquals("nested multipart should yield the leaf parts A,B,C",
1486+
"A,B,C", result.getResource(0).getContent().toString());
1487+
}
14131488
}

0 commit comments

Comments
 (0)