spiceai · lukekim · May 14, 2026 · May 14, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spiceio"
-version = "0.5.2"
+version = "0.5.3"
 edition = "2024"
 description = "S3-compatible API proxy to SMB file shares"
 license = "Apache-2.0"

diff --git a/benches/protocol_bench.rs b/benches/protocol_bench.rs
@@ -242,6 +242,42 @@ fn bench_pipelined_read_decode(c: &mut Criterion) {
     group.finish();
 }
 
+/// Bench the zero-copy `decode_read_response_from_msg` path used after the
+/// pipelined-read optimization. Compared to `bench_pipelined_read_decode` this
+/// avoids the per-response body `to_vec()` — for a 64-deep 64 KiB batch that's
+/// ~4 MiB of memcpy per batch eliminated.
+fn bench_pipelined_read_decode_zerocopy(c: &mut Criterion) {
+    let mut group = c.benchmark_group("pipelined_read_decode_zerocopy");
+    let cases = [(8usize, 65536usize), (64, 65536), (64, 8192)];
+    for (depth, chunk_size) in cases {
+        let base_msg_id = 1_000u64;
+        let messages: Vec<Vec<u8>> = (0..depth)
+            .map(|i| build_read_response_msg(base_msg_id + i as u64, chunk_size))
+            .collect();
+        group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
+        group.bench_with_input(
+            criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
+            &messages,
+            |b, messages| {
+                b.iter(|| {
+                    let n = messages.len();
+                    let mut slots: Vec<Option<bytes::Bytes>> = (0..n).map(|_| None).collect();
+                    for msg in messages.iter() {
+                        let header = Header::decode(black_box(msg)).unwrap();
+                        let slot = header.message_id.wrapping_sub(base_msg_id) as usize;
+                        // Clone to simulate ownership transfer from the read
+                        // path — the production code reads directly into a
+                        // fresh Vec each response.
+                        slots[slot] = decode_read_response_from_msg(msg.clone());
+                    }
+                    slots
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
 /// Bench the CPU-bound per-batch work of `pipelined_write`: header construction
 /// (with credit charge), `encode_write_request`, and `build_request` framing.
 /// This is the inner loop of WAL pipelined writes before any I/O happens.
@@ -279,6 +315,47 @@ fn bench_pipelined_write_encode(c: &mut Criterion) {
     group.finish();
 }
 
+/// Bench the coalesced equivalent: build all packets directly into a single
+/// `BytesMut`, the way `pipelined_write` does post-optimization. Comparable
+/// to `bench_pipelined_write_encode` — captures the win from eliminating
+/// per-packet allocations and from a single contiguous buffer.
+fn bench_pipelined_write_encode_coalesced(c: &mut Criterion) {
+    use bytes::BufMut;
+    let mut group = c.benchmark_group("pipelined_write_encode_coalesced");
+    let file_id = [1u8; 16];
+    let cases = [(8usize, 65536usize), (64, 65536), (64, 1024 * 1024)];
+    const WRITE_REQUEST_FIXED: usize = 48;
+    for (depth, chunk_size) in cases {
+        let chunk = vec![0u8; chunk_size];
+        group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
+        group.bench_with_input(
+            criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
+            &chunk,
+            |b, chunk| {
+                b.iter(|| {
+                    let total_bytes =
+                        depth * (4 + SMB2_HEADER_SIZE + WRITE_REQUEST_FIXED + chunk.len());
+                    let mut buf = BytesMut::with_capacity(total_bytes);
+                    let mut offset = 0u64;
+                    for i in 0..depth {
+                        let mut hdr = Header::new(Command::Write, i as u64)
+                            .with_credit_charge(chunk.len() as u32);
+                        hdr.tree_id = 42;
+                        hdr.session_id = 0xdead_beef;
+                        let packet_total = SMB2_HEADER_SIZE + WRITE_REQUEST_FIXED + chunk.len();
+                        buf.put_u32((packet_total as u32) & 0x00FF_FFFF);
+                        hdr.encode(&mut buf);
+                        encode_write_request(&mut buf, &file_id, offset, black_box(chunk));
+                        offset += chunk.len() as u64;
+                    }
+                    buf
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
 fn bench_parse_directory_entries(c: &mut Criterion) {
     // Build 50 entries
     let mut data = Vec::new();
@@ -321,7 +398,9 @@ criterion_group!(
     bench_build_request,
     bench_parse_compound_response,
     bench_pipelined_read_decode,
+    bench_pipelined_read_decode_zerocopy,
     bench_pipelined_write_encode,
+    bench_pipelined_write_encode_coalesced,
     bench_parse_directory_entries,
 );
 criterion_main!(benches);
diff --git a/scripts/bench-live.sh b/scripts/bench-live.sh
@@ -5,7 +5,15 @@ set -euo pipefail
 #
 # Usage: SPICEIO_SMB_USER=user SPICEIO_SMB_PASS=pass ./scripts/bench-live.sh
 #
-# Runs write and read throughput tests at various file sizes.
+# Runs write and read throughput tests at various file sizes, plus
+# concurrent multi-stream tests intended to saturate a 10G link.
+#
+# Environment knobs:
+#   BENCH_CONCURRENCY    parallel streams in the concurrent tests (default 8)
+#   BENCH_MOUNT_BASELINE 1 to also benchmark a raw mount_smbfs mount of the
+#                          same share — gives a hard ceiling on what the link
+#                          can do, so we can see spiceio's translation overhead
+#
 # Requires: aws cli, dd, curl, bc, perl (Time::HiRes).
 
 SMB_SERVER="${SPICEIO_SMB_SERVER:-192.168.3.148}"
@@ -15,6 +23,8 @@ SMB_DOMAIN="${SPICEIO_SMB_DOMAIN:-}"
 REGION="${SPICEIO_REGION:-us-east-1}"
 BUCKET="${SPICEIO_BUCKET:-bench}"
 BIND="${SPICEIO_BIND:-127.0.0.1:18334}"
+CONCURRENCY="${BENCH_CONCURRENCY:-8}"
+MOUNT_BASELINE="${BENCH_MOUNT_BASELINE:-0}"
 
 : "${SPICEIO_SMB_USER:?SPICEIO_SMB_USER is required}"
 : "${SPICEIO_SMB_PASS:?SPICEIO_SMB_PASS is required}"
@@ -32,6 +42,7 @@ fi
 
 # ── Cleanup ─────────────────────────────────────────────────────────────
 SPICEIO_PID=""
+MOUNT_POINT=""
 cleanup() {
     echo ""
     echo "[bench] cleaning up..."
@@ -40,6 +51,10 @@ cleanup() {
         kill "$SPICEIO_PID" 2>/dev/null || true
         wait "$SPICEIO_PID" 2>/dev/null || true
     fi
+    if [[ -n "$MOUNT_POINT" && -d "$MOUNT_POINT" ]]; then
+        umount "$MOUNT_POINT" 2>/dev/null || true
+        rmdir "$MOUNT_POINT" 2>/dev/null || true
+    fi
     rm -f /tmp/spiceio-bench-*
 }
 trap cleanup EXIT
@@ -122,6 +137,114 @@ bench_multi_write() {
     rm -f "$file"
 }
 
+# Concurrent single-file PUT: N parallel uploads of `size_bytes`-each.
+# Aggregate throughput is what hits the link — this is the test that
+# meaningfully exercises a 10G NAS pipe.
+bench_concurrent_write() {
+    local concurrency=$1 size_bytes=$2 label=$3
+    local total=$((concurrency * size_bytes))
+    local file="/tmp/spiceio-bench-cwrite-${label}"
+    gen_file "$file" "$size_bytes"
+
+    local start end elapsed mbps
+    start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+    local pids=()
+    for i in $(seq 1 "$concurrency"); do
+        $AWS s3 cp "$file" "s3://${BUCKET}/${PREFIX}/cw-${label}-${i}" --quiet 2>/dev/null &
+        pids+=($!)
+    done
+    for pid in "${pids[@]}"; do
+        wait "$pid"
+    done
+    end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+    elapsed=$(echo "$end - $start" | bc -l)
+    mbps=$(echo "$total / $elapsed / 1048576" | bc -l)
+    printf "  PUT x%-3d %-5s  %6.2fs  %7.1f MiB/s  (%.2f Gbit/s)\n" \
+        "$concurrency" "$label" "$elapsed" "$mbps" \
+        "$(echo "$mbps * 8 / 1024" | bc -l)"
+    rm -f "$file"
+}
+
+bench_concurrent_read() {
+    local concurrency=$1 size_bytes=$2 label=$3
+    local total=$((concurrency * size_bytes))
+
+    local start end elapsed mbps
+    start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+    local pids=()
+    for i in $(seq 1 "$concurrency"); do
+        $AWS s3 cp "s3://${BUCKET}/${PREFIX}/cw-${label}-${i}" "/tmp/spiceio-bench-cread-${label}-${i}" \
+            --quiet 2>/dev/null &
+        pids+=($!)
+    done
+    for pid in "${pids[@]}"; do
+        wait "$pid"
+    done
+    end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+    elapsed=$(echo "$end - $start" | bc -l)
+    mbps=$(echo "$total / $elapsed / 1048576" | bc -l)
+    printf "  GET x%-3d %-5s  %6.2fs  %7.1f MiB/s  (%.2f Gbit/s)\n" \
+        "$concurrency" "$label" "$elapsed" "$mbps" \
+        "$(echo "$mbps * 8 / 1024" | bc -l)"
+    rm -f /tmp/spiceio-bench-cread-${label}-*
+}
+
+# Optional raw-SMB baseline via mount_smbfs. Mounts the same share locally
+# and runs the same dd-based write/read tests. Establishes the hard
+# ceiling for what the link can do, so we can attribute spiceio's
+# translation overhead.
+bench_mount_baseline() {
+    local user="$SPICEIO_SMB_USER"
+    local pass="$SPICEIO_SMB_PASS"
+    local server="$SMB_SERVER"
+    local share="$SMB_SHARE"
+
+    MOUNT_POINT="/tmp/spiceio-bench-mount-$$"
+    mkdir -p "$MOUNT_POINT"
+    local escaped_pass
+    escaped_pass=$(printf '%s' "$pass" | perl -MURI::Escape -ne 'print uri_escape($_)')
+    if ! mount_smbfs -N "//${user}:${escaped_pass}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then
+        echo "  (mount_smbfs failed — skipping baseline)"
+        rmdir "$MOUNT_POINT" 2>/dev/null
+        MOUNT_POINT=""
+        return
+    fi
+
+    local target="${MOUNT_POINT}/${PREFIX}-mount-baseline"
+    mkdir -p "$target"
+
+    local label sizes labels
+    sizes=(104857600 524288000)
+    labels=("100M" "500M")
+    for idx in "${!sizes[@]}"; do
+        local size_bytes=${sizes[$idx]}
+        label=${labels[$idx]}
+        local file="/tmp/spiceio-bench-mountin-${label}"
+        gen_file "$file" "$size_bytes"
+
+        local start end elapsed mbps
+        start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+        cp "$file" "${target}/${label}"
+        end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+        elapsed=$(echo "$end - $start" | bc -l)
+        mbps=$(echo "$size_bytes / $elapsed / 1048576" | bc -l)
+        printf "  PUT mount  %-5s  %6.2fs  %7.1f MiB/s\n" "$label" "$elapsed" "$mbps"
+
+        start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+        cp "${target}/${label}" "${file}.out"
+        end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
+        elapsed=$(echo "$end - $start" | bc -l)
+        mbps=$(echo "$size_bytes / $elapsed / 1048576" | bc -l)
+        printf "  GET mount  %-5s  %6.2fs  %7.1f MiB/s\n" "$label" "$elapsed" "$mbps"
+        rm -f "$file" "${file}.out"
+    done
+
+    rm -rf "$target" 2>/dev/null
+    umount "$MOUNT_POINT" 2>/dev/null
+    rmdir "$MOUNT_POINT" 2>/dev/null
+    MOUNT_POINT=""
+}
+
 # ── Run benchmarks ──────────────────────────────────────────────────────
 echo ""
 echo "═══════════════════════════════════════════════════════════════"
@@ -156,11 +279,24 @@ bench_multi_write 100  1048576   "1M"
 bench_multi_write  20 10485760   "10M"
 bench_multi_write  10 52428800   "50M"
 
-# Total: 1685 (write) + 1685 (read) + 800 (multi-write) = 4170 MiB transferred
+# Concurrent single-stream tests. Single-stream uploads top out at one TCP
+# connection's worth of pipe; aggregate concurrent uploads is the test
+# that actually saturates a 10G link.
+echo ""
+echo "── Concurrent write throughput (x${CONCURRENCY} parallel) ──"
+bench_concurrent_write "$CONCURRENCY" 104857600  "100M"
+bench_concurrent_write "$CONCURRENCY" 524288000  "500M"
+
 echo ""
-echo "── Aggregate ──"
-echo "  Total written: 2485 MiB  (single-file + multi-file)"
-echo "  Total read:    1685 MiB"
-echo "  Total I/O:     4170 MiB"
+echo "── Concurrent read throughput (x${CONCURRENCY} parallel) ──"
+bench_concurrent_read "$CONCURRENCY" 104857600  "100M"
+bench_concurrent_read "$CONCURRENCY" 524288000  "500M"
+
+if [[ "$MOUNT_BASELINE" == "1" ]]; then
+    echo ""
+    echo "── Raw mount_smbfs baseline (link ceiling) ──"
+    bench_mount_baseline
+fi
+
 echo ""
 echo "═══════════════════════════════════════════════════════════════"
diff --git a/src/s3/router.rs b/src/s3/router.rs
@@ -591,8 +591,15 @@ async fn handle_get_object(
 
     let content_length = end - start + 1;
 
-    // Build response with streaming body
-    let (body, tx) = SpiceioBody::channel(4);
+    // Build response with streaming body.
+    //
+    // Channel capacity is sized to match the SMB pipeline depth so a full
+    // batch of reads can dump into the channel without blocking the producer.
+    // That lets the SMB-reading task immediately issue the next pipelined
+    // batch (incurring its round-trip) while the HTTP-sending task drains
+    // the previous batch into the wire — back-to-back batches overlap, which
+    // is the difference between filling and starving the 10G link.
+    let (body, tx) = SpiceioBody::channel(crate::smb::ops::READ_PIPELINE_DEPTH);
     let chunk_size = handle.max_chunk;
 
     // Spawn background task to stream pipelined SMB reads into the channel.