Skip to content

Commit 5736b5f

Browse files
authored
Coalesced pipelined SMB I/O for higher 10G throughput (#22)
Reworks the SMB pipelined-read and pipelined-write paths to build all batch packets into one contiguous BytesMut, sign each in-place, and emit a single write_all per batch — eliminating 64 per-packet to_vec allocations and collapsing 64 write_all syscalls per batch into 1. Adds a zero-copy read response decoder that slices an owned Vec into Bytes without the prior body.to_vec() — saves ~4 MiB of memcpy per 64-deep batch at 64 KiB chunks. Sizes the GetObject streaming channel to READ_PIPELINE_DEPTH so a full pipeline batch can dump into the channel without blocking, letting back-to-back SMB batches overlap HTTP draining. Extends bench-live.sh with concurrent multi-stream PUT/GET (BENCH_CONCURRENCY) and an optional raw mount_smbfs baseline (BENCH_MOUNT_BASELINE) to quantify the spiceio translation overhead against the link ceiling. Adds matching protocol micro-benches. Microbench (pipelined_write_encode, d64 x 64 KiB): 154 us -> 49 us, ~3.1x faster on the CPU side, on top of the 64 -> 1 syscall reduction.
1 parent 21d4dac commit 5736b5f

8 files changed

Lines changed: 357 additions & 45 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spiceio"
3-
version = "0.5.2"
3+
version = "0.5.3"
44
edition = "2024"
55
description = "S3-compatible API proxy to SMB file shares"
66
license = "Apache-2.0"

benches/protocol_bench.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,42 @@ fn bench_pipelined_read_decode(c: &mut Criterion) {
242242
group.finish();
243243
}
244244

245+
/// Bench the zero-copy `decode_read_response_from_msg` path used after the
246+
/// pipelined-read optimization. Compared to `bench_pipelined_read_decode` this
247+
/// avoids the per-response body `to_vec()` — for a 64-deep 64 KiB batch that's
248+
/// ~4 MiB of memcpy per batch eliminated.
249+
fn bench_pipelined_read_decode_zerocopy(c: &mut Criterion) {
250+
let mut group = c.benchmark_group("pipelined_read_decode_zerocopy");
251+
let cases = [(8usize, 65536usize), (64, 65536), (64, 8192)];
252+
for (depth, chunk_size) in cases {
253+
let base_msg_id = 1_000u64;
254+
let messages: Vec<Vec<u8>> = (0..depth)
255+
.map(|i| build_read_response_msg(base_msg_id + i as u64, chunk_size))
256+
.collect();
257+
group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
258+
group.bench_with_input(
259+
criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
260+
&messages,
261+
|b, messages| {
262+
b.iter(|| {
263+
let n = messages.len();
264+
let mut slots: Vec<Option<bytes::Bytes>> = (0..n).map(|_| None).collect();
265+
for msg in messages.iter() {
266+
let header = Header::decode(black_box(msg)).unwrap();
267+
let slot = header.message_id.wrapping_sub(base_msg_id) as usize;
268+
// Clone to simulate ownership transfer from the read
269+
// path — the production code reads directly into a
270+
// fresh Vec each response.
271+
slots[slot] = decode_read_response_from_msg(msg.clone());
272+
}
273+
slots
274+
});
275+
},
276+
);
277+
}
278+
group.finish();
279+
}
280+
245281
/// Bench the CPU-bound per-batch work of `pipelined_write`: header construction
246282
/// (with credit charge), `encode_write_request`, and `build_request` framing.
247283
/// This is the inner loop of WAL pipelined writes before any I/O happens.
@@ -279,6 +315,47 @@ fn bench_pipelined_write_encode(c: &mut Criterion) {
279315
group.finish();
280316
}
281317

318+
/// Bench the coalesced equivalent: build all packets directly into a single
319+
/// `BytesMut`, the way `pipelined_write` does post-optimization. Comparable
320+
/// to `bench_pipelined_write_encode` — captures the win from eliminating
321+
/// per-packet allocations and from a single contiguous buffer.
322+
fn bench_pipelined_write_encode_coalesced(c: &mut Criterion) {
323+
use bytes::BufMut;
324+
let mut group = c.benchmark_group("pipelined_write_encode_coalesced");
325+
let file_id = [1u8; 16];
326+
let cases = [(8usize, 65536usize), (64, 65536), (64, 1024 * 1024)];
327+
const WRITE_REQUEST_FIXED: usize = 48;
328+
for (depth, chunk_size) in cases {
329+
let chunk = vec![0u8; chunk_size];
330+
group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
331+
group.bench_with_input(
332+
criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
333+
&chunk,
334+
|b, chunk| {
335+
b.iter(|| {
336+
let total_bytes =
337+
depth * (4 + SMB2_HEADER_SIZE + WRITE_REQUEST_FIXED + chunk.len());
338+
let mut buf = BytesMut::with_capacity(total_bytes);
339+
let mut offset = 0u64;
340+
for i in 0..depth {
341+
let mut hdr = Header::new(Command::Write, i as u64)
342+
.with_credit_charge(chunk.len() as u32);
343+
hdr.tree_id = 42;
344+
hdr.session_id = 0xdead_beef;
345+
let packet_total = SMB2_HEADER_SIZE + WRITE_REQUEST_FIXED + chunk.len();
346+
buf.put_u32((packet_total as u32) & 0x00FF_FFFF);
347+
hdr.encode(&mut buf);
348+
encode_write_request(&mut buf, &file_id, offset, black_box(chunk));
349+
offset += chunk.len() as u64;
350+
}
351+
buf
352+
});
353+
},
354+
);
355+
}
356+
group.finish();
357+
}
358+
282359
fn bench_parse_directory_entries(c: &mut Criterion) {
283360
// Build 50 entries
284361
let mut data = Vec::new();
@@ -321,7 +398,9 @@ criterion_group!(
321398
bench_build_request,
322399
bench_parse_compound_response,
323400
bench_pipelined_read_decode,
401+
bench_pipelined_read_decode_zerocopy,
324402
bench_pipelined_write_encode,
403+
bench_pipelined_write_encode_coalesced,
325404
bench_parse_directory_entries,
326405
);
327406
criterion_main!(benches);

scripts/bench-live.sh

Lines changed: 142 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,15 @@ set -euo pipefail
55
#
66
# Usage: SPICEIO_SMB_USER=user SPICEIO_SMB_PASS=pass ./scripts/bench-live.sh
77
#
8-
# Runs write and read throughput tests at various file sizes.
8+
# Runs write and read throughput tests at various file sizes, plus
9+
# concurrent multi-stream tests intended to saturate a 10G link.
10+
#
11+
# Environment knobs:
12+
# BENCH_CONCURRENCY parallel streams in the concurrent tests (default 8)
13+
# BENCH_MOUNT_BASELINE 1 to also benchmark a raw mount_smbfs mount of the
14+
# same share — gives a hard ceiling on what the link
15+
# can do, so we can see spiceio's translation overhead
16+
#
917
# Requires: aws cli, dd, curl, bc, perl (Time::HiRes).
1018

1119
SMB_SERVER="${SPICEIO_SMB_SERVER:-192.168.3.148}"
@@ -15,6 +23,8 @@ SMB_DOMAIN="${SPICEIO_SMB_DOMAIN:-}"
1523
REGION="${SPICEIO_REGION:-us-east-1}"
1624
BUCKET="${SPICEIO_BUCKET:-bench}"
1725
BIND="${SPICEIO_BIND:-127.0.0.1:18334}"
26+
CONCURRENCY="${BENCH_CONCURRENCY:-8}"
27+
MOUNT_BASELINE="${BENCH_MOUNT_BASELINE:-0}"
1828

1929
: "${SPICEIO_SMB_USER:?SPICEIO_SMB_USER is required}"
2030
: "${SPICEIO_SMB_PASS:?SPICEIO_SMB_PASS is required}"
@@ -32,6 +42,7 @@ fi
3242

3343
# ── Cleanup ─────────────────────────────────────────────────────────────
3444
SPICEIO_PID=""
45+
MOUNT_POINT=""
3546
cleanup() {
3647
echo ""
3748
echo "[bench] cleaning up..."
@@ -40,6 +51,10 @@ cleanup() {
4051
kill "$SPICEIO_PID" 2>/dev/null || true
4152
wait "$SPICEIO_PID" 2>/dev/null || true
4253
fi
54+
if [[ -n "$MOUNT_POINT" && -d "$MOUNT_POINT" ]]; then
55+
umount "$MOUNT_POINT" 2>/dev/null || true
56+
rmdir "$MOUNT_POINT" 2>/dev/null || true
57+
fi
4358
rm -f /tmp/spiceio-bench-*
4459
}
4560
trap cleanup EXIT
@@ -122,6 +137,114 @@ bench_multi_write() {
122137
rm -f "$file"
123138
}
124139

140+
# Concurrent single-file PUT: N parallel uploads of `size_bytes`-each.
141+
# Aggregate throughput is what hits the link — this is the test that
142+
# meaningfully exercises a 10G NAS pipe.
143+
bench_concurrent_write() {
144+
local concurrency=$1 size_bytes=$2 label=$3
145+
local total=$((concurrency * size_bytes))
146+
local file="/tmp/spiceio-bench-cwrite-${label}"
147+
gen_file "$file" "$size_bytes"
148+
149+
local start end elapsed mbps
150+
start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
151+
local pids=()
152+
for i in $(seq 1 "$concurrency"); do
153+
$AWS s3 cp "$file" "s3://${BUCKET}/${PREFIX}/cw-${label}-${i}" --quiet 2>/dev/null &
154+
pids+=($!)
155+
done
156+
for pid in "${pids[@]}"; do
157+
wait "$pid"
158+
done
159+
end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
160+
elapsed=$(echo "$end - $start" | bc -l)
161+
mbps=$(echo "$total / $elapsed / 1048576" | bc -l)
162+
printf " PUT x%-3d %-5s %6.2fs %7.1f MiB/s (%.2f Gbit/s)\n" \
163+
"$concurrency" "$label" "$elapsed" "$mbps" \
164+
"$(echo "$mbps * 8 / 1024" | bc -l)"
165+
rm -f "$file"
166+
}
167+
168+
bench_concurrent_read() {
169+
local concurrency=$1 size_bytes=$2 label=$3
170+
local total=$((concurrency * size_bytes))
171+
172+
local start end elapsed mbps
173+
start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
174+
local pids=()
175+
for i in $(seq 1 "$concurrency"); do
176+
$AWS s3 cp "s3://${BUCKET}/${PREFIX}/cw-${label}-${i}" "/tmp/spiceio-bench-cread-${label}-${i}" \
177+
--quiet 2>/dev/null &
178+
pids+=($!)
179+
done
180+
for pid in "${pids[@]}"; do
181+
wait "$pid"
182+
done
183+
end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
184+
elapsed=$(echo "$end - $start" | bc -l)
185+
mbps=$(echo "$total / $elapsed / 1048576" | bc -l)
186+
printf " GET x%-3d %-5s %6.2fs %7.1f MiB/s (%.2f Gbit/s)\n" \
187+
"$concurrency" "$label" "$elapsed" "$mbps" \
188+
"$(echo "$mbps * 8 / 1024" | bc -l)"
189+
rm -f /tmp/spiceio-bench-cread-${label}-*
190+
}
191+
192+
# Optional raw-SMB baseline via mount_smbfs. Mounts the same share locally
193+
# and runs the same dd-based write/read tests. Establishes the hard
194+
# ceiling for what the link can do, so we can attribute spiceio's
195+
# translation overhead.
196+
bench_mount_baseline() {
197+
local user="$SPICEIO_SMB_USER"
198+
local pass="$SPICEIO_SMB_PASS"
199+
local server="$SMB_SERVER"
200+
local share="$SMB_SHARE"
201+
202+
MOUNT_POINT="/tmp/spiceio-bench-mount-$$"
203+
mkdir -p "$MOUNT_POINT"
204+
local escaped_pass
205+
escaped_pass=$(printf '%s' "$pass" | perl -MURI::Escape -ne 'print uri_escape($_)')
206+
if ! mount_smbfs -N "//${user}:${escaped_pass}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then
207+
echo " (mount_smbfs failed — skipping baseline)"
208+
rmdir "$MOUNT_POINT" 2>/dev/null
209+
MOUNT_POINT=""
210+
return
211+
fi
212+
213+
local target="${MOUNT_POINT}/${PREFIX}-mount-baseline"
214+
mkdir -p "$target"
215+
216+
local label sizes labels
217+
sizes=(104857600 524288000)
218+
labels=("100M" "500M")
219+
for idx in "${!sizes[@]}"; do
220+
local size_bytes=${sizes[$idx]}
221+
label=${labels[$idx]}
222+
local file="/tmp/spiceio-bench-mountin-${label}"
223+
gen_file "$file" "$size_bytes"
224+
225+
local start end elapsed mbps
226+
start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
227+
cp "$file" "${target}/${label}"
228+
end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
229+
elapsed=$(echo "$end - $start" | bc -l)
230+
mbps=$(echo "$size_bytes / $elapsed / 1048576" | bc -l)
231+
printf " PUT mount %-5s %6.2fs %7.1f MiB/s\n" "$label" "$elapsed" "$mbps"
232+
233+
start=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
234+
cp "${target}/${label}" "${file}.out"
235+
end=$(perl -MTime::HiRes=time -e 'printf "%.6f\n", time')
236+
elapsed=$(echo "$end - $start" | bc -l)
237+
mbps=$(echo "$size_bytes / $elapsed / 1048576" | bc -l)
238+
printf " GET mount %-5s %6.2fs %7.1f MiB/s\n" "$label" "$elapsed" "$mbps"
239+
rm -f "$file" "${file}.out"
240+
done
241+
242+
rm -rf "$target" 2>/dev/null
243+
umount "$MOUNT_POINT" 2>/dev/null
244+
rmdir "$MOUNT_POINT" 2>/dev/null
245+
MOUNT_POINT=""
246+
}
247+
125248
# ── Run benchmarks ──────────────────────────────────────────────────────
126249
echo ""
127250
echo "═══════════════════════════════════════════════════════════════"
@@ -156,11 +279,24 @@ bench_multi_write 100 1048576 "1M"
156279
bench_multi_write 20 10485760 "10M"
157280
bench_multi_write 10 52428800 "50M"
158281

159-
# Total: 1685 (write) + 1685 (read) + 800 (multi-write) = 4170 MiB transferred
282+
# Concurrent single-stream tests. Single-stream uploads top out at one TCP
283+
# connection's worth of pipe; aggregate concurrent uploads is the test
284+
# that actually saturates a 10G link.
285+
echo ""
286+
echo "── Concurrent write throughput (x${CONCURRENCY} parallel) ──"
287+
bench_concurrent_write "$CONCURRENCY" 104857600 "100M"
288+
bench_concurrent_write "$CONCURRENCY" 524288000 "500M"
289+
160290
echo ""
161-
echo "── Aggregate ──"
162-
echo " Total written: 2485 MiB (single-file + multi-file)"
163-
echo " Total read: 1685 MiB"
164-
echo " Total I/O: 4170 MiB"
291+
echo "── Concurrent read throughput (x${CONCURRENCY} parallel) ──"
292+
bench_concurrent_read "$CONCURRENCY" 104857600 "100M"
293+
bench_concurrent_read "$CONCURRENCY" 524288000 "500M"
294+
295+
if [[ "$MOUNT_BASELINE" == "1" ]]; then
296+
echo ""
297+
echo "── Raw mount_smbfs baseline (link ceiling) ──"
298+
bench_mount_baseline
299+
fi
300+
165301
echo ""
166302
echo "═══════════════════════════════════════════════════════════════"

src/s3/router.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -591,8 +591,15 @@ async fn handle_get_object(
591591

592592
let content_length = end - start + 1;
593593

594-
// Build response with streaming body
595-
let (body, tx) = SpiceioBody::channel(4);
594+
// Build response with streaming body.
595+
//
596+
// Channel capacity is sized to match the SMB pipeline depth so a full
597+
// batch of reads can dump into the channel without blocking the producer.
598+
// That lets the SMB-reading task immediately issue the next pipelined
599+
// batch (incurring its round-trip) while the HTTP-sending task drains
600+
// the previous batch into the wire — back-to-back batches overlap, which
601+
// is the difference between filling and starving the 10G link.
602+
let (body, tx) = SpiceioBody::channel(crate::smb::ops::READ_PIPELINE_DEPTH);
596603
let chunk_size = handle.max_chunk;
597604

598605
// Spawn background task to stream pipelined SMB reads into the channel.

0 commit comments

Comments
 (0)