Skip to content

Commit a6bb5ff

Browse files
committed
Fix sharing-violation 500s + carry forward review fixes (v0.5.4)
The previous PR squash-merged with four Copilot review threads unaddressed (auto-merge fired before the fixes landed). Brings those forward, and adds the actual bug they were a symptom of — sharing violations leaking through io_to_s3_error's catch-all arm as 500 InternalError with noisy `error:` log lines. Sharing violations are an expected transient under concurrent writes (the SMB server refuses a file held by another handle). The typed io::ErrorKind::ResourceBusy mapping was already correct; the HTTP layer just had no arm for it. Now maps to 503 SlowDown — the standard retryable S3 status — and logs via slog! instead of serr!, so SDKs auto-retry instead of seeing a fatal 500 and CI stops painting these red. Adds an io_to_s3_error unit test covering all four kinds (NotFound, PermissionDenied, ResourceBusy, fallback) so this regression has a test gate next time. Adds a stderr guard to the three CI integration scripts (test-sccache, test-extended, stress-concurrent) that fails the run if spiceio emits `[spiceio] error:` lines. Expected transients log through slog! without the prefix, so any hit means a new failure mode is leaking through the 500 InternalError arm. Would have caught the original regression. Review fixes carried forward from edc1579: - decode_read_response_from_msg rejects data_offset < SMB2_HEADER_SIZE so a malformed server response cannot slice into the SMB2 header. Adds two unit tests. - GetObject streaming channel capped by an 8 MiB per-request memory budget on top of READ_PIPELINE_DEPTH; at 1 MiB chunks the channel falls from 64 to 8 instead of buffering up to 64 MiB. - bench-live.sh mount_smbfs baseline no longer embeds the SMB password in argv (leaked via shell history / ps) and no longer shells out to perl -MURI::Escape (not in minimal Perl installs). Uses macOS Keychain credentials; skips clearly if not configured.
1 parent 5736b5f commit a6bb5ff

8 files changed

Lines changed: 196 additions & 19 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spiceio"
3-
version = "0.5.3"
3+
version = "0.5.4"
44
edition = "2024"
55
description = "S3-compatible API proxy to SMB file shares"
66
license = "Apache-2.0"

scripts/bench-live.sh

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,18 +193,27 @@ bench_concurrent_read() {
193193
# and runs the same dd-based write/read tests. Establishes the hard
194194
# ceiling for what the link can do, so we can attribute spiceio's
195195
# translation overhead.
196+
#
197+
# The mount uses the macOS Keychain for credentials rather than embedding
198+
# the password in the mount URL — embedded passwords leak via shell
199+
# history and `ps` argv listings, and shelling out to `perl -MURI::Escape`
200+
# adds a non-portable dependency. To prep the keychain once:
201+
# mount_smbfs //user@server/share /tmp/mnt # interactive; offers
202+
# "Remember in keychain"
203+
# umount /tmp/mnt
204+
# If the keychain entry is missing, the baseline is skipped.
196205
bench_mount_baseline() {
197206
local user="$SPICEIO_SMB_USER"
198-
local pass="$SPICEIO_SMB_PASS"
199207
local server="$SMB_SERVER"
200208
local share="$SMB_SHARE"
201209

202210
MOUNT_POINT="/tmp/spiceio-bench-mount-$$"
203211
mkdir -p "$MOUNT_POINT"
204-
local escaped_pass
205-
escaped_pass=$(printf '%s' "$pass" | perl -MURI::Escape -ne 'print uri_escape($_)')
206-
if ! mount_smbfs -N "//${user}:${escaped_pass}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then
207-
echo " (mount_smbfs failed — skipping baseline)"
212+
# -N: no interactive password prompt. mount_smbfs will pull credentials
213+
# from the macOS Keychain if available; we deliberately do not pass the
214+
# password in argv so it doesn't leak via process listings.
215+
if ! mount_smbfs -N "//${user}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then
216+
echo " (mount_smbfs failed — set up keychain credentials first; skipping baseline)"
208217
rmdir "$MOUNT_POINT" 2>/dev/null
209218
MOUNT_POINT=""
210219
return

scripts/stress-concurrent.sh

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ FAIL=0
2828
CONCURRENCY="${SPICEIO_STRESS_CONCURRENCY:-16}"
2929
CURL_TIMEOUT="${SPICEIO_STRESS_TIMEOUT:-30}"
3030

31+
# Capture spiceio stderr for a post-run guard. We still tee to the script's
32+
# stderr so CI logs stream live; the file is only used to fail the build if
33+
# any unexpected `[spiceio] error:` lines slip through (e.g., something we
34+
# thought was an expected transient turns out to be logged as an error).
35+
SPICEIO_STDERR="${TMPDIR_BASE}/spiceio-stderr.log"
36+
3137
# ── Cleanup ────────────────────────────────────────────────────────────────
3238

3339
SPICEIO_PID=""
@@ -95,7 +101,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \
95101
SPICEIO_SMB_SHARE="$SMB_SHARE" \
96102
SPICEIO_BUCKET="$BUCKET" \
97103
SPICEIO_REGION="$REGION" \
98-
"$SPICEIO_BIN" &
104+
"$SPICEIO_BIN" 2> >(tee "$SPICEIO_STDERR" >&2) &
99105
SPICEIO_PID=$!
100106

101107
for i in $(seq 1 30); do
@@ -394,6 +400,28 @@ for i in $(seq 1 "$LARGE_N"); do
394400
done
395401
echo " integrity: $((PASS - PREV_PASS))/${LARGE_N} verified"
396402

403+
# ════════════════════════════════════════════════════════════════════════════
404+
# Stderr guard
405+
# ════════════════════════════════════════════════════════════════════════════
406+
#
407+
# Any `[spiceio] error:` lines indicate spiceio surfaced something at error
408+
# level — that should not happen during a clean stress run. Expected
409+
# transients (sharing violations, NotFound, etc.) are mapped to typed
410+
# `io::ErrorKind`s and logged via `slog!` (without the "error:" prefix).
411+
# A hit here means we regressed: a new failure mode is leaking through the
412+
# generic 500 InternalError arm of `io_to_s3_error`. Fail the build so
413+
# someone investigates instead of merging it silently again.
414+
415+
# Give the tee subshell a moment to flush after we killed spiceio in cleanup
416+
# (which runs on EXIT, so we have to flush sync here, before cleanup fires).
417+
sync 2>/dev/null || true
418+
if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then
419+
echo ""
420+
echo "FAIL: spiceio emitted unexpected error log lines during stress run:"
421+
grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20
422+
FAIL=$((FAIL + 1))
423+
fi
424+
397425
# ════════════════════════════════════════════════════════════════════════════
398426
# Summary
399427
# ════════════════════════════════════════════════════════════════════════════

scripts/test-extended.sh

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ FAIL=0
3131
CONCURRENCY="${SPICEIO_EXT_CONCURRENCY:-8}"
3232
CURL_TIMEOUT="${SPICEIO_EXT_TIMEOUT:-60}"
3333

34+
# Capture spiceio stderr so the post-run guard can flag unexpected
35+
# `[spiceio] error:` lines. We tee back to stderr so CI still streams live.
36+
SPICEIO_STDERR="${TMPDIR_BASE}/spiceio-stderr.log"
37+
3438
# ── Cleanup ────────────────────────────────────────────────────────────────
3539

3640
SPICEIO_PID=""
@@ -113,7 +117,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \
113117
SPICEIO_SMB_SHARE="$SMB_SHARE" \
114118
SPICEIO_BUCKET="$BUCKET" \
115119
SPICEIO_REGION="$REGION" \
116-
"$SPICEIO_BIN" &
120+
"$SPICEIO_BIN" 2> >(tee "$SPICEIO_STDERR" >&2) &
117121
SPICEIO_PID=$!
118122

119123
for i in $(seq 1 60); do
@@ -427,6 +431,23 @@ ORIG_MD5=$(md5 -q "${SC_DIR}/src")
427431
GOT_MD5=$(md5 -q "${SC_DIR}/dl-after")
428432
assert_eq "full GET after cancellation succeeds" "$ORIG_MD5" "$GOT_MD5"
429433

434+
# ════════════════════════════════════════════════════════════════════════════
435+
# Stderr guard
436+
# ════════════════════════════════════════════════════════════════════════════
437+
#
438+
# Expected transients (sharing violations, NotFound, etc.) are mapped to
439+
# typed `io::ErrorKind`s and logged via `slog!` without the "error:" prefix.
440+
# A `[spiceio] error:` hit here means a new failure mode is leaking through
441+
# the generic 500 InternalError arm of `io_to_s3_error` — fail the build.
442+
443+
sync 2>/dev/null || true
444+
if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then
445+
echo ""
446+
echo "FAIL: spiceio emitted unexpected error log lines during extended run:"
447+
grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20
448+
FAIL=$((FAIL + 1))
449+
fi
450+
430451
# ════════════════════════════════════════════════════════════════════════════
431452
# Summary
432453
# ════════════════════════════════════════════════════════════════════════════

scripts/test-sccache.sh

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ TEST_PREFIX="spiceio-test-$$"
2424
PASS=0
2525
FAIL=0
2626

27+
# Capture spiceio stderr so the post-run guard can flag unexpected
28+
# `[spiceio] error:` lines. We tee back to stderr so CI still streams live.
29+
SPICEIO_STDERR=$(mktemp /tmp/spiceio-sccache-stderr.XXXXXX)
30+
2731
# ── Cleanup on exit ─────────────────────────────────────────────────────────
2832

2933
SPICEIO_PID=""
@@ -43,6 +47,7 @@ cleanup() {
4347
wait "$SPICEIO_PID" 2>/dev/null || true
4448
fi
4549
rm -rf /tmp/spiceio-test-*
50+
rm -f "$SPICEIO_STDERR"
4651
}
4752
trap cleanup EXIT
4853

@@ -116,7 +121,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \
116121
SPICEIO_SMB_SHARE="$SMB_SHARE" \
117122
SPICEIO_BUCKET="$BUCKET" \
118123
SPICEIO_REGION="$REGION" \
119-
"$SPICEIO_BIN" &
124+
"$SPICEIO_BIN" 2> >(tee -a "$SPICEIO_STDERR" >&2) &
120125
SPICEIO_PID=$!
121126

122127
echo "[test] waiting for spiceio on ${BIND}..."
@@ -281,7 +286,7 @@ SPICEIO_SMB_SHARE="$SMB_SHARE" \
281286
SPICEIO_BUCKET="$BUCKET" \
282287
SPICEIO_REGION="$REGION" \
283288
SPICEIO_LOG_FILE="$SPICEIO_LOG2" \
284-
"$SPICEIO_BIN" &
289+
"$SPICEIO_BIN" 2> >(tee -a "$SPICEIO_STDERR" >&2) &
285290
SPICEIO_PID2=$!
286291

287292
echo "[test] waiting for second spiceio instance..."
@@ -404,3 +409,18 @@ else
404409
echo "[test] FAIL: expected cache hits > 0 (got ${CACHE_HITS:-0}) and write errors == 0 (got ${WRITE_ERRORS:-0})"
405410
exit 1
406411
fi
412+
413+
# ── Stderr guard ────────────────────────────────────────────────────────────
414+
#
415+
# Any `[spiceio] error:` line means a code path is leaking through the
416+
# generic 500 InternalError arm of `io_to_s3_error`. Expected transients
417+
# (NotFound on HEAD probes, sharing violations, etc.) are mapped to typed
418+
# `io::ErrorKind`s and logged via `slog!` without the "error:" prefix.
419+
420+
sync 2>/dev/null || true
421+
if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then
422+
echo ""
423+
echo "[test] FAIL: spiceio emitted unexpected error log lines:"
424+
grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20
425+
exit 1
426+
fi

src/s3/router.rs

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -593,14 +593,23 @@ async fn handle_get_object(
593593

594594
// Build response with streaming body.
595595
//
596-
// Channel capacity is sized to match the SMB pipeline depth so a full
597-
// batch of reads can dump into the channel without blocking the producer.
598-
// That lets the SMB-reading task immediately issue the next pipelined
599-
// batch (incurring its round-trip) while the HTTP-sending task drains
600-
// the previous batch into the wire — back-to-back batches overlap, which
601-
// is the difference between filling and starving the 10G link.
602-
let (body, tx) = SpiceioBody::channel(crate::smb::ops::READ_PIPELINE_DEPTH);
596+
// Channel capacity is sized so a full SMB pipeline batch can dump into
597+
// the channel without blocking the producer — that lets the SMB-reading
598+
// task immediately issue the next pipelined round-trip while the
599+
// HTTP-sending task drains the previous batch, overlapping back-to-back
600+
// batches.
601+
//
602+
// But we also cap by a per-request memory budget: with a configured
603+
// `SPICEIO_SMB_MAX_IO` of 1 MiB and `READ_PIPELINE_DEPTH = 64`, an
604+
// uncapped channel would buffer up to 64 MiB per concurrent GetObject.
605+
// The budget keeps per-request memory bounded — at default 64 KiB
606+
// chunks it stays at the full pipeline depth (4 MiB), and at 1 MiB
607+
// chunks it falls to 8 (still room to overlap).
608+
const STREAM_CHANNEL_MAX_BYTES: usize = 8 * 1024 * 1024;
603609
let chunk_size = handle.max_chunk;
610+
let channel_cap = (STREAM_CHANNEL_MAX_BYTES / chunk_size as usize)
611+
.clamp(1, crate::smb::ops::READ_PIPELINE_DEPTH);
612+
let (body, tx) = SpiceioBody::channel(channel_cap);
604613

605614
// Spawn background task to stream pipelined SMB reads into the channel.
606615
// Sends batches of read requests to fill the network pipe, then pushes
@@ -1452,6 +1461,19 @@ fn io_to_s3_error(e: &io::Error) -> Response<SpiceioBody> {
14521461
crate::serr!("[spiceio] access denied: {e}");
14531462
error_response(StatusCode::FORBIDDEN, "AccessDenied", "Access Denied")
14541463
}
1464+
// Sharing violations are an expected transient under concurrent
1465+
// writes (the underlying SMB server refused to open a file held by
1466+
// another handle). Map to 503 SlowDown — the standard S3 retryable
1467+
// status — and log at info level instead of `serr!`, since these
1468+
// are not bugs and shouldn't paint CI logs red.
1469+
io::ErrorKind::ResourceBusy => {
1470+
crate::slog!("[spiceio] busy (retry): {e}");
1471+
error_response(
1472+
StatusCode::SERVICE_UNAVAILABLE,
1473+
"SlowDown",
1474+
"Please reduce your request rate.",
1475+
)
1476+
}
14551477
_ => {
14561478
crate::serr!("[spiceio] error: {e}");
14571479
error_response(
@@ -1510,3 +1532,42 @@ async fn collect_body(req: Request<Incoming>) -> Bytes {
15101532
}
15111533
}
15121534
}
1535+
1536+
#[cfg(test)]
1537+
mod tests {
1538+
use super::*;
1539+
1540+
/// The whole reason this file exists: unit tests in `smb::client` already
1541+
/// verify `STATUS_SHARING_VIOLATION → io::ErrorKind::ResourceBusy`. What
1542+
/// was missing was the next hop — that the HTTP layer maps that kind to
1543+
/// a retryable S3 status, not 500 InternalError. Without this assertion,
1544+
/// the prior regression (sharing violations leaking as 500 with noisy
1545+
/// `error:` logs) had no test to catch it.
1546+
#[test]
1547+
fn io_to_s3_error_maps_resource_busy_to_slow_down() {
1548+
let err = io::Error::new(io::ErrorKind::ResourceBusy, "sharing violation: foo");
1549+
let resp = io_to_s3_error(&err);
1550+
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
1551+
}
1552+
1553+
#[test]
1554+
fn io_to_s3_error_maps_not_found_to_404() {
1555+
let err = io::Error::new(io::ErrorKind::NotFound, "missing");
1556+
let resp = io_to_s3_error(&err);
1557+
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
1558+
}
1559+
1560+
#[test]
1561+
fn io_to_s3_error_maps_permission_denied_to_403() {
1562+
let err = io::Error::new(io::ErrorKind::PermissionDenied, "nope");
1563+
let resp = io_to_s3_error(&err);
1564+
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
1565+
}
1566+
1567+
#[test]
1568+
fn io_to_s3_error_other_falls_back_to_500() {
1569+
let err = io::Error::other("unexpected");
1570+
let resp = io_to_s3_error(&err);
1571+
assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR);
1572+
}
1573+
}

src/smb/protocol.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,11 @@ pub fn decode_read_response_owned(body: Vec<u8>) -> Option<Bytes> {
491491
/// body) and returns a `Bytes` slice referencing the response payload. Avoids
492492
/// the extra body-copy that `decode_read_response_owned` would require if the
493493
/// caller had to split body off first.
494+
///
495+
/// Treats `data_offset` as an absolute offset from the start of the SMB2
496+
/// message and rejects offsets that fall inside the 64-byte header, so a
497+
/// malformed server response cannot trick us into returning header bytes as
498+
/// the read payload.
494499
pub fn decode_read_response_from_msg(msg: Vec<u8>) -> Option<Bytes> {
495500
if msg.len() < SMB2_HEADER_SIZE + 17 {
496501
return None;
@@ -499,7 +504,12 @@ pub fn decode_read_response_from_msg(msg: Vec<u8>) -> Option<Bytes> {
499504
let data_offset = u16::from_le_bytes(body[2..4].try_into().unwrap()) as usize;
500505
let data_length = u32::from_le_bytes(body[4..8].try_into().unwrap()) as usize;
501506

502-
// `data_offset` is from the start of the SMB2 message, not the body.
507+
// `data_offset` is from the start of the SMB2 message. It must not point
508+
// inside the header (or before it) — a malformed server response with an
509+
// offset < SMB2_HEADER_SIZE would otherwise slice into header bytes.
510+
if data_offset < SMB2_HEADER_SIZE {
511+
return None;
512+
}
503513
let start = data_offset;
504514
let end = start.checked_add(data_length)?;
505515
if end > msg.len() {
@@ -926,6 +936,34 @@ mod tests {
926936
assert!(decode_read_response_from_msg(msg).is_none());
927937
}
928938

939+
#[test]
940+
fn decode_read_response_from_msg_rejects_offset_inside_header() {
941+
// A malformed server response with data_offset < SMB2_HEADER_SIZE
942+
// would otherwise have us slice into the SMB2 header bytes and return
943+
// them as payload. The decoder must reject that case.
944+
let mut msg = vec![0u8; SMB2_HEADER_SIZE + 32];
945+
// Seed the header with a sentinel so we'd notice if it ever leaked
946+
// back as payload.
947+
for (i, b) in msg[..SMB2_HEADER_SIZE].iter_mut().enumerate() {
948+
*b = 0xA0 | (i as u8 & 0x0F);
949+
}
950+
let body = &mut msg[SMB2_HEADER_SIZE..];
951+
// data_offset = 16 (inside the header, well before SMB2_HEADER_SIZE).
952+
body[2..4].copy_from_slice(&16u16.to_le_bytes());
953+
body[4..8].copy_from_slice(&8u32.to_le_bytes());
954+
assert!(decode_read_response_from_msg(msg).is_none());
955+
}
956+
957+
#[test]
958+
fn decode_read_response_from_msg_rejects_offset_zero() {
959+
// data_offset = 0 is also inside the header.
960+
let mut msg = vec![0u8; SMB2_HEADER_SIZE + 32];
961+
let body = &mut msg[SMB2_HEADER_SIZE..];
962+
body[2..4].copy_from_slice(&0u16.to_le_bytes());
963+
body[4..8].copy_from_slice(&4u32.to_le_bytes());
964+
assert!(decode_read_response_from_msg(msg).is_none());
965+
}
966+
929967
#[test]
930968
fn decode_write_response_valid() {
931969
let mut body = vec![0u8; 16];

0 commit comments

Comments
 (0)