From a6bb5ff06bf699c9450cffc6c9aebcf9be9e77fb Mon Sep 17 00:00:00 2001 From: Luke Kim <80174+lukekim@users.noreply.github.com> Date: Thu, 14 May 2026 18:00:28 +0900 Subject: [PATCH 1/2] Fix sharing-violation 500s + carry forward review fixes (v0.5.4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous PR squash-merged with four Copilot review threads unaddressed (auto-merge fired before the fixes landed). Brings those forward, and adds the actual bug they were a symptom of — sharing violations leaking through io_to_s3_error's catch-all arm as 500 InternalError with noisy `error:` log lines. Sharing violations are an expected transient under concurrent writes (the SMB server refuses a file held by another handle). The typed io::ErrorKind::ResourceBusy mapping was already correct; the HTTP layer just had no arm for it. Now maps to 503 SlowDown — the standard retryable S3 status — and logs via slog! instead of serr!, so SDKs auto-retry instead of seeing a fatal 500 and CI stops painting these red. Adds an io_to_s3_error unit test covering all four kinds (NotFound, PermissionDenied, ResourceBusy, fallback) so this regression has a test gate next time. Adds a stderr guard to the three CI integration scripts (test-sccache, test-extended, stress-concurrent) that fails the run if spiceio emits `[spiceio] error:` lines. Expected transients log through slog! without the prefix, so any hit means a new failure mode is leaking through the 500 InternalError arm. Would have caught the original regression. Review fixes carried forward from edc1579: - decode_read_response_from_msg rejects data_offset < SMB2_HEADER_SIZE so a malformed server response cannot slice into the SMB2 header. Adds two unit tests. - GetObject streaming channel capped by an 8 MiB per-request memory budget on top of READ_PIPELINE_DEPTH; at 1 MiB chunks the channel falls from 64 to 8 instead of buffering up to 64 MiB. - bench-live.sh mount_smbfs baseline no longer embeds the SMB password in argv (leaked via shell history / ps) and no longer shells out to perl -MURI::Escape (not in minimal Perl installs). Uses macOS Keychain credentials; skips clearly if not configured. --- Cargo.lock | 2 +- Cargo.toml | 2 +- scripts/bench-live.sh | 19 ++++++--- scripts/stress-concurrent.sh | 30 ++++++++++++++- scripts/test-extended.sh | 23 ++++++++++- scripts/test-sccache.sh | 24 +++++++++++- src/s3/router.rs | 75 ++++++++++++++++++++++++++++++++---- src/smb/protocol.rs | 40 ++++++++++++++++++- 8 files changed, 196 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 203f56b..8692c36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -656,7 +656,7 @@ dependencies = [ [[package]] name = "spiceio" -version = "0.5.3" +version = "0.5.4" dependencies = [ "bytes", "criterion", diff --git a/Cargo.toml b/Cargo.toml index ebaa17d..87a537f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spiceio" -version = "0.5.3" +version = "0.5.4" edition = "2024" description = "S3-compatible API proxy to SMB file shares" license = "Apache-2.0" diff --git a/scripts/bench-live.sh b/scripts/bench-live.sh index c83640d..627de1c 100755 --- a/scripts/bench-live.sh +++ b/scripts/bench-live.sh @@ -193,18 +193,27 @@ bench_concurrent_read() { # and runs the same dd-based write/read tests. Establishes the hard # ceiling for what the link can do, so we can attribute spiceio's # translation overhead. +# +# The mount uses the macOS Keychain for credentials rather than embedding +# the password in the mount URL — embedded passwords leak via shell +# history and `ps` argv listings, and shelling out to `perl -MURI::Escape` +# adds a non-portable dependency. To prep the keychain once: +# mount_smbfs //user@server/share /tmp/mnt # interactive; offers +# "Remember in keychain" +# umount /tmp/mnt +# If the keychain entry is missing, the baseline is skipped. bench_mount_baseline() { local user="$SPICEIO_SMB_USER" - local pass="$SPICEIO_SMB_PASS" local server="$SMB_SERVER" local share="$SMB_SHARE" MOUNT_POINT="/tmp/spiceio-bench-mount-$$" mkdir -p "$MOUNT_POINT" - local escaped_pass - escaped_pass=$(printf '%s' "$pass" | perl -MURI::Escape -ne 'print uri_escape($_)') - if ! mount_smbfs -N "//${user}:${escaped_pass}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then - echo " (mount_smbfs failed — skipping baseline)" + # -N: no interactive password prompt. mount_smbfs will pull credentials + # from the macOS Keychain if available; we deliberately do not pass the + # password in argv so it doesn't leak via process listings. + if ! mount_smbfs -N "//${user}@${server}/${share}" "$MOUNT_POINT" 2>/dev/null; then + echo " (mount_smbfs failed — set up keychain credentials first; skipping baseline)" rmdir "$MOUNT_POINT" 2>/dev/null MOUNT_POINT="" return diff --git a/scripts/stress-concurrent.sh b/scripts/stress-concurrent.sh index e2a3617..b05fb00 100755 --- a/scripts/stress-concurrent.sh +++ b/scripts/stress-concurrent.sh @@ -28,6 +28,12 @@ FAIL=0 CONCURRENCY="${SPICEIO_STRESS_CONCURRENCY:-16}" CURL_TIMEOUT="${SPICEIO_STRESS_TIMEOUT:-30}" +# Capture spiceio stderr for a post-run guard. We still tee to the script's +# stderr so CI logs stream live; the file is only used to fail the build if +# any unexpected `[spiceio] error:` lines slip through (e.g., something we +# thought was an expected transient turns out to be logged as an error). +SPICEIO_STDERR="${TMPDIR_BASE}/spiceio-stderr.log" + # ── Cleanup ──────────────────────────────────────────────────────────────── SPICEIO_PID="" @@ -95,7 +101,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \ SPICEIO_SMB_SHARE="$SMB_SHARE" \ SPICEIO_BUCKET="$BUCKET" \ SPICEIO_REGION="$REGION" \ -"$SPICEIO_BIN" & +"$SPICEIO_BIN" 2> >(tee "$SPICEIO_STDERR" >&2) & SPICEIO_PID=$! for i in $(seq 1 30); do @@ -394,6 +400,28 @@ for i in $(seq 1 "$LARGE_N"); do done echo " integrity: $((PASS - PREV_PASS))/${LARGE_N} verified" +# ════════════════════════════════════════════════════════════════════════════ +# Stderr guard +# ════════════════════════════════════════════════════════════════════════════ +# +# Any `[spiceio] error:` lines indicate spiceio surfaced something at error +# level — that should not happen during a clean stress run. Expected +# transients (sharing violations, NotFound, etc.) are mapped to typed +# `io::ErrorKind`s and logged via `slog!` (without the "error:" prefix). +# A hit here means we regressed: a new failure mode is leaking through the +# generic 500 InternalError arm of `io_to_s3_error`. Fail the build so +# someone investigates instead of merging it silently again. + +# Give the tee subshell a moment to flush after we killed spiceio in cleanup +# (which runs on EXIT, so we have to flush sync here, before cleanup fires). +sync 2>/dev/null || true +if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then + echo "" + echo "FAIL: spiceio emitted unexpected error log lines during stress run:" + grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20 + FAIL=$((FAIL + 1)) +fi + # ════════════════════════════════════════════════════════════════════════════ # Summary # ════════════════════════════════════════════════════════════════════════════ diff --git a/scripts/test-extended.sh b/scripts/test-extended.sh index 609a152..2fc8118 100755 --- a/scripts/test-extended.sh +++ b/scripts/test-extended.sh @@ -31,6 +31,10 @@ FAIL=0 CONCURRENCY="${SPICEIO_EXT_CONCURRENCY:-8}" CURL_TIMEOUT="${SPICEIO_EXT_TIMEOUT:-60}" +# Capture spiceio stderr so the post-run guard can flag unexpected +# `[spiceio] error:` lines. We tee back to stderr so CI still streams live. +SPICEIO_STDERR="${TMPDIR_BASE}/spiceio-stderr.log" + # ── Cleanup ──────────────────────────────────────────────────────────────── SPICEIO_PID="" @@ -113,7 +117,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \ SPICEIO_SMB_SHARE="$SMB_SHARE" \ SPICEIO_BUCKET="$BUCKET" \ SPICEIO_REGION="$REGION" \ -"$SPICEIO_BIN" & +"$SPICEIO_BIN" 2> >(tee "$SPICEIO_STDERR" >&2) & SPICEIO_PID=$! for i in $(seq 1 60); do @@ -427,6 +431,23 @@ ORIG_MD5=$(md5 -q "${SC_DIR}/src") GOT_MD5=$(md5 -q "${SC_DIR}/dl-after") assert_eq "full GET after cancellation succeeds" "$ORIG_MD5" "$GOT_MD5" +# ════════════════════════════════════════════════════════════════════════════ +# Stderr guard +# ════════════════════════════════════════════════════════════════════════════ +# +# Expected transients (sharing violations, NotFound, etc.) are mapped to +# typed `io::ErrorKind`s and logged via `slog!` without the "error:" prefix. +# A `[spiceio] error:` hit here means a new failure mode is leaking through +# the generic 500 InternalError arm of `io_to_s3_error` — fail the build. + +sync 2>/dev/null || true +if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then + echo "" + echo "FAIL: spiceio emitted unexpected error log lines during extended run:" + grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20 + FAIL=$((FAIL + 1)) +fi + # ════════════════════════════════════════════════════════════════════════════ # Summary # ════════════════════════════════════════════════════════════════════════════ diff --git a/scripts/test-sccache.sh b/scripts/test-sccache.sh index 6404bb4..11f0f94 100755 --- a/scripts/test-sccache.sh +++ b/scripts/test-sccache.sh @@ -24,6 +24,10 @@ TEST_PREFIX="spiceio-test-$$" PASS=0 FAIL=0 +# Capture spiceio stderr so the post-run guard can flag unexpected +# `[spiceio] error:` lines. We tee back to stderr so CI still streams live. +SPICEIO_STDERR=$(mktemp /tmp/spiceio-sccache-stderr.XXXXXX) + # ── Cleanup on exit ───────────────────────────────────────────────────────── SPICEIO_PID="" @@ -43,6 +47,7 @@ cleanup() { wait "$SPICEIO_PID" 2>/dev/null || true fi rm -rf /tmp/spiceio-test-* + rm -f "$SPICEIO_STDERR" } trap cleanup EXIT @@ -116,7 +121,7 @@ SPICEIO_SMB_DOMAIN="$SMB_DOMAIN" \ SPICEIO_SMB_SHARE="$SMB_SHARE" \ SPICEIO_BUCKET="$BUCKET" \ SPICEIO_REGION="$REGION" \ -"$SPICEIO_BIN" & +"$SPICEIO_BIN" 2> >(tee -a "$SPICEIO_STDERR" >&2) & SPICEIO_PID=$! echo "[test] waiting for spiceio on ${BIND}..." @@ -281,7 +286,7 @@ SPICEIO_SMB_SHARE="$SMB_SHARE" \ SPICEIO_BUCKET="$BUCKET" \ SPICEIO_REGION="$REGION" \ SPICEIO_LOG_FILE="$SPICEIO_LOG2" \ -"$SPICEIO_BIN" & +"$SPICEIO_BIN" 2> >(tee -a "$SPICEIO_STDERR" >&2) & SPICEIO_PID2=$! echo "[test] waiting for second spiceio instance..." @@ -404,3 +409,18 @@ else echo "[test] FAIL: expected cache hits > 0 (got ${CACHE_HITS:-0}) and write errors == 0 (got ${WRITE_ERRORS:-0})" exit 1 fi + +# ── Stderr guard ──────────────────────────────────────────────────────────── +# +# Any `[spiceio] error:` line means a code path is leaking through the +# generic 500 InternalError arm of `io_to_s3_error`. Expected transients +# (NotFound on HEAD probes, sharing violations, etc.) are mapped to typed +# `io::ErrorKind`s and logged via `slog!` without the "error:" prefix. + +sync 2>/dev/null || true +if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then + echo "" + echo "[test] FAIL: spiceio emitted unexpected error log lines:" + grep '\[spiceio\] error:' "$SPICEIO_STDERR" | head -20 + exit 1 +fi diff --git a/src/s3/router.rs b/src/s3/router.rs index bd5f97c..1b7dfa8 100644 --- a/src/s3/router.rs +++ b/src/s3/router.rs @@ -593,14 +593,23 @@ async fn handle_get_object( // Build response with streaming body. // - // Channel capacity is sized to match the SMB pipeline depth so a full - // batch of reads can dump into the channel without blocking the producer. - // That lets the SMB-reading task immediately issue the next pipelined - // batch (incurring its round-trip) while the HTTP-sending task drains - // the previous batch into the wire — back-to-back batches overlap, which - // is the difference between filling and starving the 10G link. - let (body, tx) = SpiceioBody::channel(crate::smb::ops::READ_PIPELINE_DEPTH); + // Channel capacity is sized so a full SMB pipeline batch can dump into + // the channel without blocking the producer — that lets the SMB-reading + // task immediately issue the next pipelined round-trip while the + // HTTP-sending task drains the previous batch, overlapping back-to-back + // batches. + // + // But we also cap by a per-request memory budget: with a configured + // `SPICEIO_SMB_MAX_IO` of 1 MiB and `READ_PIPELINE_DEPTH = 64`, an + // uncapped channel would buffer up to 64 MiB per concurrent GetObject. + // The budget keeps per-request memory bounded — at default 64 KiB + // chunks it stays at the full pipeline depth (4 MiB), and at 1 MiB + // chunks it falls to 8 (still room to overlap). + const STREAM_CHANNEL_MAX_BYTES: usize = 8 * 1024 * 1024; let chunk_size = handle.max_chunk; + let channel_cap = (STREAM_CHANNEL_MAX_BYTES / chunk_size as usize) + .clamp(1, crate::smb::ops::READ_PIPELINE_DEPTH); + let (body, tx) = SpiceioBody::channel(channel_cap); // Spawn background task to stream pipelined SMB reads into the channel. // Sends batches of read requests to fill the network pipe, then pushes @@ -1452,6 +1461,19 @@ fn io_to_s3_error(e: &io::Error) -> Response { crate::serr!("[spiceio] access denied: {e}"); error_response(StatusCode::FORBIDDEN, "AccessDenied", "Access Denied") } + // Sharing violations are an expected transient under concurrent + // writes (the underlying SMB server refused to open a file held by + // another handle). Map to 503 SlowDown — the standard S3 retryable + // status — and log at info level instead of `serr!`, since these + // are not bugs and shouldn't paint CI logs red. + io::ErrorKind::ResourceBusy => { + crate::slog!("[spiceio] busy (retry): {e}"); + error_response( + StatusCode::SERVICE_UNAVAILABLE, + "SlowDown", + "Please reduce your request rate.", + ) + } _ => { crate::serr!("[spiceio] error: {e}"); error_response( @@ -1510,3 +1532,42 @@ async fn collect_body(req: Request) -> Bytes { } } } + +#[cfg(test)] +mod tests { + use super::*; + + /// The whole reason this file exists: unit tests in `smb::client` already + /// verify `STATUS_SHARING_VIOLATION → io::ErrorKind::ResourceBusy`. What + /// was missing was the next hop — that the HTTP layer maps that kind to + /// a retryable S3 status, not 500 InternalError. Without this assertion, + /// the prior regression (sharing violations leaking as 500 with noisy + /// `error:` logs) had no test to catch it. + #[test] + fn io_to_s3_error_maps_resource_busy_to_slow_down() { + let err = io::Error::new(io::ErrorKind::ResourceBusy, "sharing violation: foo"); + let resp = io_to_s3_error(&err); + assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE); + } + + #[test] + fn io_to_s3_error_maps_not_found_to_404() { + let err = io::Error::new(io::ErrorKind::NotFound, "missing"); + let resp = io_to_s3_error(&err); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + + #[test] + fn io_to_s3_error_maps_permission_denied_to_403() { + let err = io::Error::new(io::ErrorKind::PermissionDenied, "nope"); + let resp = io_to_s3_error(&err); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); + } + + #[test] + fn io_to_s3_error_other_falls_back_to_500() { + let err = io::Error::other("unexpected"); + let resp = io_to_s3_error(&err); + assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR); + } +} diff --git a/src/smb/protocol.rs b/src/smb/protocol.rs index d6bbe87..b00ef9f 100644 --- a/src/smb/protocol.rs +++ b/src/smb/protocol.rs @@ -491,6 +491,11 @@ pub fn decode_read_response_owned(body: Vec) -> Option { /// body) and returns a `Bytes` slice referencing the response payload. Avoids /// the extra body-copy that `decode_read_response_owned` would require if the /// caller had to split body off first. +/// +/// Treats `data_offset` as an absolute offset from the start of the SMB2 +/// message and rejects offsets that fall inside the 64-byte header, so a +/// malformed server response cannot trick us into returning header bytes as +/// the read payload. pub fn decode_read_response_from_msg(msg: Vec) -> Option { if msg.len() < SMB2_HEADER_SIZE + 17 { return None; @@ -499,7 +504,12 @@ pub fn decode_read_response_from_msg(msg: Vec) -> Option { let data_offset = u16::from_le_bytes(body[2..4].try_into().unwrap()) as usize; let data_length = u32::from_le_bytes(body[4..8].try_into().unwrap()) as usize; - // `data_offset` is from the start of the SMB2 message, not the body. + // `data_offset` is from the start of the SMB2 message. It must not point + // inside the header (or before it) — a malformed server response with an + // offset < SMB2_HEADER_SIZE would otherwise slice into header bytes. + if data_offset < SMB2_HEADER_SIZE { + return None; + } let start = data_offset; let end = start.checked_add(data_length)?; if end > msg.len() { @@ -926,6 +936,34 @@ mod tests { assert!(decode_read_response_from_msg(msg).is_none()); } + #[test] + fn decode_read_response_from_msg_rejects_offset_inside_header() { + // A malformed server response with data_offset < SMB2_HEADER_SIZE + // would otherwise have us slice into the SMB2 header bytes and return + // them as payload. The decoder must reject that case. + let mut msg = vec![0u8; SMB2_HEADER_SIZE + 32]; + // Seed the header with a sentinel so we'd notice if it ever leaked + // back as payload. + for (i, b) in msg[..SMB2_HEADER_SIZE].iter_mut().enumerate() { + *b = 0xA0 | (i as u8 & 0x0F); + } + let body = &mut msg[SMB2_HEADER_SIZE..]; + // data_offset = 16 (inside the header, well before SMB2_HEADER_SIZE). + body[2..4].copy_from_slice(&16u16.to_le_bytes()); + body[4..8].copy_from_slice(&8u32.to_le_bytes()); + assert!(decode_read_response_from_msg(msg).is_none()); + } + + #[test] + fn decode_read_response_from_msg_rejects_offset_zero() { + // data_offset = 0 is also inside the header. + let mut msg = vec![0u8; SMB2_HEADER_SIZE + 32]; + let body = &mut msg[SMB2_HEADER_SIZE..]; + body[2..4].copy_from_slice(&0u16.to_le_bytes()); + body[4..8].copy_from_slice(&4u32.to_le_bytes()); + assert!(decode_read_response_from_msg(msg).is_none()); + } + #[test] fn decode_write_response_valid() { let mut body = vec![0u8; 16]; From 77a5aa5e8e7edc4583db4d3ba8faf0a59c315ed8 Mon Sep 17 00:00:00 2001 From: Luke Kim <80174+lukekim@users.noreply.github.com> Date: Thu, 14 May 2026 18:10:35 +0900 Subject: [PATCH 2/2] Address PR review feedback - stream_channel_capacity: extracted from the inline GetObject channel sizing into a tested helper. Floors chunk_size at 1 before the divide-by, so an SMB-negotiated max_read_size of 0 cannot panic the streaming task with a divide-by-zero. Four new unit tests cover the zero-chunk guard plus the default 64 KiB, large 1 MiB, and budget- exceeding cases. - Integration test stderr guards: kill+wait the spiceio PID(s) explicitly before grepping the captured stderr. The previous `sync` was a no-op for the tee process-substitution pipe, and the cleanup trap hadn't fired yet so spiceio's stderr stream was still open. test-sccache.sh has two spiceio instances; the guard now kills both. Misleading comment removed. --- scripts/stress-concurrent.sh | 18 +++++++-- scripts/test-extended.sh | 12 +++++- scripts/test-sccache.sh | 17 ++++++++- src/s3/router.rs | 74 +++++++++++++++++++++++++++++++----- 4 files changed, 107 insertions(+), 14 deletions(-) diff --git a/scripts/stress-concurrent.sh b/scripts/stress-concurrent.sh index b05fb00..f4ff0ef 100755 --- a/scripts/stress-concurrent.sh +++ b/scripts/stress-concurrent.sh @@ -412,9 +412,21 @@ echo " integrity: $((PASS - PREV_PASS))/${LARGE_N} verified" # generic 500 InternalError arm of `io_to_s3_error`. Fail the build so # someone investigates instead of merging it silently again. -# Give the tee subshell a moment to flush after we killed spiceio in cleanup -# (which runs on EXIT, so we have to flush sync here, before cleanup fires). -sync 2>/dev/null || true +# Stop spiceio *before* grepping the captured stderr. spiceio's stderr is +# piped through `tee` running in a process substitution; the tee process +# only finishes draining and closes the capture file when spiceio closes its +# stderr fd. The cleanup trap runs on EXIT, so it would fire too late for +# the guard below — we have to kill+wait here. Set SPICEIO_PID empty so the +# trap's own kill becomes a no-op. +if [[ -n "$SPICEIO_PID" ]]; then + kill "$SPICEIO_PID" 2>/dev/null || true + wait "$SPICEIO_PID" 2>/dev/null || true + SPICEIO_PID="" +fi +# tee in process substitution isn't directly waitable from bash, so give it +# a brief moment to drain after spiceio closed its stderr. +sleep 0.2 + if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then echo "" echo "FAIL: spiceio emitted unexpected error log lines during stress run:" diff --git a/scripts/test-extended.sh b/scripts/test-extended.sh index 2fc8118..f403d34 100755 --- a/scripts/test-extended.sh +++ b/scripts/test-extended.sh @@ -440,7 +440,17 @@ assert_eq "full GET after cancellation succeeds" "$ORIG_MD5" "$GOT_MD5" # A `[spiceio] error:` hit here means a new failure mode is leaking through # the generic 500 InternalError arm of `io_to_s3_error` — fail the build. -sync 2>/dev/null || true +# Stop spiceio *before* grepping the captured stderr. tee in the process +# substitution only closes the capture file when spiceio's stderr fd closes, +# so we have to kill+wait here rather than rely on the EXIT trap. Clear +# SPICEIO_PID so the trap's own kill becomes a no-op. +if [[ -n "$SPICEIO_PID" ]]; then + kill "$SPICEIO_PID" 2>/dev/null || true + wait "$SPICEIO_PID" 2>/dev/null || true + SPICEIO_PID="" +fi +sleep 0.2 # tee in <(...) isn't directly waitable; let it drain. + if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then echo "" echo "FAIL: spiceio emitted unexpected error log lines during extended run:" diff --git a/scripts/test-sccache.sh b/scripts/test-sccache.sh index 11f0f94..d63f15e 100755 --- a/scripts/test-sccache.sh +++ b/scripts/test-sccache.sh @@ -417,7 +417,22 @@ fi # (NotFound on HEAD probes, sharing violations, etc.) are mapped to typed # `io::ErrorKind`s and logged via `slog!` without the "error:" prefix. -sync 2>/dev/null || true +# Stop both spiceio instances *before* grepping the captured stderr. tee in +# the process substitutions only closes the capture file when each spiceio +# closes its stderr fd, so we have to kill+wait here rather than rely on +# the EXIT trap. Clear PIDs so the trap's own kills become no-ops. +if [[ -n "$SPICEIO_PID2" ]]; then + kill "$SPICEIO_PID2" 2>/dev/null || true + wait "$SPICEIO_PID2" 2>/dev/null || true + SPICEIO_PID2="" +fi +if [[ -n "$SPICEIO_PID" ]]; then + kill "$SPICEIO_PID" 2>/dev/null || true + wait "$SPICEIO_PID" 2>/dev/null || true + SPICEIO_PID="" +fi +sleep 0.2 # tee in <(...) isn't directly waitable; let it drain. + if [[ -s "$SPICEIO_STDERR" ]] && grep -q '\[spiceio\] error:' "$SPICEIO_STDERR"; then echo "" echo "[test] FAIL: spiceio emitted unexpected error log lines:" diff --git a/src/s3/router.rs b/src/s3/router.rs index 1b7dfa8..da9de45 100644 --- a/src/s3/router.rs +++ b/src/s3/router.rs @@ -413,6 +413,24 @@ async fn handle_list_objects( // ── GetObject (streaming, with Range + Conditional) ───────────────────────── +/// Compute the SMB-reads→HTTP-writes channel capacity for a streaming +/// GetObject given the SMB-negotiated chunk size. +/// +/// Sized to hold one full SMB read pipeline batch (so back-to-back batches +/// can overlap) but capped by an 8 MiB per-request memory budget so a +/// configured `SPICEIO_SMB_MAX_IO` of 1 MiB doesn't blow up to 64 MiB of +/// buffering per concurrent stream. +/// +/// Guards against `chunk_size = 0`: `handle.max_chunk` ultimately comes from +/// the SMB server's `max_read_size` in the negotiate response. If a server +/// (or a misconfiguration) yields 0, naive `BUDGET / chunk_size` would +/// divide-by-zero panic and crash the streaming task. We floor at 1 here. +fn stream_channel_capacity(chunk_size: u32) -> usize { + const STREAM_CHANNEL_MAX_BYTES: usize = 8 * 1024 * 1024; + let chunk_size_for_cap = (chunk_size as usize).max(1); + (STREAM_CHANNEL_MAX_BYTES / chunk_size_for_cap).clamp(1, crate::smb::ops::READ_PIPELINE_DEPTH) +} + async fn handle_get_object( hdrs: &http::HeaderMap, share: &ShareSession, @@ -597,18 +615,13 @@ async fn handle_get_object( // the channel without blocking the producer — that lets the SMB-reading // task immediately issue the next pipelined round-trip while the // HTTP-sending task drains the previous batch, overlapping back-to-back - // batches. - // - // But we also cap by a per-request memory budget: with a configured + // batches. We also cap by a per-request memory budget: with a configured // `SPICEIO_SMB_MAX_IO` of 1 MiB and `READ_PIPELINE_DEPTH = 64`, an // uncapped channel would buffer up to 64 MiB per concurrent GetObject. - // The budget keeps per-request memory bounded — at default 64 KiB - // chunks it stays at the full pipeline depth (4 MiB), and at 1 MiB - // chunks it falls to 8 (still room to overlap). - const STREAM_CHANNEL_MAX_BYTES: usize = 8 * 1024 * 1024; + // At default 64 KiB chunks the channel stays at the full pipeline depth + // (4 MiB); at 1 MiB chunks it falls to 8 (still room to overlap). let chunk_size = handle.max_chunk; - let channel_cap = (STREAM_CHANNEL_MAX_BYTES / chunk_size as usize) - .clamp(1, crate::smb::ops::READ_PIPELINE_DEPTH); + let channel_cap = stream_channel_capacity(chunk_size); let (body, tx) = SpiceioBody::channel(channel_cap); // Spawn background task to stream pipelined SMB reads into the channel. @@ -1570,4 +1583,47 @@ mod tests { let resp = io_to_s3_error(&err); assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR); } + + // ── stream_channel_capacity ───────────────────────────────────────────── + + #[test] + fn stream_channel_capacity_handles_zero_chunk() { + // The SMB negotiate response is the source of `max_chunk`; an + // unusual server (or a misconfiguration) could yield 0. The naive + // `BUDGET / 0` would panic and crash the streaming task — the + // floor-at-1 inside `stream_channel_capacity` keeps it safe. The + // exact value doesn't matter (a 0-byte chunk is nonsense anyway); + // we only assert (a) we don't panic and (b) the result is a sane + // pipeline-depth-bounded slot count. + let cap = stream_channel_capacity(0); + assert!( + (1..=crate::smb::ops::READ_PIPELINE_DEPTH).contains(&cap), + "expected 1..={} got {cap}", + crate::smb::ops::READ_PIPELINE_DEPTH + ); + } + + #[test] + fn stream_channel_capacity_default_chunk_uses_full_pipeline() { + // 64 KiB chunks → 8 MiB / 64 KiB = 128, clamped to the pipeline + // depth so a full SMB read batch can dump into the channel. + assert_eq!( + stream_channel_capacity(65536), + crate::smb::ops::READ_PIPELINE_DEPTH + ); + } + + #[test] + fn stream_channel_capacity_large_chunk_falls_below_pipeline() { + // 1 MiB chunks → 8 MiB / 1 MiB = 8, well below pipeline depth. + // This is the case the per-request memory budget exists to handle. + assert_eq!(stream_channel_capacity(1024 * 1024), 8); + } + + #[test] + fn stream_channel_capacity_huge_chunk_floors_at_one() { + // A chunk bigger than the entire budget still yields a 1-slot + // channel, never 0 — the producer can always make progress. + assert_eq!(stream_channel_capacity(16 * 1024 * 1024), 1); + } }