Skip to content

Commit d3d3740

Browse files
committed
Harden SMB startup, quieter protocol errors, and add tests/benches
Three resilience changes plus the tests and benchmarks for them. Setup action: dump the spiceio log on failure (both unexpected exit and timeout). The log was being written to RUNNER_TEMP but never echoed, so when startup stalled past the grace window there was no way to diagnose which phase hung. Also doubled the grace from 30s to 60s — the previous good run took ~12s, leaving only ~2.5x headroom for a slow server day. TCP connect timeout: TcpStream::connect had no spiceio-level timeout, so a server dropping SYNs left the OS waiting 75-90s and stalled pool init past any sensible CI window. Wrapped in tokio::time::timeout(15s) with explicit TimedOut error. Pool connection retry: extracted retry_with_backoff helper used by SmbPool::connect with a 250ms/750ms/2s schedule (4 attempts). A flaky connection during startup no longer takes down the whole pool init. Quieter protocol-layer logging: smb_status_to_io_error was emitting an error log for every SMB status, including expected ones (NotFound on HEAD probes, SharingViolation during WAL cleanup). Removed the unconditional log; mapped statuses return their typed io::Error silently and the catchall arm still logs for truly unknown statuses. Added STATUS_SHARING_VIOLATION (0xC0000043) -> ErrorKind::ResourceBusy. Tests (+21, now 142 total): - smb_status_to_io_error: full mapping coverage including the new ResourceBusy case, unknown-status fallback, STATUS_SUCCESS panic guard, path preservation - retry_with_backoff: first-attempt success, success after transient failures, exhaustion preserving last error, empty-backoff edge case, elapsed-time floor from the schedule, and the structural invariant on CONNECT_RETRY_BACKOFF - parse_compound_response (moved from client.rs to protocol.rs as pub): single/multi-message, empty, truncated header, malformed next_command Benches (+3 in protocol_bench.rs): - parse_compound_response over n=2,4,8 chained messages - pipelined_read_decode at (depth, chunk_size) = (8,64K), (64,64K), (64,8K) — the GetObject hot-path inner loop with throughput reporting - pipelined_write_encode at (depth, chunk_size) = (8,64K), (64,64K), (64,1M) — the WAL pipelined-write inner loop
1 parent 18e298c commit d3d3740

5 files changed

Lines changed: 517 additions & 52 deletions

File tree

.github/actions/setup/action.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,12 @@ runs:
132132
# auto-increment if the requested port was busy).
133133
echo "Waiting for spiceio on ${SPICEIO_BIND}..."
134134
ENDPOINT=""
135-
for i in $(seq 1 30); do
135+
for i in $(seq 1 60); do
136136
if ! kill -0 "$PID" 2>/dev/null; then
137137
echo "::error::spiceio exited unexpectedly"
138+
echo "::group::spiceio log"
139+
cat "$SPICEIO_LOG" 2>/dev/null || echo "(log file missing)"
140+
echo "::endgroup::"
138141
exit 1
139142
fi
140143
ENDPOINT=$(grep 'listening on' "$SPICEIO_LOG" 2>/dev/null | grep -o 'http://[^ ]*' | tail -1 || true)
@@ -146,6 +149,10 @@ runs:
146149
fi
147150
sleep 1
148151
done
149-
echo "::error::spiceio failed to start within 30s"
152+
echo "::error::spiceio failed to start within 60s"
153+
echo "::group::spiceio log"
154+
cat "$SPICEIO_LOG" 2>/dev/null || echo "(log file missing)"
155+
echo "::endgroup::"
156+
kill "$PID" 2>/dev/null || true
150157
exit 1
151158

benches/protocol_bench.rs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,133 @@ fn bench_encode_set_info_rename(c: &mut Criterion) {
152152
group.finish();
153153
}
154154

155+
// ── Parser benches for new public API ────────────────────────────────────────
156+
157+
/// Bench parsing of a compound response (chained SMB2 messages in one frame).
158+
/// Compound responses are the wire format for create+read+close and similar
159+
/// batched operations — relevant CPU cost when the S3 layer issues compounds.
160+
fn bench_parse_compound_response(c: &mut Criterion) {
161+
let mut group = c.benchmark_group("parse_compound_response");
162+
// Body size of 16 bytes is representative of close/create-response payloads.
163+
let body_len = 16usize;
164+
let entry_size = SMB2_HEADER_SIZE + body_len;
165+
for n in [2usize, 4, 8] {
166+
let mut data = vec![0u8; entry_size * n];
167+
for i in 0..n {
168+
let mut hdr = Header::new(Command::Create, i as u64);
169+
hdr.next_command = if i + 1 < n { entry_size as u32 } else { 0 };
170+
let mut buf = BytesMut::with_capacity(SMB2_HEADER_SIZE);
171+
hdr.encode(&mut buf);
172+
let start = i * entry_size;
173+
data[start..start + SMB2_HEADER_SIZE].copy_from_slice(&buf);
174+
for b in &mut data[start + SMB2_HEADER_SIZE..start + entry_size] {
175+
*b = 0xAB;
176+
}
177+
}
178+
group.bench_with_input(
179+
criterion::BenchmarkId::from_parameter(n),
180+
&data,
181+
|b, data| b.iter(|| parse_compound_response(black_box(data))),
182+
);
183+
}
184+
group.finish();
185+
}
186+
187+
/// Build one framed SMB2 read response message (header + read response body +
188+
/// data) ready for `Header::decode` + `decode_read_response_owned`.
189+
fn build_read_response_msg(msg_id: u64, data_len: usize) -> Vec<u8> {
190+
let body_len = 16 + data_len;
191+
let mut msg = vec![0u8; SMB2_HEADER_SIZE + body_len];
192+
193+
let mut hdr_buf = BytesMut::with_capacity(SMB2_HEADER_SIZE);
194+
let mut hdr = Header::new(Command::Read, msg_id);
195+
hdr.status = 0;
196+
hdr.encode(&mut hdr_buf);
197+
msg[..SMB2_HEADER_SIZE].copy_from_slice(&hdr_buf);
198+
199+
let body = &mut msg[SMB2_HEADER_SIZE..];
200+
// StructureSize = 17
201+
body[0..2].copy_from_slice(&17u16.to_le_bytes());
202+
// DataOffset (from start of SMB2 message)
203+
let data_offset = (SMB2_HEADER_SIZE + 16) as u16;
204+
body[2..4].copy_from_slice(&data_offset.to_le_bytes());
205+
// DataLength
206+
body[4..8].copy_from_slice(&(data_len as u32).to_le_bytes());
207+
// Remaining 8 bytes (DataRemaining + Flags) stay zero. Data bytes stay zero.
208+
msg
209+
}
210+
211+
/// Bench the CPU-bound per-batch work of `pipelined_read`: header decode,
212+
/// slot computation from message_id, and `decode_read_response_owned`. This
213+
/// is the inner loop of GetObject streaming once the wire bytes are in.
214+
fn bench_pipelined_read_decode(c: &mut Criterion) {
215+
let mut group = c.benchmark_group("pipelined_read_decode");
216+
// (depth, chunk_size) — depth=64 matches PIPELINE_DEPTH in ops.rs.
217+
let cases = [(8usize, 65536usize), (64, 65536), (64, 8192)];
218+
for (depth, chunk_size) in cases {
219+
let base_msg_id = 1_000u64;
220+
let messages: Vec<Vec<u8>> = (0..depth)
221+
.map(|i| build_read_response_msg(base_msg_id + i as u64, chunk_size))
222+
.collect();
223+
group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
224+
group.bench_with_input(
225+
criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
226+
&messages,
227+
|b, messages| {
228+
b.iter(|| {
229+
let n = messages.len();
230+
let mut slots: Vec<Option<bytes::Bytes>> = (0..n).map(|_| None).collect();
231+
for msg in messages.iter() {
232+
let header = Header::decode(black_box(msg)).unwrap();
233+
let slot = header.message_id.wrapping_sub(base_msg_id) as usize;
234+
let body = msg[SMB2_HEADER_SIZE..].to_vec();
235+
slots[slot] = decode_read_response_owned(body);
236+
}
237+
slots
238+
});
239+
},
240+
);
241+
}
242+
group.finish();
243+
}
244+
245+
/// Bench the CPU-bound per-batch work of `pipelined_write`: header construction
246+
/// (with credit charge), `encode_write_request`, and `build_request` framing.
247+
/// This is the inner loop of WAL pipelined writes before any I/O happens.
248+
fn bench_pipelined_write_encode(c: &mut Criterion) {
249+
let mut group = c.benchmark_group("pipelined_write_encode");
250+
let file_id = [1u8; 16];
251+
// (depth, chunk_size) — depth=64 matches WRITE_PIPELINE_DEPTH in ops.rs.
252+
let cases = [(8usize, 65536usize), (64, 65536), (64, 1024 * 1024)];
253+
for (depth, chunk_size) in cases {
254+
let chunk = vec![0u8; chunk_size];
255+
group.throughput(criterion::Throughput::Bytes((depth * chunk_size) as u64));
256+
group.bench_with_input(
257+
criterion::BenchmarkId::from_parameter(format!("d{depth}_c{chunk_size}")),
258+
&chunk,
259+
|b, chunk| {
260+
b.iter(|| {
261+
let mut packets = Vec::with_capacity(depth);
262+
let mut offset = 0u64;
263+
for i in 0..depth {
264+
let mut hdr = Header::new(Command::Write, i as u64)
265+
.with_credit_charge(chunk.len() as u32);
266+
hdr.tree_id = 42;
267+
hdr.session_id = 0xdead_beef;
268+
let packet = build_request(&hdr, |buf| {
269+
encode_write_request(buf, &file_id, offset, black_box(chunk));
270+
});
271+
packets.push(packet);
272+
offset += chunk.len() as u64;
273+
}
274+
packets
275+
});
276+
},
277+
);
278+
}
279+
group.finish();
280+
}
281+
155282
fn bench_parse_directory_entries(c: &mut Criterion) {
156283
// Build 50 entries
157284
let mut data = Vec::new();
@@ -192,6 +319,9 @@ criterion_group!(
192319
bench_decode_read_response,
193320
bench_decode_read_response_owned,
194321
bench_build_request,
322+
bench_parse_compound_response,
323+
bench_pipelined_read_decode,
324+
bench_pipelined_write_encode,
195325
bench_parse_directory_entries,
196326
);
197327
criterion_main!(benches);

src/smb/client.rs

Lines changed: 108 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ use bytes::{BufMut, BytesMut};
1515
/// the SMB server is slow or unresponsive under heavy load.
1616
const SMB_READ_TIMEOUT: Duration = Duration::from_secs(30);
1717

18+
/// Timeout for the initial TCP handshake to the SMB server. Without this,
19+
/// a server that drops SYNs leaves the OS waiting ~75-90s, which stalls
20+
/// pool initialization past any sensible CI window.
21+
const SMB_CONNECT_TIMEOUT: Duration = Duration::from_secs(15);
22+
1823
use super::auth;
1924
use super::protocol::*;
2025

@@ -71,16 +76,25 @@ impl SmbClient {
7176
/// Connect to the SMB server and authenticate.
7277
pub async fn connect(config: SmbConfig) -> io::Result<Arc<Self>> {
7378
let addr = format!("{}:{}", config.server, config.port);
74-
let stream = match TcpStream::connect(&addr).await {
75-
Ok(s) => {
76-
crate::slog!("[spiceio] smb tcp connected: {addr}");
77-
s
78-
}
79-
Err(e) => {
80-
crate::serr!("[spiceio] smb tcp connect failed: {addr}: {e}");
81-
return Err(e);
82-
}
83-
};
79+
let stream =
80+
match tokio::time::timeout(SMB_CONNECT_TIMEOUT, TcpStream::connect(&addr)).await {
81+
Ok(Ok(s)) => {
82+
crate::slog!("[spiceio] smb tcp connected: {addr}");
83+
s
84+
}
85+
Ok(Err(e)) => {
86+
crate::serr!("[spiceio] smb tcp connect failed: {addr}: {e}");
87+
return Err(e);
88+
}
89+
Err(_) => {
90+
let msg = format!(
91+
"smb tcp connect timed out after {}s: {addr}",
92+
SMB_CONNECT_TIMEOUT.as_secs()
93+
);
94+
crate::serr!("[spiceio] {msg}");
95+
return Err(io::Error::new(io::ErrorKind::TimedOut, msg));
96+
}
97+
};
8498
stream.set_nodelay(true)?;
8599

86100
// Enlarge socket buffers to 1 MB for large read/write throughput.
@@ -1193,8 +1207,10 @@ fn update_preauth_hash(hash: &mut [u8; 64], message: &[u8]) {
11931207
}
11941208

11951209
fn smb_status_to_io_error(status: u32, path: &str) -> io::Error {
1196-
crate::serr!("[spiceio] smb error 0x{status:08X}: {path}");
1197-
// Map raw status codes directly to avoid losing info through NtStatus enum
1210+
// Map raw status codes directly to avoid losing info through NtStatus enum.
1211+
// We deliberately do NOT log here — many of these are expected (NotFound on
1212+
// HEAD probes, SharingViolation during cleanup) and the io::Error string
1213+
// carries the status code for any caller that wants to surface it.
11981214
match status {
11991215
0xC000_000F // STATUS_NO_SUCH_FILE
12001216
| 0xC000_0034 // STATUS_OBJECT_NAME_NOT_FOUND
@@ -1212,7 +1228,15 @@ fn smb_status_to_io_error(status: u32, path: &str) -> io::Error {
12121228
format!("already exists: {path}"),
12131229
),
12141230

1215-
_ => io::Error::other(format!("SMB error 0x{status:08X} for {path}")),
1231+
0xC000_0043 => io::Error::new( // STATUS_SHARING_VIOLATION
1232+
io::ErrorKind::ResourceBusy,
1233+
format!("sharing violation: {path}"),
1234+
),
1235+
1236+
_ => {
1237+
crate::serr!("[spiceio] smb error 0x{status:08X}: {path}");
1238+
io::Error::other(format!("SMB error 0x{status:08X} for {path}"))
1239+
}
12161240
}
12171241
}
12181242

@@ -1232,45 +1256,81 @@ fn sign_message(msg: &mut [u8], key: &[u8; 16]) {
12321256
msg[SIGNATURE_OFFSET..SIGNATURE_OFFSET + 16].copy_from_slice(&signature);
12331257
}
12341258

1235-
/// Parse a compound response (multiple SMB2 messages in one frame).
1236-
fn parse_compound_response(msg: &[u8]) -> Vec<(Header, Vec<u8>)> {
1237-
let mut results = Vec::new();
1238-
let mut offset = 0;
1259+
// Need this for from_raw_fd
12391260

1240-
loop {
1241-
if offset + SMB2_HEADER_SIZE > msg.len() {
1242-
break;
1243-
}
1244-
let header = match Header::decode(&msg[offset..]) {
1245-
Some(h) => h,
1246-
None => break,
1247-
};
1261+
#[cfg(test)]
1262+
mod tests {
1263+
use super::*;
12481264

1249-
let next = header.next_command as usize;
1250-
let body_start = offset + SMB2_HEADER_SIZE;
1251-
let body_end = if next > 0 {
1252-
let end = offset + next;
1253-
if end > msg.len() || end < body_start {
1254-
break;
1255-
}
1256-
end
1257-
} else {
1258-
msg.len()
1259-
};
1260-
if body_start > body_end || body_end > msg.len() {
1261-
break;
1262-
}
1265+
fn assert_kind_and_path(err: &io::Error, kind: io::ErrorKind, needle: &str) {
1266+
assert_eq!(err.kind(), kind, "wrong kind for {err}");
1267+
let s = err.to_string();
1268+
assert!(s.contains(needle), "expected {needle:?} in {s:?}");
1269+
}
12631270

1264-
let body = msg[body_start..body_end].to_vec();
1265-
results.push((header, body));
1271+
#[test]
1272+
fn maps_no_such_file_to_not_found() {
1273+
let e = smb_status_to_io_error(0xC000_000F, "a\\b");
1274+
assert_kind_and_path(&e, io::ErrorKind::NotFound, "a\\b");
1275+
}
12661276

1267-
if next == 0 {
1268-
break;
1269-
}
1270-
offset += next;
1277+
#[test]
1278+
fn maps_object_name_not_found_to_not_found() {
1279+
let e = smb_status_to_io_error(0xC000_0034, "missing.txt");
1280+
assert_kind_and_path(&e, io::ErrorKind::NotFound, "missing.txt");
12711281
}
12721282

1273-
results
1274-
}
1283+
#[test]
1284+
fn maps_object_path_not_found_to_not_found() {
1285+
let e = smb_status_to_io_error(0xC000_003A, "dir\\file");
1286+
assert_kind_and_path(&e, io::ErrorKind::NotFound, "dir\\file");
1287+
}
12751288

1276-
// Need this for from_raw_fd
1289+
#[test]
1290+
fn maps_object_name_invalid_to_not_found() {
1291+
let e = smb_status_to_io_error(0xC000_0033, "bad?name");
1292+
assert_kind_and_path(&e, io::ErrorKind::NotFound, "bad?name");
1293+
}
1294+
1295+
#[test]
1296+
fn maps_access_denied_to_permission_denied() {
1297+
let e = smb_status_to_io_error(0xC000_0022, "secret");
1298+
assert_kind_and_path(&e, io::ErrorKind::PermissionDenied, "secret");
1299+
}
1300+
1301+
#[test]
1302+
fn maps_name_collision_to_already_exists() {
1303+
let e = smb_status_to_io_error(0xC000_0035, "dup");
1304+
assert_kind_and_path(&e, io::ErrorKind::AlreadyExists, "dup");
1305+
}
1306+
1307+
#[test]
1308+
fn maps_sharing_violation_to_resource_busy() {
1309+
let e = smb_status_to_io_error(0xC000_0043, ".spiceio-wal\\01-0000");
1310+
assert_kind_and_path(&e, io::ErrorKind::ResourceBusy, ".spiceio-wal\\01-0000");
1311+
}
1312+
1313+
#[test]
1314+
fn unknown_status_falls_back_to_other_and_includes_hex() {
1315+
let e = smb_status_to_io_error(0xC000_00BB, "x");
1316+
assert_eq!(e.kind(), io::ErrorKind::Other);
1317+
let s = e.to_string();
1318+
assert!(s.contains("0xC00000BB"), "expected hex in: {s}");
1319+
assert!(s.contains("x"), "expected path in: {s}");
1320+
}
1321+
1322+
#[test]
1323+
fn success_status_zero_falls_through_to_other() {
1324+
// STATUS_SUCCESS is not really an error, but the mapper must never panic.
1325+
let e = smb_status_to_io_error(0x0000_0000, "ok");
1326+
assert_eq!(e.kind(), io::ErrorKind::Other);
1327+
}
1328+
1329+
#[test]
1330+
fn error_path_is_preserved_verbatim() {
1331+
// Path containing backslashes, dots, and the WAL prefix must round-trip.
1332+
let path = ".spiceio-wal\\01778725545751751000-0000";
1333+
let e = smb_status_to_io_error(0xC000_0043, path);
1334+
assert!(e.to_string().contains(path));
1335+
}
1336+
}

0 commit comments

Comments
 (0)