Skip to content

Commit edb799c

Browse files
vmkit: keep stdout a clean ack channel for Linux guests (#776)
Found while live-testing #775 (booting a real Linux guest): driving a Linux guest interactively was unreliable because the guest serial console shares stdout with the command/ack channel, so boot logs interleaved with acks and the driver never got a real frame. - **Rust**: route the guest serial console to **stderr**; stdout becomes ack-only (the Python `Driver` already discards stderr, so boot noise is dropped during normal driving, still visible running the binary directly). - **Python (defensive)**: `_read_ack` skips any non-ack line (the binary always writes `ok`/`err`) so a stray stdout line can never be mistaken for an ack, on any platform. - **Test**: smoke test now feeds interleaved console noise + an `err` ack through a fake pipe and asserts the real ack is recovered. Validated: `nix build .#mcp` (compiles the Rust change) and `nix run .#lint` pass; ack-skip logic covered by a deterministic fake-pipe test. _Authored with Claude (Opus 4.8)._
1 parent 46796fe commit edb799c

3 files changed

Lines changed: 60 additions & 30 deletions

File tree

packages/mcp/default.nix

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,20 @@ let
847847
dd._proc = types.SimpleNamespace(
848848
poll=lambda: None, returncode=None, stdin=_Sink(), stdout=rfile
849849
)
850+
# Guest serial console can share stdout (Linux guest): non-ack lines are
851+
# skipped, the real ok/err ack is returned.
852+
wfile.write("[ OK ] Reached target Initrd Root Device.\n")
853+
wfile.write("Starting File System Check...\n")
854+
wfile.write("\n")
855+
wfile.write("ok size 1280 800\n")
856+
assert dd._send_locked("size") == "ok size 1280 800"
857+
wfile.write("[ OK ] Started Service.\n")
858+
wfile.write("err size guest framebuffer not available yet\n")
859+
try:
860+
dd._send_locked("size")
861+
raise AssertionError("err ack should raise")
862+
except vmkit.VmkitError as exc:
863+
assert "framebuffer not available" in str(exc), exc
850864
try:
851865
dd._send_locked("shot /tmp/x", ack_timeout=0.2)
852866
raise AssertionError("bounded read should have timed out")

packages/mcp/src/vmkit/vmkit/__init__.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -799,47 +799,57 @@ def _send_locked(self, command: str, ack_timeout: float | None = None) -> str:
799799
if proc.poll() is not None:
800800
raise VmkitError(f"driver process exited with code {proc.returncode}")
801801
# Drain acks left pending by an earlier timed-out read so this command
802-
# reads its OWN ack, never a stale one.
802+
# reads its OWN ack, never a stale one. The pending ack is already in
803+
# flight, so this waits (unbounded) for it.
803804
while self._pending_acks > 0:
804-
stale = proc.stdout.readline()
805-
if stale == "":
806-
raise VmkitError("driver closed its output while draining a pending ack")
807-
if stale.rstrip("\n"):
808-
self._pending_acks -= 1
805+
self._read_ack(proc, None)
806+
self._pending_acks -= 1
809807
line = command.rstrip("\n")
810808
try:
811809
proc.stdin.write(line + "\n")
812810
proc.stdin.flush()
813811
except (BrokenPipeError, OSError) as exc:
814812
raise VmkitError(f"driver process closed its input: {exc}") from exc
815813
deadline = None if ack_timeout is None else time.monotonic() + ack_timeout
816-
# stderr is discarded, so the next stdout line is this command's ack;
817-
# skip a stray blank line all the same.
814+
ack = self._read_ack(proc, deadline)
815+
if ack is None:
816+
# The ack is still coming; mark it so the next command drains it
817+
# instead of mistaking it for its own (no desync).
818+
self._pending_acks += 1
819+
raise VmkitError(
820+
f"timed out after {ack_timeout}s waiting for ack to {command!r}"
821+
)
822+
if ack.startswith("err"):
823+
raise VmkitError(f"command {command!r} failed: {ack}")
824+
return ack
825+
826+
def _read_ack(self, proc: "subprocess.Popen[str]", deadline: float | None) -> str | None:
827+
"""Return the next ack line, skipping guest-console noise; ``None`` on
828+
deadline.
829+
830+
The binary writes every ack as ``ok``/``err`` (optionally followed by a
831+
detail), one per command. A Linux guest's serial console can share this
832+
stdout, so any line that is not an ack (boot logs, blank lines) is
833+
skipped rather than mistaken for one. Raises :class:`VmkitError` if the
834+
process closes its output.
835+
"""
818836
while True:
819837
if deadline is not None:
820838
remaining = deadline - time.monotonic()
821839
if remaining <= 0:
822-
# The ack is still coming; mark it so the next command drains
823-
# it instead of mistaking it for its own (no desync).
824-
self._pending_acks += 1
825-
raise VmkitError(
826-
f"timed out after {ack_timeout}s waiting for ack to {command!r}"
827-
)
840+
return None
828841
if not select.select([proc.stdout], [], [], remaining)[0]:
829842
continue
830-
ack = proc.stdout.readline()
831-
if ack == "":
832-
rc = proc.poll()
843+
raw = proc.stdout.readline()
844+
if raw == "":
833845
raise VmkitError(
834-
f"driver process gave no ack for {command!r} "
835-
f"(process exited with code {rc})"
846+
f"driver process closed its output (exited with code {proc.poll()})"
836847
)
837-
ack = ack.rstrip("\n")
838-
if ack != "":
839-
break
840-
if ack.startswith("err"):
841-
raise VmkitError(f"command {command!r} failed: {ack}")
842-
return ack
848+
ack = raw.strip()
849+
if ack in ("ok", "err") or ack.startswith(("ok ", "err ")):
850+
return ack
851+
# Otherwise: guest console output on the shared stdout, or a blank
852+
# line. Skip it and keep reading for the real ack.
843853

844854
def key(self, name: str, count: int = 1) -> str:
845855
"""Press a named key (``return``, ``tab``, arrows, ``f1``..``f12``, a

packages/vmkit/src/linuxguest.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ pub fn drive_linux(drive: DriveLinux) -> Result<(), Error> {
122122

123123
/// Build the Linux GUI guest configuration: generic platform booted by EFI off
124124
/// the raw disk, one virtio-gpu scanout, USB keyboard + pointing device, a
125-
/// serial console to stdout (the guest must use `console=hvc0`), and entropy +
125+
/// serial console to stderr (the guest must use `console=hvc0`), and entropy +
126126
/// balloon. Shared by the screenshot ([`boot_linux_gui`]) and interactive
127127
/// ([`drive_linux`]) paths.
128128
fn build_linux_gui_config(
@@ -191,16 +191,22 @@ fn build_linux_gui_config(
191191
let keyboard = unsafe { VZUSBKeyboardConfiguration::new() };
192192
let pointing = unsafe { VZUSBScreenCoordinatePointingDeviceConfiguration::new() };
193193

194-
// Guest serial console -> our stdout for boot debugging. VZ rejects a null
195-
// read handle, so give it the (unwritten) read end of a fresh pipe.
194+
// Guest serial console -> our stderr for boot debugging. stdout is the
195+
// command/ack channel (see `crate::drive::run_commands`); routing the guest
196+
// console there too would interleave kernel/systemd boot lines with acks and
197+
// break the interactive `drive-linux` protocol. stderr keeps stdout a clean
198+
// ack-only channel (the Python `Driver` reads acks from stdout and discards
199+
// stderr) while console logs stay visible when the binary is run directly.
200+
// VZ rejects a null read handle, so give it the (unwritten) read end of a
201+
// fresh pipe.
196202
let pipe = NSPipe::pipe();
197203
let read_handle = pipe.fileHandleForReading();
198-
let stdout_handle = NSFileHandle::fileHandleWithStandardOutput();
204+
let stderr_handle = NSFileHandle::fileHandleWithStandardError();
199205
let serial_attachment = unsafe {
200206
VZFileHandleSerialPortAttachment::initWithFileHandleForReading_fileHandleForWriting(
201207
VZFileHandleSerialPortAttachment::alloc(),
202208
Some(&read_handle),
203-
Some(&stdout_handle),
209+
Some(&stderr_handle),
204210
)
205211
};
206212
let serial = unsafe { VZVirtioConsoleDeviceSerialPortConfiguration::new() };

0 commit comments

Comments
 (0)