Skip to content

Commit 4583b5d

Browse files
committed
fix(cli): validate background forward cleanup pid
Signed-off-by: Shiju <shiju@nvidia.com>
1 parent 301909f commit 4583b5d

2 files changed

Lines changed: 194 additions & 32 deletions

File tree

crates/openshell-cli/src/ssh.rs

Lines changed: 65 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ use nix::sys::signal::{SaFlags, SigAction, SigHandler, SigSet, Signal, sigaction
1010
#[cfg(unix)]
1111
use nix::unistd::Pid;
1212
use openshell_core::ObjectId;
13+
#[cfg(unix)]
14+
use openshell_core::forward::pid_matches_forward;
1315
use openshell_core::forward::{
1416
ForwardSpec, build_proxy_command, find_ssh_forward_pid, format_gateway_url,
1517
resolve_ssh_gateway, shell_escape, validate_ssh_session_response, write_forward_pid,
@@ -31,11 +33,15 @@ use tokio::net::TcpStream;
3133
use tokio::process::{Child, Command as TokioCommand};
3234
use tokio_stream::wrappers::ReceiverStream;
3335

34-
/// Time budget for a forward to become healthy after `ssh` starts: covers both
35-
/// backgrounded-PID discovery and listener readiness, in foreground and
36-
/// background.
37-
const FORWARD_STARTUP_GRACE_PERIOD: Duration = Duration::from_secs(2);
38-
/// Delay between listener/PID probes within the grace period.
36+
/// Time budget for finding the OpenSSH background process after `ssh -f`
37+
/// returns. PID discovery is separate from listener readiness so missing
38+
/// process tracking still fails quickly.
39+
const FORWARD_PID_DISCOVERY_TIMEOUT: Duration = Duration::from_secs(2);
40+
/// Time budget for the local listener to become reachable after `ssh` starts.
41+
/// This is a user-visible readiness deadline for both foreground and background
42+
/// forwards, not a soft cleanup grace period.
43+
const FORWARD_LISTENER_READINESS_TIMEOUT: Duration = Duration::from_secs(10);
44+
/// Delay between listener/PID probes within the configured timeout.
3945
const FORWARD_LISTENER_PROBE_INTERVAL: Duration = Duration::from_millis(50);
4046
/// Per-attempt connect timeout, so one hung probe cannot consume the whole
4147
/// grace period.
@@ -378,11 +384,11 @@ pub async fn sandbox_forward(
378384
)
379385
})?;
380386

381-
if let Err(err) = wait_for_forward_listener(spec, FORWARD_STARTUP_GRACE_PERIOD)
387+
if let Err(err) = wait_for_forward_listener(spec, FORWARD_LISTENER_READINESS_TIMEOUT)
382388
.await
383389
.wrap_err("ssh process started but local forward listener was not reachable")
384390
{
385-
terminate_forward_pid(pid);
391+
terminate_forward_pid(pid, port, &session.sandbox_id);
386392
return Err(err);
387393
}
388394

@@ -412,7 +418,7 @@ pub async fn sandbox_forward(
412418
/// session) means forwarding never came up, so it errors instead of waiting
413419
/// out the grace period.
414420
async fn wait_for_foreground_forward_start(child: &mut Child, spec: &ForwardSpec) -> Result<()> {
415-
let listener = wait_for_forward_listener(spec, FORWARD_STARTUP_GRACE_PERIOD);
421+
let listener = wait_for_forward_listener(spec, FORWARD_LISTENER_READINESS_TIMEOUT);
416422
tokio::pin!(listener);
417423
tokio::select! {
418424
result = &mut listener => result,
@@ -439,7 +445,7 @@ async fn wait_for_foreground_forward_start(child: &mut Child, spec: &ForwardSpec
439445
/// so the PID is unknown when `command.status()` returns and must be discovered
440446
/// afterward. Returns `None` if it never appears within the grace period.
441447
async fn wait_for_ssh_forward_pid(sandbox_id: &str, port: u16) -> Option<u32> {
442-
let deadline = tokio::time::Instant::now() + FORWARD_STARTUP_GRACE_PERIOD;
448+
let deadline = tokio::time::Instant::now() + FORWARD_PID_DISCOVERY_TIMEOUT;
443449
loop {
444450
if let Some(pid) = find_ssh_forward_pid(sandbox_id, port) {
445451
return Some(pid);
@@ -513,20 +519,26 @@ fn forward_probe_host(spec: &ForwardSpec) -> &str {
513519
/// are ignored: the process may already be exiting, and the caller surfaces the
514520
/// original listener error regardless.
515521
#[cfg(unix)]
516-
fn terminate_forward_pid(pid: u32) {
522+
fn terminate_forward_pid(pid: u32, port: u16, sandbox_id: &str) {
517523
let Ok(raw_pid) = i32::try_from(pid) else {
518524
return;
519525
};
520526
if raw_pid <= 0 {
521527
return;
522528
}
529+
if !pid_matches_forward(pid, port, Some(sandbox_id)) {
530+
// The PID came from a process-table scan, not a file we own. Re-check
531+
// immediately before signaling so a stale or spoofed match is left
532+
// untouched instead of terminating an unrelated process.
533+
return;
534+
}
523535

524536
let _ = nix::sys::signal::kill(Pid::from_raw(raw_pid), Signal::SIGTERM);
525537
}
526538

527539
/// Non-Unix builds cannot manage OpenSSH process IDs with Unix signals.
528540
#[cfg(not(unix))]
529-
fn terminate_forward_pid(_pid: u32) {}
541+
fn terminate_forward_pid(_pid: u32, _port: u16, _sandbox_id: &str) {}
530542

531543
fn foreground_forward_started_message(name: &str, spec: &ForwardSpec) -> String {
532544
format!(
@@ -1793,6 +1805,48 @@ mod tests {
17931805
assert!(text.contains("local forward listener did not open"));
17941806
}
17951807

1808+
#[cfg(unix)]
1809+
#[test]
1810+
fn terminate_forward_pid_skips_process_that_no_longer_matches_forward() {
1811+
let dir = tempfile::tempdir().unwrap();
1812+
let terminated_path = dir.path().join("terminated");
1813+
let mut child = Command::new("python3")
1814+
.arg("-c")
1815+
.arg(
1816+
r#"
1817+
import pathlib
1818+
import signal
1819+
import sys
1820+
import time
1821+
1822+
terminated_path = pathlib.Path(sys.argv[1])
1823+
1824+
def stop(_signum, _frame):
1825+
terminated_path.write_text("terminated")
1826+
raise SystemExit(0)
1827+
1828+
signal.signal(signal.SIGTERM, stop)
1829+
1830+
while True:
1831+
time.sleep(1)
1832+
"#,
1833+
)
1834+
.arg(&terminated_path)
1835+
.spawn()
1836+
.unwrap();
1837+
std::thread::sleep(Duration::from_millis(100));
1838+
1839+
terminate_forward_pid(child.id(), 43210, "id-spoofed-forward");
1840+
std::thread::sleep(Duration::from_millis(200));
1841+
1842+
assert!(
1843+
!terminated_path.exists(),
1844+
"mismatched process should not receive SIGTERM"
1845+
);
1846+
let _ = child.kill();
1847+
let _ = child.wait();
1848+
}
1849+
17961850
#[test]
17971851
fn split_sandbox_path_separates_parent_and_basename() {
17981852
assert_eq!(

crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs

Lines changed: 129 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -727,10 +727,27 @@ fn install_fake_pgrep_no_match(dir: &TempDir) -> std::path::PathBuf {
727727
install_executable_script(dir, "pgrep", "#!/bin/sh\nexit 1\n")
728728
}
729729

730-
async fn wait_for_file(path: &std::path::Path, timeout: Duration) -> bool {
730+
async fn wait_for_process_exit(pid: u32, timeout: Duration) -> bool {
731731
let deadline = Instant::now() + timeout;
732732
loop {
733-
if path.exists() {
733+
let output = std::process::Command::new("ps")
734+
.arg("-o")
735+
.arg("stat=")
736+
.arg("-p")
737+
.arg(pid.to_string())
738+
.stderr(std::process::Stdio::null())
739+
.output();
740+
let alive = output.is_ok_and(|output| {
741+
if !output.status.success() {
742+
return false;
743+
}
744+
let stat = String::from_utf8_lossy(&output.stdout);
745+
// Linux can leave the orphaned fake forward as a short-lived zombie
746+
// until the container's init process reaps it. A zombie has already
747+
// exited, so it satisfies this cleanup assertion.
748+
!stat.trim_start().starts_with('Z')
749+
});
750+
if !alive {
734751
return true;
735752
}
736753
if Instant::now() >= deadline {
@@ -844,39 +861,87 @@ exit 1
844861
ssh_path
845862
}
846863

847-
fn install_fake_unreachable_forwarding_ssh(dir: &TempDir) -> std::path::PathBuf {
864+
struct FakeUnreachableForward {
865+
log_path: std::path::PathBuf,
866+
pid_path: std::path::PathBuf,
867+
}
868+
869+
fn install_fake_unreachable_forwarding_ssh(dir: &TempDir) -> FakeUnreachableForward {
870+
let log_path = dir.path().join("fake-forward.log");
848871
let pid_path = dir.path().join("fake-forward.pid");
849-
let terminated_path = dir.path().join("fake-forward.terminated");
872+
let ready_path = dir.path().join("fake-forward.ready");
850873
install_executable_script(
851874
dir,
852875
"ssh",
853876
r#"#!/bin/sh
854877
set -eu
855878
856-
nohup python3 -c '
879+
forward=""
880+
sandbox_id=""
881+
previous=""
882+
883+
for arg in "$@"; do
884+
if [ "$previous" = "-L" ]; then
885+
forward="$arg"
886+
previous=""
887+
continue
888+
fi
889+
890+
if [ "$previous" = "-o" ]; then
891+
case "$arg" in
892+
ProxyCommand=*)
893+
sandbox_id="$(printf '%s\n' "$arg" | sed -n 's/.*--sandbox-id \([^ ]*\).*/\1/p')"
894+
;;
895+
esac
896+
previous=""
897+
continue
898+
fi
899+
900+
case "$arg" in
901+
-L|-o)
902+
previous="$arg"
903+
;;
904+
esac
905+
done
906+
907+
if [ -z "$forward" ] || [ -z "$sandbox_id" ]; then
908+
exit 1
909+
fi
910+
911+
trap '' HUP
912+
python3 -c '
857913
import pathlib
858914
import signal
859915
import sys
860916
import time
861917
862-
terminated_path = pathlib.Path(sys.argv[1])
863-
864-
def stop(_signum, _frame):
865-
terminated_path.write_text("terminated")
866-
raise SystemExit(0)
918+
ready_path = pathlib.Path(sys.argv[1])
867919
868-
signal.signal(signal.SIGTERM, stop)
869-
signal.signal(signal.SIGINT, stop)
870920
signal.signal(signal.SIGHUP, signal.SIG_IGN)
871921
922+
ready_path.write_text("ready")
923+
872924
while True:
873925
time.sleep(1)
874-
' '@TERMINATED_PATH@' >/dev/null 2>&1 &
875-
echo $! > '@PID_PATH@'
926+
' '@READY_PATH@' ssh ssh-proxy --sandbox-id "$sandbox_id" -L "$forward" >'@LOG_PATH@' 2>&1 &
927+
pid="$!"
928+
i=0
929+
while [ "$i" -lt 100 ]; do
930+
if [ -e '@READY_PATH@' ]; then
931+
break
932+
fi
933+
i=$((i + 1))
934+
sleep 0.05
935+
done
936+
if [ ! -e '@READY_PATH@' ]; then
937+
exit 1
938+
fi
939+
echo "$pid" > '@PID_PATH@'
876940
877941
exit 0
878942
"#
879-
.replace("@TERMINATED_PATH@", &terminated_path.display().to_string())
943+
.replace("@LOG_PATH@", &log_path.display().to_string())
944+
.replace("@READY_PATH@", &ready_path.display().to_string())
880945
.replace("@PID_PATH@", &pid_path.display().to_string()),
881946
);
882947

@@ -896,7 +961,7 @@ exit 1
896961
),
897962
);
898963

899-
terminated_path
964+
FakeUnreachableForward { log_path, pid_path }
900965
}
901966

902967
fn test_env(fake_ssh_dir: &TempDir, xdg_dir: &TempDir) -> EnvVarGuard {
@@ -1553,14 +1618,37 @@ async fn sandbox_forward_background_fails_when_pid_is_not_discoverable() {
15531618
);
15541619
}
15551620

1621+
#[tokio::test]
1622+
async fn sandbox_forward_foreground_fails_when_ssh_exits_before_listener_opens() {
1623+
let server = run_server().await;
1624+
let fake_ssh_dir = tempfile::tempdir().unwrap();
1625+
let xdg_dir = tempfile::tempdir().unwrap();
1626+
let _env = test_env(&fake_ssh_dir, &xdg_dir);
1627+
let tls = test_tls(&server);
1628+
install_fake_ssh(&fake_ssh_dir);
1629+
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
1630+
let forward_port = listener.local_addr().unwrap().port();
1631+
drop(listener);
1632+
1633+
let spec = openshell_core::forward::ForwardSpec::new(forward_port);
1634+
let err = run::sandbox_forward(&server.endpoint, "foreground-forward", &spec, false, &tls)
1635+
.await
1636+
.expect_err("foreground forward should fail when ssh exits before listener readiness");
1637+
let msg = format!("{err}");
1638+
assert!(
1639+
msg.contains("ssh exited before local forward listener opened"),
1640+
"error should explain that ssh exited before listener readiness, got: {msg}",
1641+
);
1642+
}
1643+
15561644
#[tokio::test]
15571645
async fn sandbox_forward_background_terminates_discovered_pid_when_listener_never_opens() {
15581646
let server = run_server().await;
15591647
let fake_ssh_dir = tempfile::tempdir().unwrap();
15601648
let xdg_dir = tempfile::tempdir().unwrap();
15611649
let _env = test_env(&fake_ssh_dir, &xdg_dir);
15621650
let tls = test_tls(&server);
1563-
let terminated_path = install_fake_unreachable_forwarding_ssh(&fake_ssh_dir);
1651+
let fake_forward = install_fake_unreachable_forwarding_ssh(&fake_ssh_dir);
15641652
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
15651653
let forward_port = listener.local_addr().unwrap().port();
15661654
drop(listener);
@@ -1578,10 +1666,30 @@ async fn sandbox_forward_background_terminates_discovered_pid_when_listener_neve
15781666
openshell_core::forward::read_forward_pid("unreachable-forward", forward_port).is_none(),
15791667
"unreachable background forwards must not write a PID file",
15801668
);
1581-
assert!(
1582-
wait_for_file(&terminated_path, Duration::from_secs(2)).await,
1583-
"discovered background SSH process should be terminated after listener failure",
1584-
);
1669+
let pid = fs::read_to_string(&fake_forward.pid_path)
1670+
.expect("fake forward should record a PID")
1671+
.trim()
1672+
.parse::<u32>()
1673+
.expect("fake forward PID should be numeric");
1674+
if !wait_for_process_exit(pid, Duration::from_secs(2)).await {
1675+
let log = fs::read_to_string(&fake_forward.log_path).unwrap_or_default();
1676+
let command = std::process::Command::new("ps")
1677+
.arg("-ww")
1678+
.arg("-o")
1679+
.arg("command=")
1680+
.arg("-p")
1681+
.arg(pid.to_string())
1682+
.output()
1683+
.ok()
1684+
.map(|output| String::from_utf8_lossy(&output.stdout).to_string())
1685+
.unwrap_or_default();
1686+
panic!(
1687+
"discovered background SSH process should exit after listener failure cleanup; pid={}, command={}, log={}",
1688+
pid,
1689+
command.trim(),
1690+
log.trim(),
1691+
);
1692+
}
15851693
}
15861694

15871695
#[tokio::test]

0 commit comments

Comments
 (0)