Skip to content

Commit e36083a

Browse files
committed
fix(mcp): use PGID to kill busy server processes
- Track process group IDs for all MCP servers on Unix - Send SIGKILL to PGID if process mutex is unreachable - Prevent zombie processes during cleanup and shutdown - Remove PGID entries when servers are terminated
1 parent e747c4d commit e36083a

1 file changed

Lines changed: 93 additions & 4 deletions

File tree

src/mcp/process.rs

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ lazy_static::lazy_static! {
8888
static ref SERVER_IN_FLIGHT: Arc<RwLock<HashMap<String, InFlightHandle>>> =
8989
Arc::new(RwLock::new(HashMap::new()));
9090

91+
// Reference counts for shared MCP server processes.
92+
// Each call to ensure_server_running() increments the count; release_server() decrements it.
93+
// cleanup_server_process() only kills the OS process when the count reaches zero,
94+
// preventing one session from tearing down a server that another session is still using.
95+
// stop_all_servers() bypasses ref counts — it is only called on process shutdown.
9196
// Reference counts for shared MCP server processes.
9297
// Each call to ensure_server_running() increments the count; release_server() decrements it.
9398
// cleanup_server_process() only kills the OS process when the count reaches zero,
@@ -96,6 +101,13 @@ lazy_static::lazy_static! {
96101
static ref SERVER_REF_COUNTS: Arc<RwLock<HashMap<String, usize>>> =
97102
Arc::new(RwLock::new(HashMap::new()));
98103

104+
// Process group IDs for SIGKILL fallback when ServerProcess mutex is locked.
105+
// Stored OUTSIDE ServerProcess so we can kill processes even when busy.
106+
// Unix-only: used to send SIGKILL to -pgid when try_lock() fails.
107+
#[cfg(unix)]
108+
static ref SERVER_PGIDS: Arc<RwLock<HashMap<String, libc::pid_t>>> =
109+
Arc::new(RwLock::new(HashMap::new()));
110+
99111
/// Recent stderr lines per server — background reader threads push lines here
100112
/// so that initialization/runtime errors can be surfaced to the user.
101113
static ref SERVER_STDERR: Arc<RwLock<HashMap<String, StderrBuffer>>> =
@@ -699,6 +711,13 @@ async fn start_server_process(server: &McpServerConfig) -> Result<String> {
699711
anyhow::anyhow!("Failed to start MCP server '{}': {}", server.name(), e)
700712
})?;
701713

714+
// Store PGID for SIGKILL fallback when mutex is locked
715+
#[cfg(unix)]
716+
{
717+
let mut pgids = SERVER_PGIDS.write().unwrap();
718+
pgids.insert(server.name().to_string(), child.id() as libc::pid_t);
719+
}
720+
702721
// Add to the registry
703722
{
704723
let mut processes = SERVER_PROCESSES.write().unwrap();
@@ -756,6 +775,13 @@ async fn start_server_process(server: &McpServerConfig) -> Result<String> {
756775
anyhow::anyhow!("Failed to start MCP server '{}': {}", server.name(), e)
757776
})?;
758777

778+
// Store PGID for SIGKILL fallback when mutex is locked
779+
#[cfg(unix)]
780+
{
781+
let mut pgids = SERVER_PGIDS.write().unwrap();
782+
pgids.insert(server.name().to_string(), child.id() as libc::pid_t);
783+
}
784+
759785
// Get the stdin/stdout handles
760786
let child_stdin = child.stdin.take().ok_or_else(|| {
761787
anyhow::anyhow!("Failed to open stdin for MCP server: {}", server.name())
@@ -1603,15 +1629,44 @@ pub fn stop_all_servers() -> Result<()> {
16031629
}
16041630
}
16051631
Err(_) => {
1606-
crate::log_debug!("Could not acquire lock for server '{}', may be busy", name);
1607-
// For busy processes, we'll just remove them from registry
1608-
// The process cleanup will happen when the lock is released
1632+
crate::log_debug!(
1633+
"Could not acquire lock for server '{}', using PGID for SIGKILL",
1634+
name
1635+
);
1636+
// For busy processes, use PGID to send SIGKILL directly
1637+
#[cfg(unix)]
1638+
{
1639+
let pgids = SERVER_PGIDS.read().unwrap();
1640+
if let Some(pgid) = pgids.get(name) {
1641+
crate::log_debug!(
1642+
"Sending SIGKILL to process group {} for server '{}'",
1643+
pgid,
1644+
name
1645+
);
1646+
unsafe {
1647+
libc::kill(-*pgid, libc::SIGKILL);
1648+
}
1649+
} else {
1650+
crate::log_debug!("No PGID found for server '{}', process may leak", name);
1651+
}
1652+
}
1653+
#[cfg(not(unix))]
1654+
{
1655+
crate::log_debug!("No PGID fallback on non-Unix for server '{}'", name);
1656+
}
16091657
}
16101658
}
16111659
}
16121660

16131661
processes.clear();
16141662

1663+
// Clear PGIDs (Unix only)
1664+
#[cfg(unix)]
1665+
{
1666+
let mut pgids = SERVER_PGIDS.write().unwrap();
1667+
pgids.clear();
1668+
}
1669+
16151670
// Clear all in-flight handles
16161671
{
16171672
let mut in_flight_map = SERVER_IN_FLIGHT.write().unwrap();
@@ -1682,15 +1737,49 @@ pub fn cleanup_server_process(server_name: &str) -> Result<()> {
16821737
}
16831738
Err(_) => {
16841739
crate::log_debug!(
1685-
"Could not acquire lock for server '{}' during cleanup",
1740+
"Could not acquire lock for server '{}' during cleanup, using PGID for SIGKILL",
16861741
server_name
16871742
);
1743+
// For busy processes, use PGID to send SIGKILL directly
1744+
#[cfg(unix)]
1745+
{
1746+
let pgids = SERVER_PGIDS.read().unwrap();
1747+
if let Some(pgid) = pgids.get(server_name) {
1748+
crate::log_debug!(
1749+
"Sending SIGKILL to process group {} for server '{}' during cleanup",
1750+
pgid,
1751+
server_name
1752+
);
1753+
unsafe {
1754+
libc::kill(-*pgid, libc::SIGKILL);
1755+
}
1756+
} else {
1757+
crate::log_debug!(
1758+
"No PGID found for server '{}' during cleanup, process may leak",
1759+
server_name
1760+
);
1761+
}
1762+
}
1763+
#[cfg(not(unix))]
1764+
{
1765+
crate::log_debug!(
1766+
"No PGID fallback on non-Unix for server '{}' during cleanup",
1767+
server_name
1768+
);
1769+
}
16881770
}
16891771
}
16901772

16911773
// Clear function cache for this server
16921774
crate::mcp::server::clear_function_cache_for_server(server_name);
16931775

1776+
// Clean up PGID (Unix only)
1777+
#[cfg(unix)]
1778+
{
1779+
let mut pgids = SERVER_PGIDS.write().unwrap();
1780+
pgids.remove(server_name);
1781+
}
1782+
16941783
// Clean up in-flight handle slot
16951784
{
16961785
let mut in_flight_map = SERVER_IN_FLIGHT.write().unwrap();

0 commit comments

Comments
 (0)