@@ -88,6 +88,11 @@ lazy_static::lazy_static! {
8888 static ref SERVER_IN_FLIGHT : Arc <RwLock <HashMap <String , InFlightHandle >>> =
8989 Arc :: new( RwLock :: new( HashMap :: new( ) ) ) ;
9090
91+ // Reference counts for shared MCP server processes.
92+ // Each call to ensure_server_running() increments the count; release_server() decrements it.
93+ // cleanup_server_process() only kills the OS process when the count reaches zero,
94+ // preventing one session from tearing down a server that another session is still using.
95+ // stop_all_servers() bypasses ref counts — it is only called on process shutdown.
9196 // Reference counts for shared MCP server processes.
9297 // Each call to ensure_server_running() increments the count; release_server() decrements it.
9398 // cleanup_server_process() only kills the OS process when the count reaches zero,
@@ -96,6 +101,13 @@ lazy_static::lazy_static! {
96101 static ref SERVER_REF_COUNTS : Arc <RwLock <HashMap <String , usize >>> =
97102 Arc :: new( RwLock :: new( HashMap :: new( ) ) ) ;
98103
104+ // Process group IDs for SIGKILL fallback when ServerProcess mutex is locked.
105+ // Stored OUTSIDE ServerProcess so we can kill processes even when busy.
106+ // Unix-only: used to send SIGKILL to -pgid when try_lock() fails.
107+ #[ cfg( unix) ]
108+ static ref SERVER_PGIDS : Arc <RwLock <HashMap <String , libc:: pid_t>>> =
109+ Arc :: new( RwLock :: new( HashMap :: new( ) ) ) ;
110+
99111 /// Recent stderr lines per server — background reader threads push lines here
100112 /// so that initialization/runtime errors can be surfaced to the user.
101113 static ref SERVER_STDERR : Arc <RwLock <HashMap <String , StderrBuffer >>> =
@@ -699,6 +711,13 @@ async fn start_server_process(server: &McpServerConfig) -> Result<String> {
699711 anyhow:: anyhow!( "Failed to start MCP server '{}': {}" , server. name( ) , e)
700712 } ) ?;
701713
714+ // Store PGID for SIGKILL fallback when mutex is locked
715+ #[ cfg( unix) ]
716+ {
717+ let mut pgids = SERVER_PGIDS . write ( ) . unwrap ( ) ;
718+ pgids. insert ( server. name ( ) . to_string ( ) , child. id ( ) as libc:: pid_t ) ;
719+ }
720+
702721 // Add to the registry
703722 {
704723 let mut processes = SERVER_PROCESSES . write ( ) . unwrap ( ) ;
@@ -756,6 +775,13 @@ async fn start_server_process(server: &McpServerConfig) -> Result<String> {
756775 anyhow:: anyhow!( "Failed to start MCP server '{}': {}" , server. name( ) , e)
757776 } ) ?;
758777
778+ // Store PGID for SIGKILL fallback when mutex is locked
779+ #[ cfg( unix) ]
780+ {
781+ let mut pgids = SERVER_PGIDS . write ( ) . unwrap ( ) ;
782+ pgids. insert ( server. name ( ) . to_string ( ) , child. id ( ) as libc:: pid_t ) ;
783+ }
784+
759785 // Get the stdin/stdout handles
760786 let child_stdin = child. stdin . take ( ) . ok_or_else ( || {
761787 anyhow:: anyhow!( "Failed to open stdin for MCP server: {}" , server. name( ) )
@@ -1603,15 +1629,44 @@ pub fn stop_all_servers() -> Result<()> {
16031629 }
16041630 }
16051631 Err ( _) => {
1606- crate :: log_debug!( "Could not acquire lock for server '{}', may be busy" , name) ;
1607- // For busy processes, we'll just remove them from registry
1608- // The process cleanup will happen when the lock is released
1632+ crate :: log_debug!(
1633+ "Could not acquire lock for server '{}', using PGID for SIGKILL" ,
1634+ name
1635+ ) ;
1636+ // For busy processes, use PGID to send SIGKILL directly
1637+ #[ cfg( unix) ]
1638+ {
1639+ let pgids = SERVER_PGIDS . read ( ) . unwrap ( ) ;
1640+ if let Some ( pgid) = pgids. get ( name) {
1641+ crate :: log_debug!(
1642+ "Sending SIGKILL to process group {} for server '{}'" ,
1643+ pgid,
1644+ name
1645+ ) ;
1646+ unsafe {
1647+ libc:: kill ( -* pgid, libc:: SIGKILL ) ;
1648+ }
1649+ } else {
1650+ crate :: log_debug!( "No PGID found for server '{}', process may leak" , name) ;
1651+ }
1652+ }
1653+ #[ cfg( not( unix) ) ]
1654+ {
1655+ crate :: log_debug!( "No PGID fallback on non-Unix for server '{}'" , name) ;
1656+ }
16091657 }
16101658 }
16111659 }
16121660
16131661 processes. clear ( ) ;
16141662
1663+ // Clear PGIDs (Unix only)
1664+ #[ cfg( unix) ]
1665+ {
1666+ let mut pgids = SERVER_PGIDS . write ( ) . unwrap ( ) ;
1667+ pgids. clear ( ) ;
1668+ }
1669+
16151670 // Clear all in-flight handles
16161671 {
16171672 let mut in_flight_map = SERVER_IN_FLIGHT . write ( ) . unwrap ( ) ;
@@ -1682,15 +1737,49 @@ pub fn cleanup_server_process(server_name: &str) -> Result<()> {
16821737 }
16831738 Err ( _) => {
16841739 crate :: log_debug!(
1685- "Could not acquire lock for server '{}' during cleanup" ,
1740+ "Could not acquire lock for server '{}' during cleanup, using PGID for SIGKILL " ,
16861741 server_name
16871742 ) ;
1743+ // For busy processes, use PGID to send SIGKILL directly
1744+ #[ cfg( unix) ]
1745+ {
1746+ let pgids = SERVER_PGIDS . read ( ) . unwrap ( ) ;
1747+ if let Some ( pgid) = pgids. get ( server_name) {
1748+ crate :: log_debug!(
1749+ "Sending SIGKILL to process group {} for server '{}' during cleanup" ,
1750+ pgid,
1751+ server_name
1752+ ) ;
1753+ unsafe {
1754+ libc:: kill ( -* pgid, libc:: SIGKILL ) ;
1755+ }
1756+ } else {
1757+ crate :: log_debug!(
1758+ "No PGID found for server '{}' during cleanup, process may leak" ,
1759+ server_name
1760+ ) ;
1761+ }
1762+ }
1763+ #[ cfg( not( unix) ) ]
1764+ {
1765+ crate :: log_debug!(
1766+ "No PGID fallback on non-Unix for server '{}' during cleanup" ,
1767+ server_name
1768+ ) ;
1769+ }
16881770 }
16891771 }
16901772
16911773 // Clear function cache for this server
16921774 crate :: mcp:: server:: clear_function_cache_for_server ( server_name) ;
16931775
1776+ // Clean up PGID (Unix only)
1777+ #[ cfg( unix) ]
1778+ {
1779+ let mut pgids = SERVER_PGIDS . write ( ) . unwrap ( ) ;
1780+ pgids. remove ( server_name) ;
1781+ }
1782+
16941783 // Clean up in-flight handle slot
16951784 {
16961785 let mut in_flight_map = SERVER_IN_FLIGHT . write ( ) . unwrap ( ) ;
0 commit comments