@@ -1120,6 +1120,35 @@ static void service_notify_stop(svc_t *svc)
11201120 }
11211121}
11221122
1123+ /*
1124+ * Drop a daemon-owned (pid:!) pidfile if it still names the just-reaped
1125+ * PID and that PID is gone. The liveness check guards against reuse.
1126+ */
1127+ static void service_clean_pidfile (svc_t * svc , pid_t reaped )
1128+ {
1129+ pid_t pid ;
1130+ char * fn ;
1131+
1132+ if (reaped <= 1 )
1133+ return ;
1134+
1135+ fn = pid_file (svc );
1136+ if (!fn )
1137+ return ;
1138+
1139+ pid = pid_file_read (fn );
1140+ if (pid != reaped || pid_alive (pid ))
1141+ return ;
1142+
1143+ if (remove (fn ) && errno != ENOENT ) {
1144+ logit (LOG_CRIT , "Failed removing stale service %s pidfile %s" ,
1145+ svc_ident (svc , NULL , 0 ), fn );
1146+ return ;
1147+ }
1148+
1149+ dbg ("Removed stale service %s pidfile %s" , svc_ident (svc , NULL , 0 ), fn );
1150+ }
1151+
11231152/*
11241153 * Clean up any lingering state from dead/killed services
11251154 */
@@ -1137,6 +1166,8 @@ static void service_cleanup(svc_t *svc)
11371166 if (remove (fn ) && errno != ENOENT )
11381167 logit (LOG_CRIT , "Failed removing service %s pidfile %s" ,
11391168 svc_ident (svc , NULL , 0 ), fn );
1169+ } else if (svc -> pidfile [0 ] == '!' ) {
1170+ service_clean_pidfile (svc , svc -> pid );
11401171 }
11411172
11421173 /*
@@ -2405,7 +2436,10 @@ void service_monitor(pid_t lost, int status)
24052436 if (svc_is_forking (svc )) {
24062437 /* Likely start script exiting */
24072438 if (svc_is_starting (svc )) {
2408- svc -> pid = 0 ; /* Expect no more activity from this one */
2439+ /* Daemon died before clearing 'starting'; drop any stale pidfile. */
2440+ service_clean_pidfile (svc , lost );
2441+ svc -> oldpid = lost ; /* So service_retry() logs the real PID */
2442+ svc -> pid = 0 ; /* Expect no more activity from this one */
24092443 goto cont ;
24102444 }
24112445
@@ -2794,13 +2828,18 @@ static void service_retry(svc_t *svc)
27942828 timeout = ((* restart_cnt ) <= (svc -> restart_max / 2 )) ? 2000 : 5000 ;
27952829 /* If a longer timeout was specified in the conf, use that instead. */
27962830 svc -> restart_tmo = max (svc -> restart_tmo , timeout );
2797- logit (LOG_CONSOLE |LOG_WARNING , "Service %s[%d] died (%s%d), restarting (retry in %d msec) (attempt: %d/%d)" ,
2798- svc_ident (svc , NULL , 0 ), svc -> oldpid ,
2799- WIFEXITED (svc -> status ) ? "with exit status: " : "by signal: " ,
2800- WIFEXITED (svc -> status ) ? WEXITSTATUS (svc -> status ) : WTERMSIG (svc -> status ),
2801- svc -> restart_tmo ,
2802- * restart_cnt ,
2803- svc -> restart_max );
2831+ if (WIFEXITED (svc -> status ))
2832+ logit (LOG_CONSOLE |LOG_WARNING ,
2833+ "Service %s[%d] died (exit status: %d), restarting (retry in %d msec) (attempt: %d/%d)" ,
2834+ svc_ident (svc , NULL , 0 ), svc -> oldpid , WEXITSTATUS (svc -> status ),
2835+ svc -> restart_tmo , * restart_cnt , svc -> restart_max );
2836+ else
2837+ logit (LOG_CONSOLE |LOG_WARNING ,
2838+ "Service %s[%d] died (killed by %s%s), restarting (retry in %d msec) (attempt: %d/%d)" ,
2839+ svc_ident (svc , NULL , 0 ), svc -> oldpid ,
2840+ sig_name (WTERMSIG (svc -> status )),
2841+ WCOREDUMP (svc -> status ) ? ", core dumped" : "" ,
2842+ svc -> restart_tmo , * restart_cnt , svc -> restart_max );
28042843
28052844 svc_unblock (svc );
28062845 service_step (svc );
0 commit comments