Skip to content

Commit a3eac3a

Browse files
authored
Merge pull request #489 from finit-project/stale-pidfile
Handle stale pid file for crashing daemons
2 parents e74dff9 + 7d09e34 commit a3eac3a

8 files changed

Lines changed: 141 additions & 13 deletions

File tree

doc/ChangeLog.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,31 @@ Change Log
33

44
All relevant changes are documented in this file.
55

6+
[Unreleased]
7+
------------
8+
9+
### Changes
10+
11+
- Restart log now spells out the signal name and flags core dumps,
12+
e.g. `killed by SIGKILL` or `killed by SIGSEGV, core dumped`, in
13+
place of the bare numeric `by signal: N`. Gives operators a much
14+
stronger breadcrumb when a daemon dies unexpectedly
15+
16+
### Fixes
17+
18+
- Remove stale daemon-owned (`pid:!`) pidfiles after unclean exits.
19+
When a daemon dies via SIGKILL/OOM/segfault, or exits early during
20+
startup, its pidfile lingers and can prevent the next instance from
21+
starting -- `dbus-daemon` for example refuses to start when its
22+
pidfile already exists, so Finit's restart loop gives up. Finit now
23+
drops the file when it still names the just-reaped PID and that PID
24+
is no longer alive (the liveness check guards against PID reuse).
25+
This is a controlled exception to the long-standing rule that Finit
26+
does not touch service-owned pidfiles
27+
- Fix misspelled `SIGUNKOWN` returned by `sig_name()` for unknown
28+
signal numbers, now spelled correctly as `SIGUNKNOWN`. Surfaced by
29+
the new restart log above
30+
631
[4.17][] - 2026-04-28
732
---------------------
833

doc/config/services.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ basename of the binary to guess the PID file to watch for the PID:
3131
`/var/run/ntpd.pid`. If Finit guesses wrong, you have to submit the
3232
full `pid:!/path/to/file.pid`.
3333

34+
With `pid:!/path`, the file belongs to the service: Finit reads it
35+
but does not create or remove it. The one exception is *stale*
36+
cleanup — if the service dies without removing its own pidfile
37+
(SIGKILL, OOM, segfault), and the file still names the just-reaped
38+
PID, Finit removes it before the next retry. This prevents daemons
39+
that refuse to start on an existing pidfile (e.g. `dbus-daemon`)
40+
from getting stuck in a crash-restart loop.
41+
3442
**Example:**
3543

3644
In the case of `ospfd` (below), we omit the `-d` flag (daemonize) to

src/pid.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,10 @@ int pid_file_set(svc_t *svc, char *file, int not)
201201
* pid:!foo --> !/run/foo.pid
202202
* pid:!/run/foo.pid --> !/run/foo.pid
203203
*
204-
* Note, nothing is created or removed by Finit in this latter form.
204+
* Nothing is created or removed by Finit in this latter form, with one
205+
* exception: a verifiably stale pidfile (still names the just-reaped
206+
* PID, and that PID is no longer alive) is removed so the next instance
207+
* can start. See service_clean_pidfile() in src/service.c.
205208
*/
206209
int pid_file_parse(svc_t *svc, char *arg)
207210
{

src/service.c

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,35 @@ static void service_notify_stop(svc_t *svc)
11201120
}
11211121
}
11221122

1123+
/*
1124+
* Drop a daemon-owned (pid:!) pidfile if it still names the just-reaped
1125+
* PID and that PID is gone. The liveness check guards against reuse.
1126+
*/
1127+
static void service_clean_pidfile(svc_t *svc, pid_t reaped)
1128+
{
1129+
pid_t pid;
1130+
char *fn;
1131+
1132+
if (reaped <= 1)
1133+
return;
1134+
1135+
fn = pid_file(svc);
1136+
if (!fn)
1137+
return;
1138+
1139+
pid = pid_file_read(fn);
1140+
if (pid != reaped || pid_alive(pid))
1141+
return;
1142+
1143+
if (remove(fn) && errno != ENOENT) {
1144+
logit(LOG_CRIT, "Failed removing stale service %s pidfile %s",
1145+
svc_ident(svc, NULL, 0), fn);
1146+
return;
1147+
}
1148+
1149+
dbg("Removed stale service %s pidfile %s", svc_ident(svc, NULL, 0), fn);
1150+
}
1151+
11231152
/*
11241153
* Clean up any lingering state from dead/killed services
11251154
*/
@@ -1137,6 +1166,8 @@ static void service_cleanup(svc_t *svc)
11371166
if (remove(fn) && errno != ENOENT)
11381167
logit(LOG_CRIT, "Failed removing service %s pidfile %s",
11391168
svc_ident(svc, NULL, 0), fn);
1169+
} else if (svc->pidfile[0] == '!') {
1170+
service_clean_pidfile(svc, svc->pid);
11401171
}
11411172

11421173
/*
@@ -2405,7 +2436,10 @@ void service_monitor(pid_t lost, int status)
24052436
if (svc_is_forking(svc)) {
24062437
/* Likely start script exiting */
24072438
if (svc_is_starting(svc)) {
2408-
svc->pid = 0; /* Expect no more activity from this one */
2439+
/* Daemon died before clearing 'starting'; drop any stale pidfile. */
2440+
service_clean_pidfile(svc, lost);
2441+
svc->oldpid = lost; /* So service_retry() logs the real PID */
2442+
svc->pid = 0; /* Expect no more activity from this one */
24092443
goto cont;
24102444
}
24112445

@@ -2794,13 +2828,18 @@ static void service_retry(svc_t *svc)
27942828
timeout = ((*restart_cnt) <= (svc->restart_max / 2)) ? 2000 : 5000;
27952829
/* If a longer timeout was specified in the conf, use that instead. */
27962830
svc->restart_tmo = max(svc->restart_tmo, timeout);
2797-
logit(LOG_CONSOLE|LOG_WARNING, "Service %s[%d] died (%s%d), restarting (retry in %d msec) (attempt: %d/%d)",
2798-
svc_ident(svc, NULL, 0), svc->oldpid,
2799-
WIFEXITED(svc->status) ? "with exit status: " : "by signal: ",
2800-
WIFEXITED(svc->status) ? WEXITSTATUS(svc->status) : WTERMSIG(svc->status),
2801-
svc->restart_tmo,
2802-
*restart_cnt,
2803-
svc->restart_max);
2831+
if (WIFEXITED(svc->status))
2832+
logit(LOG_CONSOLE|LOG_WARNING,
2833+
"Service %s[%d] died (exit status: %d), restarting (retry in %d msec) (attempt: %d/%d)",
2834+
svc_ident(svc, NULL, 0), svc->oldpid, WEXITSTATUS(svc->status),
2835+
svc->restart_tmo, *restart_cnt, svc->restart_max);
2836+
else
2837+
logit(LOG_CONSOLE|LOG_WARNING,
2838+
"Service %s[%d] died (killed by %s%s), restarting (retry in %d msec) (attempt: %d/%d)",
2839+
svc_ident(svc, NULL, 0), svc->oldpid,
2840+
sig_name(WTERMSIG(svc->status)),
2841+
WCOREDUMP(svc->status) ? ", core dumped" : "",
2842+
svc->restart_tmo, *restart_cnt, svc->restart_max);
28042843

28052844
svc_unblock(svc);
28062845
service_step(svc);

src/sig.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ const char *sig_name(int signo)
630630
return signames[i].name;
631631
}
632632

633-
return "SIGUNKOWN";
633+
return "SIGUNKNOWN";
634634
}
635635

636636
/*

test/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ EXTRA_DIST += global-envs.sh
4141
EXTRA_DIST += initctl-status-subset.sh
4242
EXTRA_DIST += notify.sh
4343
EXTRA_DIST += pidfile.sh
44+
EXTRA_DIST += stale-pidfile.sh
4445
EXTRA_DIST += pre-post-serv.sh
4546
EXTRA_DIST += pre-fail.sh
4647
EXTRA_DIST += process-depends.sh
@@ -84,6 +85,7 @@ TESTS += global-envs.sh
8485
TESTS += initctl-status-subset.sh
8586
TESTS += notify.sh
8687
TESTS += pidfile.sh
88+
TESTS += stale-pidfile.sh
8789
TESTS += pre-post-serv.sh
8890
TESTS += pre-fail.sh
8991
TESTS += process-depends.sh

test/src/serv.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ volatile sig_atomic_t reloading = 1;
3434
volatile sig_atomic_t running = 1;
3535
static char *ident = PROGNM;
3636
static char fn[80];
37+
static int exclusive = 0; /* -x: refuse to start if pidfile exists (dbus-style) */
3738

3839
static void verify_env(char *arg)
3940
{
@@ -122,7 +123,7 @@ static void writefn(char *fn, int val)
122123

123124
static int checkfn(char *fn)
124125
{
125-
return !access(fn, R_OK);
126+
return !access(fn, F_OK);
126127
}
127128

128129
static void mine(char *fn)
@@ -133,12 +134,18 @@ static void mine(char *fn)
133134

134135
static void pidfile(char *pidfn)
135136
{
137+
static int once = 0;
138+
136139
if (!pidfn) {
137140
if (fn[0] == 0)
138141
snprintf(fn, sizeof(fn), "%s%s.pid", _PATH_VARRUN, ident);
139142
pidfn = fn;
140143
}
141144

145+
if (exclusive && !once && checkfn(pidfn))
146+
errx(EX_SOFTWARE, "PID file %s exists, refusing to start", pidfn);
147+
once = 1;
148+
142149
if (!checkfn(pidfn)) {
143150
pid_t pid;
144151

@@ -148,7 +155,7 @@ static void pidfile(char *pidfn)
148155
atexit(cleanup);
149156
} else {
150157
inf("Touching PID file %s", pidfn);
151-
utimensat(0, fn, NULL, 0);
158+
utimensat(AT_FDCWD, pidfn, NULL, 0);
152159
}
153160
}
154161

@@ -182,6 +189,7 @@ static int usage(int rc)
182189
" -p Create PID file despite running in foreground\n"
183190
" -P FILE Create PID file using FILE\n"
184191
" -r SVC Call initctl to restart service SVC (self)\n"
192+
" -x Refuse to start if PID file already exists (dbus-style)\n"
185193
"\n"
186194
"By default this program daemonizes itself to the background, and,\n"
187195
"when it's done setting up its signal handler(s), creates a PID file\n"
@@ -212,7 +220,7 @@ int main(int argc, char *argv[])
212220
char cmd[80];
213221
int c;
214222

215-
while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:")) != EOF) {
223+
while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:x")) != EOF) {
216224
switch (c) {
217225
case 'c':
218226
do_crash = 1;
@@ -249,12 +257,17 @@ int main(int argc, char *argv[])
249257
break;
250258
case 'P':
251259
pidfn = optarg;
260+
if ((size_t)snprintf(fn, sizeof(fn), "%s", optarg) >= sizeof(fn))
261+
errx(EX_USAGE, "-P path too long (max %zu)", sizeof(fn) - 1);
252262
do_pidfile++;
253263
break;
254264
case 'r':
255265
snprintf(cmd, sizeof(cmd), "initctl restart %s", optarg);
256266
do_restart = 1;
257267
break;
268+
case 'x':
269+
exclusive = 1;
270+
break;
258271
default:
259272
return usage(1);
260273
}

test/stale-pidfile.sh

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/sh
2+
# Regression: Finit must remove a daemon-owned (pid:!) stale pidfile
3+
# left behind by an unclean exit (SIGKILL), so the next instance can
4+
# start. Simulates the dbus-daemon pattern: 'serv -x' refuses to
5+
# start if its pidfile already exists.
6+
7+
set -eu
8+
9+
TEST_DIR=$(dirname "$0")
10+
PIDFN="/run/serv.pid"
11+
12+
test_teardown()
13+
{
14+
say "Running test teardown."
15+
run "rm -f $FINIT_CONF $PIDFN"
16+
}
17+
18+
# shellcheck source=/dev/null
19+
. "$TEST_DIR/lib/setup.sh"
20+
21+
say "Add service stanza '$FINIT_CONF' with pid:!$PIDFN"
22+
run "echo 'service pid:!$PIDFN serv -np -P $PIDFN -x' > $FINIT_CONF"
23+
run "initctl reload"
24+
25+
retry "assert_num_children 1 serv"
26+
assert_is_pidfile "serv" "$PIDFN"
27+
28+
PID=$(texec cat "$PIDFN")
29+
say "SIGKILL serv ($PID) -- leaves stale pidfile"
30+
run "kill -9 $PID"
31+
32+
# Without the fix, 'serv -x' refuses to start on each retry until
33+
# Finit hits restart_max and marks the service crashed. With the fix
34+
# Finit removes the stale pidfile and the next instance comes up.
35+
retry "assert_pidiff serv $PID"
36+
retry "assert_num_children 1 serv"
37+
38+
sep

0 commit comments

Comments
 (0)