Skip to content

Commit 4a960a1

Browse files
committed
Attempt to gracefully shutdown services first, then hard kill
1 parent b44b7a1 commit 4a960a1

9 files changed

Lines changed: 273 additions & 38 deletions

File tree

BUILD.bazel

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
1+
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "string_flag")
22

33
alias(
44
name = "gazelle",
@@ -11,6 +11,12 @@ cc_binary(
1111
visibility = ["//visibility:public"],
1212
)
1313

14+
cc_test(
15+
name = "exit0_test",
16+
srcs = [":exit0.c"],
17+
visibility = ["//visibility:public"],
18+
)
19+
1420
bool_flag(
1521
name = "allow_configuring_tmpdir",
1622
build_setting_default = False,
@@ -36,3 +42,16 @@ bool_flag(
3642
build_setting_default = False,
3743
visibility = ["//visibility:public"],
3844
)
45+
46+
# Default value for shutdown_timeout in itest_service rules.
47+
string_flag(
48+
name = "shutdown_timeout",
49+
build_setting_default = "1s",
50+
visibility = ["//visibility:public"],
51+
)
52+
53+
bool_flag(
54+
name = "error_on_forceful_shutdown",
55+
build_setting_default = False,
56+
visibility = ["//visibility:public"],
57+
)

docs/itest.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ This can be used in conjunction with the `/v0/port` API to let other tools inter
4242
<pre>
4343
load("@rules_itest//private:itest.bzl", "itest_service")
4444

45-
itest_service(<a href="#itest_service-name">name</a>, <a href="#itest_service-deps">deps</a>, <a href="#itest_service-data">data</a>, <a href="#itest_service-autoassign_port">autoassign_port</a>, <a href="#itest_service-env">env</a>, <a href="#itest_service-exe">exe</a>, <a href="#itest_service-expected_start_duration">expected_start_duration</a>, <a href="#itest_service-health_check">health_check</a>,
46-
<a href="#itest_service-health_check_args">health_check_args</a>, <a href="#itest_service-health_check_interval">health_check_interval</a>, <a href="#itest_service-health_check_timeout">health_check_timeout</a>, <a href="#itest_service-hot_reloadable">hot_reloadable</a>,
47-
<a href="#itest_service-http_health_check_address">http_health_check_address</a>, <a href="#itest_service-named_ports">named_ports</a>, <a href="#itest_service-so_reuseport_aware">so_reuseport_aware</a>)
45+
itest_service(<a href="#itest_service-name">name</a>, <a href="#itest_service-deps">deps</a>, <a href="#itest_service-data">data</a>, <a href="#itest_service-autoassign_port">autoassign_port</a>, <a href="#itest_service-env">env</a>, <a href="#itest_service-error_on_forceful_shutdown">error_on_forceful_shutdown</a>, <a href="#itest_service-exe">exe</a>,
46+
<a href="#itest_service-expected_start_duration">expected_start_duration</a>, <a href="#itest_service-health_check">health_check</a>, <a href="#itest_service-health_check_args">health_check_args</a>, <a href="#itest_service-health_check_interval">health_check_interval</a>,
47+
<a href="#itest_service-health_check_timeout">health_check_timeout</a>, <a href="#itest_service-hot_reloadable">hot_reloadable</a>, <a href="#itest_service-http_health_check_address">http_health_check_address</a>, <a href="#itest_service-named_ports">named_ports</a>,
48+
<a href="#itest_service-shutdown_signal">shutdown_signal</a>, <a href="#itest_service-shutdown_timeout">shutdown_timeout</a>, <a href="#itest_service-so_reuseport_aware">so_reuseport_aware</a>)
4849
</pre>
4950

5051
An itest_service is a binary that is intended to run for the duration of the integration test. Examples include databases, HTTP/RPC servers, queue consumers, external service mocks, etc.
@@ -61,6 +62,7 @@ All [common binary attributes](https://bazel.build/reference/be/common-definitio
6162
| <a id="itest_service-data"></a>data | - | <a href="https://bazel.build/concepts/labels">List of labels</a> | optional | `[]` |
6263
| <a id="itest_service-autoassign_port"></a>autoassign_port | If true, the service manager will pick a free port and assign it to the service. The port will be interpolated into `$${PORT}` in the service's `http_health_check_address` and `args`. It will also be exported under the target's fully qualified label in the service-port mapping.<br><br>The assigned ports for all services are available for substitution in `http_health_check_address` and `args` (in case one service needs the address for another one.) For example, the following substitution: `args = ["-client-addr", "127.0.0.1:$${@@//label/for:service}"]`<br><br>The service-port mapping is a JSON string -> int map propagated through the `ASSIGNED_PORTS` env var. For example, a port can be retrieved with the following JS code: `JSON.parse(process.env["ASSIGNED_PORTS"])["@@//label/for:service"]`.<br><br>Alternately, the env will also contain the location of a binary that can return the port, for contexts without a readily-accessible JSON parser. For example, the following Bash command: `PORT=$($GET_ASSIGNED_PORT_BIN @@//label/for:service)` | Boolean | optional | `False` |
6364
| <a id="itest_service-env"></a>env | The service manager will merge these variables into the environment when spawning the underlying binary. | <a href="https://bazel.build/rules/lib/dict">Dictionary: String -> String</a> | optional | `{}` |
65+
| <a id="itest_service-error_on_forceful_shutdown"></a>error_on_forceful_shutdown | If set to True, the service manager will fail the service_test if the service had to be forcefully killed if the signal was not SIGKILL and after the shutdown timeout elapsed.<br><br>This needs to be False to have coverage of your services but don't want a them to be graceful at shutdown | Boolean | optional | `True` |
6466
| <a id="itest_service-exe"></a>exe | The binary target to run. | <a href="https://bazel.build/concepts/labels">Label</a> | required | |
6567
| <a id="itest_service-expected_start_duration"></a>expected_start_duration | How long the service expected to take before passing a healthcheck. Any failing health checks before this duration elapses will not be logged. | String | optional | `"0s"` |
6668
| <a id="itest_service-health_check"></a>health_check | If set, the service manager will execute this binary to check if the service came up in a healthy state. This check will be retried until it exits with a 0 exit code. When used in conjunction with autoassigned ports, use one of the methods described in `autoassign_port` to locate the service. | <a href="https://bazel.build/concepts/labels">Label</a> | optional | `None` |
@@ -70,6 +72,8 @@ All [common binary attributes](https://bazel.build/reference/be/common-definitio
7072
| <a id="itest_service-hot_reloadable"></a>hot_reloadable | If set to True, the service manager will propagate ibazel's reload notification over stdin instead of restarting the service. See the ruleset docstring for more info on using ibazel | Boolean | optional | `False` |
7173
| <a id="itest_service-http_health_check_address"></a>http_health_check_address | If set, the service manager will send an HTTP request to this address to check if the service came up in a healthy state. This check will be retried until it returns a 200 HTTP code. When used in conjunction with autoassigned ports, `$${@@//label/for:service:port_name}` can be used in the address. Example: `http_health_check_address = "http://127.0.0.1:$${@@//label/for:service:port_name}",` | String | optional | `""` |
7274
| <a id="itest_service-named_ports"></a>named_ports | For each element of the list, the service manager will pick a free port and assign it to the service. The port's fully-qualified name is the service's fully-qualified label and the port name, separated by a colon. For example, a port assigned with `named_ports = ["http_port"]` will be assigned a fully-qualified name of `@@//label/for:service:http_port`.<br><br>Named ports are accessible through the service-port mapping. For more details, see `autoassign_port`. | List of strings | optional | `[]` |
75+
| <a id="itest_service-shutdown_signal"></a>shutdown_signal | The signal to send to the service when it needs to be shut down. Valid values are: SIGTERM and SIGKILL. SIGTERM is necessary to have proper coverage of services which needs to be gracefully terminated | String | optional | `"SIGTERM"` |
76+
| <a id="itest_service-shutdown_timeout"></a>shutdown_timeout | The duration to wait after sending the shutdown signal before forcefully killing the service. The syntax is based on common time duration with a number, followed by the time unit. For example, `200ms`, `1s`, `2m`, `3h`, `4d`. | String | optional | `"1s"` |
7377
| <a id="itest_service-so_reuseport_aware"></a>so_reuseport_aware | If set, the service manager will not release the autoassigned port. The service binary must use SO_REUSEPORT when binding it. This reduces the possibility of port collisions when running many service_tests in parallel, or when code binds port 0 without being aware of the port assignment mechanism.<br><br>Must only be set when `autoassign_port` is enabled or `named_ports` are used. | Boolean | optional | `False` |
7478

7579

private/itest.bzl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ def _itest_binary_impl(ctx, extra_service_spec_kwargs, extra_exe_runfiles = []):
138138
for arg in ctx.attr.args
139139
]
140140

141-
142141
if version_file:
143142
extra_service_spec_kwargs["version_file"] = to_rlocation_path(ctx, version_file)
144143

@@ -198,6 +197,9 @@ def _itest_service_impl(ctx):
198197
"expected_start_duration": ctx.attr.expected_start_duration,
199198
"health_check_interval": ctx.attr.health_check_interval,
200199
"health_check_timeout": ctx.attr.health_check_timeout,
200+
"shutdown_signal": ctx.attr.shutdown_signal,
201+
"shutdown_timeout": str(ctx.attr.shutdown_timeout[BuildSettingInfo].value),
202+
"error_on_forceful_shutdown": bool(ctx.attr.error_on_forceful_shutdown[BuildSettingInfo].value),
201203
}
202204
extra_exe_runfiles = []
203205

@@ -278,6 +280,21 @@ _itest_service_attrs = _itest_binary_attrs | {
278280
This check will be retried until it returns a 200 HTTP code. When used in conjunction with autoassigned ports, `$${@@//label/for:service:port_name}` can be used in the address.
279281
Example: `http_health_check_address = "http://127.0.0.1:$${@@//label/for:service:port_name}",`""",
280282
),
283+
"shutdown_signal": attr.string(
284+
default = "SIGTERM",
285+
doc = "The signal to send to the service when it needs to be shut down. Valid values are: SIGTERM and SIGKILL. SIGTERM is necessary to have proper coverage of services which needs to be gracefully terminated",
286+
values = ["SIGTERM", "SIGKILL"],
287+
),
288+
"shutdown_timeout": attr.label(
289+
default = "//:shutdown_timeout",
290+
doc = "The duration to wait after sending the shutdown signal before forcefully killing the service. The syntax is based on common time duration with a number, followed by the time unit. For example, `200ms`, `1s`, `2m`, `3h`, `4d`.",
291+
),
292+
"error_on_forceful_shutdown": attr.label(
293+
default = "//:error_on_forceful_shutdown",
294+
doc = """If set to True, the service manager will fail the service_test if the service had to be forcefully killed if the signal was not SIGKILL and after the shutdown timeout elapsed.
295+
296+
This needs to be False to have coverage of your services but don't want a them to be graceful at shutdown""",
297+
),
281298
}
282299

283300
itest_service = rule(

runner/runner.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
"reflect"
1010
"runtime"
1111
"sync"
12-
"syscall"
1312
"time"
1413

1514
"rules_itest/logger"
@@ -100,12 +99,11 @@ func (r *Runner) StartAll(serviceErrCh chan error) ([]topological.Task, error) {
10099

101100
func (r *Runner) StopAll() (map[string]*os.ProcessState, error) {
102101
tasks := allTasks(r.serviceInstances, func(ctx context.Context, service *ServiceInstance) error {
103-
if service.Type == "group" {
102+
if service.Type == "group" || service.Type == "task" {
104103
return nil
105104
}
106105
log.Printf("Stopping %s\n", colorize(service.VersionedServiceSpec))
107-
service.Stop(syscall.SIGKILL)
108-
return nil
106+
return service.Stop(nil)
109107
})
110108
stopper := topological.NewReversedRunner(tasks)
111109
err := stopper.Run(r.ctx)
@@ -185,10 +183,10 @@ func (r *Runner) UpdateSpecs(serviceSpecs ServiceSpecs, ibazelCmd []byte) error
185183

186184
for _, label := range updateActions.toStopLabels {
187185
serviceInstance := r.serviceInstances[label]
188-
if serviceInstance.Type == "group" {
186+
if serviceInstance.Type == "group" || serviceInstance.Type == "task" {
189187
continue
190188
}
191-
serviceInstance.Stop(syscall.SIGKILL)
189+
serviceInstance.Stop(nil)
192190
delete(r.serviceInstances, label)
193191
}
194192

runner/service_instance.go

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package runner
22

33
import (
44
"context"
5+
"errors"
56
"io"
67
"log"
78
"net/http"
@@ -34,6 +35,8 @@ type ServiceInstance struct {
3435
done bool
3536
}
3637

38+
var forcedKillError = errors.New("process forcefully killed with SIGKILL, to circumvent this error, set shutdown_signal to SIGKILL")
39+
3740
func (s *ServiceInstance) Start(ctx context.Context) error {
3841
s.mu.Lock()
3942
defer s.mu.Unlock()
@@ -197,7 +200,7 @@ func (s *ServiceInstance) Error() error {
197200
return s.runErr
198201
}
199202

200-
func (s *ServiceInstance) Stop(sig syscall.Signal) error {
203+
func (s *ServiceInstance) Stop(sig *syscall.Signal) error {
201204
func() {
202205
s.mu.Lock()
203206
defer s.mu.Unlock()
@@ -208,13 +211,74 @@ func (s *ServiceInstance) Stop(sig syscall.Signal) error {
208211
return nil
209212
}
210213

211-
err := killGroup(s.cmd, sig)
214+
if s.cmd.ProcessState.ExitCode() == 0 {
215+
return nil
216+
}
217+
218+
var signal syscall.Signal
219+
if sig == nil {
220+
if s.ShutdownSignal == "SIGKILL" {
221+
signal = syscall.SIGKILL
222+
} else if s.ShutdownSignal == "SIGTERM" {
223+
signal = syscall.SIGTERM
224+
} else {
225+
// Default to SIGKILL if unspecified or unrecognized. In case we add new values to itest.bzl but forget to add it here
226+
signal = syscall.SIGKILL
227+
}
228+
} else {
229+
signal = *sig
230+
}
231+
232+
err := killGroup(s.cmd, signal)
212233
if err != nil {
213234
return err
214235
}
215236

216-
for !s.isDone() {
217-
time.Sleep(5 * time.Millisecond)
237+
if signal == syscall.SIGKILL {
238+
log.Printf("Sent SIGKILL to %s\n", s.Colorize(s.Label))
239+
for !s.isDone() {
240+
time.Sleep(5 * time.Millisecond)
241+
}
242+
} else {
243+
log.Printf("Sent SIGTERM to %s\n", s.Colorize(s.Label))
244+
245+
shutdownTimeout, err := time.ParseDuration(s.VersionedServiceSpec.ShutdownTimeout)
246+
if err != nil {
247+
log.Printf("failed to parse health check timeout, falling back to no timeout: %v", err)
248+
return err
249+
}
250+
251+
log.Printf("Sent SIGTERM to %s, waiting for %s to stop gracefully\n", s.Colorize(s.Label), s.VersionedServiceSpec.ShutdownTimeout)
252+
253+
waitFor := time.After(shutdownTimeout)
254+
for {
255+
// Check if the process has exited
256+
if s.isDone() {
257+
break
258+
}
259+
260+
select {
261+
case <-waitFor:
262+
log.Printf("%s did not exit within %s, sending SIGKILL. If you are trying to collect coverage, you will most likely miss stats, try increasing the shutdown timeout.\n", s.Colorize(s.Label), shutdownTimeout)
263+
264+
err := killGroup(s.cmd, syscall.SIGKILL)
265+
if err != nil {
266+
return err
267+
}
268+
269+
for !s.isDone() {
270+
time.Sleep(5 * time.Millisecond)
271+
}
272+
273+
if s.ErrorOnForcefulShutdown {
274+
return forcedKillError
275+
}
276+
277+
return nil
278+
default:
279+
time.Sleep(5 * time.Millisecond)
280+
}
281+
}
218282
}
219283

220284
return nil

svcctl/svcctl.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,21 @@ func handleStart(ctx context.Context, r *runner.Runner, serviceErrCh chan error,
8383
w.WriteHeader(http.StatusOK)
8484
}
8585

86+
func Ptr[T any](v T) *T {
87+
return &v
88+
}
89+
8690
func handleKill(ctx context.Context, r *runner.Runner, _ chan error, w http.ResponseWriter, req *http.Request) {
87-
sig := syscall.SIGKILL
91+
var sig *syscall.Signal // Use the default defined signal configured on the service.
8892
params := req.URL.Query()
8993
signal := params.Get("signal")
9094
if signal != "" {
9195
// Currently only SIGTERM and SIGKILL are supported.
9296
switch signal {
9397
case "SIGTERM":
94-
sig = syscall.SIGTERM
98+
sig = Ptr(syscall.SIGTERM)
9599
case "SIGKILL":
96-
sig = syscall.SIGKILL
100+
sig = Ptr(syscall.SIGKILL)
97101
default:
98102
http.Error(w, "unsupported signal", http.StatusBadRequest)
99103
}

svclib/types.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,34 @@
11
package svclib
22

3-
import "rules_itest/logger"
3+
import (
4+
"rules_itest/logger"
5+
)
46

57
// Created by Starlark
68
type ServiceSpec struct {
79
// Type can be "service", "task", or "group".
8-
Type string `json:"type"`
9-
Label string `json:"label"`
10-
Args []string `json:"args"`
11-
Env map[string]string `json:"env"`
12-
Exe string `json:"exe"`
13-
HttpHealthCheckAddress string `json:"http_health_check_address"`
14-
ExpectedStartDuration string `json:"expected_start_duration"`
15-
HealthCheck string `json:"health_check"`
16-
HealthCheckLabel string `json:"health_check_label"`
17-
HealthCheckArgs []string `json:"health_check_args"`
18-
HealthCheckInterval string `json:"health_check_interval"`
19-
HealthCheckTimeout string `json:"health_check_timeout"`
20-
VersionFile string `json:"version_file"`
21-
Deps []string `json:"deps"`
22-
AutoassignPort bool `json:"autoassign_port"`
23-
SoReuseportAware bool `json:"so_reuseport_aware"`
24-
NamedPorts []string `json:"named_ports"`
25-
HotReloadable bool `json:"hot_reloadable"`
26-
PortAliases map[string]string `json:"port_aliases"`
10+
Type string `json:"type"`
11+
Label string `json:"label"`
12+
Args []string `json:"args"`
13+
Env map[string]string `json:"env"`
14+
Exe string `json:"exe"`
15+
HttpHealthCheckAddress string `json:"http_health_check_address"`
16+
ExpectedStartDuration string `json:"expected_start_duration"`
17+
HealthCheck string `json:"health_check"`
18+
HealthCheckLabel string `json:"health_check_label"`
19+
HealthCheckArgs []string `json:"health_check_args"`
20+
HealthCheckInterval string `json:"health_check_interval"`
21+
HealthCheckTimeout string `json:"health_check_timeout"`
22+
VersionFile string `json:"version_file"`
23+
Deps []string `json:"deps"`
24+
AutoassignPort bool `json:"autoassign_port"`
25+
SoReuseportAware bool `json:"so_reuseport_aware"`
26+
NamedPorts []string `json:"named_ports"`
27+
HotReloadable bool `json:"hot_reloadable"`
28+
PortAliases map[string]string `json:"port_aliases"`
29+
ShutdownSignal string `json:"shutdown_signal"`
30+
ShutdownTimeout string `json:"shutdown_timeout"`
31+
ErrorOnForcefulShutdown bool `json:"error_on_forceful_shutdown"`
2732
}
2833

2934
// Our internal representation.

0 commit comments

Comments
 (0)