Skip to content

Commit cd3415f

Browse files
authored
Add better shutdown handling for tessera (#432)
Before this change. it was possible for Tessera to get in the way of the process shutting down in two ways: 1) The context in the Appender did not respond to signals, so if it ran into an irrecoverable error in `integrateEntriesJob`, it would retry indefinitely and the process would not end shutdown. 2) The appender Shutdown function also runs in an indefinite loop, and if Tessera got into a corrupt state where it could never finish appending the sequenced entries, the shutdown loop would never terminate. This change adds a new timeout parameter specifically for Tessera to give it a maximum time to finish processing entries. In the event it is stuck looping trying to process an unprocessable entry, it will now terminate in response to SIGINT/SIGTERM. The default timeout for the server timeout, which is shared by the HTTP and gRPC servers, is reduced to 20s, which in a correctly functioning system with a checkpoint period of 10s should be long enough to finish processing all in-flight requests. The new tessera timeout is 30s. This means that in a badly behaving system where tessera did not integrate entries in time to send the response to clients, it still has 30s to try to finish committing entries to the log and empty the Spanner queue. In the event that tessera encounters an irrecoverable error and cannot finish processing entries, the maximum time to shut down is 50s by default. This is reduced from 60s (server idle timeout) + indefinite (tessera loop). The Pod termination grace period should be extended to 50-60s accordingly. Signed-off-by: Colleen Murphy <[email protected]>
1 parent 999b9fe commit cd3415f

File tree

4 files changed

+13
-8
lines changed

4 files changed

+13
-8
lines changed

cmd/rekor-server/app/serve.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ var serveCmd = &cobra.Command{
157157
server.NewHTTPConfig(
158158
server.WithHTTPPort(viper.GetInt("http-port")),
159159
server.WithHTTPHost(viper.GetString("http-address")),
160-
server.WithHTTPTimeout(viper.GetDuration("timeout")),
160+
server.WithHTTPTimeout(viper.GetDuration("server-timeout")),
161161
server.WithHTTPMaxRequestBodySize(viper.GetInt("max-request-body-size")),
162162
server.WithHTTPMetricsPort(viper.GetInt("http-metrics-port")),
163163
server.WithHTTPTLSCredentials(viper.GetString("http-tls-cert-file"), viper.GetString("http-tls-key-file")),
@@ -166,11 +166,12 @@ var serveCmd = &cobra.Command{
166166
server.NewGRPCConfig(
167167
server.WithGRPCPort(viper.GetInt("grpc-port")),
168168
server.WithGRPCHost(viper.GetString("grpc-address")),
169-
server.WithGRPCTimeout(viper.GetDuration("timeout")),
169+
server.WithGRPCTimeout(viper.GetDuration("server-timeout")),
170170
server.WithGRPCMaxMessageSize(viper.GetInt("max-request-body-size")),
171171
server.WithGRPCLogLevel(logLevel, viper.GetBool("request-response-logging")),
172172
server.WithTLSCredentials(viper.GetString("grpc-tls-cert-file"), viper.GetString("grpc-tls-key-file")),
173173
),
174+
viper.GetDuration("tlog-timeout"),
174175
rekorServer,
175176
shutdownFn,
176177
)
@@ -185,7 +186,7 @@ func init() {
185186
serveCmd.Flags().Int("http-metrics-port", 2112, "HTTP port to bind metrics to")
186187
serveCmd.Flags().Int("grpc-port", 3001, "GRPC port to bind to")
187188
serveCmd.Flags().String("grpc-address", "127.0.0.1", "GRPC address to bind to")
188-
serveCmd.Flags().Duration("timeout", 60*time.Second, "timeout")
189+
serveCmd.Flags().Duration("server-timeout", 20*time.Second, "timeout settings for gRPC and HTTP connections")
189190
serveCmd.Flags().Int("max-request-body-size", 4*1024*1024, "maximum request body size in bytes")
190191
serveCmd.Flags().String("log-level", "info", "log level for the process. options are [debug, info, warn, error]")
191192
serveCmd.Flags().Bool("request-response-logging", false, "enables logging of request and response content; log-level must be 'debug' for this to take effect")
@@ -218,6 +219,7 @@ func init() {
218219
serveCmd.Flags().Duration("batch-max-age", tessera.DefaultBatchMaxAge, "the maximum amount of time a batch of entries will wait before being sent to the sequencer")
219220
serveCmd.Flags().Duration("checkpoint-interval", tessera.DefaultCheckpointInterval, "the frequency at which a checkpoint will be published")
220221
serveCmd.Flags().Uint("pushback-max-outstanding", tessera.DefaultPushbackMaxOutstanding, "the maximum number of 'in-flight' add requests")
222+
serveCmd.Flags().Duration("tlog-timeout", 30*time.Second, "timeout for terminating the tiles log queue")
221223

222224
// antispam configs
223225
serveCmd.Flags().Bool("persistent-antispam", false, "whether to enable persistent antispam measures; only available for GCP storage backend and not supported by the Spanner storage emulator")

internal/server/serve.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ import (
1919
"log/slog"
2020
"os"
2121
"sync"
22+
"time"
2223
)
2324

2425
// Serve starts the grpc server and its http proxy.
25-
func Serve(ctx context.Context, hc *HTTPConfig, gc *GRPCConfig, s rekorServer, tesseraShutdownFn func(context.Context) error) {
26+
func Serve(ctx context.Context, hc *HTTPConfig, gc *GRPCConfig, tesseraTimeout time.Duration, s rekorServer, tesseraShutdownFn func(context.Context) error) {
2627
var wg sync.WaitGroup
2728

2829
if hc.port == 0 || gc.port == 0 {
@@ -46,7 +47,9 @@ func Serve(ctx context.Context, hc *HTTPConfig, gc *GRPCConfig, s rekorServer, t
4647
wg.Wait()
4748

4849
slog.Info("shutting down Tessera sequencer")
49-
if err := tesseraShutdownFn(ctx); err != nil {
50+
tesseraCtx, cancel := context.WithTimeout(ctx, tesseraTimeout)
51+
defer cancel()
52+
if err := tesseraShutdownFn(tesseraCtx); err != nil {
5053
slog.Error("error shutting down Tessera", "error", err)
5154
}
5255
slog.Info("stopped Tessera sequencer")

internal/server/serve_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ func TestServe(t *testing.T) {
3232
}
3333
go func() {
3434
pid.Store(uint64(syscall.Getpid())) // Process IDs are positive ints
35-
Serve(context.Background(), NewHTTPConfig(), NewGRPCConfig(), nil, shutdownFn)
35+
Serve(context.Background(), NewHTTPConfig(), NewGRPCConfig(), 1*time.Second, nil, shutdownFn)
3636
wg.Done()
3737
}()
3838
// One for Serve returning, one for shutdown function being invoked

internal/server/service_mock.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func (ms *MockServer) Start(t *testing.T) {
6868
// Start the server
6969
ms.wg = &sync.WaitGroup{}
7070
go func() {
71-
Serve(context.Background(), ms.hc, ms.gc, s, shutdownFn)
71+
Serve(context.Background(), ms.hc, ms.gc, 1*time.Second, s, shutdownFn)
7272
ms.wg.Done()
7373
}()
7474
ms.wg.Add(1)
@@ -116,7 +116,7 @@ func (ms *MockServer) StartTLS(t *testing.T) {
116116
// Start the server
117117
ms.wg = &sync.WaitGroup{}
118118
go func() {
119-
Serve(context.Background(), ms.hc, ms.gc, s, shutdownFn)
119+
Serve(context.Background(), ms.hc, ms.gc, 1*time.Second, s, shutdownFn)
120120
ms.wg.Done()
121121
}()
122122
ms.wg.Add(1)

0 commit comments

Comments
 (0)