Skip to content

Commit 56bf6aa

Browse files
committed
Allow watchdog to exit earlier
This change is a mirror image of: openfaas/of-watchdog#125 It has been tested in of-watchdog and allows the graceful shutdown sequence to exit earlier than previously, if all active connections have completed. It also adds an in-flight HTTP metric. Signed-off-by: Alex Ellis (OpenFaaS Ltd) <[email protected]>
1 parent f294c84 commit 56bf6aa

File tree

11 files changed

+734
-35
lines changed

11 files changed

+734
-35
lines changed

.github/workflows/build.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
build:
1313
strategy:
1414
matrix:
15-
go-version: [ 1.15.x ]
15+
go-version: [ 1.16.x ]
1616
os: [ ubuntu-latest ]
1717
runs-on: ${{ matrix.os }}
1818
steps:

.github/workflows/publish.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
publish:
1010
strategy:
1111
matrix:
12-
go-version: [ 1.15.x ]
12+
go-version: [ 1.16.x ]
1313
os: [ ubuntu-latest ]
1414
runs-on: ${{ matrix.os }}
1515
steps:

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,21 @@ The watchdog can be configured through environmental variables. You must always
9898
| `content_type` | Force a specific Content-Type response for all responses |
9999
| `write_timeout` | HTTP timeout for writing a response body from your function (in seconds) |
100100
| `read_timeout` | HTTP timeout for reading the payload from the client caller (in seconds) |
101+
| `healthcheck_interval` | Interval (in seconds) for HTTP healthcheck by container orchestrator i.e. kubelet. Used for graceful shutdowns. |
101102
| `suppress_lock` | The watchdog will attempt to write a lockfile to /tmp/ for swarm healthchecks - set this to true to disable behaviour. |
102103
| `exec_timeout` | Hard timeout for process exec'd for each incoming request (in seconds). Disabled if set to 0 |
103104
| `write_debug` | Write all output, error messages, and additional information to the logs. Default is false |
104105
| `combine_output` | True by default - combines stdout/stderr in function response, when set to false `stderr` is written to the container logs and stdout is used for function response |
105106
| `max_inflight` | Limit the maximum number of requests in flight |
106107

108+
## Metrics
109+
110+
| Name | Description | Type |
111+
|---------------------------------|-------------------------|------------------------|
112+
| http_requests_total | Total number of requests | Counter |
113+
| http_request_duration_seconds | Duration of requests | Histogram |
114+
| http_requests_in_flight | Number of requests in-flight | Gauge |
115+
107116
## Advanced / tuning
108117

109118
### (New) of-watchdog and HTTP mode

main.go

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121

2222
"github.com/openfaas/classic-watchdog/metrics"
2323
"github.com/openfaas/classic-watchdog/types"
24+
"github.com/prometheus/client_golang/prometheus/testutil"
2425
)
2526

2627
var (
@@ -67,6 +68,7 @@ func main() {
6768

6869
readTimeout := config.readTimeout
6970
writeTimeout := config.writeTimeout
71+
healthcheckInterval := config.healthcheckInterval
7072

7173
s := &http.Server{
7274
Addr: fmt.Sprintf(":%d", config.port),
@@ -77,10 +79,11 @@ func main() {
7779

7880
httpMetrics := metrics.NewHttp()
7981

80-
log.Printf("Timeouts: read: %s, write: %s hard: %s.\n",
82+
log.Printf("Timeouts: read: %s write: %s hard: %s health: %s.\n",
8183
readTimeout,
8284
writeTimeout,
83-
config.execTimeout)
85+
config.execTimeout,
86+
healthcheckInterval)
8487
log.Printf("Listening on port: %d\n", config.port)
8588

8689
http.HandleFunc("/_/health", makeHealthHandler())
@@ -93,24 +96,14 @@ func main() {
9396

9497
go metricsServer.Serve(cancel)
9598

96-
shutdownTimeout := config.writeTimeout
97-
listenUntilShutdown(shutdownTimeout, s, config.suppressLock)
98-
}
99-
100-
func markUnhealthy() error {
101-
atomic.StoreInt32(&acceptingConnections, 0)
102-
103-
path := filepath.Join(os.TempDir(), ".lock")
104-
log.Printf("Removing lock-file : %s\n", path)
105-
removeErr := os.Remove(path)
106-
return removeErr
99+
listenUntilShutdown(s, healthcheckInterval, writeTimeout, config.suppressLock, &httpMetrics)
107100
}
108101

109102
// listenUntilShutdown will listen for HTTP requests until SIGTERM
110103
// is sent at which point the code will wait `shutdownTimeout` before
111104
// closing off connections and a futher `shutdownTimeout` before
112105
// exiting
113-
func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server, suppressLock bool) {
106+
func listenUntilShutdown(s *http.Server, healthcheckInterval time.Duration, writeTimeout time.Duration, suppressLock bool, httpMetrics *metrics.Http) {
114107

115108
idleConnsClosed := make(chan struct{})
116109
go func() {
@@ -119,24 +112,29 @@ func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server, suppress
119112

120113
<-sig
121114

122-
log.Printf("SIGTERM received.. shutting down server in %s\n", shutdownTimeout.String())
115+
log.Printf("SIGTERM: no new connections in %s\n", healthcheckInterval.String())
123116

124-
healthErr := markUnhealthy()
125-
126-
if healthErr != nil {
127-
log.Printf("Unable to mark unhealthy during shutdown: %s\n", healthErr.Error())
117+
if err := markUnhealthy(); err != nil {
118+
log.Printf("Unable to mark server as unhealthy: %s\n", err.Error())
128119
}
129120

130-
<-time.Tick(shutdownTimeout)
121+
<-time.Tick(healthcheckInterval)
122+
123+
connections := int64(testutil.ToFloat64(httpMetrics.InFlight))
124+
log.Printf("No new connections allowed, draining: %d requests\n", connections)
131125

132-
if err := s.Shutdown(context.Background()); err != nil {
133-
// Error from closing listeners, or context timeout:
126+
// The maximum time to wait for active connections whilst shutting down is
127+
// equivalent to the maximum execution time i.e. writeTimeout.
128+
ctx, cancel := context.WithTimeout(context.Background(), writeTimeout)
129+
defer cancel()
130+
131+
if err := s.Shutdown(ctx); err != nil {
134132
log.Printf("Error in Shutdown: %v", err)
135133
}
136134

137-
log.Printf("No new connections allowed. Exiting in: %s\n", shutdownTimeout.String())
135+
connections = int64(testutil.ToFloat64(httpMetrics.InFlight))
138136

139-
<-time.Tick(shutdownTimeout)
137+
log.Printf("Exiting. Active connections: %d\n", connections)
140138

141139
close(idleConnsClosed)
142140
}()
@@ -164,6 +162,15 @@ func listenUntilShutdown(shutdownTimeout time.Duration, s *http.Server, suppress
164162
<-idleConnsClosed
165163
}
166164

165+
func markUnhealthy() error {
166+
atomic.StoreInt32(&acceptingConnections, 0)
167+
168+
path := filepath.Join(os.TempDir(), ".lock")
169+
log.Printf("Removing lock-file : %s\n", path)
170+
removeErr := os.Remove(path)
171+
return removeErr
172+
}
173+
167174
func printVersion() {
168175
sha := "unknown"
169176
if len(GitCommit) > 0 {

metrics/http.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ import (
88
type Http struct {
99
RequestsTotal *prometheus.CounterVec
1010
RequestDurationHistogram *prometheus.HistogramVec
11+
InFlight prometheus.Gauge
1112
}
1213

1314
func NewHttp() Http {
14-
return Http{
15+
h := Http{
1516
RequestsTotal: promauto.NewCounterVec(prometheus.CounterOpts{
1617
Subsystem: "http",
1718
Name: "requests_total",
@@ -23,5 +24,14 @@ func NewHttp() Http {
2324
Help: "Seconds spent serving HTTP requests.",
2425
Buckets: prometheus.DefBuckets,
2526
}, []string{"code", "method"}),
27+
InFlight: promauto.NewGauge(prometheus.GaugeOpts{
28+
Subsystem: "http",
29+
Name: "requests_in_flight",
30+
Help: "total HTTP requests in-flight",
31+
}),
2632
}
33+
34+
// Default to 0 for queries during graceful shutdown.
35+
h.InFlight.Set(0)
36+
return h
2737
}

metrics/metrics.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,22 @@ func (m *MetricsServer) Serve(cancel chan bool) {
4848
}()
4949

5050
go func() {
51-
select {
52-
case <-cancel:
53-
log.Printf("metrics server shutdown\n")
54-
55-
m.s.Shutdown(context.Background())
56-
}
51+
<-cancel
52+
log.Printf("metrics server shutdown\n")
53+
m.s.Shutdown(context.Background())
5754
}()
5855
}
5956

6057
// InstrumentHandler returns a handler which records HTTP requests
6158
// as they are made
6259
func InstrumentHandler(next http.Handler, _http Http) http.HandlerFunc {
63-
return promhttp.InstrumentHandlerCounter(_http.RequestsTotal,
64-
promhttp.InstrumentHandlerDuration(_http.RequestDurationHistogram, next))
60+
return func(w http.ResponseWriter, r *http.Request) {
61+
then := promhttp.InstrumentHandlerCounter(_http.RequestsTotal,
62+
promhttp.InstrumentHandlerDuration(_http.RequestDurationHistogram, next))
63+
64+
_http.InFlight.Inc()
65+
defer _http.InFlight.Dec()
66+
67+
then(w, r)
68+
}
6569
}

readconfig.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ func (ReadConfig) Read(hasEnv HasEnv) WatchdogConfig {
6666

6767
cfg.readTimeout = parseIntOrDurationValue(hasEnv.Getenv("read_timeout"), time.Second*5)
6868
cfg.writeTimeout = parseIntOrDurationValue(hasEnv.Getenv("write_timeout"), time.Second*5)
69+
cfg.healthcheckInterval = parseIntOrDurationValue(hasEnv.Getenv("healthcheck_interval"), cfg.writeTimeout)
6970

7071
cfg.execTimeout = parseIntOrDurationValue(hasEnv.Getenv("exec_timeout"), time.Second*0)
7172
cfg.port = parseIntValue(hasEnv.Getenv("port"), 8080)
@@ -106,6 +107,10 @@ type WatchdogConfig struct {
106107
// HTTP write timeout
107108
writeTimeout time.Duration
108109

110+
// healthcheckInterval is the interval that an external service runs its health checks to
111+
// detect health and remove the watchdog from its pool of endpoints
112+
healthcheckInterval time.Duration
113+
109114
// faasProcess is the process to exec
110115
faasProcess string
111116

vendor/github.com/prometheus/client_golang/prometheus/testutil/lint.go

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)