Skip to content

Commit c2078b8

Browse files
authored
Add optimistic mode (#123)
1 parent 64f8e79 commit c2078b8

File tree

2 files changed

+110
-57
lines changed

2 files changed

+110
-57
lines changed

http-tracer.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ func InternalTrace(req *http.Request, resp *http.Response, reqTime, respTime tim
228228
}
229229

230230
// Trace gets trace of http request
231-
func Trace(f http.HandlerFunc, logBody bool, w http.ResponseWriter, r *http.Request, endpoint string) TraceInfo {
231+
func Trace(f http.HandlerFunc, logBody bool, w http.ResponseWriter, r *http.Request, backend *Backend) TraceInfo {
232232
// Setup a http request body recorder
233233
reqHeaders := r.Header.Clone()
234234
reqHeaders.Set("Host", r.Host)
@@ -270,7 +270,7 @@ func Trace(f http.HandlerFunc, logBody bool, w http.ResponseWriter, r *http.Requ
270270

271271
t.ReqInfo = rq
272272
t.RespInfo = rs
273-
t.NodeName = endpoint
273+
t.NodeName = backend.endpoint
274274
t.CallStats = traceCallStats{
275275
Latency: rs.Time.Sub(rw.StartTime),
276276
Rx: reqBodyRecorder.Size(),
@@ -286,7 +286,7 @@ func Trace(f http.HandlerFunc, logBody bool, w http.ResponseWriter, r *http.Requ
286286

287287
// Log only the headers.
288288
func httpTraceHdrs(f http.HandlerFunc, w http.ResponseWriter, r *http.Request, backend *Backend) {
289-
trace := Trace(f, false, w, r, backend.endpoint)
289+
trace := Trace(f, false, w, r, backend)
290290
doTrace(trace, backend)
291291
}
292292

main.go

Lines changed: 107 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ type logMessage struct {
132132
// Endpoint of backend
133133
Endpoint string `json:"Endpoint"`
134134
// Error message
135-
Error error `json:"Error,omitempty"`
135+
Error string `json:"Error,omitempty"`
136136
// Status of endpoint
137137
Status string `json:"Status,omitempty"`
138138
// Downtime so far
@@ -141,7 +141,7 @@ type logMessage struct {
141141
}
142142

143143
func (l logMessage) String() string {
144-
if l.Error == nil {
144+
if l.Error == "" {
145145
if l.DowntimeDuration > 0 {
146146
return fmt.Sprintf("%s%2s: %s %s is %s Downtime duration: %s",
147147
console.Colorize("LogMsgType", l.Type), "",
@@ -157,6 +157,7 @@ func (l logMessage) String() string {
157157

158158
// Backend entity to which requests gets load balanced.
159159
type Backend struct {
160+
ctxt context.Context
160161
siteNumber int
161162
endpoint string
162163
proxy *reverse.Proxy
@@ -165,6 +166,7 @@ type Backend struct {
165166
healthCheckURL string
166167
healthCheckDuration time.Duration
167168
healthCheckTimeout time.Duration
169+
healthOptimistic bool
168170
Stats *BackendStats
169171
}
170172

@@ -173,12 +175,58 @@ const (
173175
online
174176
)
175177

176-
func (b *Backend) setOffline() {
177-
atomic.StoreInt32(&b.up, offline)
178+
func (b *Backend) setOffline(msg string) (swapped bool) {
179+
if atomic.SwapInt32(&b.up, offline) == offline {
180+
return false
181+
}
182+
183+
now := time.Now().UTC()
184+
185+
b.Stats.Lock()
186+
b.Stats.DowntimeStart = now
187+
b.Stats.UpSince = time.Time{}
188+
b.Stats.Unlock()
189+
190+
if b.healthOptimistic {
191+
go b.healthCheck(false)
192+
}
193+
194+
if globalLoggingEnabled {
195+
logMsg(logMessage{
196+
Endpoint: b.endpoint,
197+
Status: "down",
198+
Error: msg,
199+
})
200+
}
201+
202+
return true
178203
}
179204

180-
func (b *Backend) setOnline() {
181-
atomic.StoreInt32(&b.up, online)
205+
func (b *Backend) setOnline() (swapped bool) {
206+
if atomic.SwapInt32(&b.up, online) == online {
207+
return false
208+
}
209+
now := time.Now().UTC()
210+
211+
b.Stats.Lock()
212+
b.Stats.UpSince = now
213+
if !b.Stats.DowntimeStart.IsZero() {
214+
downtime := now.Sub(b.Stats.DowntimeStart)
215+
b.Stats.LastDowntime = downtime
216+
b.Stats.CumDowntime += downtime
217+
}
218+
b.Stats.DowntimeStart = time.Time{}
219+
b.Stats.Unlock()
220+
221+
if globalLoggingEnabled {
222+
logMsg(logMessage{
223+
Endpoint: b.endpoint,
224+
Status: "up",
225+
DowntimeDuration: b.Stats.LastDowntime,
226+
})
227+
}
228+
229+
return true
182230
}
183231

184232
// Online returns true if backend is up
@@ -247,10 +295,7 @@ func (b *Backend) ErrorHandler(w http.ResponseWriter, r *http.Request, err error
247295
}
248296
}
249297
if offline {
250-
if globalLoggingEnabled {
251-
logMsg(logMessage{Endpoint: b.endpoint, Status: "down", Error: err})
252-
}
253-
b.setOffline()
298+
b.setOffline(err.Error())
254299
}
255300

256301
writeErrorResponse(w, r, err)
@@ -312,18 +357,27 @@ func getHealthCheckURL(endpoint, healthCheckPath string, healthCheckPort int) (s
312357
}
313358

314359
// healthCheck - background routine which checks if a backend is up or down.
315-
func (b *Backend) healthCheck(ctxt context.Context) {
360+
func (b *Backend) healthCheck(immediate bool) {
361+
if immediate {
362+
if err := b.doHealthCheck(); err != nil {
363+
console.Errorln(err)
364+
} else if b.healthOptimistic && b.Online() {
365+
return
366+
}
367+
}
368+
316369
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
317370
timer := time.NewTimer(b.healthCheckDuration)
318371
defer timer.Stop()
319372
for {
320373
select {
321-
case <-ctxt.Done():
374+
case <-b.ctxt.Done():
322375
return
323376
case <-timer.C:
324-
err := b.doHealthCheck()
325-
if err != nil {
377+
if err := b.doHealthCheck(); err != nil {
326378
console.Errorln(err)
379+
} else if b.healthOptimistic && b.Online() {
380+
return
327381
}
328382
// Add random jitter to call
329383
timer.Reset(b.healthCheckDuration + time.Duration(rng.Int63n(int64(b.healthCheckDuration))))
@@ -353,33 +407,16 @@ func (b *Backend) doHealthCheck() error {
353407
resp, err := b.httpClient.Do(req)
354408
respTime := time.Now().UTC()
355409
drainBody(resp)
356-
if err != nil || (err == nil && resp.StatusCode != http.StatusOK) {
357-
if globalLoggingEnabled && (!b.Online() || b.Stats.UpSince.IsZero()) {
358-
logMsg(logMessage{Endpoint: b.endpoint, Status: "down", Error: err})
359-
}
360-
// observed an error, take the backend down.
361-
b.setOffline()
362-
if b.Stats.DowntimeStart.IsZero() {
363-
b.Stats.DowntimeStart = time.Now().UTC()
364-
}
365-
} else {
366-
var downtimeEnd time.Time
367-
if !b.Stats.DowntimeStart.IsZero() {
368-
now := time.Now().UTC()
369-
b.updateDowntime(now.Sub(b.Stats.DowntimeStart))
370-
downtimeEnd = now
371-
}
372-
if globalLoggingEnabled && !b.Online() && !b.Stats.UpSince.IsZero() {
373-
logMsg(logMessage{
374-
Endpoint: b.endpoint,
375-
Status: "up",
376-
DowntimeDuration: downtimeEnd.Sub(b.Stats.DowntimeStart),
377-
})
378-
}
379-
b.Stats.UpSince = time.Now().UTC()
380-
b.Stats.DowntimeStart = time.Time{}
410+
411+
switch {
412+
case err != nil:
413+
b.setOffline(err.Error())
414+
case resp.StatusCode != http.StatusOK:
415+
b.setOffline(fmt.Sprintf("response status %d", resp.StatusCode))
416+
default:
381417
b.setOnline()
382418
}
419+
383420
if globalTrace != "application" {
384421
if resp != nil {
385422
traceHealthCheckReq(req, resp, reqTime, respTime, b, err)
@@ -389,13 +426,6 @@ func (b *Backend) doHealthCheck() error {
389426
return nil
390427
}
391428

392-
func (b *Backend) updateDowntime(downtime time.Duration) {
393-
b.Stats.Lock()
394-
defer b.Stats.Unlock()
395-
b.Stats.LastDowntime = downtime
396-
b.Stats.CumDowntime += downtime
397-
}
398-
399429
// updateCallStats updates the cumulative stats for each call to backend
400430
func (b *Backend) updateCallStats(t shortTraceMsg) {
401431
b.Stats.Lock()
@@ -438,6 +468,7 @@ type healthCheckOptions struct {
438468
healthCheckPort int
439469
healthCheckDuration time.Duration
440470
healthCheckTimeout time.Duration
471+
healthOptimistic bool
441472
}
442473

443474
func (m *multisite) renewSite(ctx *cli.Context, tlsMaxVersion uint16, opts healthCheckOptions) {
@@ -503,6 +534,14 @@ func (m *multisite) populate() {
503534
minLatency = fmt.Sprintf("%2s", b.Stats.MinLatency.Round(time.Microsecond))
504535
maxLatency = fmt.Sprintf("%2s", b.Stats.MaxLatency.Round(time.Microsecond))
505536
}
537+
cumDowntime := b.Stats.CumDowntime
538+
lastDowntime := b.Stats.LastDowntime
539+
if !b.Online() {
540+
// show current downtime and cumulative downtime including
541+
// the current downtime
542+
lastDowntime = time.Now().UTC().Sub(b.Stats.DowntimeStart)
543+
cumDowntime += lastDowntime
544+
}
506545
cellText[i*len(site.backends)+j+1] = []string{
507546
humanize.Ordinal(b.siteNumber),
508547
b.endpoint,
@@ -511,8 +550,8 @@ func (m *multisite) populate() {
511550
strconv.FormatInt(b.Stats.TotCallFailures, 10),
512551
humanize.IBytes(uint64(b.Stats.Rx)),
513552
humanize.IBytes(uint64(b.Stats.Tx)),
514-
b.Stats.CumDowntime.Round(time.Microsecond).String(),
515-
b.Stats.LastDowntime.Round(time.Microsecond).String(),
553+
cumDowntime.Round(time.Microsecond).String(),
554+
lastDowntime.Round(time.Microsecond).String(),
516555
minLatency,
517556
maxLatency,
518557
}
@@ -886,8 +925,8 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
886925
var backends []*Backend
887926
var prevScheme string
888927
var transport http.RoundTripper
889-
var connStats []*ConnStats
890928
var hostName string
929+
891930
if len(endpoints) == 1 && ctx.GlobalBool("rr-dns-mode") {
892931
console.Infof("RR DNS mode enabled, using %s as hostname", endpoints[0])
893932
// guess it is LB config address
@@ -906,6 +945,15 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
906945
endpoints = append(endpoints, strings.Replace(target.String(), hostName, ip, 1))
907946
}
908947
}
948+
949+
var connStats []*ConnStats
950+
for _, endpoint := range endpoints {
951+
endpoint = strings.TrimSuffix(endpoint, slashSeparator)
952+
connStats = append(connStats, newConnStats(endpoint))
953+
}
954+
globalConnStats.Store(&connStats)
955+
956+
healthOptimistic := ctx.GlobalBool("health-optimistic")
909957
for _, endpoint := range endpoints {
910958
endpoint = strings.TrimSuffix(endpoint, slashSeparator)
911959
target, err := url.Parse(endpoint)
@@ -956,15 +1004,13 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
9561004
if err != nil {
9571005
console.Fatalln(err)
9581006
}
959-
backend := &Backend{siteNum, endpoint, proxy, &http.Client{
1007+
backend := &Backend{ctxt, siteNum, endpoint, proxy, &http.Client{
9601008
Transport: proxy.Transport,
961-
}, 0, healthCheckURL, opts.healthCheckDuration, opts.healthCheckTimeout, &stats}
962-
go backend.healthCheck(ctxt)
1009+
}, 0, healthCheckURL, opts.healthCheckDuration, opts.healthCheckTimeout, healthOptimistic, &stats}
9631010
proxy.ErrorHandler = backend.ErrorHandler
9641011
backends = append(backends, backend)
965-
connStats = append(connStats, newConnStats(endpoint))
1012+
go backend.healthCheck(true)
9661013
}
967-
globalConnStats.Store(&connStats)
9681014
return &site{
9691015
backends: backends,
9701016
}
@@ -993,6 +1039,7 @@ func sidekickMain(ctx *cli.Context) {
9931039
})
9941040
log2.SetReportCaller(true)
9951041

1042+
healthOptimistic := ctx.GlobalBool("health-optimistic")
9961043
healthCheckPath := ctx.GlobalString("health-path")
9971044
healthReadCheckPath := ctx.GlobalString("read-health-path")
9981045
healthCheckPort := ctx.GlobalInt("health-port")
@@ -1082,6 +1129,7 @@ func sidekickMain(ctx *cli.Context) {
10821129
healthCheckPort,
10831130
healthCheckDuration,
10841131
healthCheckTimeout,
1132+
healthOptimistic,
10851133
})
10861134
m.displayUI(!globalConsoleDisplay)
10871135

@@ -1179,6 +1227,7 @@ func sidekickMain(ctx *cli.Context) {
11791227
healthCheckPort,
11801228
healthCheckDuration,
11811229
healthCheckTimeout,
1230+
healthOptimistic,
11821231
})
11831232
default:
11841233
console.Infof("caught signal '%s'\n", signal)
@@ -1240,6 +1289,10 @@ func main() {
12401289
Name: "rr-dns-mode",
12411290
Usage: "enable round-robin DNS mode",
12421291
},
1292+
cli.BoolFlag{
1293+
Name: "health-optimistic",
1294+
Usage: "only perform health requests when nodes are down",
1295+
},
12431296
cli.StringFlag{
12441297
Name: "auto-tls-host",
12451298
Usage: "enable auto TLS mode for the specified host",

0 commit comments

Comments
 (0)