@@ -132,7 +132,7 @@ type logMessage struct {
132132 // Endpoint of backend
133133 Endpoint string `json:"Endpoint"`
134134 // Error message
135- Error error `json:"Error,omitempty"`
135+ Error string `json:"Error,omitempty"`
136136 // Status of endpoint
137137 Status string `json:"Status,omitempty"`
138138 // Downtime so far
@@ -141,7 +141,7 @@ type logMessage struct {
141141}
142142
143143func (l logMessage ) String () string {
144- if l .Error == nil {
144+ if l .Error == "" {
145145 if l .DowntimeDuration > 0 {
146146 return fmt .Sprintf ("%s%2s: %s %s is %s Downtime duration: %s" ,
147147 console .Colorize ("LogMsgType" , l .Type ), "" ,
@@ -157,6 +157,7 @@ func (l logMessage) String() string {
157157
158158// Backend entity to which requests gets load balanced.
159159type Backend struct {
160+ ctxt context.Context
160161 siteNumber int
161162 endpoint string
162163 proxy * reverse.Proxy
@@ -165,6 +166,7 @@ type Backend struct {
165166 healthCheckURL string
166167 healthCheckDuration time.Duration
167168 healthCheckTimeout time.Duration
169+ healthOptimistic bool
168170 Stats * BackendStats
169171}
170172
@@ -173,12 +175,58 @@ const (
173175 online
174176)
175177
176- func (b * Backend ) setOffline () {
177- atomic .StoreInt32 (& b .up , offline )
178+ func (b * Backend ) setOffline (msg string ) (swapped bool ) {
179+ if atomic .SwapInt32 (& b .up , offline ) == offline {
180+ return false
181+ }
182+
183+ now := time .Now ().UTC ()
184+
185+ b .Stats .Lock ()
186+ b .Stats .DowntimeStart = now
187+ b .Stats .UpSince = time.Time {}
188+ b .Stats .Unlock ()
189+
190+ if b .healthOptimistic {
191+ go b .healthCheck (false )
192+ }
193+
194+ if globalLoggingEnabled {
195+ logMsg (logMessage {
196+ Endpoint : b .endpoint ,
197+ Status : "down" ,
198+ Error : msg ,
199+ })
200+ }
201+
202+ return true
178203}
179204
180- func (b * Backend ) setOnline () {
181- atomic .StoreInt32 (& b .up , online )
205+ func (b * Backend ) setOnline () (swapped bool ) {
206+ if atomic .SwapInt32 (& b .up , online ) == online {
207+ return false
208+ }
209+ now := time .Now ().UTC ()
210+
211+ b .Stats .Lock ()
212+ b .Stats .UpSince = now
213+ if ! b .Stats .DowntimeStart .IsZero () {
214+ downtime := now .Sub (b .Stats .DowntimeStart )
215+ b .Stats .LastDowntime = downtime
216+ b .Stats .CumDowntime += downtime
217+ }
218+ b .Stats .DowntimeStart = time.Time {}
219+ b .Stats .Unlock ()
220+
221+ if globalLoggingEnabled {
222+ logMsg (logMessage {
223+ Endpoint : b .endpoint ,
224+ Status : "up" ,
225+ DowntimeDuration : b .Stats .LastDowntime ,
226+ })
227+ }
228+
229+ return true
182230}
183231
184232// Online returns true if backend is up
@@ -247,10 +295,7 @@ func (b *Backend) ErrorHandler(w http.ResponseWriter, r *http.Request, err error
247295 }
248296 }
249297 if offline {
250- if globalLoggingEnabled {
251- logMsg (logMessage {Endpoint : b .endpoint , Status : "down" , Error : err })
252- }
253- b .setOffline ()
298+ b .setOffline (err .Error ())
254299 }
255300
256301 writeErrorResponse (w , r , err )
@@ -312,18 +357,27 @@ func getHealthCheckURL(endpoint, healthCheckPath string, healthCheckPort int) (s
312357}
313358
314359// healthCheck - background routine which checks if a backend is up or down.
315- func (b * Backend ) healthCheck (ctxt context.Context ) {
360+ func (b * Backend ) healthCheck (immediate bool ) {
361+ if immediate {
362+ if err := b .doHealthCheck (); err != nil {
363+ console .Errorln (err )
364+ } else if b .healthOptimistic && b .Online () {
365+ return
366+ }
367+ }
368+
316369 rng := rand .New (rand .NewSource (time .Now ().UnixNano ()))
317370 timer := time .NewTimer (b .healthCheckDuration )
318371 defer timer .Stop ()
319372 for {
320373 select {
321- case <- ctxt .Done ():
374+ case <- b . ctxt .Done ():
322375 return
323376 case <- timer .C :
324- err := b .doHealthCheck ()
325- if err != nil {
377+ if err := b .doHealthCheck (); err != nil {
326378 console .Errorln (err )
379+ } else if b .healthOptimistic && b .Online () {
380+ return
327381 }
328382 // Add random jitter to call
329383 timer .Reset (b .healthCheckDuration + time .Duration (rng .Int63n (int64 (b .healthCheckDuration ))))
@@ -353,33 +407,16 @@ func (b *Backend) doHealthCheck() error {
353407 resp , err := b .httpClient .Do (req )
354408 respTime := time .Now ().UTC ()
355409 drainBody (resp )
356- if err != nil || (err == nil && resp .StatusCode != http .StatusOK ) {
357- if globalLoggingEnabled && (! b .Online () || b .Stats .UpSince .IsZero ()) {
358- logMsg (logMessage {Endpoint : b .endpoint , Status : "down" , Error : err })
359- }
360- // observed an error, take the backend down.
361- b .setOffline ()
362- if b .Stats .DowntimeStart .IsZero () {
363- b .Stats .DowntimeStart = time .Now ().UTC ()
364- }
365- } else {
366- var downtimeEnd time.Time
367- if ! b .Stats .DowntimeStart .IsZero () {
368- now := time .Now ().UTC ()
369- b .updateDowntime (now .Sub (b .Stats .DowntimeStart ))
370- downtimeEnd = now
371- }
372- if globalLoggingEnabled && ! b .Online () && ! b .Stats .UpSince .IsZero () {
373- logMsg (logMessage {
374- Endpoint : b .endpoint ,
375- Status : "up" ,
376- DowntimeDuration : downtimeEnd .Sub (b .Stats .DowntimeStart ),
377- })
378- }
379- b .Stats .UpSince = time .Now ().UTC ()
380- b .Stats .DowntimeStart = time.Time {}
410+
411+ switch {
412+ case err != nil :
413+ b .setOffline (err .Error ())
414+ case resp .StatusCode != http .StatusOK :
415+ b .setOffline (fmt .Sprintf ("response status %d" , resp .StatusCode ))
416+ default :
381417 b .setOnline ()
382418 }
419+
383420 if globalTrace != "application" {
384421 if resp != nil {
385422 traceHealthCheckReq (req , resp , reqTime , respTime , b , err )
@@ -389,13 +426,6 @@ func (b *Backend) doHealthCheck() error {
389426 return nil
390427}
391428
392- func (b * Backend ) updateDowntime (downtime time.Duration ) {
393- b .Stats .Lock ()
394- defer b .Stats .Unlock ()
395- b .Stats .LastDowntime = downtime
396- b .Stats .CumDowntime += downtime
397- }
398-
399429// updateCallStats updates the cumulative stats for each call to backend
400430func (b * Backend ) updateCallStats (t shortTraceMsg ) {
401431 b .Stats .Lock ()
@@ -438,6 +468,7 @@ type healthCheckOptions struct {
438468 healthCheckPort int
439469 healthCheckDuration time.Duration
440470 healthCheckTimeout time.Duration
471+ healthOptimistic bool
441472}
442473
443474func (m * multisite ) renewSite (ctx * cli.Context , tlsMaxVersion uint16 , opts healthCheckOptions ) {
@@ -503,6 +534,14 @@ func (m *multisite) populate() {
503534 minLatency = fmt .Sprintf ("%2s" , b .Stats .MinLatency .Round (time .Microsecond ))
504535 maxLatency = fmt .Sprintf ("%2s" , b .Stats .MaxLatency .Round (time .Microsecond ))
505536 }
537+ cumDowntime := b .Stats .CumDowntime
538+ lastDowntime := b .Stats .LastDowntime
539+ if ! b .Online () {
540+ // show current downtime and cumulative downtime including
541+ // the current downtime
542+ lastDowntime = time .Now ().UTC ().Sub (b .Stats .DowntimeStart )
543+ cumDowntime += lastDowntime
544+ }
506545 cellText [i * len (site .backends )+ j + 1 ] = []string {
507546 humanize .Ordinal (b .siteNumber ),
508547 b .endpoint ,
@@ -511,8 +550,8 @@ func (m *multisite) populate() {
511550 strconv .FormatInt (b .Stats .TotCallFailures , 10 ),
512551 humanize .IBytes (uint64 (b .Stats .Rx )),
513552 humanize .IBytes (uint64 (b .Stats .Tx )),
514- b . Stats . CumDowntime .Round (time .Microsecond ).String (),
515- b . Stats . LastDowntime .Round (time .Microsecond ).String (),
553+ cumDowntime .Round (time .Microsecond ).String (),
554+ lastDowntime .Round (time .Microsecond ).String (),
516555 minLatency ,
517556 maxLatency ,
518557 }
@@ -886,8 +925,8 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
886925 var backends []* Backend
887926 var prevScheme string
888927 var transport http.RoundTripper
889- var connStats []* ConnStats
890928 var hostName string
929+
891930 if len (endpoints ) == 1 && ctx .GlobalBool ("rr-dns-mode" ) {
892931 console .Infof ("RR DNS mode enabled, using %s as hostname" , endpoints [0 ])
893932 // guess it is LB config address
@@ -906,6 +945,15 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
906945 endpoints = append (endpoints , strings .Replace (target .String (), hostName , ip , 1 ))
907946 }
908947 }
948+
949+ var connStats []* ConnStats
950+ for _ , endpoint := range endpoints {
951+ endpoint = strings .TrimSuffix (endpoint , slashSeparator )
952+ connStats = append (connStats , newConnStats (endpoint ))
953+ }
954+ globalConnStats .Store (& connStats )
955+
956+ healthOptimistic := ctx .GlobalBool ("health-optimistic" )
909957 for _ , endpoint := range endpoints {
910958 endpoint = strings .TrimSuffix (endpoint , slashSeparator )
911959 target , err := url .Parse (endpoint )
@@ -956,15 +1004,13 @@ func configureSite(ctxt context.Context, ctx *cli.Context, siteNum int, siteStrs
9561004 if err != nil {
9571005 console .Fatalln (err )
9581006 }
959- backend := & Backend {siteNum , endpoint , proxy , & http.Client {
1007+ backend := & Backend {ctxt , siteNum , endpoint , proxy , & http.Client {
9601008 Transport : proxy .Transport ,
961- }, 0 , healthCheckURL , opts .healthCheckDuration , opts .healthCheckTimeout , & stats }
962- go backend .healthCheck (ctxt )
1009+ }, 0 , healthCheckURL , opts .healthCheckDuration , opts .healthCheckTimeout , healthOptimistic , & stats }
9631010 proxy .ErrorHandler = backend .ErrorHandler
9641011 backends = append (backends , backend )
965- connStats = append ( connStats , newConnStats ( endpoint ) )
1012+ go backend . healthCheck ( true )
9661013 }
967- globalConnStats .Store (& connStats )
9681014 return & site {
9691015 backends : backends ,
9701016 }
@@ -993,6 +1039,7 @@ func sidekickMain(ctx *cli.Context) {
9931039 })
9941040 log2 .SetReportCaller (true )
9951041
1042+ healthOptimistic := ctx .GlobalBool ("health-optimistic" )
9961043 healthCheckPath := ctx .GlobalString ("health-path" )
9971044 healthReadCheckPath := ctx .GlobalString ("read-health-path" )
9981045 healthCheckPort := ctx .GlobalInt ("health-port" )
@@ -1082,6 +1129,7 @@ func sidekickMain(ctx *cli.Context) {
10821129 healthCheckPort ,
10831130 healthCheckDuration ,
10841131 healthCheckTimeout ,
1132+ healthOptimistic ,
10851133 })
10861134 m .displayUI (! globalConsoleDisplay )
10871135
@@ -1179,6 +1227,7 @@ func sidekickMain(ctx *cli.Context) {
11791227 healthCheckPort ,
11801228 healthCheckDuration ,
11811229 healthCheckTimeout ,
1230+ healthOptimistic ,
11821231 })
11831232 default :
11841233 console .Infof ("caught signal '%s'\n " , signal )
@@ -1240,6 +1289,10 @@ func main() {
12401289 Name : "rr-dns-mode" ,
12411290 Usage : "enable round-robin DNS mode" ,
12421291 },
1292+ cli.BoolFlag {
1293+ Name : "health-optimistic" ,
1294+ Usage : "only perform health requests when nodes are down" ,
1295+ },
12431296 cli.StringFlag {
12441297 Name : "auto-tls-host" ,
12451298 Usage : "enable auto TLS mode for the specified host" ,
0 commit comments