avereha/prioritylimiter (#328)

avereha · web-flow · commit 3f7591f90de1 · 2021-01-11T12:35:56.000+01:00
* Add priority limiter

From time to time, simple requests are slow.

Extra logging helped us understand that the slowdown would happen when we would get a complex request to fill up all limiter slots in carbon API.
How does this change solve the problem? Why is this the best approach?

To prevent this issue, I'm using a priority queue for the limiter, with the priority set to the number of subrequests - fewer subrequests -&gt; more priority.
That way, all the simple requests have the highest priority(0) in the carbon API.
Simple requests will be faster, complex requests will be slower.
For requests with equal priority, we are sorting them by uuid. That way, we can lower the number of active requests.

Possible future improvements:

    now that we know how many requests are waiting for the limiter and request size, we can start rejecting complex requests when we have more than X pending requests.
    we can propagate the priority in a header to the zipper

* carbonapi: add metrics for number of active/waiting requests

* carbonapi: register prometheus metrics
diff --git a/app/carbonapi/app.go b/app/carbonapi/app.go
@@ -85,7 +85,9 @@ func New(config cfg.API, logger *zap.Logger, buildVersion string) (*App, error)
 	app.requestBlocker.ReloadRules()
 
 	// TODO(gmagnusson): Setup backends
-	backend, err := initBackend(app.config, logger)
+	backend, err := initBackend(app.config, logger,
+		app.prometheusMetrics.ActiveUpstreamRequests,
+		app.prometheusMetrics.WaitingUpstreamRequests)
 	if err != nil {
 		logger.Fatal("couldn't initialize backends", zap.Error(err))
 	}
@@ -155,6 +157,8 @@ func (app *App) registerPrometheusMetrics(logger *zap.Logger) *http.Server {
 	prometheus.MustRegister(app.prometheusMetrics.FindDurationLinComplex)
 	prometheus.MustRegister(app.prometheusMetrics.TimeInQueueExp)
 	prometheus.MustRegister(app.prometheusMetrics.TimeInQueueLin)
+	prometheus.MustRegister(app.prometheusMetrics.ActiveUpstreamRequests)
+	prometheus.MustRegister(app.prometheusMetrics.WaitingUpstreamRequests)
 
 	writeTimeout := app.config.Timeouts.Global
 	if writeTimeout < 30*time.Second {
@@ -476,7 +480,7 @@ func (app *App) bucketRequestTimes(req *http.Request, t time.Duration) {
 	}
 }
 
-func initBackend(config cfg.API, logger *zap.Logger) (backend.Backend, error) {
+func initBackend(config cfg.API, logger *zap.Logger, activeUpstreamRequests, waitingUpstreamRequests prometheus.Gauge) (backend.Backend, error) {
 	client := &http.Client{}
 	client.Transport = &http.Transport{
 		MaxIdleConnsPerHost: config.MaxIdleConnsPerHost,
@@ -500,6 +504,8 @@ func initBackend(config cfg.API, logger *zap.Logger) (backend.Backend, error) {
 		Limit:              config.ConcurrencyLimitPerServer,
 		PathCacheExpirySec: uint32(config.ExpireDelaySec),
 		Logger:             logger,
+		ActiveRequests:     activeUpstreamRequests,
+		WaitingRequests:    waitingUpstreamRequests,
 	})
 
 	if err != nil {
diff --git a/app/carbonapi/http_handlers.go b/app/carbonapi/http_handlers.go
@@ -358,12 +358,16 @@ func (app *App) getTargetData(ctx context.Context, target string, exp parser.Exp
 			metricErrs = append(metricErrs, dataTypes.ErrMetricsNotFound)
 			continue
 		}
-
+		renderRequestContext := ctx
+		subrequestCount := len(renderRequests)
+		if subrequestCount > 1 {
+			renderRequestContext = util.WithPriority(ctx, subrequestCount)
+		}
 		// TODO(dgryski): group the render requests into batches
 		rch := make(chan renderResponse, len(renderRequests))
 		for _, m := range renderRequests {
 			// TODO (grzkv) Refactor to enable premature cancel
-			go app.sendRenderRequest(ctx, rch, m, mfetch.From, mfetch.Until, toLog)
+			go app.sendRenderRequest(renderRequestContext, rch, m, mfetch.From, mfetch.Until, toLog)
 		}
 
 		errs := make([]error, 0)
diff --git a/app/carbonapi/metrics.go b/app/carbonapi/metrics.go
@@ -27,6 +27,8 @@ type PrometheusMetrics struct {
 	FindDurationLinComplex    prometheus.Histogram
 	TimeInQueueExp            prometheus.Histogram
 	TimeInQueueLin            prometheus.Histogram
+	ActiveUpstreamRequests    prometheus.Gauge
+	WaitingUpstreamRequests   prometheus.Gauge
 }
 
 func newPrometheusMetrics(config cfg.API) PrometheusMetrics {
@@ -194,6 +196,18 @@ func newPrometheusMetrics(config cfg.API) PrometheusMetrics {
 					config.Zipper.Common.Monitoring.TimeInQueueLinHistogram.BucketsNum),
 			},
 		),
+		ActiveUpstreamRequests: prometheus.NewGauge(
+			prometheus.GaugeOpts{
+				Name: "active_upstream_requests",
+				Help: "Number of in-flight upstream requests",
+			},
+		),
+		WaitingUpstreamRequests: prometheus.NewGauge(
+			prometheus.GaugeOpts{
+				Name: "waiting_upstream_requests",
+				Help: "Number of upstream requests waiting on the limiter",
+			},
+		),
 	}
 }
 
@@ -242,6 +256,7 @@ var apiMetrics = struct {
 	FindCacheHits:       expvar.NewInt("find_cache_hits"),
 	FindCacheMisses:     expvar.NewInt("find_cache_misses"),
 	FindCacheOverheadNS: expvar.NewInt("find_cache_overhead_ns"),
+
 }
 
 // TODO (grzkv): Move to Prometheus, as these are not runtime metrics.
diff --git a/pkg/backend/net/net.go b/pkg/backend/net/net.go
@@ -13,7 +13,10 @@ import (
 	"strings"
 	"time"
 
+	"github.com/bookingcom/carbonapi/pkg/prioritylimiter"
 	"github.com/bookingcom/carbonapi/pkg/types"
+	"github.com/prometheus/client_golang/prometheus"
+
 	"github.com/bookingcom/carbonapi/pkg/types/encoding/carbonapi_v2"
 	"github.com/bookingcom/carbonapi/util"
 
@@ -47,7 +50,7 @@ type Backend struct {
 	cluster        string
 	client         *http.Client
 	timeout        time.Duration
-	limiter        chan struct{}
+	limiter        *prioritylimiter.Limiter
 	logger         *zap.Logger
 	cache          *expirecache.Cache
 	cacheExpirySec int32
@@ -69,6 +72,8 @@ type Config struct {
 	Limit              int           // Set limit of concurrent requests to backend. Defaults to no limit.
 	PathCacheExpirySec uint32        // Set time in seconds before items in path cache expire. Defaults to 10 minutes.
 	Logger             *zap.Logger   // Logger to use. Defaults to a no-op logger.
+	ActiveRequests     prometheus.Gauge
+	WaitingRequests    prometheus.Gauge
 }
 
 var fmtProto = []string{"protobuf"}
@@ -108,7 +113,11 @@ func New(cfg Config) (*Backend, error) {
 	}
 
 	if cfg.Limit > 0 {
-		b.limiter = make(chan struct{}, cfg.Limit)
+		if cfg.ActiveRequests != nil && cfg.WaitingRequests != nil {
+			b.limiter = prioritylimiter.New(cfg.Limit, prioritylimiter.WithMetrics(cfg.ActiveRequests, cfg.WaitingRequests))
+		} else {
+			b.limiter = prioritylimiter.New(cfg.Limit)
+		}
 	}
 
 	if cfg.Logger != nil {
@@ -155,32 +164,16 @@ func (b Backend) enter(ctx context.Context) error {
 	if b.limiter == nil {
 		return nil
 	}
-
-	select {
-	case <-ctx.Done():
-		return ctx.Err()
-
-	case b.limiter <- struct{}{}:
-		// fallthrough
-	}
-
-	return nil
+	priority := util.GetPriority(ctx)
+	uuid := util.GetUUID(ctx)
+	return b.limiter.Enter(ctx, priority, uuid)
 }
 
 func (b Backend) leave() error {
 	if b.limiter == nil {
 		return nil
 	}
-
-	select {
-	case <-b.limiter:
-		// fallthrough
-	default:
-		// this should never happen, but let's not block forever if it does
-		return errors.New("Unable to return value to limiter")
-	}
-
-	return nil
+	return b.limiter.Leave()
 }
 
 func (b Backend) setTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
diff --git a/pkg/backend/net/net_test.go b/pkg/backend/net/net_test.go
@@ -198,9 +198,6 @@ func TestCallTimeoutLeavesLimiter(t *testing.T) {
 		t.Error("Expected to time out")
 	}
 
-	if len(b.limiter) != 0 {
-		t.Error("Expected limiter to be empty")
-	}
 }
 
 func TestDo(t *testing.T) {
diff --git a/pkg/prioritylimiter/prioritylimiter.go b/pkg/prioritylimiter/prioritylimiter.go
@@ -0,0 +1,212 @@
+package prioritylimiter
+
+import (
+	"container/heap"
+	"context"
+	"errors"
+	"sync/atomic"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	indexStateActive    = -1
+	indexStateNew       = -2
+	indexStateCancelled = -3
+)
+
+type request struct {
+	priority int // less is more
+	canEnter chan struct{}
+	index    int
+	uuid     string
+}
+
+type requests []*request
+
+// Limiter does two things
+// a) limits the number of concurrent requests going upstream
+// b) prioritize the "waiting" requests
+// For prioritization we are using two variables:
+// "priority": that is request complexity, less complexity == more priority
+// "uuid": for requests of equal comlexity, process them ordered by uuid in order do minimize the number of "active" requests
+type Limiter struct {
+	requests      requests
+	limiter       chan struct{}
+	wantToEnter   chan *request
+	cancelRequest chan *request
+	loopCount     uint32
+	activeGauge   prometheus.Gauge
+	waitingGauge  prometheus.Gauge
+}
+
+type LimiterOption func(*Limiter)
+
+// New creates a new limiter that allows maximum "limit" requests to "Enter"
+func New(limit int, options ...LimiterOption) *Limiter {
+	ret := &Limiter{
+		limiter:       make(chan struct{}, limit),
+		wantToEnter:   make(chan *request),
+		cancelRequest: make(chan *request),
+		loopCount:     0,
+	}
+	for _, option := range options {
+		option(ret)
+	}
+
+	go ret.loop()
+
+	return ret
+}
+
+// WithMetrics adds prometheus metrics to the Limiter instanace
+func WithMetrics(activeGauge, waitingGauge prometheus.Gauge) LimiterOption {
+	return func(l *Limiter) {
+		l.activeGauge = activeGauge
+		l.waitingGauge = waitingGauge
+	}
+}
+
+// Enter blocks this request until it's turn comes
+func (l *Limiter) Enter(ctx context.Context, priority int, uuid string) error {
+	canEnter := make(chan struct{})
+
+	req := &request{
+		priority: priority,
+		canEnter: canEnter,
+		uuid:     uuid,
+		index:    indexStateNew,
+	}
+
+	l.wantToEnter <- req
+
+	select {
+	// Check first if the ctx is not closed
+	case <-ctx.Done():
+		l.cancelRequest <- req
+		return ctx.Err()
+	default:
+		select {
+		case <-ctx.Done():
+			l.cancelRequest <- req
+			return ctx.Err()
+		case <-canEnter:
+			return nil
+		}
+	}
+}
+
+// Leave marks a request as complete
+func (l *Limiter) Leave() error {
+	select {
+	case <-l.limiter:
+		// fallthrough
+	default:
+		// this should never happen, but let's not block forever if it does
+		return errors.New("Unable to return value to limiter")
+	}
+	return nil
+}
+
+// Active returns the number of in progress requests
+func (l *Limiter) Active() int {
+	return len(l.limiter)
+}
+
+func (l *Limiter) loop() {
+	for {
+		if len(l.requests) == 0 {
+			select {
+			case req := <-l.wantToEnter:
+				if req.index != indexStateCancelled {
+					heap.Push(&l.requests, req)
+				}
+			case req := <-l.cancelRequest:
+				index := req.index
+				if index >= 0 {
+					heap.Remove(&l.requests, index)
+				}
+				if index == indexStateActive {
+					// If we are receiving a cancel request at this point,
+					// it means Enter() returned with error, and the caller will not Leave()
+					l.Leave()
+				}
+				req.index = indexStateCancelled
+			}
+		} else {
+			select {
+			case req := <-l.wantToEnter:
+				if req.index != indexStateCancelled {
+					heap.Push(&l.requests, req)
+				}
+			case req := <-l.cancelRequest:
+				index := req.index
+				if index >= 0 {
+					heap.Remove(&l.requests, index)
+				}
+				if index == indexStateActive {
+					// If we are receiving a cancel request at this point,
+					// it means Enter() returned with error, and the caller will not Leave()
+					l.Leave()
+				}
+				req.index = indexStateCancelled
+			case l.limiter <- struct{}{}:
+				req := heap.Pop(&l.requests).(*request)
+				close(req.canEnter)
+			}
+		}
+		atomic.AddUint32(&l.loopCount, 1)
+		if l.activeGauge != nil {
+			l.activeGauge.Set(float64(len(l.limiter)))
+		}
+		if l.waitingGauge != nil {
+			l.waitingGauge.Set(float64(len(l.requests)))
+		}
+	}
+}
+
+// used in tests to ensure that loop() processed all the pending messages
+func (l *Limiter) waitLoopCount(i int) {
+	for {
+		count := int(atomic.LoadUint32(&l.loopCount))
+		if count >= i {
+			return
+		}
+		time.Sleep(time.Millisecond * 50)
+	}
+}
+
+func (r requests) Len() int {
+	return len(r)
+}
+
+func (r requests) Less(i, j int) bool {
+	if r[i].priority == r[j].priority {
+		return r[i].uuid < r[j].uuid
+	}
+	return r[i].priority < r[j].priority
+}
+
+func (r requests) Swap(i, j int) {
+	r[i], r[j] = r[j], r[i]
+	r[i].index = i
+	r[j].index = j
+}
+
+func (r *requests) Push(x interface{}) {
+	req := x.(*request)
+	idx := len(*r)
+	req.index = idx
+	*r = append(*r, req)
+}
+
+func (r *requests) Pop() interface{} {
+	old := *r
+	n := len(old)
+	req := old[n-1]
+	req.index = indexStateActive
+	old[n-1] = nil // avoid memory leak
+	*r = old[0 : n-1]
+	return req
+}
diff --git a/pkg/prioritylimiter/prioritylimiter_test.go b/pkg/prioritylimiter/prioritylimiter_test.go
diff --git a/util/util.go b/util/util.go

Original file line number	Diff line number	Diff line change
`@@ -198,9 +198,6 @@ func TestCallTimeoutLeavesLimiter(t *testing.T) {`
`198`	`198`	`t.Error("Expected to time out")`
`199`	`199`	`}`
`200`	`200`
`201`		`- if len(b.limiter) != 0 {`
`202`		`- t.Error("Expected limiter to be empty")`
`203`		`- }`
`204`	`201`	`}`
`205`	`202`
`206`	`203`	`func TestDo(t *testing.T) {`