From 0adedca0a4e4ad2234ec90a54e37f6e4c4694597 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 25 Jul 2024 12:00:06 +0800 Subject: [PATCH] controller: fix limiter cannot work well in high concurrency scenario (#8436) (#8439) close tikv/pd#8435 controller: Fix limiter not functioning well in high concurrency scenarios - In high concurrency scenarios, time may appear rollback because the `now` value passed from outside. high mutext completion leading to more non-sequential execution orders. - Time rollback allows advancing more tokens, which can cause the issue. even result in no limit for the controller. - Fix the problem by avoiding time rollback; instead of acquiring time again within the lock to fix it, as this might incur high costs when frequently acquiring time. Signed-off-by: nolouch Co-authored-by: nolouch --- client/resource_group/controller/limiter.go | 16 +++- .../resource_group/controller/limiter_test.go | 79 +++++++++++++++++++ 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/client/resource_group/controller/limiter.go b/client/resource_group/controller/limiter.go index 1d250c91214..576475008af 100644 --- a/client/resource_group/controller/limiter.go +++ b/client/resource_group/controller/limiter.go @@ -334,7 +334,7 @@ func (lim *Limiter) Reconfigure(now time.Time, ) { lim.mu.Lock() defer lim.mu.Unlock() - logControllerTrace("[resource group controller] before reconfigure", zap.Float64("old-tokens", lim.tokens), zap.Float64("old-rate", float64(lim.limit)), zap.Float64("old-notify-threshold", args.NotifyThreshold), zap.Int64("old-burst", lim.burst)) + logControllerTrace("[resource group controller] before reconfigure", zap.String("name", lim.name), zap.Float64("old-tokens", lim.tokens), zap.Float64("old-rate", float64(lim.limit)), zap.Float64("old-notify-threshold", args.NotifyThreshold), zap.Int64("old-burst", lim.burst)) if args.NewBurst < 0 { lim.last = now lim.tokens = args.NewTokens @@ -350,7 +350,7 @@ func (lim *Limiter) Reconfigure(now time.Time, opt(lim) } lim.maybeNotify() - logControllerTrace("[resource group controller] after reconfigure", zap.Float64("tokens", lim.tokens), zap.Float64("rate", float64(lim.limit)), zap.Float64("notify-threshold", args.NotifyThreshold), zap.Int64("burst", lim.burst)) + logControllerTrace("[resource group controller] after reconfigure", zap.String("name", lim.name), zap.Float64("tokens", lim.tokens), zap.Float64("rate", float64(lim.limit)), zap.Float64("notify-threshold", args.NotifyThreshold), zap.Int64("burst", lim.burst)) } // AvailableTokens decreases the amount of tokens currently available. @@ -361,6 +361,14 @@ func (lim *Limiter) AvailableTokens(now time.Time) float64 { return tokens } +func (lim *Limiter) updateLast(t time.Time) { + // make sure lim.last is monotonic + // see issue: https://github.com/tikv/pd/issues/8435. + if lim.last.Before(t) { + lim.last = t + } +} + const reserveWarnLogInterval = 10 * time.Millisecond // reserveN is a helper method for Reserve. @@ -404,7 +412,7 @@ func (lim *Limiter) reserveN(now time.Time, n float64, maxFutureReserve time.Dur } // Update state if ok { - lim.last = now + lim.updateLast(now) lim.tokens = tokens lim.maybeNotify() } else { @@ -422,7 +430,7 @@ func (lim *Limiter) reserveN(now time.Time, n float64, maxFutureReserve time.Dur zap.Int("remaining-notify-times", lim.remainingNotifyTimes), zap.String("name", lim.name)) } - lim.last = last + lim.updateLast(last) if lim.limit == 0 { lim.notify() } else if lim.remainingNotifyTimes > 0 { diff --git a/client/resource_group/controller/limiter_test.go b/client/resource_group/controller/limiter_test.go index 8854d4b2803..6dfe4d1a32e 100644 --- a/client/resource_group/controller/limiter_test.go +++ b/client/resource_group/controller/limiter_test.go @@ -20,8 +20,10 @@ package controller import ( "context" + "fmt" "math" "sync" + "sync/atomic" "testing" "time" @@ -203,3 +205,80 @@ func TestCancelErrorOfReservation(t *testing.T) { re.Error(err) re.Contains(err.Error(), "context canceled") } + +func TestQPS(t *testing.T) { + re := require.New(t) + cases := []struct { + concurrency int + reserveN int64 + ruPerSec int64 + }{ + {1000, 10, 400000}, + } + + for _, tc := range cases { + t.Run(fmt.Sprintf("concurrency=%d,reserveN=%d,limit=%d", tc.concurrency, tc.reserveN, tc.ruPerSec), func(t *testing.T) { + qps, ruSec, waitTime := testQPSCase(tc.concurrency, tc.reserveN, tc.ruPerSec) + t.Log(fmt.Printf("QPS: %.2f, RU: %.2f, new request need wait %s\n", qps, ruSec, waitTime)) + re.LessOrEqual(math.Abs(float64(tc.ruPerSec)-ruSec), float64(100)*float64(tc.reserveN)) + re.LessOrEqual(math.Abs(float64(tc.ruPerSec)/float64(tc.reserveN)-qps), float64(100)) + }) + } +} + +const testCaseRunTime = 4 * time.Second + +func testQPSCase(concurrency int, reserveN int64, limit int64) (qps float64, ru float64, needWait time.Duration) { + nc := make(chan notifyMsg, 1) + lim := NewLimiter(time.Now(), Limit(limit), limit, float64(limit), nc) + ctx, cancel := context.WithCancel(context.Background()) + + var wg sync.WaitGroup + var totalRequests int64 + start := time.Now() + + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-ctx.Done(): + return + default: + } + r := lim.Reserve(context.Background(), 30*time.Second, time.Now(), float64(reserveN)) + if r.OK() { + delay := r.DelayFrom(time.Now()) + <-time.After(delay) + } else { + panic("r not ok") + } + atomic.AddInt64(&totalRequests, 1) + } + }() + } + var vQPS atomic.Value + var wait time.Duration + ch := make(chan struct{}) + go func() { + var windowRequests int64 + for { + elapsed := time.Since(start) + if elapsed >= testCaseRunTime { + close(ch) + break + } + windowRequests = atomic.SwapInt64(&totalRequests, 0) + vQPS.Store(float64(windowRequests)) + r := lim.Reserve(ctx, 30*time.Second, time.Now(), float64(reserveN)) + wait = r.Delay() + time.Sleep(1 * time.Second) + } + }() + <-ch + cancel() + wg.Wait() + qps = vQPS.Load().(float64) + return qps, qps * float64(reserveN), wait +}