rpc/lib/client: add jitter for exponential backoff of WSClient
Fixes https://github.com/tendermint/tendermint/issues/751. Adds jitter to our exponential backoff to mitigate a self DDOS vector. The jitter is a randomly picked percentage of a second whose purpose is to ensure that each exponential backoff retry occurs within (1<<attempts) == 2**attempts, but with the delay each client will have a random buffer time before it tries to reconnect instead of all at once reconnections that might even bring back the previous conditions that might have caused the dial to the WSServer to have failed e.g * Network outage * File descriptor exhaustion * False positives from firewalls etc
This commit is contained in:
parent
6a5254c475
commit
5504920ba3
|
@ -4,7 +4,7 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
|
@ -254,11 +254,13 @@ func (c *WSClient) reconnect() error {
|
|||
c.mtx.Unlock()
|
||||
}()
|
||||
|
||||
_1sAsNs := float64(time.Second.Nanoseconds())
|
||||
for {
|
||||
c.Logger.Info("reconnecting", "attempt", attempt+1)
|
||||
jitter := time.Duration(rand.Float64() * _1sAsNs)
|
||||
backoffDuration := jitter + ((1 << uint(attempt)) * time.Second)
|
||||
|
||||
d := time.Duration(math.Exp2(float64(attempt)))
|
||||
time.Sleep(d * time.Second)
|
||||
c.Logger.Info("reconnecting", "attempt", attempt+1, "backoff_duration", backoffDuration)
|
||||
time.Sleep(backoffDuration)
|
||||
|
||||
err := c.dial()
|
||||
if err != nil {
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
package rpcclient
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"regexp"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -191,3 +194,55 @@ func callWgDoneOnResult(t *testing.T, c *WSClient, wg *sync.WaitGroup) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWSClientReconnectWithJitter(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skipf("This is a potentially long test")
|
||||
}
|
||||
|
||||
n := 8
|
||||
maxReconnectAttempts := 3
|
||||
// Max wait time is ceil(1+0.999) + ceil(2+0.999) + ceil(4+0.999) + ceil(...) = 2 + 3 + 5 = 10s + ...
|
||||
maxSleepTime := time.Second * time.Duration(((1<<uint(maxReconnectAttempts))-1)+maxReconnectAttempts)
|
||||
|
||||
var errNotConnected = errors.New("not connected")
|
||||
clientMap := make(map[int]*WSClient)
|
||||
buf := new(bytes.Buffer)
|
||||
logger := log.NewTMLogger(buf)
|
||||
for i := 0; i < n; i++ {
|
||||
c := NewWSClient("tcp://foo", "/websocket")
|
||||
c.Dialer = func(string, string) (net.Conn, error) {
|
||||
return nil, errNotConnected
|
||||
}
|
||||
c.SetLogger(logger)
|
||||
c.maxReconnectAttempts = maxReconnectAttempts
|
||||
// Not invoking defer c.Stop() because
|
||||
// after all the reconnect attempts have been
|
||||
// exhausted, c.Stop is implicitly invoked.
|
||||
clientMap[i] = c
|
||||
// Trigger the reconnect routine that performs exponential backoff.
|
||||
go c.reconnect()
|
||||
}
|
||||
|
||||
stopCount := 0
|
||||
time.Sleep(maxSleepTime)
|
||||
for key, c := range clientMap {
|
||||
if !c.IsActive() {
|
||||
delete(clientMap, key)
|
||||
stopCount += 1
|
||||
}
|
||||
}
|
||||
require.Equal(t, stopCount, n, "expecting all clients to have been stopped")
|
||||
|
||||
// Next we have to examine the logs to ensure that no single time was repeated
|
||||
backoffDurRegexp := regexp.MustCompile(`backoff_duration=(.+)`)
|
||||
matches := backoffDurRegexp.FindAll(buf.Bytes(), -1)
|
||||
seenMap := make(map[string]int)
|
||||
for i, match := range matches {
|
||||
if origIndex, seen := seenMap[string(match)]; seen {
|
||||
t.Errorf("Match #%d (%q) was seen originally at log entry #%d", i, match, origIndex)
|
||||
} else {
|
||||
seenMap[string(match)] = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue