rpc/lib/client: add jitter for exponential backoff of WSClient

Fixes https://github.com/tendermint/tendermint/issues/751.

Adds jitter to our exponential backoff to mitigate a self DDOS
vector. The jitter is a randomly picked percentage of a second
whose purpose is to ensure that each exponential backoff retry
occurs within (1<<attempts) == 2**attempts, but with the delay
each client will have a random buffer time before it tries to
reconnect instead of all at once reconnections that might even
bring back the previous conditions that might have caused the
dial to the WSServer to have failed e.g
* Network outage
* File descriptor exhaustion
* False positives from firewalls
etc
This commit is contained in:
Emmanuel Odeke 2017-10-24 02:00:17 -07:00
parent 6a5254c475
commit 5504920ba3
No known key found for this signature in database
GPG Key ID: 1CA47A292F89DD40
2 changed files with 61 additions and 4 deletions

View File

@ -4,7 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"math/rand"
"net"
"net/http"
"sync"
@ -254,11 +254,13 @@ func (c *WSClient) reconnect() error {
c.mtx.Unlock()
}()
_1sAsNs := float64(time.Second.Nanoseconds())
for {
c.Logger.Info("reconnecting", "attempt", attempt+1)
jitter := time.Duration(rand.Float64() * _1sAsNs)
backoffDuration := jitter + ((1 << uint(attempt)) * time.Second)
d := time.Duration(math.Exp2(float64(attempt)))
time.Sleep(d * time.Second)
c.Logger.Info("reconnecting", "attempt", attempt+1, "backoff_duration", backoffDuration)
time.Sleep(backoffDuration)
err := c.dial()
if err != nil {

View File

@ -1,11 +1,14 @@
package rpcclient
import (
"bytes"
"context"
"encoding/json"
"errors"
"net"
"net/http"
"net/http/httptest"
"regexp"
"sync"
"testing"
"time"
@ -191,3 +194,55 @@ func callWgDoneOnResult(t *testing.T, c *WSClient, wg *sync.WaitGroup) {
}
}
}
func TestWSClientReconnectWithJitter(t *testing.T) {
if testing.Short() {
t.Skipf("This is a potentially long test")
}
n := 8
maxReconnectAttempts := 3
// Max wait time is ceil(1+0.999) + ceil(2+0.999) + ceil(4+0.999) + ceil(...) = 2 + 3 + 5 = 10s + ...
maxSleepTime := time.Second * time.Duration(((1<<uint(maxReconnectAttempts))-1)+maxReconnectAttempts)
var errNotConnected = errors.New("not connected")
clientMap := make(map[int]*WSClient)
buf := new(bytes.Buffer)
logger := log.NewTMLogger(buf)
for i := 0; i < n; i++ {
c := NewWSClient("tcp://foo", "/websocket")
c.Dialer = func(string, string) (net.Conn, error) {
return nil, errNotConnected
}
c.SetLogger(logger)
c.maxReconnectAttempts = maxReconnectAttempts
// Not invoking defer c.Stop() because
// after all the reconnect attempts have been
// exhausted, c.Stop is implicitly invoked.
clientMap[i] = c
// Trigger the reconnect routine that performs exponential backoff.
go c.reconnect()
}
stopCount := 0
time.Sleep(maxSleepTime)
for key, c := range clientMap {
if !c.IsActive() {
delete(clientMap, key)
stopCount += 1
}
}
require.Equal(t, stopCount, n, "expecting all clients to have been stopped")
// Next we have to examine the logs to ensure that no single time was repeated
backoffDurRegexp := regexp.MustCompile(`backoff_duration=(.+)`)
matches := backoffDurRegexp.FindAll(buf.Bytes(), -1)
seenMap := make(map[string]int)
for i, match := range matches {
if origIndex, seen := seenMap[string(match)]; seen {
t.Errorf("Match #%d (%q) was seen originally at log entry #%d", i, match, origIndex)
} else {
seenMap[string(match)] = i
}
}
}