Merge pull request #955 from tendermint/939-p2p-exponential-backoff-on-reconnect
p2p: exponential backoff on reconnect. closes #939
This commit is contained in:
commit
24a9491203
107
p2p/switch.go
107
p2p/switch.go
|
@ -2,6 +2,7 @@ package p2p
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"net"
|
"net"
|
||||||
"time"
|
"time"
|
||||||
|
@ -14,8 +15,19 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
reconnectAttempts = 30
|
// wait a random amount of time from this interval
|
||||||
reconnectInterval = 3 * time.Second
|
// before dialing seeds or reconnecting to help prevent DoS
|
||||||
|
dialRandomizerIntervalMilliseconds = 3000
|
||||||
|
|
||||||
|
// repeatedly try to reconnect for a few minutes
|
||||||
|
// ie. 5 * 20 = 100s
|
||||||
|
reconnectAttempts = 20
|
||||||
|
reconnectInterval = 5 * time.Second
|
||||||
|
|
||||||
|
// then move into exponential backoff mode for ~1day
|
||||||
|
// ie. 3**10 = 16hrs
|
||||||
|
reconnectBackOffAttempts = 10
|
||||||
|
reconnectBackOffBaseSeconds = 3
|
||||||
)
|
)
|
||||||
|
|
||||||
type Reactor interface {
|
type Reactor interface {
|
||||||
|
@ -74,6 +86,8 @@ type Switch struct {
|
||||||
|
|
||||||
filterConnByAddr func(net.Addr) error
|
filterConnByAddr func(net.Addr) error
|
||||||
filterConnByPubKey func(crypto.PubKeyEd25519) error
|
filterConnByPubKey func(crypto.PubKeyEd25519) error
|
||||||
|
|
||||||
|
rng *rand.Rand // seed for randomizing dial times and orders
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -92,6 +106,10 @@ func NewSwitch(config *cfg.P2PConfig) *Switch {
|
||||||
nodeInfo: nil,
|
nodeInfo: nil,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure we have a completely undeterministic PRNG. cmd.RandInt64() draws
|
||||||
|
// from a seed that's initialized with OS entropy on process start.
|
||||||
|
sw.rng = rand.New(rand.NewSource(cmn.RandInt64()))
|
||||||
|
|
||||||
// TODO: collapse the peerConfig into the config ?
|
// TODO: collapse the peerConfig into the config ?
|
||||||
sw.peerConfig.MConfig.flushThrottle = time.Duration(config.FlushThrottleTimeout) * time.Millisecond
|
sw.peerConfig.MConfig.flushThrottle = time.Duration(config.FlushThrottleTimeout) * time.Millisecond
|
||||||
sw.peerConfig.MConfig.SendRate = config.SendRate
|
sw.peerConfig.MConfig.SendRate = config.SendRate
|
||||||
|
@ -317,15 +335,11 @@ func (sw *Switch) DialSeeds(addrBook *AddrBook, seeds []string) error {
|
||||||
addrBook.Save()
|
addrBook.Save()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure we have a completely undeterministic PRNG. cmd.RandInt64() draws
|
|
||||||
// from a seed that's initialized with OS entropy on process start.
|
|
||||||
rng := rand.New(rand.NewSource(cmn.RandInt64()))
|
|
||||||
|
|
||||||
// permute the list, dial them in random order.
|
// permute the list, dial them in random order.
|
||||||
perm := rng.Perm(len(netAddrs))
|
perm := sw.rng.Perm(len(netAddrs))
|
||||||
for i := 0; i < len(perm); i++ {
|
for i := 0; i < len(perm); i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
time.Sleep(time.Duration(rng.Int63n(3000)) * time.Millisecond)
|
sw.randomSleep(0)
|
||||||
j := perm[i]
|
j := perm[i]
|
||||||
sw.dialSeed(netAddrs[j])
|
sw.dialSeed(netAddrs[j])
|
||||||
}(i)
|
}(i)
|
||||||
|
@ -333,6 +347,12 @@ func (sw *Switch) DialSeeds(addrBook *AddrBook, seeds []string) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sleep for interval plus some random amount of ms on [0, dialRandomizerIntervalMilliseconds]
|
||||||
|
func (sw *Switch) randomSleep(interval time.Duration) {
|
||||||
|
r := time.Duration(sw.rng.Int63n(dialRandomizerIntervalMilliseconds)) * time.Millisecond
|
||||||
|
time.Sleep(r + interval)
|
||||||
|
}
|
||||||
|
|
||||||
func (sw *Switch) dialSeed(addr *NetAddress) {
|
func (sw *Switch) dialSeed(addr *NetAddress) {
|
||||||
peer, err := sw.DialPeerWithAddress(addr, true)
|
peer, err := sw.DialPeerWithAddress(addr, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -413,36 +433,61 @@ func (sw *Switch) Peers() IPeerSet {
|
||||||
// If the peer is persistent, it will attempt to reconnect.
|
// If the peer is persistent, it will attempt to reconnect.
|
||||||
// TODO: make record depending on reason.
|
// TODO: make record depending on reason.
|
||||||
func (sw *Switch) StopPeerForError(peer Peer, reason interface{}) {
|
func (sw *Switch) StopPeerForError(peer Peer, reason interface{}) {
|
||||||
addr, _ := NewNetAddressString(peer.NodeInfo().RemoteAddr)
|
|
||||||
sw.Logger.Error("Stopping peer for error", "peer", peer, "err", reason)
|
sw.Logger.Error("Stopping peer for error", "peer", peer, "err", reason)
|
||||||
sw.stopAndRemovePeer(peer, reason)
|
sw.stopAndRemovePeer(peer, reason)
|
||||||
|
|
||||||
if peer.IsPersistent() {
|
if peer.IsPersistent() {
|
||||||
go func() {
|
go sw.reconnectToPeer(peer)
|
||||||
sw.Logger.Info("Reconnecting to peer", "peer", peer)
|
|
||||||
for i := 1; i < reconnectAttempts; i++ {
|
|
||||||
if !sw.IsRunning() {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
peer, err := sw.DialPeerWithAddress(addr, true)
|
|
||||||
if err != nil {
|
|
||||||
if i == reconnectAttempts {
|
|
||||||
sw.Logger.Info("Error reconnecting to peer. Giving up", "tries", i, "err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err)
|
|
||||||
time.Sleep(reconnectInterval)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
sw.Logger.Info("Reconnected to peer", "peer", peer)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reconnectToPeer tries to reconnect to the peer, first repeatedly
|
||||||
|
// with a fixed interval, then with exponential backoff.
|
||||||
|
// If no success after all that, it stops trying, and leaves it
|
||||||
|
// to the PEX/Addrbook to find the peer again
|
||||||
|
func (sw *Switch) reconnectToPeer(peer Peer) {
|
||||||
|
addr, _ := NewNetAddressString(peer.NodeInfo().RemoteAddr)
|
||||||
|
start := time.Now()
|
||||||
|
sw.Logger.Info("Reconnecting to peer", "peer", peer)
|
||||||
|
for i := 0; i < reconnectAttempts; i++ {
|
||||||
|
if !sw.IsRunning() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
peer, err := sw.DialPeerWithAddress(addr, true)
|
||||||
|
if err != nil {
|
||||||
|
sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "peer", peer)
|
||||||
|
// sleep a set amount
|
||||||
|
sw.randomSleep(reconnectInterval)
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
sw.Logger.Info("Reconnected to peer", "peer", peer)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sw.Logger.Error("Failed to reconnect to peer. Beginning exponential backoff",
|
||||||
|
"peer", peer, "elapsed", time.Since(start))
|
||||||
|
for i := 0; i < reconnectBackOffAttempts; i++ {
|
||||||
|
if !sw.IsRunning() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// sleep an exponentially increasing amount
|
||||||
|
sleepIntervalSeconds := math.Pow(reconnectBackOffBaseSeconds, float64(i))
|
||||||
|
sw.randomSleep(time.Duration(sleepIntervalSeconds) * time.Second)
|
||||||
|
peer, err := sw.DialPeerWithAddress(addr, true)
|
||||||
|
if err != nil {
|
||||||
|
sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "peer", peer)
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
sw.Logger.Info("Reconnected to peer", "peer", peer)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sw.Logger.Error("Failed to reconnect to peer. Giving up", "peer", peer, "elapsed", time.Since(start))
|
||||||
|
}
|
||||||
|
|
||||||
// StopPeerGracefully disconnects from a peer gracefully.
|
// StopPeerGracefully disconnects from a peer gracefully.
|
||||||
// TODO: handle graceful disconnects.
|
// TODO: handle graceful disconnects.
|
||||||
func (sw *Switch) StopPeerGracefully(peer Peer) {
|
func (sw *Switch) StopPeerGracefully(peer Peer) {
|
||||||
|
|
|
@ -272,10 +272,10 @@ func TestSwitchReconnectsToPersistentPeer(t *testing.T) {
|
||||||
// simulate failure by closing connection
|
// simulate failure by closing connection
|
||||||
peer.CloseConn()
|
peer.CloseConn()
|
||||||
|
|
||||||
// TODO: actually detect the disconnection and wait for reconnect
|
// TODO: remove sleep, detect the disconnection, wait for reconnect
|
||||||
npeers := sw.Peers().Size()
|
npeers := sw.Peers().Size()
|
||||||
for i := 0; i < 20; i++ {
|
for i := 0; i < 20; i++ {
|
||||||
time.Sleep(100 * time.Millisecond)
|
time.Sleep(250 * time.Millisecond)
|
||||||
npeers = sw.Peers().Size()
|
npeers = sw.Peers().Size()
|
||||||
if npeers > 0 {
|
if npeers > 0 {
|
||||||
break
|
break
|
||||||
|
|
Loading…
Reference in New Issue