node/p2p: Enforce connection to bootstrap node on startup

This commit is contained in:
tbjump 2023-06-27 17:43:54 +00:00 committed by tbjump
parent 372beb01fe
commit c8fca0f5b9
3 changed files with 50 additions and 2 deletions

View File

@ -833,8 +833,10 @@ func runNode(cmd *cobra.Command, args []string) {
logger.Info("Error resolving guardian-0.guardian. Trying again...")
time.Sleep(time.Second)
}
// TODO this is a hack. If this is not the bootstrap Guardian, we wait 5s such that the bootstrap Guardian has enough time to start.
logger.Info("This is not a bootstrap Guardian. Waiting another 10 seconds so the bootstrap guardian to come online.")
// TODO this is a hack. If this is not the bootstrap Guardian, we wait 10s such that the bootstrap Guardian has enough time to start.
// This may no longer be necessary because now the p2p.go ensures that it can connect to at least one bootstrap peer and will
// exit the whole guardian if it is unable to. Sleeping here for a bit may reduce overall startup time by preventing unnecessary restarts, though.
logger.Info("This is not a bootstrap Guardian. Waiting another 10 seconds for the bootstrap guardian to come online.")
time.Sleep(time.Second * 10)
}
} else {

View File

@ -542,6 +542,9 @@ func testConsensus(t *testing.T, testCases []testCase, numGuardians int) {
for i := 0; i < numGuardians; i++ {
gRun := mockGuardianRunnable(gs, uint(i), obsDb)
err := supervisor.Run(ctx, fmt.Sprintf("g-%d", i), gRun)
if i == 0 && numGuardians > 1 {
time.Sleep(time.Second) // give the bootstrap guardian some time to start up
}
assert.NoError(t, err)
}
logger.Info("All Guardians initiated.")

View File

@ -255,6 +255,49 @@ func Run(
return fmt.Errorf("failed to subscribe topic: %w", err)
}
// Make sure we connect to at least 1 bootstrap node (this is particularly important in a local devnet and CI
// as peer discovery can take a long time).
// Count number of successful connection attempts. If we fail to connect to any bootstrap peer, kill the service
// TODO: Currently, returning from this function will lead to rootCtxCancel() being called in the defer() above.
// The service will then be restarted by Tilt/kubernetes
successes := 0
// Are we a bootstrap node? If so, it's okay to not have any peers.
bootstrapNode := false
for _, addr := range strings.Split(bootstrapPeers, ",") {
if addr == "" {
continue
}
ma, err := multiaddr.NewMultiaddr(addr)
if err != nil {
logger.Error("Invalid bootstrap address", zap.String("peer", addr), zap.Error(err))
continue
}
pi, err := peer.AddrInfoFromP2pAddr(ma)
if err != nil {
logger.Error("Invalid bootstrap address", zap.String("peer", addr), zap.Error(err))
continue
}
if pi.ID == h.ID() {
logger.Info("We're a bootstrap node")
bootstrapNode = true
continue
}
if err = h.Connect(ctx, *pi); err != nil {
logger.Error("Failed to connect to bootstrap peer", zap.String("peer", addr), zap.Error(err))
} else {
successes += 1
}
}
if successes == 0 && !bootstrapNode {
return fmt.Errorf("failed to connect to any bootstrap peer")
}
logger.Info("Connected to bootstrap peers", zap.Int("num", successes))
logger.Info("Node has been started", zap.String("peer_id", h.ID().String()),
zap.String("addrs", fmt.Sprintf("%v", h.Addrs())))