node: poller timeout (#1670)

* node: poller timeout Change-Id: Ia324f1ac482fa9c5bea2b501970f0b22b16e67ce * Add a comment explaining readiness change * Add comment explaining why we are using a timeout * Retry if polling fails
2022-10-06 00:19:31 -05:00 · 2022-10-06 00:19:31 -05:00 · dd2b8e2bd2
parent 9657f41561
commit dd2b8e2bd2
3 changed files with 28 additions and 9 deletions
--- a/node/cmd/guardiand/node.go
+++ b/node/cmd/guardiand/node.go
@ -604,11 +604,12 @@ func runNode(cmd *cobra.Command, args []string) {
 		if *injectiveContract == "" {
 			logger.Fatal("Please specify --injectiveContract")
 		}
-		if *arbitrumRPC == "" {
-			logger.Fatal("Please specify --arbitrumRPC")
-		}
-		if *arbitrumContract == "" {
-			logger.Fatal("Please specify --arbitrumContract")
+		if *arbitrumRPC != "" {
+			if *arbitrumContract == "" {
+				logger.Fatal("If --arbitrumRPC is specified, then --arbitrumContract is required")
+			}
+		} else if *arbitrumContract != "" {
+			logger.Fatal("If --arbitrumContract is specified, then --arbitrumRPC is required")
 		}
 		if *xplaWS == "" {
 			logger.Fatal("Please specify --xplaWS")
--- a/node/pkg/watchers/evm/connectors/poller.go
+++ b/node/pkg/watchers/evm/connectors/poller.go
@ -64,7 +64,14 @@ func (b *BlockPollConnector) run(ctx context.Context) error {
 			timer.Stop()
 			return ctx.Err()
 		case <-timer.C:
-			lastBlock, err = b.pollBlocks(ctx, logger, lastBlock)
+			for count := 0; count < 3; count++ {
+				lastBlock, err = b.pollBlocks(ctx, logger, lastBlock)
+				if err == nil {
+					break
+				}
+				logger.Error("polling encountered an error", zap.Error(err))
+			}
+
 			if err != nil {
 				b.errFeed.Send("polling encountered an error")
 			}
@ -74,12 +81,19 @@ func (b *BlockPollConnector) run(ctx context.Context) error {
 }

 func (b *BlockPollConnector) pollBlocks(ctx context.Context, logger *zap.Logger, lastBlock *NewBlock) (lastPublishedBlock *NewBlock, retErr error) {
+	// Some of the testnet providers (like the one we are using for Arbitrum) limit how many transactions we can do. When that happens, the call hangs.
+	// Use a timeout so that the call will fail and the runable will get restarted. This should not happen in mainnet, but if it does, we will need to
+	// investigate why the runable is dying and fix the underlying problem.
+
+	timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
+	defer cancel()
+
 	lastPublishedBlock = lastBlock

 	// Fetch the latest block on the chain
 	// We could do this on every iteration such that if a new block is created while this function is being executed,
 	// it would automatically fetch new blocks but in order to reduce API load this will be done on the next iteration.
-	latestBlock, err := b.getBlock(ctx, logger, nil)
+	latestBlock, err := b.getBlock(timeout, logger, nil)
 	if err != nil {
 		logger.Error("failed to look up latest block",
 			zap.Uint64("lastSeenBlock", lastBlock.Number.Uint64()), zap.Error(err))
@ -93,7 +107,7 @@ func (b *BlockPollConnector) pollBlocks(ctx context.Context, logger *zap.Logger,

 		// Try to fetch the next block between lastBlock and latestBlock
 		nextBlockNumber := new(big.Int).Add(lastPublishedBlock.Number, big.NewInt(1))
-		block, err := b.getBlock(ctx, logger, nextBlockNumber)
+		block, err := b.getBlock(timeout, logger, nextBlockNumber)
 		if err != nil {
 			logger.Error("failed to fetch next block",
 				zap.Uint64("block", nextBlockNumber.Uint64()), zap.Error(err))
@ -101,7 +115,7 @@ func (b *BlockPollConnector) pollBlocks(ctx context.Context, logger *zap.Logger,
 		}

 		if b.finalizer != nil {
-			finalized, err := b.finalizer.IsBlockFinalized(ctx, block)
+			finalized, err := b.finalizer.IsBlockFinalized(timeout, block)
 			if err != nil {
 				logger.Error("failed to check block finalization",
 					zap.Uint64("block", block.Number.Uint64()), zap.Error(err))
--- a/node/pkg/watchers/evm/watcher.go
+++ b/node/pkg/watchers/evm/watcher.go
@ -634,6 +634,10 @@ func (w *Watcher) Run(ctx context.Context) error {
 		}
 	}()

+	// Now that the init is complete, peg readiness. That will also happen when we process a new head, but chains
+	// that wait for finality may take a while to receive the first block and we don't want to hold up the init.
+	readiness.SetReady(w.readiness)
+
 	select {
 	case <-ctx.Done():
 		return ctx.Err()