wormhole/node/pkg/watchers/evm/watcher.go

package evm

import (
	"context"
	"fmt"
	"math"
	"math/big"
	"sync"
	"sync/atomic"
	"time"

	"github.com/certusone/wormhole/node/pkg/watchers/evm/connectors"
	"github.com/certusone/wormhole/node/pkg/watchers/evm/connectors/ethabi"
	"github.com/certusone/wormhole/node/pkg/watchers/interfaces"

	"github.com/certusone/wormhole/node/pkg/p2p"
	gossipv1 "github.com/certusone/wormhole/node/pkg/proto/gossip/v1"
	"github.com/ethereum/go-ethereum/rpc"
	"github.com/prometheus/client_golang/prometheus/promauto"

	"github.com/prometheus/client_golang/prometheus"

	eth_common "github.com/ethereum/go-ethereum/common"
	eth_hexutil "github.com/ethereum/go-ethereum/common/hexutil"
	"go.uber.org/zap"

	"github.com/certusone/wormhole/node/pkg/common"
	"github.com/certusone/wormhole/node/pkg/query"
	"github.com/certusone/wormhole/node/pkg/readiness"
	"github.com/certusone/wormhole/node/pkg/supervisor"
	"github.com/wormhole-foundation/wormhole/sdk/vaa"
)

var (
	ethConnectionErrors = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "wormhole_eth_connection_errors_total",
			Help: "Total number of Ethereum connection errors (either during initial connection or while watching)",
		}, []string{"eth_network", "reason"})

	ethMessagesObserved = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "wormhole_eth_messages_observed_total",
			Help: "Total number of Eth messages observed (pre-confirmation)",
		}, []string{"eth_network"})
	ethMessagesOrphaned = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "wormhole_eth_messages_orphaned_total",
			Help: "Total number of Eth messages dropped (orphaned)",
		}, []string{"eth_network", "reason"})
	ethMessagesConfirmed = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "wormhole_eth_messages_confirmed_total",
			Help: "Total number of Eth messages verified (post-confirmation)",
		}, []string{"eth_network"})
	currentEthHeight = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "wormhole_eth_current_height",
			Help: "Current Ethereum block height",
		}, []string{"eth_network"})
	currentEthSafeHeight = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "wormhole_eth_current_safe_height",
			Help: "Current Ethereum safe block height",
		}, []string{"eth_network"})
	currentEthFinalizedHeight = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "wormhole_eth_current_finalized_height",
			Help: "Current Ethereum finalized block height",
		}, []string{"eth_network"})
	queryLatency = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name: "wormhole_eth_query_latency",
			Help: "Latency histogram for Ethereum calls (note that most interactions are streaming queries, NOT calls, and we cannot measure latency for those",
		}, []string{"eth_network", "operation"})
)

type (
	Watcher struct {
		// Ethereum RPC url
		url string
		// Address of the Eth contract
		contract eth_common.Address
		// Human-readable name of the Eth network, for logging and monitoring.
		networkName string
		// Readiness component
		readinessSync readiness.Component
		// VAA ChainID of the network we're connecting to.
		chainID vaa.ChainID

		// Channel to send new messages to.
		msgC chan<- *common.MessagePublication

		// Channel to send guardian set changes to.
		// setC can be set to nil if no guardian set changes are needed.
		//
		// We currently only fetch the guardian set from one primary chain, which should
		// have this flag set to true, and false on all others.
		//
		// The current primary chain is Ethereum (a mostly arbitrary decision because it
		// has the best API - we might want to switch the primary chain to Solana once
		// the governance mechanism lives there),
		setC chan<- *common.GuardianSet

		// Incoming re-observation requests from the network. Pre-filtered to only
		// include requests for our chainID.
		obsvReqC <-chan *gossipv1.ObservationRequest

		// Incoming query requests from the network. Pre-filtered to only
		// include requests for our chainID.
		queryReqC <-chan *query.PerChainQueryInternal

		// Outbound query responses to query requests
		queryResponseC chan<- *query.PerChainQueryResponseInternal

		pending   map[pendingKey]*pendingMessage
		pendingMu sync.Mutex

		// 0 is a valid guardian set, so we need a nil value here
		currentGuardianSet *uint32

		// Interface to the chain specific ethereum library.
		ethConn       connectors.Connector
		unsafeDevMode bool

		latestBlockNumber          uint64
		latestSafeBlockNumber      uint64
		latestFinalizedBlockNumber uint64
		l1Finalizer                interfaces.L1Finalizer

		ccqConfig          query.PerChainConfig
		ccqMaxBlockNumber  *big.Int
		ccqTimestampCache  *BlocksByTimestamp
		ccqBackfillChannel chan *ccqBackfillRequest
		ccqBatchSize       int64
		ccqBackfillCache   bool
		ccqLogger          *zap.Logger
	}

	pendingKey struct {
		TxHash         eth_common.Hash
		BlockHash      eth_common.Hash
		EmitterAddress vaa.Address
		Sequence       uint64
	}

	pendingMessage struct {
		message *common.MessagePublication
		height  uint64
	}
)

// MaxWaitConfirmations is the maximum number of confirmations to wait before declaring a transaction abandoned.
const MaxWaitConfirmations = 60

func NewEthWatcher(
	url string,
	contract eth_common.Address,
	networkName string,
	chainID vaa.ChainID,
	msgC chan<- *common.MessagePublication,
	setC chan<- *common.GuardianSet,
	obsvReqC <-chan *gossipv1.ObservationRequest,
	queryReqC <-chan *query.PerChainQueryInternal,
	queryResponseC chan<- *query.PerChainQueryResponseInternal,
	unsafeDevMode bool,
	ccqBackfillCache bool,
) *Watcher {
	return &Watcher{
		url:                url,
		contract:           contract,
		networkName:        networkName,
		readinessSync:      common.MustConvertChainIdToReadinessSyncing(chainID),
		chainID:            chainID,
		msgC:               msgC,
		setC:               setC,
		obsvReqC:           obsvReqC,
		queryReqC:          queryReqC,
		queryResponseC:     queryResponseC,
		pending:            map[pendingKey]*pendingMessage{},
		unsafeDevMode:      unsafeDevMode,
		ccqConfig:          query.GetPerChainConfig(chainID),
		ccqMaxBlockNumber:  big.NewInt(0).SetUint64(math.MaxUint64),
		ccqBackfillCache:   ccqBackfillCache,
		ccqBackfillChannel: make(chan *ccqBackfillRequest, 50),
	}
}

func (w *Watcher) Run(parentCtx context.Context) error {
	var err error
	logger := supervisor.Logger(parentCtx)
	w.ccqLogger = logger.With(zap.String("component", "ccqevm"))

	logger.Info("Starting watcher",
		zap.String("watcher_name", "evm"),
		zap.String("url", w.url),
		zap.String("contract", w.contract.String()),
		zap.String("networkName", w.networkName),
		zap.String("chainID", w.chainID.String()),
		zap.Bool("unsafeDevMode", w.unsafeDevMode),
	)

	// later on we will spawn multiple go-routines through `RunWithScissors`, i.e. catching panics.
	// If any of them panic, this function will return, causing this child context to be canceled
	// such that the other go-routines can free up resources
	ctx, watcherContextCancelFunc := context.WithCancel(parentCtx)
	defer watcherContextCancelFunc()

	// Initialize gossip metrics (we want to broadcast the address even if we're not yet syncing)
	p2p.DefaultRegistry.SetNetworkStats(w.chainID, &gossipv1.Heartbeat_Network{
		ContractAddress: w.contract.Hex(),
	})

	timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
	defer cancel()

	finalizedPollingSupported, safePollingSupported, err := w.getFinality(ctx)
	if err != nil {
		return fmt.Errorf("failed to determine finality: %w", err)
	}

	if finalizedPollingSupported {
		if safePollingSupported {
			logger.Info("polling for finalized and safe blocks")
		} else {
			logger.Info("polling for finalized blocks, will generate safe blocks")
		}
		baseConnector, err := connectors.NewEthereumBaseConnector(timeout, w.networkName, w.url, w.contract, logger)
		if err != nil {
			ethConnectionErrors.WithLabelValues(w.networkName, "dial_error").Inc()
			p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
			return fmt.Errorf("dialing eth client failed: %w", err)
		}
		w.ethConn = connectors.NewBatchPollConnector(ctx, logger, baseConnector, safePollingSupported, 1000*time.Millisecond)
	} else if w.chainID == vaa.ChainIDCelo {
		// When we are running in mainnet or testnet, we need to use the Celo ethereum library rather than go-ethereum.
		// However, in devnet, we currently run the standard ETH node for Celo, so we need to use the standard go-ethereum.
		w.ethConn, err = connectors.NewCeloConnector(timeout, w.networkName, w.url, w.contract, logger)
		if err != nil {
			ethConnectionErrors.WithLabelValues(w.networkName, "dial_error").Inc()
			p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
			return fmt.Errorf("dialing eth client failed: %w", err)
		}
	} else {
		// Everything else is instant finality.
		logger.Info("assuming instant finality")
		baseConnector, err := connectors.NewEthereumBaseConnector(timeout, w.networkName, w.url, w.contract, logger)
		if err != nil {
			ethConnectionErrors.WithLabelValues(w.networkName, "dial_error").Inc()
			p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
			return fmt.Errorf("dialing eth client failed: %w", err)
		}
		w.ethConn, err = connectors.NewInstantFinalityConnector(baseConnector, logger)
		if err != nil {
			ethConnectionErrors.WithLabelValues(w.networkName, "dial_error").Inc()
			p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
			return fmt.Errorf("failed to connect to instant finality chain: %w", err)
		}
	}

	if w.ccqConfig.TimestampCacheSupported {
		w.ccqTimestampCache = NewBlocksByTimestamp(BTS_MAX_BLOCKS, w.unsafeDevMode)
	}

	errC := make(chan error)

	// Subscribe to new message publications. We don't use a timeout here because the LogPollConnector
	// will keep running. Other connectors will use a timeout internally if appropriate.
	messageC := make(chan *ethabi.AbiLogMessagePublished, 2)
	messageSub, err := w.ethConn.WatchLogMessagePublished(ctx, errC, messageC)
	if err != nil {
		ethConnectionErrors.WithLabelValues(w.networkName, "subscribe_error").Inc()
		p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
		return fmt.Errorf("failed to subscribe to message publication events: %w", err)
	}
	defer messageSub.Unsubscribe()

	// Fetch initial guardian set
	if err := w.fetchAndUpdateGuardianSet(logger, ctx, w.ethConn); err != nil {
		return fmt.Errorf("failed to request guardian set: %v", err)
	}

	// Poll for guardian set.
	common.RunWithScissors(ctx, errC, "evm_fetch_guardian_set", func(ctx context.Context) error {
		t := time.NewTicker(15 * time.Second)
		defer t.Stop()
		for {
			select {
			case <-ctx.Done():
				return nil
			case <-t.C:
				if err := w.fetchAndUpdateGuardianSet(logger, ctx, w.ethConn); err != nil {
					errC <- fmt.Errorf("failed to request guardian set: %v", err)
					return nil
				}
			}
		}
	})

	common.RunWithScissors(ctx, errC, "evm_fetch_objs_req", func(ctx context.Context) error {
		for {
			select {
			case <-ctx.Done():
				return nil
			case r := <-w.obsvReqC:
				// This can't happen unless there is a programming error - the caller
				// is expected to send us only requests for our chainID.
				if vaa.ChainID(r.ChainId) != w.chainID {
					panic("invalid chain ID")
				}

				tx := eth_common.BytesToHash(r.TxHash)
				logger.Info("received observation request", zap.String("tx_hash", tx.Hex()))

				// SECURITY: Load the block number before requesting the transaction to avoid a
				// race condition where requesting the tx succeeds and is then dropped due to a fork,
				// but blockNumberU had already advanced beyond the required threshold.
				//
				// In the primary watcher flow, this is of no concern since we assume the node
				// always sends the head before it sends the logs (implicit synchronization
				// by relying on the same websocket connection).
				blockNumberU := atomic.LoadUint64(&w.latestFinalizedBlockNumber)
				safeBlockNumberU := atomic.LoadUint64(&w.latestSafeBlockNumber)

				timeout, cancel := context.WithTimeout(ctx, 5*time.Second)
				blockNumber, msgs, err := MessageEventsForTransaction(timeout, w.ethConn, w.contract, w.chainID, tx)
				cancel()

				if err != nil {
					logger.Error("failed to process observation request", zap.String("tx_hash", tx.Hex()), zap.Error(err))
					continue
				}

				for _, msg := range msgs {
					msg.IsReobservation = true
					if msg.ConsistencyLevel == vaa.ConsistencyLevelPublishImmediately {
						logger.Info("re-observed message publication transaction, publishing it immediately",
							zap.String("msgId", msg.MessageIDString()),
							zap.Stringer("txHash", msg.TxHash),
							zap.Uint64("current_block", blockNumberU),
							zap.Uint64("observed_block", blockNumber),
						)
						w.msgC <- msg
						continue
					}

					if msg.ConsistencyLevel == vaa.ConsistencyLevelSafe {
						if safeBlockNumberU == 0 {
							logger.Error("no safe block number available, ignoring observation request",
								zap.String("msgId", msg.MessageIDString()),
								zap.Stringer("txHash", msg.TxHash),
							)
							continue
						}

						if blockNumber <= safeBlockNumberU {
							logger.Info("re-observed message publication transaction",
								zap.String("msgId", msg.MessageIDString()),
								zap.Stringer("txHash", msg.TxHash),
								zap.Uint64("current_safe_block", safeBlockNumberU),
								zap.Uint64("observed_block", blockNumber),
							)
							w.msgC <- msg
						} else {
							logger.Info("ignoring re-observed message publication transaction",
								zap.String("msgId", msg.MessageIDString()),
								zap.Stringer("txHash", msg.TxHash),
								zap.Uint64("current_safe_block", safeBlockNumberU),
								zap.Uint64("observed_block", blockNumber),
							)
						}

						continue
					}

					if blockNumberU == 0 {
						logger.Error("no block number available, ignoring observation request",
							zap.String("msgId", msg.MessageIDString()),
							zap.Stringer("txHash", msg.TxHash),
						)
						continue
					}

					// SECURITY: In the recovery flow, we already know which transaction to
					// observe, and we can assume that it has reached the expected finality
					// level a long time ago. Therefore, the logic is much simpler than the
					// primary watcher, which has to wait for finality.
					//
					// Instead, we can simply check if the transaction's block number is in
					// the past by more than the expected confirmation number.
					//
					// Ensure that the current block number is larger than the message observation's block number.
					if blockNumber <= blockNumberU {
						logger.Info("re-observed message publication transaction",
							zap.String("msgId", msg.MessageIDString()),
							zap.Stringer("txHash", msg.TxHash),
							zap.Uint64("current_block", blockNumberU),
							zap.Uint64("observed_block", blockNumber),
						)
						w.msgC <- msg
					} else {
						logger.Info("ignoring re-observed message publication transaction",
							zap.String("msgId", msg.MessageIDString()),
							zap.Stringer("txHash", msg.TxHash),
							zap.Uint64("current_block", blockNumberU),
							zap.Uint64("observed_block", blockNumber),
						)
					}
				}
			}
		}
	})

	if w.ccqConfig.QueriesSupported() {
		w.ccqStart(ctx, errC)
	}

	common.RunWithScissors(ctx, errC, "evm_fetch_messages", func(ctx context.Context) error {
		for {
			select {
			case <-ctx.Done():
				return nil
			case err := <-messageSub.Err():
				ethConnectionErrors.WithLabelValues(w.networkName, "subscription_error").Inc()
				errC <- fmt.Errorf("error while processing message publication subscription: %w", err)
				p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
				return nil
			case ev := <-messageC:
				blockTime, err := w.getBlockTime(ctx, ev.Raw.BlockHash)
				if err != nil {
					ethConnectionErrors.WithLabelValues(w.networkName, "block_by_number_error").Inc()
					if canRetryGetBlockTime(err) {
						go w.waitForBlockTime(ctx, logger, errC, ev)
						continue
					}
					p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
					errC <- fmt.Errorf("failed to request timestamp for block %d, hash %s: %w", ev.Raw.BlockNumber, ev.Raw.BlockHash.String(), err)
					return nil
				}

				w.postMessage(logger, ev, blockTime)
			}
		}
	})

	// Watch headers
	headSink := make(chan *connectors.NewBlock, 100)
	headerSubscription, err := w.ethConn.SubscribeForBlocks(ctx, errC, headSink)
	if err != nil {
		ethConnectionErrors.WithLabelValues(w.networkName, "header_subscribe_error").Inc()
		p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
		return fmt.Errorf("failed to subscribe to header events: %w", err)
	}
	defer headerSubscription.Unsubscribe()

	common.RunWithScissors(ctx, errC, "evm_fetch_headers", func(ctx context.Context) error {
		stats := gossipv1.Heartbeat_Network{ContractAddress: w.contract.Hex()}
		for {
			select {
			case <-ctx.Done():
				return nil
			case err := <-headerSubscription.Err():
				logger.Error("error while processing header subscription", zap.Error(err))
				ethConnectionErrors.WithLabelValues(w.networkName, "header_subscription_error").Inc()
				errC <- fmt.Errorf("error while processing header subscription: %w", err)
				p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
				return nil
			case ev := <-headSink:
				// These two pointers should have been checked before the event was placed on the channel, but just being safe.
				if ev == nil {
					logger.Error("new header event is nil")
					continue
				}
				if ev.Number == nil {
					logger.Error("new header block number is nil", zap.Stringer("finality", ev.Finality))
					continue
				}

				start := time.Now()
				currentHash := ev.Hash
				logger.Debug("processing new header",
					zap.Stringer("current_block", ev.Number),
					zap.Uint64("block_time", ev.Time),
					zap.Stringer("current_blockhash", currentHash),
					zap.Stringer("finality", ev.Finality),
				)
				readiness.SetReady(w.readinessSync)

				blockNumberU := ev.Number.Uint64()
				if ev.Finality == connectors.Latest {
					atomic.StoreUint64(&w.latestBlockNumber, blockNumberU)
					currentEthHeight.WithLabelValues(w.networkName).Set(float64(blockNumberU))
					stats.Height = int64(blockNumberU)
					w.updateNetworkStats(&stats)
					w.ccqAddLatestBlock(ev)
					continue
				}

				// The only blocks that get here are safe and finalized.

				if ev.Finality == connectors.Safe {
					atomic.StoreUint64(&w.latestSafeBlockNumber, blockNumberU)
					currentEthSafeHeight.WithLabelValues(w.networkName).Set(float64(blockNumberU))
					stats.SafeHeight = int64(blockNumberU)
				} else {
					atomic.StoreUint64(&w.latestFinalizedBlockNumber, blockNumberU)
					currentEthFinalizedHeight.WithLabelValues(w.networkName).Set(float64(blockNumberU))
					stats.FinalizedHeight = int64(blockNumberU)
				}
				w.updateNetworkStats(&stats)

				w.pendingMu.Lock()
				for key, pLock := range w.pending {
					// If this block is safe, only process messages wanting safe.
					// If it's not safe, only process messages wanting finalized.
					if (ev.Finality == connectors.Safe) != (pLock.message.ConsistencyLevel == vaa.ConsistencyLevelSafe) {
						continue
					}

					// Transaction is now ready
					if pLock.height <= blockNumberU {
						msm := time.Now()
						timeout, cancel := context.WithTimeout(ctx, 5*time.Second)
						tx, err := w.ethConn.TransactionReceipt(timeout, pLock.message.TxHash)
						queryLatency.WithLabelValues(w.networkName, "transaction_receipt").Observe(time.Since(msm).Seconds())
						cancel()

						// If the node returns an error after waiting expectedConfirmation blocks,
						// it means the chain reorged and the transaction was orphaned. The
						// TransactionReceipt call is using the same websocket connection than the
						// head notifications, so it's guaranteed to be atomic.
						//
						// Check multiple possible error cases - the node seems to return a
						// "not found" error most of the time, but it could conceivably also
						// return a nil tx or rpc.ErrNoResult.
						if tx == nil || err == rpc.ErrNoResult || (err != nil && err.Error() == "not found") {
							logger.Warn("tx was orphaned",
								zap.String("msgId", pLock.message.MessageIDString()),
								zap.Stringer("txHash", pLock.message.TxHash),
								zap.Stringer("blockHash", key.BlockHash),
								zap.Uint64("target_blockNum", pLock.height),
								zap.Stringer("current_blockNum", ev.Number),
								zap.Stringer("finality", ev.Finality),
								zap.Stringer("current_blockHash", currentHash),
								zap.Error(err))
							delete(w.pending, key)
							ethMessagesOrphaned.WithLabelValues(w.networkName, "not_found").Inc()
							continue
						}

						// This should never happen - if we got this far, it means that logs were emitted,
						// which is only possible if the transaction succeeded. We check it anyway just
						// in case the EVM implementation is buggy.
						if tx.Status != 1 {
							logger.Error("transaction receipt with non-success status",
								zap.String("msgId", pLock.message.MessageIDString()),
								zap.Stringer("txHash", pLock.message.TxHash),
								zap.Stringer("blockHash", key.BlockHash),
								zap.Uint64("target_blockNum", pLock.height),
								zap.Stringer("current_blockNum", ev.Number),
								zap.Stringer("finality", ev.Finality),
								zap.Stringer("current_blockHash", currentHash),
								zap.Error(err))
							delete(w.pending, key)
							ethMessagesOrphaned.WithLabelValues(w.networkName, "tx_failed").Inc()
							continue
						}

						// Any error other than "not found" is likely transient - we retry next block.
						if err != nil {
							if pLock.height+MaxWaitConfirmations <= blockNumberU {
								// An error from this "transient" case has persisted for more than MaxWaitConfirmations.
								logger.Info("observation timed out",
									zap.String("msgId", pLock.message.MessageIDString()),
									zap.Stringer("txHash", pLock.message.TxHash),
									zap.Stringer("blockHash", key.BlockHash),
									zap.Uint64("target_blockNum", pLock.height),
									zap.Stringer("current_blockNum", ev.Number),
									zap.Stringer("finality", ev.Finality),
									zap.Stringer("current_blockHash", currentHash),
								)
								ethMessagesOrphaned.WithLabelValues(w.networkName, "timeout").Inc()
								delete(w.pending, key)
							} else {
								logger.Warn("transaction could not be fetched",
									zap.String("msgId", pLock.message.MessageIDString()),
									zap.Stringer("txHash", pLock.message.TxHash),
									zap.Stringer("blockHash", key.BlockHash),
									zap.Uint64("target_blockNum", pLock.height),
									zap.Stringer("current_blockNum", ev.Number),
									zap.Stringer("finality", ev.Finality),
									zap.Stringer("current_blockHash", currentHash),
									zap.Error(err))
							}
							continue
						}

						// It's possible for a transaction to be orphaned and then included in a different block
						// but with the same tx hash. Drop the observation (it will be re-observed and needs to
						// wait for the full confirmation time again).
						if tx.BlockHash != key.BlockHash {
							logger.Info("tx got dropped and mined in a different block; the message should have been reobserved",
								zap.String("msgId", pLock.message.MessageIDString()),
								zap.Stringer("txHash", pLock.message.TxHash),
								zap.Stringer("blockHash", key.BlockHash),
								zap.Uint64("target_blockNum", pLock.height),
								zap.Stringer("current_blockNum", ev.Number),
								zap.Stringer("finality", ev.Finality),
								zap.Stringer("current_blockHash", currentHash),
							)
							delete(w.pending, key)
							ethMessagesOrphaned.WithLabelValues(w.networkName, "blockhash_mismatch").Inc()
							continue
						}

						logger.Info("observation confirmed",
							zap.String("msgId", pLock.message.MessageIDString()),
							zap.Stringer("txHash", pLock.message.TxHash),
							zap.Stringer("blockHash", key.BlockHash),
							zap.Uint64("target_blockNum", pLock.height),
							zap.Stringer("current_blockNum", ev.Number),
							zap.Stringer("finality", ev.Finality),
							zap.Stringer("current_blockHash", currentHash),
						)
						delete(w.pending, key)
						w.msgC <- pLock.message
						ethMessagesConfirmed.WithLabelValues(w.networkName).Inc()
					}
				}

				w.pendingMu.Unlock()
				logger.Debug("processed new header",
					zap.Stringer("current_block", ev.Number),
					zap.Stringer("finality", ev.Finality),
					zap.Stringer("current_blockhash", currentHash),
					zap.Duration("took", time.Since(start)),
				)
			}
		}
	})

	// Now that the init is complete, peg readiness. That will also happen when we process a new head, but chains
	// that wait for finality may take a while to receive the first block and we don't want to hold up the init.
	readiness.SetReady(w.readinessSync)

	select {
	case <-ctx.Done():
		return ctx.Err()
	case err := <-errC:
		return err
	}
}

func (w *Watcher) fetchAndUpdateGuardianSet(
	logger *zap.Logger,
	ctx context.Context,
	ethConn connectors.Connector,
) error {
	msm := time.Now()
	logger.Debug("fetching guardian set")
	timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
	defer cancel()
	idx, gs, err := fetchCurrentGuardianSet(timeout, ethConn)
	if err != nil {
		ethConnectionErrors.WithLabelValues(w.networkName, "guardian_set_fetch_error").Inc()
		p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
		return err
	}

	queryLatency.WithLabelValues(w.networkName, "get_guardian_set").Observe(time.Since(msm).Seconds())

	if w.currentGuardianSet != nil && *(w.currentGuardianSet) == idx {
		return nil
	}

	logger.Info("updated guardian set found", zap.Any("value", gs), zap.Uint32("index", idx))

	w.currentGuardianSet = &idx

	if w.setC != nil {
		w.setC <- common.NewGuardianSet(gs.Keys, idx)
	}

	return nil
}

// Fetch the current guardian set ID and guardian set from the chain.
func fetchCurrentGuardianSet(ctx context.Context, ethConn connectors.Connector) (uint32, *ethabi.StructsGuardianSet, error) {
	currentIndex, err := ethConn.GetCurrentGuardianSetIndex(ctx)
	if err != nil {
		return 0, nil, fmt.Errorf("error requesting current guardian set index: %w", err)
	}

	gs, err := ethConn.GetGuardianSet(ctx, currentIndex)
	if err != nil {
		return 0, nil, fmt.Errorf("error requesting current guardian set value: %w", err)
	}

	return currentIndex, &gs, nil
}

// getFinality determines if the chain supports "finalized" and "safe". This is hard coded so it requires thought to change something. However, it also reads the RPC
// to make sure the node actually supports the expected values, and returns an error if it doesn't. Note that we do not support using safe mode but not finalized mode.
func (w *Watcher) getFinality(ctx context.Context) (bool, bool, error) {
	finalized := false
	safe := false

	// Tilt supports polling for both finalized and safe.
	if w.unsafeDevMode {
		finalized = true
		safe = true

		// The following chains support polling for both finalized and safe.
	} else if w.chainID == vaa.ChainIDAcala ||
		w.chainID == vaa.ChainIDArbitrum ||
		w.chainID == vaa.ChainIDArbitrumSepolia ||
		w.chainID == vaa.ChainIDBase ||
		w.chainID == vaa.ChainIDBaseSepolia ||
		w.chainID == vaa.ChainIDBlast ||
		w.chainID == vaa.ChainIDBSC ||
		w.chainID == vaa.ChainIDEthereum ||
		w.chainID == vaa.ChainIDHolesky ||
		w.chainID == vaa.ChainIDKarura ||
		w.chainID == vaa.ChainIDMantle ||
		w.chainID == vaa.ChainIDMonadDevnet ||
		w.chainID == vaa.ChainIDMoonbeam ||
		w.chainID == vaa.ChainIDOptimism ||
		w.chainID == vaa.ChainIDOptimismSepolia ||
		w.chainID == vaa.ChainIDSepolia ||
		w.chainID == vaa.ChainIDSnaxchain ||
		w.chainID == vaa.ChainIDUnichain ||
		w.chainID == vaa.ChainIDWorldchain ||
		w.chainID == vaa.ChainIDXLayer {
		finalized = true
		safe = true

		// The following chains have their own specialized finalizers.
	} else if w.chainID == vaa.ChainIDCelo {
		return false, false, nil

		// Polygon now supports polling for finalized but not safe.
		// https://forum.polygon.technology/t/optimizing-decentralized-apps-ux-with-milestones-a-significantly-accelerated-finality-solution/13154
	} else if w.chainID == vaa.ChainIDPolygon ||
		w.chainID == vaa.ChainIDPolygonSepolia {
		finalized = true

		// As of 11/10/2023 Scroll supports polling for finalized but not safe.
	} else if w.chainID == vaa.ChainIDScroll {
		finalized = true

		// As of 9/06/2024 Linea supports polling for finalized but not safe.
	} else if w.chainID == vaa.ChainIDLinea {
		finalized = true

		// The following chains support instant finality.
	} else if w.chainID == vaa.ChainIDAvalanche ||
		w.chainID == vaa.ChainIDBerachain || // Berachain supports instant finality: https://docs.berachain.com/faq/
		w.chainID == vaa.ChainIDOasis ||
		w.chainID == vaa.ChainIDAurora ||
		w.chainID == vaa.ChainIDFantom ||
		w.chainID == vaa.ChainIDKlaytn {
		return false, false, nil

		// Anything else is undefined / not supported.
	} else {
		return false, false, fmt.Errorf("unsupported chain: %s", w.chainID.String())
	}

	// If finalized / safe should be supported, read the RPC to make sure they actually are.
	if finalized {
		timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
		defer cancel()

		c, err := rpc.DialContext(timeout, w.url)
		if err != nil {
			return false, false, fmt.Errorf("failed to connect to endpoint: %w", err)
		}

		type Marshaller struct {
			Number *eth_hexutil.Big
		}
		var m Marshaller

		err = c.CallContext(ctx, &m, "eth_getBlockByNumber", "finalized", false)
		if err != nil || m.Number == nil {
			return false, false, fmt.Errorf("finalized not supported by the node when it should be")
		}

		if safe {
			err = c.CallContext(ctx, &m, "eth_getBlockByNumber", "safe", false)
			if err != nil || m.Number == nil {
				return false, false, fmt.Errorf("safe not supported by the node when it should be")
			}
		}
	}

	return finalized, safe, nil
}

// SetL1Finalizer is used to set the layer one finalizer.
func (w *Watcher) SetL1Finalizer(l1Finalizer interfaces.L1Finalizer) {
	w.l1Finalizer = l1Finalizer
}

// GetLatestFinalizedBlockNumber() implements the L1Finalizer interface and allows other watchers to
// get the latest finalized block number from this watcher.
func (w *Watcher) GetLatestFinalizedBlockNumber() uint64 {
	return atomic.LoadUint64(&w.latestFinalizedBlockNumber)
}

// getLatestSafeBlockNumber() returns the latest safe block seen by this watcher..
func (w *Watcher) getLatestSafeBlockNumber() uint64 {
	return atomic.LoadUint64(&w.latestSafeBlockNumber)
}

func (w *Watcher) updateNetworkStats(stats *gossipv1.Heartbeat_Network) {
	p2p.DefaultRegistry.SetNetworkStats(w.chainID, &gossipv1.Heartbeat_Network{
		Height:          stats.Height,
		SafeHeight:      stats.SafeHeight,
		FinalizedHeight: stats.FinalizedHeight,
		ContractAddress: w.contract.Hex(),
	})
}

// getBlockTime reads the time of a block.
func (w *Watcher) getBlockTime(ctx context.Context, blockHash eth_common.Hash) (uint64, error) {
	msm := time.Now()
	timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
	blockTime, err := w.ethConn.TimeOfBlockByHash(timeout, blockHash)
	cancel()
	queryLatency.WithLabelValues(w.networkName, "block_by_number").Observe(time.Since(msm).Seconds())
	return blockTime, err
}

// postMessage creates a message object from a log event and adds it to the pending list for processing.
func (w *Watcher) postMessage(logger *zap.Logger, ev *ethabi.AbiLogMessagePublished, blockTime uint64) {
	message := &common.MessagePublication{
		TxHash:           ev.Raw.TxHash,
		Timestamp:        time.Unix(int64(blockTime), 0),
		Nonce:            ev.Nonce,
		Sequence:         ev.Sequence,
		EmitterChain:     w.chainID,
		EmitterAddress:   PadAddress(ev.Sender),
		Payload:          ev.Payload,
		ConsistencyLevel: ev.ConsistencyLevel,
	}

	ethMessagesObserved.WithLabelValues(w.networkName).Inc()

	if message.ConsistencyLevel == vaa.ConsistencyLevelPublishImmediately {
		logger.Info("found new message publication transaction, publishing it immediately",
			zap.String("msgId", message.MessageIDString()),
			zap.Stringer("txHash", message.TxHash),
			zap.Uint64("blockNum", ev.Raw.BlockNumber),
			zap.Uint64("latestFinalizedBlock", atomic.LoadUint64(&w.latestFinalizedBlockNumber)),
			zap.Stringer("blockHash", ev.Raw.BlockHash),
			zap.Uint64("blockTime", blockTime),
			zap.Uint32("Nonce", ev.Nonce),
			zap.Uint8("ConsistencyLevel", ev.ConsistencyLevel),
		)

		w.msgC <- message
		ethMessagesConfirmed.WithLabelValues(w.networkName).Inc()
		return
	}

	logger.Info("found new message publication transaction",
		zap.String("msgId", message.MessageIDString()),
		zap.Stringer("txHash", message.TxHash),
		zap.Uint64("blockNum", ev.Raw.BlockNumber),
		zap.Uint64("latestFinalizedBlock", atomic.LoadUint64(&w.latestFinalizedBlockNumber)),
		zap.Stringer("blockHash", ev.Raw.BlockHash),
		zap.Uint64("blockTime", blockTime),
		zap.Uint32("Nonce", ev.Nonce),
		zap.Uint8("ConsistencyLevel", ev.ConsistencyLevel),
	)

	key := pendingKey{
		TxHash:         message.TxHash,
		BlockHash:      ev.Raw.BlockHash,
		EmitterAddress: message.EmitterAddress,
		Sequence:       message.Sequence,
	}

	w.pendingMu.Lock()
	w.pending[key] = &pendingMessage{
		message: message,
		height:  ev.Raw.BlockNumber,
	}
	w.pendingMu.Unlock()
}

// blockNotFoundErrors is used by `canRetryGetBlockTime`. It is a map of the error returns from `getBlockTime` that can trigger a retry.
var blockNotFoundErrors = map[string]struct{}{
	"not found":                     {},
	"Unknown block":                 {},
	"cannot query unfinalized data": {}, // Seen on Avalanche
}

// canRetryGetBlockTime returns true if the error returned by getBlockTime warrants doing a retry.
func canRetryGetBlockTime(err error) bool {
	_, exists := blockNotFoundErrors[err.Error()]
	return exists
}

// waitForBlockTime is a go routine that repeatedly attempts to read the block time for a single log event. It is used when the initial attempt to read
// the block time fails. If it is finally able to read the block time, it posts the event for processing. Otherwise, it will eventually give up.
func (w *Watcher) waitForBlockTime(ctx context.Context, logger *zap.Logger, errC chan error, ev *ethabi.AbiLogMessagePublished) {
	logger.Warn("found new message publication transaction but failed to look up block time, deferring processing",
		zap.String("msgId", msgIdFromLogEvent(w.chainID, ev)),
		zap.Stringer("txHash", ev.Raw.TxHash),
		zap.Uint64("blockNum", ev.Raw.BlockNumber),
		zap.Uint64("latestFinalizedBlock", atomic.LoadUint64(&w.latestFinalizedBlockNumber)),
		zap.Stringer("blockHash", ev.Raw.BlockHash),
		zap.Uint32("Nonce", ev.Nonce),
		zap.Uint8("ConsistencyLevel", ev.ConsistencyLevel),
	)

	const RetryInterval = 5 * time.Second
	const MaxRetries = 3
	start := time.Now()
	t := time.NewTimer(RetryInterval)
	defer t.Stop()
	retries := 1
	for {
		select {
		case <-ctx.Done():
			return
		case <-t.C:
			blockTime, err := w.getBlockTime(ctx, ev.Raw.BlockHash)
			if err == nil {
				logger.Info("retry of block time query succeeded, posting transaction",
					zap.String("msgId", msgIdFromLogEvent(w.chainID, ev)),
					zap.Stringer("txHash", ev.Raw.TxHash),
					zap.Uint64("blockNum", ev.Raw.BlockNumber),
					zap.Stringer("blocHash", ev.Raw.BlockHash),
					zap.Uint64("blockTime", blockTime),
					zap.Uint32("Nonce", ev.Nonce),
					zap.Uint8("ConsistencyLevel", ev.ConsistencyLevel),
					zap.Stringer("startTime", start),
					zap.Int("retries", retries),
				)

				w.postMessage(logger, ev, blockTime)
				return
			}

			ethConnectionErrors.WithLabelValues(w.networkName, "block_by_number_error").Inc()
			if !canRetryGetBlockTime(err) {
				p2p.DefaultRegistry.AddErrorCount(w.chainID, 1)
				errC <- fmt.Errorf("failed to request timestamp for block %d, hash %s: %w", ev.Raw.BlockNumber, ev.Raw.BlockHash.String(), err)
				return
			}
			if retries >= MaxRetries {
				logger.Error("repeatedly failed to look up block time, giving up",
					zap.String("msgId", msgIdFromLogEvent(w.chainID, ev)),
					zap.Stringer("txHash", ev.Raw.TxHash),
					zap.Uint64("blockNum", ev.Raw.BlockNumber),
					zap.Stringer("blockHash", ev.Raw.BlockHash),
					zap.Uint32("Nonce", ev.Nonce),
					zap.Uint8("ConsistencyLevel", ev.ConsistencyLevel),
					zap.Stringer("startTime", start),
					zap.Int("retries", retries),
				)

				return
			}

			retries++
			t.Reset(RetryInterval)
		}
	}
}

// msgIdFromLogEvent formats the message ID (chain/emitterAddress/seqNo) from a log event.
func msgIdFromLogEvent(chainID vaa.ChainID, ev *ethabi.AbiLogMessagePublished) string {
	return fmt.Sprintf("%v/%v/%v", uint16(chainID), PadAddress(ev.Sender), ev.Sequence)
}