2022-08-05 10:49:16 -07:00
package near
import (
"context"
"fmt"
2022-11-29 03:22:56 -08:00
"sync/atomic"
2022-08-05 10:49:16 -07:00
"time"
"github.com/certusone/wormhole/node/pkg/common"
"github.com/certusone/wormhole/node/pkg/p2p"
gossipv1 "github.com/certusone/wormhole/node/pkg/proto/gossip/v1"
"github.com/certusone/wormhole/node/pkg/readiness"
"github.com/certusone/wormhole/node/pkg/supervisor"
2022-11-11 07:24:55 -08:00
"github.com/certusone/wormhole/node/pkg/watchers/near/nearapi"
2022-08-05 10:49:16 -07:00
"github.com/mr-tron/base58"
2022-08-18 01:52:36 -07:00
"github.com/wormhole-foundation/wormhole/sdk/vaa"
2022-08-05 10:49:16 -07:00
"go.uber.org/zap"
)
2022-11-11 07:24:55 -08:00
var (
2022-09-19 08:54:44 -07:00
2022-11-11 07:24:55 -08:00
// how long to initially wait between observing a transaction and attempting to process the transaction.
// To successfully process the transaction, all receipts need to be finalized, which typically only occurs two blocks later or so.
// transaction processing will be retried with exponential backoff, i.e. transaction may stay in the queque for ca. initialTxProcDelay^(txProcRetry+2) time.
initialTxProcDelay = time . Second * 3
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
blockPollInterval = time . Millisecond * 200
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
// this value should be set to the max. amount of transactions in a block such that they can all be processed in parallel.
workerCountTxProcessing int = 100
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
// this value should be set to be greater than the amount of chunks in a NEAR block,
// such that they can all be fetched in parallel.
// We're currently seeing ~10 chunks/block, so setting this to 20 conservatively.
workerChunkFetching int = 20
quequeSize int = 10_000 // size of the queques for chunk processing as well as transaction processing
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
// if watcher falls behind this many blocks, start over. This should be set proportional to `quequeSize`
// such that all transactions from `maxFallBehindBlocks` can easily fit into the queue
maxFallBehindBlocks uint = 200
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
metricsInterval = time . Second * 10 // how often you want health metrics reported
txProcRetry uint = 4 // how often to retry processing a transaction
// the maximum span of gaps in the NEAR blockchain we want to support
// lower values yields better performance, but can lead to missed observations if NEAR has larger gaps.
// During testing, gaps on NEAR were at most 1 block long.
nearBlockchainMaxGaps = 5
2022-08-05 10:49:16 -07:00
)
2022-11-11 07:24:55 -08:00
type (
transactionProcessingJob struct {
txHash string
senderAccountId string
creationTime time . Time
retryCounter uint
delay time . Duration
// set during processing
hasWormholeMsg bool // set during processing; whether this transaction emitted a Wormhole message
wormholeMsgBlockHeight uint64 // highest block height of a wormhole message in this transaction
}
Watcher struct {
mainnet bool
wormholeAccount string // name of the Wormhole Account on the NEAR blockchain
nearRPC string
// external channels
msgC chan <- * common . MessagePublication // validated (SECURITY: and only validated!) observations go into this channel
obsvReqC <- chan * gossipv1 . ObservationRequest // observation requests are coming from this channel
// internal queques
2022-11-29 03:22:56 -08:00
transactionProcessingQueueCounter atomic . Int64
transactionProcessingQueue chan * transactionProcessingJob
chunkProcessingQueue chan nearapi . ChunkHeader
2022-11-11 07:24:55 -08:00
// events channels
eventChanBlockProcessedHeight chan uint64 // whenever a block is processed, post the height here
eventChanTxProcessedDuration chan time . Duration
eventChan chan eventType // whenever a messages is confirmed, post true in here
// sub-components
finalizer Finalizer
nearAPI nearapi . NearApi
}
2022-08-05 10:49:16 -07:00
)
// NewWatcher creates a new Near appid watcher
func NewWatcher (
nearRPC string ,
wormholeContract string ,
2022-11-11 07:24:55 -08:00
msgC chan <- * common . MessagePublication ,
obsvReqC <- chan * gossipv1 . ObservationRequest ,
2022-09-19 08:54:44 -07:00
mainnet bool ,
2022-08-05 10:49:16 -07:00
) * Watcher {
return & Watcher {
2022-11-11 07:24:55 -08:00
mainnet : mainnet ,
wormholeAccount : wormholeContract ,
nearRPC : nearRPC ,
msgC : msgC ,
obsvReqC : obsvReqC ,
2022-11-29 03:22:56 -08:00
transactionProcessingQueue : make ( chan * transactionProcessingJob ) ,
2022-11-11 07:24:55 -08:00
chunkProcessingQueue : make ( chan nearapi . ChunkHeader , quequeSize ) ,
eventChanBlockProcessedHeight : make ( chan uint64 , 10 ) ,
eventChanTxProcessedDuration : make ( chan time . Duration , 10 ) ,
eventChan : make ( chan eventType , 10 ) ,
2022-08-05 10:49:16 -07:00
}
}
2022-11-29 03:22:56 -08:00
func newTransactionProcessingJob ( txHash string , senderAccountId string ) * transactionProcessingJob {
return & transactionProcessingJob {
2022-11-11 07:24:55 -08:00
txHash ,
senderAccountId ,
time . Now ( ) ,
0 ,
initialTxProcDelay ,
false ,
0 ,
2022-08-05 10:49:16 -07:00
}
}
2022-11-11 07:24:55 -08:00
func ( e * Watcher ) runBlockPoll ( ctx context . Context ) error {
logger := supervisor . Logger ( ctx )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
// As we start, get the height of the latest finalized block. We won't be processing any blocks before that.
finalBlock , err := e . nearAPI . GetFinalBlock ( ctx )
if err != nil || finalBlock . Header . Height == 0 {
logger . Error ( "failed to start NEAR block poll" , zap . String ( "error_type" , "startup_fail" ) , zap . String ( "log_msg_type" , "startup_error" ) )
return err
2022-08-05 10:49:16 -07:00
}
2022-11-11 07:24:55 -08:00
highestFinalBlockHeightObserved := finalBlock . Header . Height - 1 // minues one because we still want to process this block, just no blocks before it
2022-09-19 08:54:44 -07:00
2022-11-11 07:24:55 -08:00
supervisor . Signal ( ctx , supervisor . SignalHealthy )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
timer := time . NewTimer ( time . Nanosecond ) // this is just for the first iteration.
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
for {
select {
case <- ctx . Done ( ) :
return nil
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
case <- timer . C :
highestFinalBlockHeightObserved , err = e . ReadFinalChunksSince ( logger , ctx , highestFinalBlockHeightObserved , e . chunkProcessingQueue )
if err != nil {
logger . Warn ( "NEAR poll error" , zap . String ( "log_msg_type" , "block_poll_error" ) , zap . String ( "error" , err . Error ( ) ) )
2022-08-05 10:49:16 -07:00
}
2022-11-11 07:24:55 -08:00
timer . Reset ( blockPollInterval )
2022-08-05 10:49:16 -07:00
}
}
2022-08-04 08:53:08 -07:00
}
2022-11-11 07:24:55 -08:00
func ( e * Watcher ) runChunkFetcher ( ctx context . Context ) error {
logger := supervisor . Logger ( ctx )
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
for {
select {
case <- ctx . Done ( ) :
return nil
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
case chunkHeader := <- e . chunkProcessingQueue :
newJobs , err := e . fetchAndParseChunk ( logger , ctx , chunkHeader )
if err != nil {
logger . Warn ( "near.processChunk failed" , zap . String ( "log_msg_type" , "chunk_processing_failed" ) , zap . String ( "error" , err . Error ( ) ) )
p2p . DefaultRegistry . AddErrorCount ( vaa . ChainIDNear , 1 )
2022-08-04 08:53:08 -07:00
continue
}
2022-11-29 03:22:56 -08:00
for _ , job := range newJobs {
e . schedule ( ctx , job , job . delay )
2022-08-04 08:53:08 -07:00
}
}
}
}
2022-11-11 07:24:55 -08:00
func ( e * Watcher ) runObsvReqProcessor ( ctx context . Context ) error {
logger := supervisor . Logger ( ctx )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
supervisor . Signal ( ctx , supervisor . SignalHealthy )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
for {
select {
case <- ctx . Done ( ) :
return ctx . Err ( )
case r := <- e . obsvReqC :
if vaa . ChainID ( r . ChainId ) != vaa . ChainIDNear {
panic ( "invalid chain ID" )
2022-08-05 10:49:16 -07:00
}
2022-11-11 07:24:55 -08:00
txHash := base58 . Encode ( r . TxHash )
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
logger . Info ( "Received obsv request" , zap . String ( "log_msg_type" , "obsv_req_received" ) , zap . String ( "tx_hash" , txHash ) )
2022-08-04 08:53:08 -07:00
2022-11-11 07:24:55 -08:00
// TODO e.wormholeContract is not the correct value for senderAccountId. Instead, it should be the account id of the transaction sender.
// This value is used by NEAR to determine which shard to query. An incorrect value here is not a security risk but could lead to reobservation requests failing.
// Guardians currently run nodes for all shards and the API seems to be returning the correct results independent of the set senderAccountId but this could change in the future.
// Fixing this would require adding the transaction sender account ID to the observation request.
job := newTransactionProcessingJob ( txHash , e . wormholeAccount )
2022-11-29 03:22:56 -08:00
e . schedule ( ctx , job , time . Nanosecond )
2022-08-05 10:49:16 -07:00
}
}
}
2022-11-11 07:24:55 -08:00
func ( e * Watcher ) runTxProcessor ( ctx context . Context ) error {
2022-08-05 10:49:16 -07:00
logger := supervisor . Logger ( ctx )
2022-11-11 07:24:55 -08:00
supervisor . Signal ( ctx , supervisor . SignalHealthy )
for {
select {
case <- ctx . Done ( ) :
return ctx . Err ( )
2022-08-05 10:49:16 -07:00
2022-11-29 03:22:56 -08:00
case job := <- e . transactionProcessingQueue :
err := e . processTx ( logger , ctx , job )
if err != nil {
// transaction processing unsuccessful. Retry if retry_counter not exceeded.
if job . retryCounter < txProcRetry {
// Log and retry with exponential backoff
logger . Info (
"near.processTx" ,
zap . String ( "log_msg_type" , "tx_processing_retry" ) ,
zap . String ( "tx_hash" , job . txHash ) ,
zap . String ( "error" , err . Error ( ) ) ,
)
job . retryCounter ++
job . delay *= 2
e . schedule ( ctx , job , job . delay )
} else {
// Warn and do not retry
logger . Warn (
"near.processTx" ,
zap . String ( "log_msg_type" , "tx_processing_retries_exceeded" ) ,
zap . String ( "tx_hash" , job . txHash ) ,
zap . String ( "error" , err . Error ( ) ) ,
)
p2p . DefaultRegistry . AddErrorCount ( vaa . ChainIDNear , 1 )
2022-11-11 07:24:55 -08:00
}
2022-11-29 03:22:56 -08:00
}
2022-09-13 13:35:06 -07:00
2022-11-29 03:22:56 -08:00
if job . hasWormholeMsg {
// report how long it took to process this transaction
e . eventChanTxProcessedDuration <- time . Since ( job . creationTime )
2022-11-11 07:24:55 -08:00
}
2022-08-05 10:49:16 -07:00
2022-11-29 03:22:56 -08:00
// tell everyone about successful processing
e . eventChanBlockProcessedHeight <- job . wormholeMsgBlockHeight
2022-11-11 07:24:55 -08:00
}
2022-11-29 03:22:56 -08:00
2022-11-11 07:24:55 -08:00
}
}
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
func ( e * Watcher ) Run ( ctx context . Context ) error {
logger := supervisor . Logger ( ctx )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
e . nearAPI = nearapi . NewNearApiImpl ( nearapi . NewHttpNearRpc ( e . nearRPC ) )
e . finalizer = newFinalizer ( e . eventChan , e . nearAPI , e . mainnet )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
p2p . DefaultRegistry . SetNetworkStats ( vaa . ChainIDNear , & gossipv1 . Heartbeat_Network {
ContractAddress : e . wormholeAccount ,
} )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
logger . Info ( "Near watcher connecting to RPC node " , zap . String ( "url" , e . nearRPC ) )
2022-08-05 10:49:16 -07:00
2022-11-11 07:24:55 -08:00
// start metrics reporter
err := supervisor . Run ( ctx , "metrics" , e . runMetrics )
if err != nil {
return err
}
// start one poller
err = supervisor . Run ( ctx , "blockPoll" , e . runBlockPoll )
if err != nil {
2022-08-05 10:49:16 -07:00
return err
}
2022-11-11 07:24:55 -08:00
// start one obsvReqC runner
err = supervisor . Run ( ctx , "obsvReqProcessor" , e . runObsvReqProcessor )
if err != nil {
return err
}
// start `workerCount` many chunkFetcher runners
for i := 0 ; i < workerChunkFetching ; i ++ {
err := supervisor . Run ( ctx , fmt . Sprintf ( "chunk_fetcher_%d" , i ) , e . runChunkFetcher )
if err != nil {
return err
}
}
// start `workerCount` many transactionProcessing runners
for i := 0 ; i < workerCountTxProcessing ; i ++ {
err := supervisor . Run ( ctx , fmt . Sprintf ( "txProcessor_%d" , i ) , e . runTxProcessor )
if err != nil {
return err
}
}
readiness . SetReady ( common . ReadinessNearSyncing )
supervisor . Signal ( ctx , supervisor . SignalHealthy )
<- ctx . Done ( )
return ctx . Err ( )
2022-08-05 10:49:16 -07:00
}
2022-11-29 03:22:56 -08:00
func ( e * Watcher ) schedule ( ctx context . Context , job * transactionProcessingJob , delay time . Duration ) {
go func ( ) {
timer := time . NewTimer ( delay )
defer timer . Stop ( )
e . transactionProcessingQueueCounter . Add ( 1 )
defer e . transactionProcessingQueueCounter . Add ( - 1 )
select {
case <- ctx . Done ( ) :
return
case <- timer . C :
// Don't block on processing if the context is cancelled
select {
case <- ctx . Done ( ) :
return
case e . transactionProcessingQueue <- job :
}
}
} ( )
}