wormhole/bridge/pkg/solana/watcher.go

package ethereum

import (
	"context"
	"encoding/hex"
	"fmt"
	"math/big"
	"strings"
	"time"

	eth_common "github.com/ethereum/go-ethereum/common"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"

	agentv1 "github.com/certusone/wormhole/bridge/pkg/proto/agent/v1"

	"go.uber.org/zap"

	"github.com/certusone/wormhole/bridge/pkg/common"
	"github.com/certusone/wormhole/bridge/pkg/supervisor"
	"github.com/certusone/wormhole/bridge/pkg/vaa"
)

type (
	SolanaBridgeWatcher struct {
		url string

		lockChan chan *common.ChainLock
		vaaChan  chan *vaa.VAA
	}
)

func NewSolanaBridgeWatcher(url string, lockEvents chan *common.ChainLock, vaaQueue chan *vaa.VAA) *SolanaBridgeWatcher {
	return &SolanaBridgeWatcher{url: url, lockChan: lockEvents, vaaChan: vaaQueue}
}

func (e *SolanaBridgeWatcher) Run(ctx context.Context) error {
	timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
	defer cancel()
	conn, err := grpc.DialContext(timeout, e.url, grpc.WithBlock(), grpc.WithInsecure())
	if err != nil {
		return fmt.Errorf("failed to dial agent at %s: %w", e.url, err)
	}
	defer conn.Close()

	c := agentv1.NewAgentClient(conn)

	errC := make(chan error)
	logger := supervisor.Logger(ctx)

	// Subscribe to new token lockups
	tokensLockedSub, err := c.WatchLockups(ctx, &agentv1.WatchLockupsRequest{})
	if err != nil {
		return fmt.Errorf("failed to subscribe to token lockup events: %w", err)
	}

	go func() {
		logger.Info("watching for on-chain events")

		for {
			ev, err := tokensLockedSub.Recv()
			if err != nil {
				errC <- fmt.Errorf("failed to receive message from agent: %w", err)
				return
			}

			switch event := ev.Event.(type) {
			case *agentv1.LockupEvent_New:
				logger.Debug("received lockup event",
					zap.Any("event", ev))

				lock := &common.ChainLock{
					TxHash:        eth_common.HexToHash(ev.LockupAddress),
					Timestamp:     time.Unix(int64(ev.Time), 0),
					Nonce:         event.New.Nonce,
					SourceChain:   vaa.ChainIDSolana,
					TargetChain:   vaa.ChainID(event.New.TargetChain),
					TokenChain:    vaa.ChainID(event.New.TokenChain),
					TokenDecimals: uint8(event.New.TokenDecimals),
					Amount:        new(big.Int).SetBytes(event.New.Amount),
				}
				copy(lock.TokenAddress[:], event.New.TokenAddress)
				copy(lock.SourceAddress[:], event.New.SourceAddress)
				copy(lock.TargetAddress[:], event.New.TargetAddress)

				e.lockChan <- lock
				logger.Info("found new lockup transaction", zap.String("lockup_address", ev.LockupAddress))
			}
		}
	}()

	go func() {
		for {
			select {
			case <-ctx.Done():
				return
			case v := <-e.vaaChan:
				vaaBytes, err := v.Marshal()
				if err != nil {
					panic(err)
				}

				// Calculate digest so we can log it (TODO: refactor to vaa method? we do this in different places)
				m, err := v.SigningMsg()
				if err != nil {
					panic(err)
				}
				h := hex.EncodeToString(m.Bytes())

				timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
				res, err := c.SubmitVAA(timeout, &agentv1.SubmitVAARequest{Vaa: vaaBytes})
				cancel()
				if err != nil {
					st, ok := status.FromError(err)
					if !ok {
						panic("err not a status")
					}

					// For transient errors, we can put the VAA back into the queue such that it can
					// be retried after the runnable has been rescheduled.
					switch st.Code() {
					case
						// Our context was cancelled, likely because the watcher stream died.
						codes.Canceled,
						// The agent encountered a transient error, likely node unavailability.
						codes.Unavailable,
						codes.Aborted:

						logger.Error("transient error, requeuing VAA", zap.Error(err), zap.String("digest", h))

						// Tombstone goroutine
						go func(v *vaa.VAA) {
							time.Sleep(10 * time.Second)
							e.vaaChan <- v
						}(v)

					case codes.Internal:
						// This VAA has already been executed on chain, successfully or not.
						// TODO: dissect InstructionError in agent and convert this to the proper gRPC code
						if strings.Contains(st.Message(), "custom program error: 0xb") { // AlreadyExists
							logger.Info("VAA already submitted on-chain, ignoring", zap.Error(err), zap.String("digest", h))
							break
						}

						fallthrough
					default:
						logger.Error("error submitting VAA", zap.Error(err), zap.String("digest", h))
					}

					break
				}

				logger.Info("submitted VAA",
					zap.String("tx_sig", res.Signature), zap.String("digest", h))
			}
		}
	}()

	select {
	case <-ctx.Done():
		return ctx.Err()
	case err := <-errC:
		return err
	}
}
Add slot to agent 2020-08-20 10:20:11 -07:00			`package ethereum`

			`import (`
			`"context"`
Generalize token lockup processor 2020-08-21 04:00:40 -07:00			`"encoding/hex"`
Add slot to agent 2020-08-20 10:20:11 -07:00			`"fmt"`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`"math/big"`
bridge: do not log errors for duplicate VAA submissions No functional change, just nicer log output. ghstack-source-id: f946cbe71dca976b97d35d9c4b4615f2ceac5a32 Pull Request resolved: https://github.com/certusone/wormhole/pull/52 2020-10-22 03:20:13 -07:00			`"strings"`
Add slot to agent 2020-08-20 10:20:11 -07:00			`"time"`

Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`eth_common "github.com/ethereum/go-ethereum/common"`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`"google.golang.org/grpc"`
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48 2020-10-22 03:20:12 -07:00			`"google.golang.org/grpc/codes"`
			`"google.golang.org/grpc/status"`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00
			`agentv1 "github.com/certusone/wormhole/bridge/pkg/proto/agent/v1"`

Add slot to agent 2020-08-20 10:20:11 -07:00			`"go.uber.org/zap"`

			`"github.com/certusone/wormhole/bridge/pkg/common"`
			`"github.com/certusone/wormhole/bridge/pkg/supervisor"`
			`"github.com/certusone/wormhole/bridge/pkg/vaa"`
			`)`

			`type (`
			`SolanaBridgeWatcher struct {`
			`url string`

			`lockChan chan *common.ChainLock`
			`vaaChan chan *vaa.VAA`
			`}`
			`)`

Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`func NewSolanaBridgeWatcher(url string, lockEvents chan common.ChainLock, vaaQueue chan vaa.VAA) *SolanaBridgeWatcher {`
			`return &SolanaBridgeWatcher{url: url, lockChan: lockEvents, vaaChan: vaaQueue}`
Add slot to agent 2020-08-20 10:20:11 -07:00			`}`

			`func (e *SolanaBridgeWatcher) Run(ctx context.Context) error {`
Always cancel contexts to avoid leaking goroutines 2020-08-21 14:47:58 -07:00			`timeout, cancel := context.WithTimeout(ctx, 15*time.Second)`
			`defer cancel()`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`conn, err := grpc.DialContext(timeout, e.url, grpc.WithBlock(), grpc.WithInsecure())`
Add slot to agent 2020-08-20 10:20:11 -07:00			`if err != nil {`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`return fmt.Errorf("failed to dial agent at %s: %w", e.url, err)`
Add slot to agent 2020-08-20 10:20:11 -07:00			`}`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`defer conn.Close()`

Add slot to agent 2020-08-20 10:20:11 -07:00			`c := agentv1.NewAgentClient(conn)`

			`errC := make(chan error)`
			`logger := supervisor.Logger(ctx)`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`// Subscribe to new token lockups`
			`tokensLockedSub, err := c.WatchLockups(ctx, &agentv1.WatchLockupsRequest{})`
			`if err != nil {`
			`return fmt.Errorf("failed to subscribe to token lockup events: %w", err)`
			`}`

			`go func() {`
			`logger.Info("watching for on-chain events")`

			`for {`
			`ev, err := tokensLockedSub.Recv()`
			`if err != nil {`
bridge: wrap agent recv errors 2020-08-28 08:12:06 -07:00			`errC <- fmt.Errorf("failed to receive message from agent: %w", err)`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`return`
			`}`

			`switch event := ev.Event.(type) {`
			`case *agentv1.LockupEvent_New:`
VAA guardian devnet submission 2020-08-27 08:46:40 -07:00			`logger.Debug("received lockup event",`
all: consider decimals on wrapped assets, fix VAA posting, fix solana account parsing 2020-08-28 06:10:42 -07:00			`zap.Any("event", ev))`
agent: use max commitment and fix nonce type It appears that single confirmation level is not useful for transactions that depend on each other. 2020-08-21 13:53:27 -07:00
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`lock := &common.ChainLock{`
			`TxHash: eth_common.HexToHash(ev.LockupAddress),`
all: consider decimals on wrapped assets, fix VAA posting, fix solana account parsing 2020-08-28 06:10:42 -07:00			`Timestamp: time.Unix(int64(ev.Time), 0),`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`Nonce: event.New.Nonce,`
			`SourceChain: vaa.ChainIDSolana,`
			`TargetChain: vaa.ChainID(event.New.TargetChain),`
			`TokenChain: vaa.ChainID(event.New.TokenChain),`
all: consider decimals on wrapped assets, fix VAA posting, fix solana account parsing 2020-08-28 06:10:42 -07:00			`TokenDecimals: uint8(event.New.TokenDecimals),`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`Amount: new(big.Int).SetBytes(event.New.Amount),`
			`}`
bridge: use full token address 2020-08-27 23:35:59 -07:00			`copy(lock.TokenAddress[:], event.New.TokenAddress)`
all: consider decimals on wrapped assets, fix VAA posting, fix solana account parsing 2020-08-28 06:10:42 -07:00			`copy(lock.SourceAddress[:], event.New.SourceAddress)`
			`copy(lock.TargetAddress[:], event.New.TargetAddress)`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00
			`e.lockChan <- lock`
			`logger.Info("found new lockup transaction", zap.String("lockup_address", ev.LockupAddress))`
			`}`
			`}`
			`}()`
Add slot to agent 2020-08-20 10:20:11 -07:00
			`go func() {`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`for {`
			`select {`
			`case <-ctx.Done():`
Add slot to agent 2020-08-20 10:20:11 -07:00			`return`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`case v := <-e.vaaChan:`
			`vaaBytes, err := v.Marshal()`
			`if err != nil {`
			`panic(err)`
			`}`
Add slot to agent 2020-08-20 10:20:11 -07:00
Generalize token lockup processor 2020-08-21 04:00:40 -07:00			`// Calculate digest so we can log it (TODO: refactor to vaa method? we do this in different places)`
			`m, err := v.SigningMsg()`
			`if err != nil {`
			`panic(err)`
			`}`
			`h := hex.EncodeToString(m.Bytes())`

Always cancel contexts to avoid leaking goroutines 2020-08-21 14:47:58 -07:00			`timeout, cancel := context.WithTimeout(ctx, 15*time.Second)`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`res, err := c.SubmitVAA(timeout, &agentv1.SubmitVAARequest{Vaa: vaaBytes})`
Always cancel contexts to avoid leaking goroutines 2020-08-21 14:47:58 -07:00			`cancel()`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`if err != nil {`
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48 2020-10-22 03:20:12 -07:00			`st, ok := status.FromError(err)`
			`if !ok {`
			`panic("err not a status")`
			`}`

			`// For transient errors, we can put the VAA back into the queue such that it can`
			`// be retried after the runnable has been rescheduled.`
			`switch st.Code() {`
			`case`
			`// Our context was cancelled, likely because the watcher stream died.`
			`codes.Canceled,`
			`// The agent encountered a transient error, likely node unavailability.`
			`codes.Unavailable,`
			`codes.Aborted:`

bridge: do not log errors for duplicate VAA submissions No functional change, just nicer log output. ghstack-source-id: f946cbe71dca976b97d35d9c4b4615f2ceac5a32 Pull Request resolved: https://github.com/certusone/wormhole/pull/52 2020-10-22 03:20:13 -07:00			`logger.Error("transient error, requeuing VAA", zap.Error(err), zap.String("digest", h))`
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48 2020-10-22 03:20:12 -07:00
			`// Tombstone goroutine`
			`go func(v *vaa.VAA) {`
			`time.Sleep(10 * time.Second)`
			`e.vaaChan <- v`
			`}(v)`
bridge: do not log errors for duplicate VAA submissions No functional change, just nicer log output. ghstack-source-id: f946cbe71dca976b97d35d9c4b4615f2ceac5a32 Pull Request resolved: https://github.com/certusone/wormhole/pull/52 2020-10-22 03:20:13 -07:00
			`case codes.Internal:`
			`// This VAA has already been executed on chain, successfully or not.`
			`// TODO: dissect InstructionError in agent and convert this to the proper gRPC code`
bridge: remove all supervisor.SignalHealthy calls Supervisor does not back off tasks that failed in a healthy state. There are a couple places where we rely on supervisor for application-level backoff, so we always want back-off. The distinction is meant to enable runnables to implement their own specific back-off logic, which we don't, so we can safely ignore it. Fixes #37 ghstack-source-id: c756381b1b1598305ae6d59b2967ca7ea35aa68f Pull Request resolved: https://github.com/certusone/wormhole/pull/64 2020-10-28 14:41:36 -07:00			`if strings.Contains(st.Message(), "custom program error: 0xb") { // AlreadyExists`
bridge: do not log errors for duplicate VAA submissions No functional change, just nicer log output. ghstack-source-id: f946cbe71dca976b97d35d9c4b4615f2ceac5a32 Pull Request resolved: https://github.com/certusone/wormhole/pull/52 2020-10-22 03:20:13 -07:00			`logger.Info("VAA already submitted on-chain, ignoring", zap.Error(err), zap.String("digest", h))`
			`break`
			`}`

			`fallthrough`
			`default:`
			`logger.Error("error submitting VAA", zap.Error(err), zap.String("digest", h))`
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48 2020-10-22 03:20:12 -07:00			`}`

Enable VAA submission to Solana 2020-08-20 15:06:38 -07:00			`break`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`}`

			`logger.Info("submitted VAA",`
Wire up Solana lockup watcher 2020-08-21 11:49:33 -07:00			`zap.String("tx_sig", res.Signature), zap.String("digest", h))`
Solana VAA submission stub 2020-08-20 12:48:58 -07:00			`}`
Add slot to agent 2020-08-20 10:20:11 -07:00			`}`
			`}()`

			`select {`
			`case <-ctx.Done():`
			`return ctx.Err()`
			`case err := <-errC:`
			`return err`
			`}`
			`}`