wormhole/bridge/pkg/solana/watcher.go

167 lines
4.5 KiB
Go
Raw Normal View History

2020-08-20 10:20:11 -07:00
package ethereum
import (
"context"
2020-08-21 04:00:40 -07:00
"encoding/hex"
2020-08-20 10:20:11 -07:00
"fmt"
2020-08-21 11:49:33 -07:00
"math/big"
"strings"
2020-08-20 10:20:11 -07:00
"time"
2020-08-21 11:49:33 -07:00
eth_common "github.com/ethereum/go-ethereum/common"
2020-08-20 12:48:58 -07:00
"google.golang.org/grpc"
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48
2020-10-22 03:20:12 -07:00
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
2020-08-20 12:48:58 -07:00
agentv1 "github.com/certusone/wormhole/bridge/pkg/proto/agent/v1"
2020-08-20 10:20:11 -07:00
"go.uber.org/zap"
"github.com/certusone/wormhole/bridge/pkg/common"
"github.com/certusone/wormhole/bridge/pkg/supervisor"
"github.com/certusone/wormhole/bridge/pkg/vaa"
)
type (
SolanaBridgeWatcher struct {
url string
lockChan chan *common.ChainLock
vaaChan chan *vaa.VAA
}
)
2020-08-20 12:48:58 -07:00
func NewSolanaBridgeWatcher(url string, lockEvents chan *common.ChainLock, vaaQueue chan *vaa.VAA) *SolanaBridgeWatcher {
return &SolanaBridgeWatcher{url: url, lockChan: lockEvents, vaaChan: vaaQueue}
2020-08-20 10:20:11 -07:00
}
func (e *SolanaBridgeWatcher) Run(ctx context.Context) error {
timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
defer cancel()
2020-08-20 12:48:58 -07:00
conn, err := grpc.DialContext(timeout, e.url, grpc.WithBlock(), grpc.WithInsecure())
2020-08-20 10:20:11 -07:00
if err != nil {
2020-08-20 12:48:58 -07:00
return fmt.Errorf("failed to dial agent at %s: %w", e.url, err)
2020-08-20 10:20:11 -07:00
}
2020-08-20 12:48:58 -07:00
defer conn.Close()
2020-08-20 10:20:11 -07:00
c := agentv1.NewAgentClient(conn)
errC := make(chan error)
logger := supervisor.Logger(ctx)
2020-08-20 12:48:58 -07:00
2020-08-21 11:49:33 -07:00
// Subscribe to new token lockups
tokensLockedSub, err := c.WatchLockups(ctx, &agentv1.WatchLockupsRequest{})
if err != nil {
return fmt.Errorf("failed to subscribe to token lockup events: %w", err)
}
go func() {
logger.Info("watching for on-chain events")
for {
ev, err := tokensLockedSub.Recv()
if err != nil {
2020-08-28 08:12:06 -07:00
errC <- fmt.Errorf("failed to receive message from agent: %w", err)
2020-08-21 11:49:33 -07:00
return
}
switch event := ev.Event.(type) {
case *agentv1.LockupEvent_New:
2020-08-27 08:46:40 -07:00
logger.Debug("received lockup event",
zap.Any("event", ev))
2020-08-21 11:49:33 -07:00
lock := &common.ChainLock{
TxHash: eth_common.HexToHash(ev.LockupAddress),
Timestamp: time.Unix(int64(ev.Time), 0),
2020-08-21 11:49:33 -07:00
Nonce: event.New.Nonce,
SourceChain: vaa.ChainIDSolana,
TargetChain: vaa.ChainID(event.New.TargetChain),
TokenChain: vaa.ChainID(event.New.TokenChain),
TokenDecimals: uint8(event.New.TokenDecimals),
2020-08-21 11:49:33 -07:00
Amount: new(big.Int).SetBytes(event.New.Amount),
}
2020-08-27 23:35:59 -07:00
copy(lock.TokenAddress[:], event.New.TokenAddress)
copy(lock.SourceAddress[:], event.New.SourceAddress)
copy(lock.TargetAddress[:], event.New.TargetAddress)
2020-08-21 11:49:33 -07:00
e.lockChan <- lock
logger.Info("found new lockup transaction", zap.String("lockup_address", ev.LockupAddress))
}
}
}()
2020-08-20 10:20:11 -07:00
go func() {
2020-08-20 12:48:58 -07:00
for {
select {
case <-ctx.Done():
2020-08-20 10:20:11 -07:00
return
2020-08-20 12:48:58 -07:00
case v := <-e.vaaChan:
vaaBytes, err := v.Marshal()
if err != nil {
panic(err)
}
2020-08-20 10:20:11 -07:00
2020-08-21 04:00:40 -07:00
// Calculate digest so we can log it (TODO: refactor to vaa method? we do this in different places)
m, err := v.SigningMsg()
if err != nil {
panic(err)
}
h := hex.EncodeToString(m.Bytes())
timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
2020-08-20 12:48:58 -07:00
res, err := c.SubmitVAA(timeout, &agentv1.SubmitVAARequest{Vaa: vaaBytes})
cancel()
2020-08-20 12:48:58 -07:00
if err != nil {
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48
2020-10-22 03:20:12 -07:00
st, ok := status.FromError(err)
if !ok {
panic("err not a status")
}
// For transient errors, we can put the VAA back into the queue such that it can
// be retried after the runnable has been rescheduled.
switch st.Code() {
case
// Our context was cancelled, likely because the watcher stream died.
codes.Canceled,
// The agent encountered a transient error, likely node unavailability.
codes.Unavailable,
codes.Aborted:
logger.Error("transient error, requeuing VAA", zap.Error(err), zap.String("digest", h))
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48
2020-10-22 03:20:12 -07:00
// Tombstone goroutine
go func(v *vaa.VAA) {
time.Sleep(10 * time.Second)
e.vaaChan <- v
}(v)
case codes.Internal:
// This VAA has already been executed on chain, successfully or not.
// TODO: dissect InstructionError in agent and convert this to the proper gRPC code
if strings.Contains(st.Message(), "custom program error: 0xb") { // AlreadyExists
logger.Info("VAA already submitted on-chain, ignoring", zap.Error(err), zap.String("digest", h))
break
}
fallthrough
default:
logger.Error("error submitting VAA", zap.Error(err), zap.String("digest", h))
bridge/pkg/solana: retry VAA submission on transient errors In particular, this fixes a race condition where the Solana devnet would take longer to deploy than the ETH devnet to deploy and we'd end up with an outdated guardian set on Solana. We currently create a Goroutine for every pending resubmission, which waits and blocks on the channel until solwatch is processing requests again. This is effectively an unbounded queue. An alternative approach would be a channel with sufficient capacity plus backoff. Test Plan: Deployed without solana-devnet, waited for initial guardian set change VAA to be requeued, then deployed solana-devnet. The VAA was successfully submitted once the transient error resolved: ``` [...] 21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"} 21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432} 21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"} 21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"} ``` ghstack-source-id: 1b1d05a4cb1fa37802685f6e89951833366ef634 Pull Request resolved: https://github.com/certusone/wormhole/pull/48
2020-10-22 03:20:12 -07:00
}
2020-08-20 15:06:38 -07:00
break
2020-08-20 12:48:58 -07:00
}
logger.Info("submitted VAA",
2020-08-21 11:49:33 -07:00
zap.String("tx_sig", res.Signature), zap.String("digest", h))
2020-08-20 12:48:58 -07:00
}
2020-08-20 10:20:11 -07:00
}
}()
select {
case <-ctx.Done():
return ctx.Err()
case err := <-errC:
return err
}
}