bridge/pkg/solana: retry VAA submission on transient errors

In particular, this fixes a race condition where the Solana devnet would
take longer to deploy than the ETH devnet to deploy and we'd end up
with an outdated guardian set on Solana.

We currently create a Goroutine for every pending resubmission, which
waits and blocks on the channel until solwatch is processing requests
again. This is effectively an unbounded queue. An alternative approach
would be a channel with sufficient capacity plus backoff.

Test Plan: Deployed without solana-devnet, waited for initial guardian
set change VAA to be requeued, then deployed solana-devnet.

The VAA was successfully submitted once the transient error resolved:

```
[...]
21:08:44.712Z	ERROR	wormhole-guardian-0.supervisor	Runnable died	{"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"}
21:08:44.712Z	INFO	wormhole-guardian-0.supervisor	rescheduling supervised node	{"dn": "root.solwatch", "backoff": 0.737286432}
21:08:45.451Z	INFO	wormhole-guardian-0.root.solwatch	watching for on-chain events
21:08:50.031Z	ERROR	wormhole-guardian-0.root.solwatch	failed to submit VAA	{"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"}
21:08:50.031Z	ERROR	wormhole-guardian-0.root.solwatch	requeuing VAA	{"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"}
21:09:02.062Z	INFO	wormhole-guardian-0.root.solwatch	submitted VAA	{"tx_sig": "4EKmH[...]", "digest": "79[...]"}
```

ghstack-source-id: 1b1d05a4cb
Pull Request resolved: https://github.com/certusone/wormhole/pull/48
This commit is contained in:
Leo 2020-10-22 12:20:12 +02:00
parent faf9a71cef
commit 91241ee852
1 changed files with 27 additions and 0 deletions

View File

@ -9,6 +9,8 @@ import (
eth_common "github.com/ethereum/go-ethereum/common"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
agentv1 "github.com/certusone/wormhole/bridge/pkg/proto/agent/v1"
@ -110,6 +112,31 @@ func (e *SolanaBridgeWatcher) Run(ctx context.Context) error {
cancel()
if err != nil {
logger.Error("failed to submit VAA", zap.Error(err), zap.String("digest", h))
st, ok := status.FromError(err)
if !ok {
panic("err not a status")
}
// For transient errors, we can put the VAA back into the queue such that it can
// be retried after the runnable has been rescheduled.
switch st.Code() {
case
// Our context was cancelled, likely because the watcher stream died.
codes.Canceled,
// The agent encountered a transient error, likely node unavailability.
codes.Unavailable,
codes.Aborted:
logger.Error("requeuing VAA", zap.Error(err), zap.String("digest", h))
// Tombstone goroutine
go func(v *vaa.VAA) {
time.Sleep(10 * time.Second)
e.vaaChan <- v
}(v)
}
break
}