bridge/pkg/solana: retry VAA submission on transient errors
In particular, this fixes a race condition where the Solana devnet would
take longer to deploy than the ETH devnet to deploy and we'd end up
with an outdated guardian set on Solana.
We currently create a Goroutine for every pending resubmission, which
waits and blocks on the channel until solwatch is processing requests
again. This is effectively an unbounded queue. An alternative approach
would be a channel with sufficient capacity plus backoff.
Test Plan: Deployed without solana-devnet, waited for initial guardian
set change VAA to be requeued, then deployed solana-devnet.
The VAA was successfully submitted once the transient error resolved:
```
[...]
21:08:44.712Z ERROR wormhole-guardian-0.supervisor Runnable died {"dn": "root.solwatch", "error": "returned error when NODE_STATE_HEALTHY: failed to receive message from agent: EOF"}
21:08:44.712Z INFO wormhole-guardian-0.supervisor rescheduling supervised node {"dn": "root.solwatch", "backoff": 0.737286432}
21:08:45.451Z INFO wormhole-guardian-0.root.solwatch watching for on-chain events
21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch failed to submit VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"}
21:08:50.031Z ERROR wormhole-guardian-0.root.solwatch requeuing VAA {"error": "rpc error: code = Canceled desc = stream terminated by RST_STREAM with error code: CANCEL", "digest": "79[...]"}
21:09:02.062Z INFO wormhole-guardian-0.root.solwatch submitted VAA {"tx_sig": "4EKmH[...]", "digest": "79[...]"}
```
ghstack-source-id: 1b1d05a4cb
Pull Request resolved: https://github.com/certusone/wormhole/pull/48
This commit is contained in:
parent
faf9a71cef
commit
91241ee852
|
@ -9,6 +9,8 @@ import (
|
|||
|
||||
eth_common "github.com/ethereum/go-ethereum/common"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
agentv1 "github.com/certusone/wormhole/bridge/pkg/proto/agent/v1"
|
||||
|
||||
|
@ -110,6 +112,31 @@ func (e *SolanaBridgeWatcher) Run(ctx context.Context) error {
|
|||
cancel()
|
||||
if err != nil {
|
||||
logger.Error("failed to submit VAA", zap.Error(err), zap.String("digest", h))
|
||||
|
||||
st, ok := status.FromError(err)
|
||||
if !ok {
|
||||
panic("err not a status")
|
||||
}
|
||||
|
||||
// For transient errors, we can put the VAA back into the queue such that it can
|
||||
// be retried after the runnable has been rescheduled.
|
||||
switch st.Code() {
|
||||
case
|
||||
// Our context was cancelled, likely because the watcher stream died.
|
||||
codes.Canceled,
|
||||
// The agent encountered a transient error, likely node unavailability.
|
||||
codes.Unavailable,
|
||||
codes.Aborted:
|
||||
|
||||
logger.Error("requeuing VAA", zap.Error(err), zap.String("digest", h))
|
||||
|
||||
// Tombstone goroutine
|
||||
go func(v *vaa.VAA) {
|
||||
time.Sleep(10 * time.Second)
|
||||
e.vaaChan <- v
|
||||
}(v)
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue