bridge: count misses for settled VAAs

This commit is contained in:
Leo 2021-01-25 20:12:28 +01:00
parent c5b59ac6a0
commit bc356a5e51
3 changed files with 53 additions and 7 deletions

View File

@ -2,6 +2,7 @@ package processor
import (
"context"
"github.com/certusone/wormhole/bridge/pkg/common"
"github.com/prometheus/client_golang/prometheus"
"time"
@ -34,6 +35,11 @@ var (
Name: "wormhole_aggregation_state_unobserved_total",
Help: "Total number of aggregation states expired due to no matching local lockup observations",
})
aggregationStateFulfillment = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "wormhole_aggregation_state_settled_signatures_total",
Help: "Total number of signatures produced by a validator, counted after waiting a fixed amount of time",
}, []string{"addr", "origin", "status"})
)
func init() {
@ -42,6 +48,7 @@ func init() {
prometheus.MustRegister(aggregationStateTimeout)
prometheus.MustRegister(aggregationStateRetries)
prometheus.MustRegister(aggregationStateUnobserved)
prometheus.MustRegister(aggregationStateFulfillment)
}
// handleCleanup handles periodic retransmissions and cleanup of VAAs
@ -53,6 +60,28 @@ func (p *Processor) handleCleanup(ctx context.Context) {
delta := time.Now().Sub(s.firstObserved)
switch {
case !s.settled && delta.Seconds() >= 30:
// After 30 seconds, the VAA is considered settled - it's unlikely that more observations will
// arrive, barring special circumstances. This is a better time to count misses than submission,
// because we submit right when we quorum rather than waiting for all observations to arrive.
s.settled = true
p.logger.Info("VAA considered settled", zap.String("digest", hash))
// Use either the most recent (in case of a VAA we haven't seen) or stored gs, if available.
var gs *common.GuardianSet
if s.gs != nil {
gs = s.gs
} else {
gs = p.gs
}
for _, k := range gs.Keys {
if _, ok := s.signatures[k]; ok {
aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "present").Inc()
} else {
aggregationStateFulfillment.WithLabelValues(k.Hex(), s.source, "missing").Inc()
}
}
case s.submitted && delta.Hours() >= 1:
// We could delete submitted VAAs right away, but then we'd lose context about additional (late)
// observation that come in. Therefore, keep it for a reasonable amount of time.

View File

@ -148,7 +148,6 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
// byzantine, but now we know who we're dealing with.
// We can now count events by guardian without worry about cardinality explosions:
// TODO: add source_chain
observationsReceivedByGuardianAddressTotal.WithLabelValues(their_addr.Hex()).Inc()
// []byte isn't hashable in a map. Paying a small extra cost for encoding for easier debugging.
@ -166,6 +165,7 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
p.state.vaaSignatures[hash] = &vaaState{
firstObserved: time.Now(),
signatures: map[common.Address][]byte{},
source: "unknown",
}
}
@ -231,6 +231,7 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
switch t := v.Payload.(type) {
case *vaa.BodyTransfer:
p.state.vaaSignatures[hash].source = t.SourceChain.String()
// Depending on the target chain, guardians submit VAAs directly to the chain.
switch t.TargetChain {
@ -250,10 +251,14 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
zap.Stringer("target_chain", t.TargetChain))
}
case *vaa.BodyGuardianSetUpdate:
p.state.vaaSignatures[hash].source = "guardian_set_upgrade"
// A guardian set update is broadcast to every chain that we talk to.
p.devnetVAASubmission(ctx, signed, hash)
p.terraVAASubmission(ctx, signed, hash)
case *vaa.BodyContractUpgrade:
p.state.vaaSignatures[hash].source = "contract_upgrade"
switch t.ChainID {
case vaa.ChainIDSolana:
// Already submitted to Solana.

View File

@ -21,13 +21,25 @@ import (
type (
// vaaState represents the local view of a given VAA
vaaState struct {
// First time this digest was seen (possibly even before we saw its lockup).
firstObserved time.Time
ourVAA *vaa.VAA
signatures map[ethcommon.Address][]byte
submitted bool
retryCount uint
ourMsg []byte
gs *common.GuardianSet
// Copy of the VAA we constructed when we saw the lockup.
ourVAA *vaa.VAA
// Map of signatures seen by guardian. During guardian set updates, this may contain signatures belonging
// to either the old or new guardian set.
signatures map[ethcommon.Address][]byte
// Flag set after reaching quorum and submitting the VAA.
submitted bool
// Flag set by the cleanup service after the settlement timeout has expired and misses were counted.
settled bool
// Human-readable description of the VAA's source, used for metrics.
source string
// Number of times the cleanup service has attempted to retransmit this VAA.
retryCount uint
// Copy of the bytes we submitted (ourVAA, but signed and serialized). Used for retransmissions.
ourMsg []byte
// Copy of the guardian set valid at lockup/injection time.
gs *common.GuardianSet
}
vaaMap map[string]*vaaState