diff --git a/node/pkg/processor/cleanup.go b/node/pkg/processor/cleanup.go index 40367792..b11632a3 100644 --- a/node/pkg/processor/cleanup.go +++ b/node/pkg/processor/cleanup.go @@ -4,6 +4,7 @@ import ( "context" "encoding/hex" "github.com/certusone/wormhole/node/pkg/common" + "github.com/certusone/wormhole/node/pkg/db" "github.com/certusone/wormhole/node/pkg/vaa" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -23,6 +24,11 @@ var ( Name: "wormhole_aggregation_state_expirations_total", Help: "Total number of expired submitted aggregation states", }) + aggregationStateLate = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "wormhole_aggregation_state_late_total", + Help: "Total number of late aggregation states (cluster achieved consensus without us)", + }) aggregationStateTimeout = promauto.NewCounter( prometheus.CounterOpts{ Name: "wormhole_aggregation_state_timeout_total", @@ -58,6 +64,29 @@ func (p *Processor) handleCleanup(ctx context.Context) { delta := time.Since(s.firstObserved) switch { + case !s.submitted && s.ourVAA != nil && delta > settlementTime: + // Expire pending VAAs post settlement time if we have a stored quorum VAA. + // + // This occurs when we observed a message after the cluster has already reached + // consensus on it, causing us to never achieve quorum. + + if _, err := p.db.GetSignedVAABytes(*db.VaaIDFromVAA(s.ourVAA)); err == nil { + // If we have a stored quorum VAA, we can safely expire the state. + // + // This is a rare case, and we can safely expire the state, since we + // have a quorum VAA. + p.logger.Info("Expiring late VAA", zap.String("digest", hash), zap.Duration("delta", delta)) + aggregationStateLate.Inc() + delete(p.state.vaaSignatures, hash) + break + } else if err != db.ErrVAANotFound { + p.logger.Error("failed to look up VAA in database", + zap.String("digest", hash), + zap.Error(err), + ) + } + + fallthrough case !s.settled && delta > settlementTime: // After 30 seconds, the VAA is considered settled - it's unlikely that more observations will // arrive, barring special circumstances. This is a better time to count misses than submission,