diff --git a/CHANGELOG.md b/CHANGELOG.md index b27842d..004ea20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ 1.1.0: + - added metrics to track strategy operation results - provide release metric in `vouch_release` - provide ready metric in `vouch_ready` - handle chain reorganisations, updating duties as appropriate diff --git a/docs/metrics/prometheus.md b/docs/metrics/prometheus.md index e9ac084..7da9cef 100644 --- a/docs/metrics/prometheus.md +++ b/docs/metrics/prometheus.md @@ -56,7 +56,7 @@ Vouch's job scheduler provides a number of metrics. The specific metrics are: ## Client operations Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency. -`vouch_client_opeation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels: +`vouch_client_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels: - `proposer` is the endpoint for the operation - `operation` is the operation that took place (_e.g._ "beacon block proposal") @@ -67,6 +67,21 @@ There is also a companion metric `vouch_client_operation_requests_total`, which - `operation` is the operation that took place (_e.g._ "beacon block proposal") - `result` is the result of the operation, either "succeeded" or "failed" +## Strategy operations +Strategy operations metrics provide information the results and calculation times of strategies. This can be used to understand which beacon nodes are providing the most useful information to Vouch, and how quickly Vouch is deciding on which data to use in its attestations and proposals. + +`vouch_strategy_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has three labels: + + - `strategy` is the strategy for the operation + - `provider` is the provider for the operation + - `operation` is the operation that took place (_e.g._ "beacon block proposal") + +There is also a companion metric `vouch_strategy_operation_requests_total`, which is a simple count of the number of operations that have taken place. It has three labels: + + - `strategy` is the strategy for the operation + - `provider` is the provider for the operation + - `operation` is the operation that took place (_e.g._ "beacon block proposal") + ## Network Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are: diff --git a/services/metrics/null/service.go b/services/metrics/null/service.go index 4210760..0c988d3 100644 --- a/services/metrics/null/service.go +++ b/services/metrics/null/service.go @@ -77,3 +77,7 @@ func (s *Service) Accounts(state string, count uint64) {} // ClientOperation provides a generic monitor for client operations. func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) { } + +// StrategyOperation provides a generic monitor for strategy operations. +func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) { +} diff --git a/services/metrics/prometheus/client.go b/services/metrics/prometheus/client.go index 80abfba..726edd2 100644 --- a/services/metrics/prometheus/client.go +++ b/services/metrics/prometheus/client.go @@ -43,6 +43,30 @@ func (s *Service) setupClientMetrics() error { if err := prometheus.Register(s.clientOperationTimer); err != nil { return err } + s.strategyOperationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "vouch", + Subsystem: "strategy_operation", + Name: "used_total", + Help: "The results used by a strategy.", + }, []string{"strategy", "provider", "operation"}) + if err := prometheus.Register(s.strategyOperationCounter); err != nil { + return err + } + s.strategyOperationTimer = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "vouch", + Subsystem: "strategy_operation", + Name: "duration_seconds", + Help: "The time vouch spends in strategy operations.", + Buckets: []float64{ + 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, + 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, + 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, + }, + }, []string{"strategy", "provider", "operation"}) + if err := prometheus.Register(s.strategyOperationTimer); err != nil { + return err + } return nil } @@ -56,3 +80,9 @@ func (s *Service) ClientOperation(provider string, operation string, succeeded b s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1) } } + +// StrategyOperation provides a generic monitor for strategy operations. +func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) { + s.strategyOperationCounter.WithLabelValues(strategy, provider, operation).Add(1) + s.strategyOperationTimer.WithLabelValues(strategy, provider, operation).Observe(duration.Seconds()) +} diff --git a/services/metrics/prometheus/service.go b/services/metrics/prometheus/service.go index e3a4571..74965f1 100644 --- a/services/metrics/prometheus/service.go +++ b/services/metrics/prometheus/service.go @@ -50,8 +50,10 @@ type Service struct { accountManagerAccounts *prometheus.GaugeVec - clientOperationCounter *prometheus.CounterVec - clientOperationTimer *prometheus.HistogramVec + clientOperationCounter *prometheus.CounterVec + clientOperationTimer *prometheus.HistogramVec + strategyOperationCounter *prometheus.CounterVec + strategyOperationTimer *prometheus.HistogramVec } // module-wide log. diff --git a/services/metrics/service.go b/services/metrics/service.go index 5f29306..5ba3a0e 100644 --- a/services/metrics/service.go +++ b/services/metrics/service.go @@ -83,6 +83,8 @@ type AccountManagerMonitor interface { type ClientMonitor interface { // ClientOperation provides a generic monitor for client operations. ClientOperation(provider string, name string, succeeded bool, duration time.Duration) + // StrategyOperation provides a generic monitor for strategy operations. + StrategyOperation(strategy string, provider string, operation string, duration time.Duration) } // ValidatorsManagerMonitor provides methods to monitor the validators manager. diff --git a/strategies/aggregateattestation/best/aggregateattestation.go b/strategies/aggregateattestation/best/aggregateattestation.go index eaf6114..723af88 100644 --- a/strategies/aggregateattestation/best/aggregateattestation.go +++ b/strategies/aggregateattestation/best/aggregateattestation.go @@ -23,6 +23,7 @@ import ( ) type aggregateAttestationResponse struct { + provider string aggregate *spec.Attestation score float64 } @@ -57,6 +58,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte score := s.scoreAggregateAttestation(ctx, name, aggregate) respCh <- &aggregateAttestationResponse{ + provider: name, aggregate: aggregate, score: score, } @@ -68,6 +70,8 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte errored := 0 bestScore := float64(0) var bestAggregateAttestation *spec.Attestation + bestProvider := "" + for responded+errored != len(s.aggregateAttestationProviders) { select { case <-ctx.Done(): @@ -81,6 +85,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte if bestAggregateAttestation == nil || resp.score > bestScore { bestAggregateAttestation = resp.aggregate bestScore = resp.score + bestProvider = resp.provider } } } @@ -90,6 +95,9 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte return nil, errors.New("no aggregate attestations received") } log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation") + if bestProvider != "" { + s.clientMonitor.StrategyOperation("best", bestProvider, "aggregate attestation", time.Since(started)) + } return bestAggregateAttestation, nil } diff --git a/strategies/attestationdata/best/attestationdata.go b/strategies/attestationdata/best/attestationdata.go index 7d6abdc..dc5d0f3 100644 --- a/strategies/attestationdata/best/attestationdata.go +++ b/strategies/attestationdata/best/attestationdata.go @@ -23,6 +23,7 @@ import ( ) type attestationDataResponse struct { + provider string attestationData *spec.AttestationData score float64 } @@ -53,6 +54,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee score := s.scoreAttestationData(ctx, provider, name, attestationData) respCh <- &attestationDataResponse{ + provider: name, attestationData: attestationData, score: score, } @@ -64,6 +66,8 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee errored := 0 bestScore := float64(0) var bestAttestationData *spec.AttestationData + bestProvider := "" + for responded+errored != len(s.attestationDataProviders) { select { case <-ctx.Done(): @@ -77,6 +81,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee if bestAttestationData == nil || resp.score > bestScore { bestAttestationData = resp.attestationData bestScore = resp.score + bestProvider = resp.provider } } } @@ -87,6 +92,9 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee return nil, errors.New("no attestations received") } log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation") + if bestProvider != "" { + s.clientMonitor.StrategyOperation("best", bestProvider, "attestation data", time.Since(started)) + } return bestAttestationData, nil } diff --git a/strategies/beaconblockproposal/best/beaconblockproposal.go b/strategies/beaconblockproposal/best/beaconblockproposal.go index 0d2a10b..709c3d5 100644 --- a/strategies/beaconblockproposal/best/beaconblockproposal.go +++ b/strategies/beaconblockproposal/best/beaconblockproposal.go @@ -29,6 +29,7 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa var mu sync.Mutex bestScore := float64(0) var bestProposal *spec.BeaconBlock + bestProvider := "" started := time.Now() sem := semaphore.NewWeighted(s.processConcurrency) @@ -80,11 +81,16 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa if score > bestScore || bestProposal == nil { bestScore = score bestProposal = proposal + bestProvider = name } mu.Unlock() }(ctx, sem, &wg, name, provider, &mu) } wg.Wait() + if bestProvider != "" { + s.clientMonitor.StrategyOperation("best", bestProvider, "beacon block proposal", time.Since(started)) + } + return bestProposal, nil }