Add metrics to track strategy operations.

This commit is contained in:
Jim McDonald 2021-05-19 15:29:23 +01:00
parent 1439ee1937
commit b825ba40b7
No known key found for this signature in database
GPG Key ID: 89CEB61B2AD2A5E7
9 changed files with 79 additions and 3 deletions

View File

@ -1,4 +1,5 @@
1.1.0:
- added metrics to track strategy operation results
- provide release metric in `vouch_release`
- provide ready metric in `vouch_ready`
- handle chain reorganisations, updating duties as appropriate

View File

@ -56,7 +56,7 @@ Vouch's job scheduler provides a number of metrics. The specific metrics are:
## Client operations
Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency.
`vouch_client_opeation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels:
`vouch_client_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels:
- `proposer` is the endpoint for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
@ -67,6 +67,21 @@ There is also a companion metric `vouch_client_operation_requests_total`, which
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
- `result` is the result of the operation, either "succeeded" or "failed"
## Strategy operations
Strategy operations metrics provide information the results and calculation times of strategies. This can be used to understand which beacon nodes are providing the most useful information to Vouch, and how quickly Vouch is deciding on which data to use in its attestations and proposals.
`vouch_strategy_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has three labels:
- `strategy` is the strategy for the operation
- `provider` is the provider for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
There is also a companion metric `vouch_strategy_operation_requests_total`, which is a simple count of the number of operations that have taken place. It has three labels:
- `strategy` is the strategy for the operation
- `provider` is the provider for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
## Network
Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are:

View File

@ -77,3 +77,7 @@ func (s *Service) Accounts(state string, count uint64) {}
// ClientOperation provides a generic monitor for client operations.
func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) {
}
// StrategyOperation provides a generic monitor for strategy operations.
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
}

View File

@ -43,6 +43,30 @@ func (s *Service) setupClientMetrics() error {
if err := prometheus.Register(s.clientOperationTimer); err != nil {
return err
}
s.strategyOperationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "vouch",
Subsystem: "strategy_operation",
Name: "used_total",
Help: "The results used by a strategy.",
}, []string{"strategy", "provider", "operation"})
if err := prometheus.Register(s.strategyOperationCounter); err != nil {
return err
}
s.strategyOperationTimer = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "vouch",
Subsystem: "strategy_operation",
Name: "duration_seconds",
Help: "The time vouch spends in strategy operations.",
Buckets: []float64{
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0,
3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0,
},
}, []string{"strategy", "provider", "operation"})
if err := prometheus.Register(s.strategyOperationTimer); err != nil {
return err
}
return nil
}
@ -56,3 +80,9 @@ func (s *Service) ClientOperation(provider string, operation string, succeeded b
s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1)
}
}
// StrategyOperation provides a generic monitor for strategy operations.
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
s.strategyOperationCounter.WithLabelValues(strategy, provider, operation).Add(1)
s.strategyOperationTimer.WithLabelValues(strategy, provider, operation).Observe(duration.Seconds())
}

View File

@ -50,8 +50,10 @@ type Service struct {
accountManagerAccounts *prometheus.GaugeVec
clientOperationCounter *prometheus.CounterVec
clientOperationTimer *prometheus.HistogramVec
clientOperationCounter *prometheus.CounterVec
clientOperationTimer *prometheus.HistogramVec
strategyOperationCounter *prometheus.CounterVec
strategyOperationTimer *prometheus.HistogramVec
}
// module-wide log.

View File

@ -83,6 +83,8 @@ type AccountManagerMonitor interface {
type ClientMonitor interface {
// ClientOperation provides a generic monitor for client operations.
ClientOperation(provider string, name string, succeeded bool, duration time.Duration)
// StrategyOperation provides a generic monitor for strategy operations.
StrategyOperation(strategy string, provider string, operation string, duration time.Duration)
}
// ValidatorsManagerMonitor provides methods to monitor the validators manager.

View File

@ -23,6 +23,7 @@ import (
)
type aggregateAttestationResponse struct {
provider string
aggregate *spec.Attestation
score float64
}
@ -57,6 +58,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
score := s.scoreAggregateAttestation(ctx, name, aggregate)
respCh <- &aggregateAttestationResponse{
provider: name,
aggregate: aggregate,
score: score,
}
@ -68,6 +70,8 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
errored := 0
bestScore := float64(0)
var bestAggregateAttestation *spec.Attestation
bestProvider := ""
for responded+errored != len(s.aggregateAttestationProviders) {
select {
case <-ctx.Done():
@ -81,6 +85,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
if bestAggregateAttestation == nil || resp.score > bestScore {
bestAggregateAttestation = resp.aggregate
bestScore = resp.score
bestProvider = resp.provider
}
}
}
@ -90,6 +95,9 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
return nil, errors.New("no aggregate attestations received")
}
log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation")
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "aggregate attestation", time.Since(started))
}
return bestAggregateAttestation, nil
}

View File

@ -23,6 +23,7 @@ import (
)
type attestationDataResponse struct {
provider string
attestationData *spec.AttestationData
score float64
}
@ -53,6 +54,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
score := s.scoreAttestationData(ctx, provider, name, attestationData)
respCh <- &attestationDataResponse{
provider: name,
attestationData: attestationData,
score: score,
}
@ -64,6 +66,8 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
errored := 0
bestScore := float64(0)
var bestAttestationData *spec.AttestationData
bestProvider := ""
for responded+errored != len(s.attestationDataProviders) {
select {
case <-ctx.Done():
@ -77,6 +81,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
if bestAttestationData == nil || resp.score > bestScore {
bestAttestationData = resp.attestationData
bestScore = resp.score
bestProvider = resp.provider
}
}
}
@ -87,6 +92,9 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
return nil, errors.New("no attestations received")
}
log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation")
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "attestation data", time.Since(started))
}
return bestAttestationData, nil
}

View File

@ -29,6 +29,7 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
var mu sync.Mutex
bestScore := float64(0)
var bestProposal *spec.BeaconBlock
bestProvider := ""
started := time.Now()
sem := semaphore.NewWeighted(s.processConcurrency)
@ -80,11 +81,16 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
if score > bestScore || bestProposal == nil {
bestScore = score
bestProposal = proposal
bestProvider = name
}
mu.Unlock()
}(ctx, sem, &wg, name, provider, &mu)
}
wg.Wait()
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "beacon block proposal", time.Since(started))
}
return bestProposal, nil
}