Add metrics to track strategy operations.

This commit is contained in:
Jim McDonald 2021-05-19 15:29:23 +01:00
parent 1439ee1937
commit b825ba40b7
No known key found for this signature in database
GPG Key ID: 89CEB61B2AD2A5E7
9 changed files with 79 additions and 3 deletions

View File

@ -1,4 +1,5 @@
1.1.0: 1.1.0:
- added metrics to track strategy operation results
- provide release metric in `vouch_release` - provide release metric in `vouch_release`
- provide ready metric in `vouch_ready` - provide ready metric in `vouch_ready`
- handle chain reorganisations, updating duties as appropriate - handle chain reorganisations, updating duties as appropriate

View File

@ -56,7 +56,7 @@ Vouch's job scheduler provides a number of metrics. The specific metrics are:
## Client operations ## Client operations
Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency. Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency.
`vouch_client_opeation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels: `vouch_client_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels:
- `proposer` is the endpoint for the operation - `proposer` is the endpoint for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal") - `operation` is the operation that took place (_e.g._ "beacon block proposal")
@ -67,6 +67,21 @@ There is also a companion metric `vouch_client_operation_requests_total`, which
- `operation` is the operation that took place (_e.g._ "beacon block proposal") - `operation` is the operation that took place (_e.g._ "beacon block proposal")
- `result` is the result of the operation, either "succeeded" or "failed" - `result` is the result of the operation, either "succeeded" or "failed"
## Strategy operations
Strategy operations metrics provide information the results and calculation times of strategies. This can be used to understand which beacon nodes are providing the most useful information to Vouch, and how quickly Vouch is deciding on which data to use in its attestations and proposals.
`vouch_strategy_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has three labels:
- `strategy` is the strategy for the operation
- `provider` is the provider for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
There is also a companion metric `vouch_strategy_operation_requests_total`, which is a simple count of the number of operations that have taken place. It has three labels:
- `strategy` is the strategy for the operation
- `provider` is the provider for the operation
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
## Network ## Network
Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are: Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are:

View File

@ -77,3 +77,7 @@ func (s *Service) Accounts(state string, count uint64) {}
// ClientOperation provides a generic monitor for client operations. // ClientOperation provides a generic monitor for client operations.
func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) { func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) {
} }
// StrategyOperation provides a generic monitor for strategy operations.
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
}

View File

@ -43,6 +43,30 @@ func (s *Service) setupClientMetrics() error {
if err := prometheus.Register(s.clientOperationTimer); err != nil { if err := prometheus.Register(s.clientOperationTimer); err != nil {
return err return err
} }
s.strategyOperationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "vouch",
Subsystem: "strategy_operation",
Name: "used_total",
Help: "The results used by a strategy.",
}, []string{"strategy", "provider", "operation"})
if err := prometheus.Register(s.strategyOperationCounter); err != nil {
return err
}
s.strategyOperationTimer = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "vouch",
Subsystem: "strategy_operation",
Name: "duration_seconds",
Help: "The time vouch spends in strategy operations.",
Buckets: []float64{
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0,
3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0,
},
}, []string{"strategy", "provider", "operation"})
if err := prometheus.Register(s.strategyOperationTimer); err != nil {
return err
}
return nil return nil
} }
@ -56,3 +80,9 @@ func (s *Service) ClientOperation(provider string, operation string, succeeded b
s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1) s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1)
} }
} }
// StrategyOperation provides a generic monitor for strategy operations.
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
s.strategyOperationCounter.WithLabelValues(strategy, provider, operation).Add(1)
s.strategyOperationTimer.WithLabelValues(strategy, provider, operation).Observe(duration.Seconds())
}

View File

@ -50,8 +50,10 @@ type Service struct {
accountManagerAccounts *prometheus.GaugeVec accountManagerAccounts *prometheus.GaugeVec
clientOperationCounter *prometheus.CounterVec clientOperationCounter *prometheus.CounterVec
clientOperationTimer *prometheus.HistogramVec clientOperationTimer *prometheus.HistogramVec
strategyOperationCounter *prometheus.CounterVec
strategyOperationTimer *prometheus.HistogramVec
} }
// module-wide log. // module-wide log.

View File

@ -83,6 +83,8 @@ type AccountManagerMonitor interface {
type ClientMonitor interface { type ClientMonitor interface {
// ClientOperation provides a generic monitor for client operations. // ClientOperation provides a generic monitor for client operations.
ClientOperation(provider string, name string, succeeded bool, duration time.Duration) ClientOperation(provider string, name string, succeeded bool, duration time.Duration)
// StrategyOperation provides a generic monitor for strategy operations.
StrategyOperation(strategy string, provider string, operation string, duration time.Duration)
} }
// ValidatorsManagerMonitor provides methods to monitor the validators manager. // ValidatorsManagerMonitor provides methods to monitor the validators manager.

View File

@ -23,6 +23,7 @@ import (
) )
type aggregateAttestationResponse struct { type aggregateAttestationResponse struct {
provider string
aggregate *spec.Attestation aggregate *spec.Attestation
score float64 score float64
} }
@ -57,6 +58,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
score := s.scoreAggregateAttestation(ctx, name, aggregate) score := s.scoreAggregateAttestation(ctx, name, aggregate)
respCh <- &aggregateAttestationResponse{ respCh <- &aggregateAttestationResponse{
provider: name,
aggregate: aggregate, aggregate: aggregate,
score: score, score: score,
} }
@ -68,6 +70,8 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
errored := 0 errored := 0
bestScore := float64(0) bestScore := float64(0)
var bestAggregateAttestation *spec.Attestation var bestAggregateAttestation *spec.Attestation
bestProvider := ""
for responded+errored != len(s.aggregateAttestationProviders) { for responded+errored != len(s.aggregateAttestationProviders) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -81,6 +85,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
if bestAggregateAttestation == nil || resp.score > bestScore { if bestAggregateAttestation == nil || resp.score > bestScore {
bestAggregateAttestation = resp.aggregate bestAggregateAttestation = resp.aggregate
bestScore = resp.score bestScore = resp.score
bestProvider = resp.provider
} }
} }
} }
@ -90,6 +95,9 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
return nil, errors.New("no aggregate attestations received") return nil, errors.New("no aggregate attestations received")
} }
log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation") log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation")
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "aggregate attestation", time.Since(started))
}
return bestAggregateAttestation, nil return bestAggregateAttestation, nil
} }

View File

@ -23,6 +23,7 @@ import (
) )
type attestationDataResponse struct { type attestationDataResponse struct {
provider string
attestationData *spec.AttestationData attestationData *spec.AttestationData
score float64 score float64
} }
@ -53,6 +54,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
score := s.scoreAttestationData(ctx, provider, name, attestationData) score := s.scoreAttestationData(ctx, provider, name, attestationData)
respCh <- &attestationDataResponse{ respCh <- &attestationDataResponse{
provider: name,
attestationData: attestationData, attestationData: attestationData,
score: score, score: score,
} }
@ -64,6 +66,8 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
errored := 0 errored := 0
bestScore := float64(0) bestScore := float64(0)
var bestAttestationData *spec.AttestationData var bestAttestationData *spec.AttestationData
bestProvider := ""
for responded+errored != len(s.attestationDataProviders) { for responded+errored != len(s.attestationDataProviders) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -77,6 +81,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
if bestAttestationData == nil || resp.score > bestScore { if bestAttestationData == nil || resp.score > bestScore {
bestAttestationData = resp.attestationData bestAttestationData = resp.attestationData
bestScore = resp.score bestScore = resp.score
bestProvider = resp.provider
} }
} }
} }
@ -87,6 +92,9 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
return nil, errors.New("no attestations received") return nil, errors.New("no attestations received")
} }
log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation") log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation")
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "attestation data", time.Since(started))
}
return bestAttestationData, nil return bestAttestationData, nil
} }

View File

@ -29,6 +29,7 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
var mu sync.Mutex var mu sync.Mutex
bestScore := float64(0) bestScore := float64(0)
var bestProposal *spec.BeaconBlock var bestProposal *spec.BeaconBlock
bestProvider := ""
started := time.Now() started := time.Now()
sem := semaphore.NewWeighted(s.processConcurrency) sem := semaphore.NewWeighted(s.processConcurrency)
@ -80,11 +81,16 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
if score > bestScore || bestProposal == nil { if score > bestScore || bestProposal == nil {
bestScore = score bestScore = score
bestProposal = proposal bestProposal = proposal
bestProvider = name
} }
mu.Unlock() mu.Unlock()
}(ctx, sem, &wg, name, provider, &mu) }(ctx, sem, &wg, name, provider, &mu)
} }
wg.Wait() wg.Wait()
if bestProvider != "" {
s.clientMonitor.StrategyOperation("best", bestProvider, "beacon block proposal", time.Since(started))
}
return bestProposal, nil return bestProposal, nil
} }