mirror of https://github.com/certusone/vouch.git
Add metrics to track strategy operations.
This commit is contained in:
parent
1439ee1937
commit
b825ba40b7
|
@ -1,4 +1,5 @@
|
||||||
1.1.0:
|
1.1.0:
|
||||||
|
- added metrics to track strategy operation results
|
||||||
- provide release metric in `vouch_release`
|
- provide release metric in `vouch_release`
|
||||||
- provide ready metric in `vouch_ready`
|
- provide ready metric in `vouch_ready`
|
||||||
- handle chain reorganisations, updating duties as appropriate
|
- handle chain reorganisations, updating duties as appropriate
|
||||||
|
|
|
@ -56,7 +56,7 @@ Vouch's job scheduler provides a number of metrics. The specific metrics are:
|
||||||
## Client operations
|
## Client operations
|
||||||
Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency.
|
Client operations metrics provide information about the response time of beacon nodes, as well as if the request to them succeeded or failed. This can be used to understand how quickly and how well beacon nodes are responding to requests, for example if Vouch using multiple beacon nodes in different data centres this can be used to obtain data about their response times due to network latency.
|
||||||
|
|
||||||
`vouch_client_opeation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels:
|
`vouch_client_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has two labels:
|
||||||
|
|
||||||
- `proposer` is the endpoint for the operation
|
- `proposer` is the endpoint for the operation
|
||||||
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
||||||
|
@ -67,6 +67,21 @@ There is also a companion metric `vouch_client_operation_requests_total`, which
|
||||||
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
||||||
- `result` is the result of the operation, either "succeeded" or "failed"
|
- `result` is the result of the operation, either "succeeded" or "failed"
|
||||||
|
|
||||||
|
## Strategy operations
|
||||||
|
Strategy operations metrics provide information the results and calculation times of strategies. This can be used to understand which beacon nodes are providing the most useful information to Vouch, and how quickly Vouch is deciding on which data to use in its attestations and proposals.
|
||||||
|
|
||||||
|
`vouch_strategy_operation_duration_seconds` is provided as a histogram, with buckets in increments of 0.1 seconds up to 4 seconds. It has three labels:
|
||||||
|
|
||||||
|
- `strategy` is the strategy for the operation
|
||||||
|
- `provider` is the provider for the operation
|
||||||
|
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
||||||
|
|
||||||
|
There is also a companion metric `vouch_strategy_operation_requests_total`, which is a simple count of the number of operations that have taken place. It has three labels:
|
||||||
|
|
||||||
|
- `strategy` is the strategy for the operation
|
||||||
|
- `provider` is the provider for the operation
|
||||||
|
- `operation` is the operation that took place (_e.g._ "beacon block proposal")
|
||||||
|
|
||||||
## Network
|
## Network
|
||||||
Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are:
|
Network metrics provide information about the network from vouch's point of view. Although these are not under vouch's control, they have an impact on the performance of the validator. The specific metrics are:
|
||||||
|
|
||||||
|
|
|
@ -77,3 +77,7 @@ func (s *Service) Accounts(state string, count uint64) {}
|
||||||
// ClientOperation provides a generic monitor for client operations.
|
// ClientOperation provides a generic monitor for client operations.
|
||||||
func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) {
|
func (s *Service) ClientOperation(provider string, name string, succeeded bool, duration time.Duration) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StrategyOperation provides a generic monitor for strategy operations.
|
||||||
|
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
|
||||||
|
}
|
||||||
|
|
|
@ -43,6 +43,30 @@ func (s *Service) setupClientMetrics() error {
|
||||||
if err := prometheus.Register(s.clientOperationTimer); err != nil {
|
if err := prometheus.Register(s.clientOperationTimer); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
s.strategyOperationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: "vouch",
|
||||||
|
Subsystem: "strategy_operation",
|
||||||
|
Name: "used_total",
|
||||||
|
Help: "The results used by a strategy.",
|
||||||
|
}, []string{"strategy", "provider", "operation"})
|
||||||
|
if err := prometheus.Register(s.strategyOperationCounter); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
s.strategyOperationTimer = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: "vouch",
|
||||||
|
Subsystem: "strategy_operation",
|
||||||
|
Name: "duration_seconds",
|
||||||
|
Help: "The time vouch spends in strategy operations.",
|
||||||
|
Buckets: []float64{
|
||||||
|
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
||||||
|
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
|
||||||
|
2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0,
|
||||||
|
3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0,
|
||||||
|
},
|
||||||
|
}, []string{"strategy", "provider", "operation"})
|
||||||
|
if err := prometheus.Register(s.strategyOperationTimer); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -56,3 +80,9 @@ func (s *Service) ClientOperation(provider string, operation string, succeeded b
|
||||||
s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1)
|
s.clientOperationCounter.WithLabelValues(provider, operation, "failed").Add(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StrategyOperation provides a generic monitor for strategy operations.
|
||||||
|
func (s *Service) StrategyOperation(strategy string, provider string, operation string, duration time.Duration) {
|
||||||
|
s.strategyOperationCounter.WithLabelValues(strategy, provider, operation).Add(1)
|
||||||
|
s.strategyOperationTimer.WithLabelValues(strategy, provider, operation).Observe(duration.Seconds())
|
||||||
|
}
|
||||||
|
|
|
@ -50,8 +50,10 @@ type Service struct {
|
||||||
|
|
||||||
accountManagerAccounts *prometheus.GaugeVec
|
accountManagerAccounts *prometheus.GaugeVec
|
||||||
|
|
||||||
clientOperationCounter *prometheus.CounterVec
|
clientOperationCounter *prometheus.CounterVec
|
||||||
clientOperationTimer *prometheus.HistogramVec
|
clientOperationTimer *prometheus.HistogramVec
|
||||||
|
strategyOperationCounter *prometheus.CounterVec
|
||||||
|
strategyOperationTimer *prometheus.HistogramVec
|
||||||
}
|
}
|
||||||
|
|
||||||
// module-wide log.
|
// module-wide log.
|
||||||
|
|
|
@ -83,6 +83,8 @@ type AccountManagerMonitor interface {
|
||||||
type ClientMonitor interface {
|
type ClientMonitor interface {
|
||||||
// ClientOperation provides a generic monitor for client operations.
|
// ClientOperation provides a generic monitor for client operations.
|
||||||
ClientOperation(provider string, name string, succeeded bool, duration time.Duration)
|
ClientOperation(provider string, name string, succeeded bool, duration time.Duration)
|
||||||
|
// StrategyOperation provides a generic monitor for strategy operations.
|
||||||
|
StrategyOperation(strategy string, provider string, operation string, duration time.Duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ValidatorsManagerMonitor provides methods to monitor the validators manager.
|
// ValidatorsManagerMonitor provides methods to monitor the validators manager.
|
||||||
|
|
|
@ -23,6 +23,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type aggregateAttestationResponse struct {
|
type aggregateAttestationResponse struct {
|
||||||
|
provider string
|
||||||
aggregate *spec.Attestation
|
aggregate *spec.Attestation
|
||||||
score float64
|
score float64
|
||||||
}
|
}
|
||||||
|
@ -57,6 +58,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
|
||||||
|
|
||||||
score := s.scoreAggregateAttestation(ctx, name, aggregate)
|
score := s.scoreAggregateAttestation(ctx, name, aggregate)
|
||||||
respCh <- &aggregateAttestationResponse{
|
respCh <- &aggregateAttestationResponse{
|
||||||
|
provider: name,
|
||||||
aggregate: aggregate,
|
aggregate: aggregate,
|
||||||
score: score,
|
score: score,
|
||||||
}
|
}
|
||||||
|
@ -68,6 +70,8 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
|
||||||
errored := 0
|
errored := 0
|
||||||
bestScore := float64(0)
|
bestScore := float64(0)
|
||||||
var bestAggregateAttestation *spec.Attestation
|
var bestAggregateAttestation *spec.Attestation
|
||||||
|
bestProvider := ""
|
||||||
|
|
||||||
for responded+errored != len(s.aggregateAttestationProviders) {
|
for responded+errored != len(s.aggregateAttestationProviders) {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -81,6 +85,7 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
|
||||||
if bestAggregateAttestation == nil || resp.score > bestScore {
|
if bestAggregateAttestation == nil || resp.score > bestScore {
|
||||||
bestAggregateAttestation = resp.aggregate
|
bestAggregateAttestation = resp.aggregate
|
||||||
bestScore = resp.score
|
bestScore = resp.score
|
||||||
|
bestProvider = resp.provider
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -90,6 +95,9 @@ func (s *Service) AggregateAttestation(ctx context.Context, slot spec.Slot, atte
|
||||||
return nil, errors.New("no aggregate attestations received")
|
return nil, errors.New("no aggregate attestations received")
|
||||||
}
|
}
|
||||||
log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation")
|
log.Trace().Stringer("aggregate_attestation", bestAggregateAttestation).Float64("score", bestScore).Msg("Selected best aggregate attestation")
|
||||||
|
if bestProvider != "" {
|
||||||
|
s.clientMonitor.StrategyOperation("best", bestProvider, "aggregate attestation", time.Since(started))
|
||||||
|
}
|
||||||
|
|
||||||
return bestAggregateAttestation, nil
|
return bestAggregateAttestation, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type attestationDataResponse struct {
|
type attestationDataResponse struct {
|
||||||
|
provider string
|
||||||
attestationData *spec.AttestationData
|
attestationData *spec.AttestationData
|
||||||
score float64
|
score float64
|
||||||
}
|
}
|
||||||
|
@ -53,6 +54,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
|
||||||
|
|
||||||
score := s.scoreAttestationData(ctx, provider, name, attestationData)
|
score := s.scoreAttestationData(ctx, provider, name, attestationData)
|
||||||
respCh <- &attestationDataResponse{
|
respCh <- &attestationDataResponse{
|
||||||
|
provider: name,
|
||||||
attestationData: attestationData,
|
attestationData: attestationData,
|
||||||
score: score,
|
score: score,
|
||||||
}
|
}
|
||||||
|
@ -64,6 +66,8 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
|
||||||
errored := 0
|
errored := 0
|
||||||
bestScore := float64(0)
|
bestScore := float64(0)
|
||||||
var bestAttestationData *spec.AttestationData
|
var bestAttestationData *spec.AttestationData
|
||||||
|
bestProvider := ""
|
||||||
|
|
||||||
for responded+errored != len(s.attestationDataProviders) {
|
for responded+errored != len(s.attestationDataProviders) {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -77,6 +81,7 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
|
||||||
if bestAttestationData == nil || resp.score > bestScore {
|
if bestAttestationData == nil || resp.score > bestScore {
|
||||||
bestAttestationData = resp.attestationData
|
bestAttestationData = resp.attestationData
|
||||||
bestScore = resp.score
|
bestScore = resp.score
|
||||||
|
bestProvider = resp.provider
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -87,6 +92,9 @@ func (s *Service) AttestationData(ctx context.Context, slot spec.Slot, committee
|
||||||
return nil, errors.New("no attestations received")
|
return nil, errors.New("no attestations received")
|
||||||
}
|
}
|
||||||
log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation")
|
log.Trace().Stringer("attestation_data", bestAttestationData).Float64("score", bestScore).Msg("Selected best attestation")
|
||||||
|
if bestProvider != "" {
|
||||||
|
s.clientMonitor.StrategyOperation("best", bestProvider, "attestation data", time.Since(started))
|
||||||
|
}
|
||||||
|
|
||||||
return bestAttestationData, nil
|
return bestAttestationData, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
|
||||||
var mu sync.Mutex
|
var mu sync.Mutex
|
||||||
bestScore := float64(0)
|
bestScore := float64(0)
|
||||||
var bestProposal *spec.BeaconBlock
|
var bestProposal *spec.BeaconBlock
|
||||||
|
bestProvider := ""
|
||||||
|
|
||||||
started := time.Now()
|
started := time.Now()
|
||||||
sem := semaphore.NewWeighted(s.processConcurrency)
|
sem := semaphore.NewWeighted(s.processConcurrency)
|
||||||
|
@ -80,11 +81,16 @@ func (s *Service) BeaconBlockProposal(ctx context.Context, slot spec.Slot, randa
|
||||||
if score > bestScore || bestProposal == nil {
|
if score > bestScore || bestProposal == nil {
|
||||||
bestScore = score
|
bestScore = score
|
||||||
bestProposal = proposal
|
bestProposal = proposal
|
||||||
|
bestProvider = name
|
||||||
}
|
}
|
||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
}(ctx, sem, &wg, name, provider, &mu)
|
}(ctx, sem, &wg, name, provider, &mu)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
|
if bestProvider != "" {
|
||||||
|
s.clientMonitor.StrategyOperation("best", bestProvider, "beacon block proposal", time.Since(started))
|
||||||
|
}
|
||||||
|
|
||||||
return bestProposal, nil
|
return bestProposal, nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue