Add additional metrics (#2500)

* Add additional metrics
Continues addressing https://github.com/cosmos/cosmos-sdk/issues/2169.
* Add nop metrics to fix NPE
* Tweak buckets, code review
* Update buckets
* Update docs with new metrics
* Code review updates
This commit is contained in:
Matthew Slipper 2018-10-10 09:27:43 -07:00 committed by Alexander Simmerl
parent ee7b3d260e
commit 92343ef484
9 changed files with 183 additions and 45 deletions

View File

@ -44,6 +44,9 @@ type Metrics struct {
CommittedHeight metrics.Gauge
// Whether or not a node is fast syncing. 1 if yes, 0 if no.
FastSyncing metrics.Gauge
// Number of blockparts transmitted by peer.
BlockParts metrics.Counter
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
@ -136,6 +139,12 @@ func PrometheusMetrics(namespace string) *Metrics {
Name: "fast_syncing",
Help: "Whether or not a node is fast syncing. 1 if yes, 0 if no.",
}, []string{}),
BlockParts: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_parts",
Help: "Number of blockparts transmitted by peer.",
}, []string{"peer_id"}),
}
}
@ -160,5 +169,6 @@ func NopMetrics() *Metrics {
TotalTxs: discard.NewGauge(),
CommittedHeight: discard.NewGauge(),
FastSyncing: discard.NewGauge(),
BlockParts: discard.NewCounter(),
}
}

View File

@ -8,7 +8,7 @@ import (
"github.com/pkg/errors"
amino "github.com/tendermint/go-amino"
"github.com/tendermint/go-amino"
cstypes "github.com/tendermint/tendermint/consensus/types"
cmn "github.com/tendermint/tendermint/libs/common"
@ -274,7 +274,7 @@ func (conR *ConsensusReactor) Receive(chID byte, src p2p.Peer, msgBytes []byte)
ps.ApplyProposalPOLMessage(msg)
case *BlockPartMessage:
ps.SetHasProposalBlockPart(msg.Height, msg.Round, msg.Part.Index)
conR.metrics.BlockParts.With("peer_id", string(src.ID())).Add(1)
conR.conS.peerMsgQueue <- msgInfo{msg, src.ID()}
default:
conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg)))

View File

@ -8,35 +8,45 @@ This functionality is disabled by default.
To enable the Prometheus metrics, set `instrumentation.prometheus=true` if your
config file. Metrics will be served under `/metrics` on 26660 port by default.
Listen address can be changed in the config file (see
`instrumentation.prometheus_listen_addr`).
`instrumentation.prometheus\_listen\_addr`).
## List of available metrics
The following metrics are available:
```
| Name | Type | Since | Description |
| --------------------------------------- | ------- | --------- | ----------------------------------------------------------------------------- |
| consensus_height | Gauge | 0.21.0 | Height of the chain |
| consensus_validators | Gauge | 0.21.0 | Number of validators |
| consensus_validators_power | Gauge | 0.21.0 | Total voting power of all validators |
| consensus_missing_validators | Gauge | 0.21.0 | Number of validators who did not sign |
| consensus_missing_validators_power | Gauge | 0.21.0 | Total voting power of the missing validators |
| consensus_byzantine_validators | Gauge | 0.21.0 | Number of validators who tried to double sign |
| consensus_byzantine_validators_power | Gauge | 0.21.0 | Total voting power of the byzantine validators |
| consensus_block_interval_seconds | Histogram | 0.21.0 | Time between this and last block (Block.Header.Time) in seconds |
| consensus_rounds | Gauge | 0.21.0 | Number of rounds |
| consensus_num_txs | Gauge | 0.21.0 | Number of transactions |
| mempool_size | Gauge | 0.21.0 | Number of uncommitted transactions |
| consensus_total_txs | Gauge | 0.21.0 | Total number of transactions committed |
| consensus_block_size_bytes | Gauge | 0.21.0 | Block size in bytes |
| p2p_peers | Gauge | 0.21.0 | Number of peers node's connected to |
```
| **Name** | **Type** | **Since** | **Tags** | **Description** |
|-----------------------------------------|-----------|-----------|----------|-----------------------------------------------------------------|
| consensus\_height | Gauge | 0.21.0 | | Height of the chain |
| consensus\_validators | Gauge | 0.21.0 | | Number of validators |
| consensus\_validators\_power | Gauge | 0.21.0 | | Total voting power of all validators |
| consensus\_missing\_validators | Gauge | 0.21.0 | | Number of validators who did not sign |
| consensus\_missing\_validators\_power | Gauge | 0.21.0 | | Total voting power of the missing validators |
| consensus\_byzantine\_validators | Gauge | 0.21.0 | | Number of validators who tried to double sign |
| consensus\_byzantine\_validators\_power | Gauge | 0.21.0 | | Total voting power of the byzantine validators |
| consensus\_block\_interval\_seconds | Histogram | 0.21.0 | | Time between this and last block (Block.Header.Time) in seconds |
| consensus\_rounds | Gauge | 0.21.0 | | Number of rounds |
| consensus\_num\_txs | Gauge | 0.21.0 | | Number of transactions |
| consensus\_block\_parts | counter | on dev | peer\_id | number of blockparts transmitted by peer |
| consensus\_latest\_block\_height | gauge | on dev | | /status sync\_info number |
| consensus\_fast\_syncing | gauge | on dev | | either 0 (not fast syncing) or 1 (syncing) |
| consensus\_total\_txs | Gauge | 0.21.0 | | Total number of transactions committed |
| consensus\_block\_size\_bytes | Gauge | 0.21.0 | | Block size in bytes |
| p2p\_peers | Gauge | 0.21.0 | | Number of peers node's connected to |
| p2p\_peer\_receive\_bytes\_total | counter | on dev | peer\_id | number of bytes received from a given peer |
| p2p\_peer\_send\_bytes\_total | counter | on dev | peer\_id | number of bytes sent to a given peer |
| p2p\_peer\_pending\_send\_bytes | gauge | on dev | peer\_id | number of pending bytes to be sent to a given peer |
| p2p\_num\_txs | gauge | on dev | peer\_id | number of transactions submitted by each peer\_id |
| p2p\_pending\_send\_bytes | gauge | on dev | peer\_id | amount of data pending to be sent to peer |
| mempool\_size | Gauge | 0.21.0 | | Number of uncommitted transactions |
| mempool\_tx\_size\_bytes | histogram | on dev | | transaction sizes in bytes |
| mempool\_failed\_txs | counter | on dev | | number of failed transactions |
| mempool\_recheck\_times | counter | on dev | | number of transactions rechecked in the mempool |
| state\_block\_processing\_time | histogram | on dev | | time between BeginBlock and EndBlock in ms |
## Useful queries
Percentage of missing + byzantine validators:
```
((consensus_byzantine_validators_power + consensus_missing_validators_power) / consensus_validators_power) * 100
((consensus\_byzantine\_validators\_power + consensus\_missing\_validators\_power) / consensus\_validators\_power) * 100
```

View File

@ -326,6 +326,7 @@ func (mem *Mempool) resCb(req *abci.Request, res *abci.Response) {
if mem.recheckCursor == nil {
mem.resCbNormal(req, res)
} else {
mem.metrics.RecheckTimes.Add(1)
mem.resCbRecheck(req, res)
}
mem.metrics.Size.Set(float64(mem.Size()))
@ -346,11 +347,12 @@ func (mem *Mempool) resCbNormal(req *abci.Request, res *abci.Response) {
}
mem.txs.PushBack(memTx)
mem.logger.Info("Added good transaction", "tx", TxID(tx), "res", r, "total", mem.Size())
mem.metrics.TxSizeBytes.Observe(float64(len(tx)))
mem.notifyTxsAvailable()
} else {
// ignore bad transaction
mem.logger.Info("Rejected bad transaction", "tx", TxID(tx), "res", r)
mem.metrics.FailedTxs.Add(1)
// remove from cache (it might be good later)
mem.cache.Remove(tx)
}

View File

@ -3,15 +3,23 @@ package mempool
import (
"github.com/go-kit/kit/metrics"
"github.com/go-kit/kit/metrics/discard"
prometheus "github.com/go-kit/kit/metrics/prometheus"
"github.com/go-kit/kit/metrics/prometheus"
stdprometheus "github.com/prometheus/client_golang/prometheus"
)
const MetricsSubsytem = "mempool"
// Metrics contains metrics exposed by this package.
// see MetricsProvider for descriptions.
type Metrics struct {
// Size of the mempool.
Size metrics.Gauge
// Histogram of transaction sizes, in bytes.
TxSizeBytes metrics.Histogram
// Number of failed transactions.
FailedTxs metrics.Counter
// Number of times transactions are rechecked in the mempool.
RecheckTimes metrics.Counter
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
@ -19,16 +27,38 @@ func PrometheusMetrics(namespace string) *Metrics {
return &Metrics{
Size: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: "mempool",
Subsystem: MetricsSubsytem,
Name: "size",
Help: "Size of the mempool (number of uncommitted transactions).",
}, []string{}),
TxSizeBytes: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
Namespace: namespace,
Subsystem: MetricsSubsytem,
Name: "tx_size_bytes",
Help: "Transaction sizes in bytes.",
Buckets: stdprometheus.ExponentialBuckets(1, 3, 17),
}, []string{}),
FailedTxs: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsytem,
Name: "failed_txs",
Help: "Number of failed transactions.",
}, []string{}),
RecheckTimes: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsytem,
Name: "recheck_times",
Help: "Number of times transactions are rechecked in the mempool.",
}, []string{}),
}
}
// NopMetrics returns no-op Metrics.
func NopMetrics() *Metrics {
return &Metrics{
Size: discard.NewGauge(),
Size: discard.NewGauge(),
TxSizeBytes: discard.NewHistogram(),
FailedTxs: discard.NewCounter(),
RecheckTimes: discard.NewCounter(),
}
}

View File

@ -13,7 +13,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
amino "github.com/tendermint/go-amino"
"github.com/tendermint/go-amino"
abci "github.com/tendermint/tendermint/abci/types"
bc "github.com/tendermint/tendermint/blockchain"
@ -32,8 +32,8 @@ import (
rpccore "github.com/tendermint/tendermint/rpc/core"
ctypes "github.com/tendermint/tendermint/rpc/core/types"
grpccore "github.com/tendermint/tendermint/rpc/grpc"
rpc "github.com/tendermint/tendermint/rpc/lib"
rpcserver "github.com/tendermint/tendermint/rpc/lib/server"
"github.com/tendermint/tendermint/rpc/lib"
"github.com/tendermint/tendermint/rpc/lib/server"
sm "github.com/tendermint/tendermint/state"
"github.com/tendermint/tendermint/state/txindex"
"github.com/tendermint/tendermint/state/txindex/kv"
@ -98,16 +98,17 @@ func DefaultNewNode(config *cfg.Config, logger log.Logger) (*Node, error) {
}
// MetricsProvider returns a consensus, p2p and mempool Metrics.
type MetricsProvider func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics)
type MetricsProvider func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics)
// DefaultMetricsProvider returns Metrics build using Prometheus client library
// if Prometheus is enabled. Otherwise, it returns no-op Metrics.
func DefaultMetricsProvider(config *cfg.InstrumentationConfig) MetricsProvider {
return func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) {
return func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics) {
if config.Prometheus {
return cs.PrometheusMetrics(config.Namespace), p2p.PrometheusMetrics(config.Namespace), mempl.PrometheusMetrics(config.Namespace)
return cs.PrometheusMetrics(config.Namespace), p2p.PrometheusMetrics(config.Namespace),
mempl.PrometheusMetrics(config.Namespace), sm.PrometheusMetrics(config.Namespace)
}
return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics()
return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics(), sm.NopMetrics()
}
}
@ -245,7 +246,7 @@ func NewNode(config *cfg.Config,
consensusLogger.Info("This node is not a validator", "addr", privValidator.GetAddress(), "pubKey", privValidator.GetPubKey())
}
csMetrics, p2pMetrics, memplMetrics := metricsProvider()
csMetrics, p2pMetrics, memplMetrics, smMetrics := metricsProvider()
// Make MempoolReactor
mempool := mempl.NewMempool(
@ -289,7 +290,14 @@ func NewNode(config *cfg.Config,
blockExecLogger := logger.With("module", "state")
// make block executor for consensus and blockchain reactors to execute blocks
blockExec := sm.NewBlockExecutor(stateDB, blockExecLogger, proxyApp.Consensus(), mempool, evidencePool)
blockExec := sm.NewBlockExecutor(
stateDB,
blockExecLogger,
proxyApp.Consensus(),
mempool,
evidencePool,
sm.BlockExecutorWithMetrics(smMetrics),
)
// Make BlockchainReactor
bcReactor := bc.NewBlockchainReactor(state.Copy(), blockExec, blockStore, fastSync)

View File

@ -3,7 +3,7 @@ package p2p
import (
"github.com/go-kit/kit/metrics"
"github.com/go-kit/kit/metrics/discard"
prometheus "github.com/go-kit/kit/metrics/prometheus"
"github.com/go-kit/kit/metrics/prometheus"
stdprometheus "github.com/prometheus/client_golang/prometheus"
)
@ -19,6 +19,8 @@ type Metrics struct {
PeerSendBytesTotal metrics.Counter
// Pending bytes to be sent to a given peer.
PeerPendingSendBytes metrics.Gauge
// Number of transactions submitted by each peer.
NumTxs metrics.Gauge
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
@ -48,6 +50,13 @@ func PrometheusMetrics(namespace string) *Metrics {
Name: "peer_pending_send_bytes",
Help: "Number of pending bytes to be sent to a given peer.",
}, []string{"peer_id"}),
NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "num_txs",
Help: "Number of transactions submitted by each peer.",
}, []string{"peer_id"}),
}
}
@ -58,5 +67,6 @@ func NopMetrics() *Metrics {
PeerReceiveBytesTotal: discard.NewCounter(),
PeerSendBytesTotal: discard.NewCounter(),
PeerPendingSendBytes: discard.NewGauge(),
NumTxs: discard.NewGauge(),
}
}

View File

@ -2,8 +2,9 @@ package state
import (
"fmt"
"time"
fail "github.com/ebuchman/fail-test"
"github.com/ebuchman/fail-test"
abci "github.com/tendermint/tendermint/abci/types"
dbm "github.com/tendermint/tendermint/libs/db"
"github.com/tendermint/tendermint/libs/log"
@ -33,20 +34,37 @@ type BlockExecutor struct {
evpool EvidencePool
logger log.Logger
metrics *Metrics
}
type BlockExecutorOption func(executor *BlockExecutor)
func BlockExecutorWithMetrics(metrics *Metrics) BlockExecutorOption {
return func(blockExec *BlockExecutor) {
blockExec.metrics = metrics
}
}
// NewBlockExecutor returns a new BlockExecutor with a NopEventBus.
// Call SetEventBus to provide one.
func NewBlockExecutor(db dbm.DB, logger log.Logger, proxyApp proxy.AppConnConsensus,
mempool Mempool, evpool EvidencePool) *BlockExecutor {
return &BlockExecutor{
mempool Mempool, evpool EvidencePool, options ...BlockExecutorOption) *BlockExecutor {
res := &BlockExecutor{
db: db,
proxyApp: proxyApp,
eventBus: types.NopEventBus{},
mempool: mempool,
evpool: evpool,
logger: logger,
metrics: NopMetrics(),
}
for _, option := range options {
option(res)
}
return res
}
// SetEventBus - sets the event bus for publishing block related events.
@ -74,7 +92,10 @@ func (blockExec *BlockExecutor) ApplyBlock(state State, blockID types.BlockID, b
return state, ErrInvalidBlock(err)
}
startTime := time.Now().UnixNano()
abciResponses, err := execBlockOnProxyApp(blockExec.logger, blockExec.proxyApp, block, state.LastValidators, blockExec.db)
endTime := time.Now().UnixNano()
blockExec.metrics.BlockProcessingTime.Observe(float64(endTime - startTime) / 1000000)
if err != nil {
return state, ErrProxyAppConn(err)
}
@ -176,8 +197,13 @@ func (blockExec *BlockExecutor) Commit(
// Executes block's transactions on proxyAppConn.
// Returns a list of transaction results and updates to the validator set
func execBlockOnProxyApp(logger log.Logger, proxyAppConn proxy.AppConnConsensus,
block *types.Block, lastValSet *types.ValidatorSet, stateDB dbm.DB) (*ABCIResponses, error) {
func execBlockOnProxyApp(
logger log.Logger,
proxyAppConn proxy.AppConnConsensus,
block *types.Block,
lastValSet *types.ValidatorSet,
stateDB dbm.DB,
) (*ABCIResponses, error) {
var validTxs, invalidTxs = 0, 0
txIndex := 0
@ -333,8 +359,12 @@ func updateValidators(currentSet *types.ValidatorSet, abciUpdates []abci.Validat
}
// updateState returns a new State updated according to the header and responses.
func updateState(state State, blockID types.BlockID, header *types.Header,
abciResponses *ABCIResponses) (State, error) {
func updateState(
state State,
blockID types.BlockID,
header *types.Header,
abciResponses *ABCIResponses,
) (State, error) {
// Copy the valset so we can apply changes from EndBlock
// and update s.LastValidators and s.Validators.
@ -417,8 +447,13 @@ func fireEvents(logger log.Logger, eventBus types.BlockEventPublisher, block *ty
// ExecCommitBlock executes and commits a block on the proxyApp without validating or mutating the state.
// It returns the application root hash (result of abci.Commit).
func ExecCommitBlock(appConnConsensus proxy.AppConnConsensus, block *types.Block,
logger log.Logger, lastValSet *types.ValidatorSet, stateDB dbm.DB) ([]byte, error) {
func ExecCommitBlock(
appConnConsensus proxy.AppConnConsensus,
block *types.Block,
logger log.Logger,
lastValSet *types.ValidatorSet,
stateDB dbm.DB,
) ([]byte, error) {
_, err := execBlockOnProxyApp(logger, appConnConsensus, block, lastValSet, stateDB)
if err != nil {
logger.Error("Error executing block on proxy app", "height", block.Height, "err", err)

33
state/metrics.go Normal file
View File

@ -0,0 +1,33 @@
package state
import (
"github.com/go-kit/kit/metrics"
"github.com/go-kit/kit/metrics/prometheus"
stdprometheus "github.com/prometheus/client_golang/prometheus"
"github.com/go-kit/kit/metrics/discard"
)
const MetricsSubsystem = "state"
type Metrics struct {
// Time between BeginBlock and EndBlock.
BlockProcessingTime metrics.Histogram
}
func PrometheusMetrics(namespace string) *Metrics {
return &Metrics{
BlockProcessingTime: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_processing_time",
Help: "Time between BeginBlock and EndBlock in ms.",
Buckets: stdprometheus.LinearBuckets(1, 10, 10),
}, []string{}),
}
}
func NopMetrics() *Metrics {
return &Metrics{
BlockProcessingTime: discard.NewHistogram(),
}
}