Add the solana_leader_slots_by_epoch metric
This commit is contained in:
parent
7fbc2a450e
commit
70eafb273e
|
@ -2,6 +2,7 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -43,9 +44,16 @@ var (
|
||||||
leaderSlotsTotal = prometheus.NewCounterVec(
|
leaderSlotsTotal = prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Name: "solana_leader_slots_total",
|
Name: "solana_leader_slots_total",
|
||||||
Help: "Number of leader slots per leader, grouped by skip status (max confirmation)",
|
Help: "(DEPRECATED) Number of leader slots per leader, grouped by skip status",
|
||||||
},
|
},
|
||||||
[]string{"status", "nodekey"})
|
[]string{"status", "nodekey"})
|
||||||
|
|
||||||
|
leaderSlotsByEpoch = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "solana_leader_slots_by_epoch",
|
||||||
|
Help: "Number of leader slots per leader, grouped by skip status and epoch",
|
||||||
|
},
|
||||||
|
[]string{"status", "nodekey", "epoch"})
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -55,14 +63,10 @@ func init() {
|
||||||
prometheus.MustRegister(epochFirstSlot)
|
prometheus.MustRegister(epochFirstSlot)
|
||||||
prometheus.MustRegister(epochLastSlot)
|
prometheus.MustRegister(epochLastSlot)
|
||||||
prometheus.MustRegister(leaderSlotsTotal)
|
prometheus.MustRegister(leaderSlotsTotal)
|
||||||
|
prometheus.MustRegister(leaderSlotsByEpoch)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *solanaCollector) WatchSlots() {
|
func (c *solanaCollector) WatchSlots() {
|
||||||
var (
|
|
||||||
// Last slot number we generated ticks for.
|
|
||||||
watermark int64
|
|
||||||
)
|
|
||||||
|
|
||||||
// Get current slot height and epoch info
|
// Get current slot height and epoch info
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), httpTimeout)
|
ctx, cancel := context.WithTimeout(context.Background(), httpTimeout)
|
||||||
info, err := c.rpcClient.GetEpochInfo(ctx, rpc.CommitmentMax)
|
info, err := c.rpcClient.GetEpochInfo(ctx, rpc.CommitmentMax)
|
||||||
|
@ -72,8 +76,11 @@ func (c *solanaCollector) WatchSlots() {
|
||||||
}
|
}
|
||||||
cancel()
|
cancel()
|
||||||
|
|
||||||
// Set watermark to current offset on startup (we do not backfill slots we missed at startup)
|
// watermark is the last slot number we generated ticks for. Set it to the current offset on startup (we do not backfill slots we missed at startup)
|
||||||
watermark = info.AbsoluteSlot
|
watermark := info.AbsoluteSlot
|
||||||
|
currentEpoch := info.Epoch
|
||||||
|
firstSlot := info.AbsoluteSlot - info.SlotIndex
|
||||||
|
lastSlot := firstSlot + info.SlotsInEpoch
|
||||||
|
|
||||||
ticker := time.NewTicker(slotPacerSchedule)
|
ticker := time.NewTicker(slotPacerSchedule)
|
||||||
|
|
||||||
|
@ -90,8 +97,18 @@ func (c *solanaCollector) WatchSlots() {
|
||||||
}
|
}
|
||||||
cancel()
|
cancel()
|
||||||
|
|
||||||
|
if currentEpoch != info.Epoch {
|
||||||
|
last, err := updateCounters(c, currentEpoch, watermark, &lastSlot)
|
||||||
|
if err != nil {
|
||||||
|
klog.Info(err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
watermark = last
|
||||||
|
}
|
||||||
|
|
||||||
|
currentEpoch = info.Epoch
|
||||||
// Calculate first and last slot in epoch.
|
// Calculate first and last slot in epoch.
|
||||||
firstSlot := info.AbsoluteSlot - info.SlotIndex
|
firstSlot = info.AbsoluteSlot - info.SlotIndex
|
||||||
lastSlot := firstSlot + info.SlotsInEpoch
|
lastSlot := firstSlot + info.SlotsInEpoch
|
||||||
|
|
||||||
totalTransactionsTotal.Set(float64(info.TransactionCount))
|
totalTransactionsTotal.Set(float64(info.TransactionCount))
|
||||||
|
@ -108,31 +125,74 @@ func (c *solanaCollector) WatchSlots() {
|
||||||
klog.Infof("confirmed slot %d (offset %d, +%d), epoch %d (from slot %d to %d, %d remaining)",
|
klog.Infof("confirmed slot %d (offset %d, +%d), epoch %d (from slot %d to %d, %d remaining)",
|
||||||
info.AbsoluteSlot, info.SlotIndex, info.AbsoluteSlot-watermark, info.Epoch, firstSlot, lastSlot, lastSlot-info.AbsoluteSlot)
|
info.AbsoluteSlot, info.SlotIndex, info.AbsoluteSlot-watermark, info.Epoch, firstSlot, lastSlot, lastSlot-info.AbsoluteSlot)
|
||||||
|
|
||||||
first := watermark + 1
|
last, err := updateCounters(c, currentEpoch, watermark, nil)
|
||||||
ctx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
|
|
||||||
blockProduction, err := c.rpcClient.GetBlockProduction(ctx, &first, nil)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Infof("failed to fetch block production, retrying: %v", err)
|
klog.Info(err)
|
||||||
cancel()
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
watermark = last
|
||||||
for host, prod := range blockProduction.Hosts {
|
|
||||||
leaderSlotsTotal.
|
|
||||||
With(prometheus.Labels{"status": "valid", "nodekey": host}).
|
|
||||||
Add(float64(prod.BlocksProduced))
|
|
||||||
leaderSlotsTotal.
|
|
||||||
With(prometheus.Labels{"status": "skipped", "nodekey": host}).
|
|
||||||
Add(float64(prod.LeaderSlots - prod.BlocksProduced))
|
|
||||||
klog.V(1).Infof(
|
|
||||||
"Slot %d, node %s: Added %d valid and %d skipped slots",
|
|
||||||
blockProduction.LastSlot,
|
|
||||||
host,
|
|
||||||
prod.BlocksProduced,
|
|
||||||
prod.LeaderSlots-prod.BlocksProduced,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
watermark = blockProduction.LastSlot
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func updateCounters(c *solanaCollector, epoch, firstSlot int64, lastSlotOpt *int64) (int64, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
|
||||||
|
var lastSlot int64
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if lastSlotOpt == nil {
|
||||||
|
klog.V(2).Info("LastSlot is nil, getting last published slot")
|
||||||
|
lastSlot, err = c.rpcClient.GetSlot(ctx)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
cancel()
|
||||||
|
return 0, fmt.Errorf("Error while getting the last slot: %v", err)
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
} else {
|
||||||
|
lastSlot = *lastSlotOpt
|
||||||
|
}
|
||||||
|
klog.V(2).Infof("LastSlot is %d", lastSlot)
|
||||||
|
|
||||||
|
if firstSlot > lastSlot {
|
||||||
|
return 0, fmt.Errorf(
|
||||||
|
"In epoch %d, firstSlot (%d) > lastSlot (%d). This should not happen. Not updating.",
|
||||||
|
epoch,
|
||||||
|
firstSlot,
|
||||||
|
lastSlot,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
blockProduction, err := c.rpcClient.GetBlockProduction(ctx, &firstSlot, &lastSlot)
|
||||||
|
if err != nil {
|
||||||
|
cancel()
|
||||||
|
return 0, fmt.Errorf("failed to fetch block production, retrying: %v", err)
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
|
||||||
|
for host, prod := range blockProduction.Hosts {
|
||||||
|
valid := float64(prod.BlocksProduced)
|
||||||
|
skipped := float64(prod.LeaderSlots - prod.BlocksProduced)
|
||||||
|
|
||||||
|
epochStr := fmt.Sprintf("%d", epoch)
|
||||||
|
|
||||||
|
leaderSlotsTotal.WithLabelValues("valid", host).Add(valid)
|
||||||
|
leaderSlotsTotal.WithLabelValues("skipped", host).Add(skipped)
|
||||||
|
|
||||||
|
leaderSlotsByEpoch.WithLabelValues("valid", host, epochStr).Add(valid)
|
||||||
|
leaderSlotsByEpoch.WithLabelValues("skipped", host, epochStr).Add(skipped)
|
||||||
|
|
||||||
|
klog.V(1).Infof(
|
||||||
|
"Epoch %s, slots %d-%d, node %s: Added %d valid and %d skipped slots",
|
||||||
|
epochStr,
|
||||||
|
firstSlot,
|
||||||
|
lastSlot,
|
||||||
|
host,
|
||||||
|
prod.BlocksProduced,
|
||||||
|
prod.LeaderSlots-prod.BlocksProduced,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastSlot, nil
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
package rpc
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
type getSlotResponse struct {
|
||||||
|
Result int64 `json:"result"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://solana.com/docs/rpc/http/getslot
|
||||||
|
func (c *RPCClient) GetSlot(ctx context.Context) (int64, error) {
|
||||||
|
body, err := c.rpcRequest(ctx, formatRPCRequest("getSlot", []interface{}{}))
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("RPC call failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
klog.V(2).Infof("getSlot response: %v", string(body))
|
||||||
|
|
||||||
|
var resp getSlotResponse
|
||||||
|
if err = json.Unmarshal(body, &resp); err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to decode response body: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp.Result, nil
|
||||||
|
}
|
Loading…
Reference in New Issue