diff --git a/cmd/solana_exporter/exporter.go b/cmd/solana_exporter/exporter.go index eb7cc2e..51b092a 100644 --- a/cmd/solana_exporter/exporter.go +++ b/cmd/solana_exporter/exporter.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "github.com/asymmetric-research/solana_exporter/pkg/rpc" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -46,6 +47,8 @@ type SolanaCollector struct { validatorDelinquent *prometheus.Desc solanaVersion *prometheus.Desc balances *prometheus.Desc + isHealthy *prometheus.Desc + numSlotsBehind *prometheus.Desc } func NewSolanaCollector( @@ -97,6 +100,18 @@ func NewSolanaCollector( []string{AddressLabel}, nil, ), + isHealthy: prometheus.NewDesc( + "solana_is_healthy", + "Whether the node is healthy or not.", + nil, + nil, + ), + numSlotsBehind: prometheus.NewDesc( + "solana_num_slots_behind", + "The number of slots that the node is behind the latest cluster confirmed slot.", + nil, + nil, + ), } return collector } @@ -109,11 +124,14 @@ func (c *SolanaCollector) Describe(ch chan<- *prometheus.Desc) { ch <- c.validatorRootSlot ch <- c.validatorDelinquent ch <- c.balances + ch <- c.isHealthy + ch <- c.numSlotsBehind } func (c *SolanaCollector) collectVoteAccounts(ctx context.Context, ch chan<- prometheus.Metric) { voteAccounts, err := c.rpcClient.GetVoteAccounts(ctx, rpc.CommitmentConfirmed, nil) if err != nil { + klog.Errorf("failed to get vote accounts: %v", err) ch <- prometheus.NewInvalidMetric(c.totalValidatorsDesc, err) ch <- prometheus.NewInvalidMetric(c.validatorActivatedStake, err) ch <- prometheus.NewInvalidMetric(c.validatorLastVote, err) @@ -169,6 +187,7 @@ func (c *SolanaCollector) collectVersion(ctx context.Context, ch chan<- promethe version, err := c.rpcClient.GetVersion(ctx) if err != nil { + klog.Errorf("failed to get version: %v", err) ch <- prometheus.NewInvalidMetric(c.solanaVersion, err) return } @@ -179,6 +198,7 @@ func (c *SolanaCollector) collectVersion(ctx context.Context, ch chan<- promethe func (c *SolanaCollector) collectBalances(ctx context.Context, ch chan<- prometheus.Metric) { balances, err := FetchBalances(ctx, c.rpcClient, c.balanceAddresses) if err != nil { + klog.Errorf("failed to get balances: %v", err) ch <- prometheus.NewInvalidMetric(c.solanaVersion, err) return } @@ -188,6 +208,45 @@ func (c *SolanaCollector) collectBalances(ctx context.Context, ch chan<- prometh } } +func (c *SolanaCollector) collectHealth(ctx context.Context, ch chan<- prometheus.Metric) { + var ( + isHealthy = 1 + numSlotsBehind int64 + ) + + _, err := c.rpcClient.GetHealth(ctx) + if err != nil { + var rpcError *rpc.RPCError + if errors.As(err, &rpcError) { + var errorData rpc.NodeUnhealthyErrorData + if rpcError.Data == nil { + // if there is no data, then this is some unexpected error and should just be logged + klog.Errorf("failed to get health: %v", err) + ch <- prometheus.NewInvalidMetric(c.isHealthy, err) + ch <- prometheus.NewInvalidMetric(c.numSlotsBehind, err) + return + } + if err = rpc.UnpackRpcErrorData(rpcError, errorData); err != nil { + // if we error here, it means we have the incorrect format + klog.Fatalf("failed to unpack %s rpc error: %v", rpcError.Method, err.Error()) + } + isHealthy = 0 + numSlotsBehind = errorData.NumSlotsBehind + } else { + // if it's not an RPC error, log it + klog.Errorf("failed to get health: %v", err) + ch <- prometheus.NewInvalidMetric(c.isHealthy, err) + ch <- prometheus.NewInvalidMetric(c.numSlotsBehind, err) + return + } + } + + ch <- prometheus.MustNewConstMetric(c.isHealthy, prometheus.GaugeValue, float64(isHealthy)) + ch <- prometheus.MustNewConstMetric(c.numSlotsBehind, prometheus.GaugeValue, float64(numSlotsBehind)) + + return +} + func (c *SolanaCollector) Collect(ch chan<- prometheus.Metric) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -195,6 +254,7 @@ func (c *SolanaCollector) Collect(ch chan<- prometheus.Metric) { c.collectVoteAccounts(ctx, ch) c.collectVersion(ctx, ch) c.collectBalances(ctx, ch) + c.collectHealth(ctx, ch) } func main() { diff --git a/cmd/solana_exporter/exporter_test.go b/cmd/solana_exporter/exporter_test.go index a63edaf..38adb41 100644 --- a/cmd/solana_exporter/exporter_test.go +++ b/cmd/solana_exporter/exporter_test.go @@ -178,6 +178,12 @@ func (c *staticRPCClient) GetBlock( return nil, nil } +//goland:noinspection GoUnusedParameter +func (c *staticRPCClient) GetHealth(ctx context.Context) (*string, error) { + health := "ok" + return &health, nil +} + /* ===== DYNAMIC CLIENT =====: */ @@ -373,6 +379,12 @@ func (c *dynamicRPCClient) GetBlock( return nil, nil } +//goland:noinspection GoUnusedParameter +func (c *dynamicRPCClient) GetHealth(ctx context.Context) (*string, error) { + health := "ok" + return &health, nil +} + /* ===== OTHER TEST UTILITIES =====: */ diff --git a/cmd/solana_exporter/slots.go b/cmd/solana_exporter/slots.go index f4ec113..cd56485 100644 --- a/cmd/solana_exporter/slots.go +++ b/cmd/solana_exporter/slots.go @@ -205,7 +205,7 @@ func (c *SlotWatcher) WatchSlots(ctx context.Context, pace time.Duration) { // and updates the prometheus gauges associated with those metrics. func (c *SlotWatcher) trackEpoch(ctx context.Context, epoch *rpc.EpochInfo) { klog.Infof("Tracking epoch %v (from %v)", epoch.Epoch, c.currentEpoch) - firstSlot, lastSlot := getEpochBounds(epoch) + firstSlot, lastSlot := GetEpochBounds(epoch) // if we haven't yet set c.currentEpoch, that (hopefully) means this is the initial setup, // and so we can simply store the tracking numbers if c.currentEpoch == 0 { @@ -393,12 +393,6 @@ func (c *SlotWatcher) fetchAndEmitSingleBlockInfo( return nil } -// getEpochBounds returns the first slot and last slot within an [inclusive] Epoch -func getEpochBounds(info *rpc.EpochInfo) (int64, int64) { - firstSlot := info.AbsoluteSlot - info.SlotIndex - return firstSlot, firstSlot + info.SlotsInEpoch - 1 -} - // fetchAndEmitInflationRewards fetches and emits the inflation rewards for the configured inflationRewardAddresses // at the provided epoch func (c *SlotWatcher) fetchAndEmitInflationRewards(ctx context.Context, epoch int64) error { diff --git a/cmd/solana_exporter/slots_test.go b/cmd/solana_exporter/slots_test.go index bf07fbc..ca44114 100644 --- a/cmd/solana_exporter/slots_test.go +++ b/cmd/solana_exporter/slots_test.go @@ -106,7 +106,7 @@ func TestSolanaCollector_WatchSlots_Static(t *testing.T) { assert.NoError(t, err) time.Sleep(1 * time.Second) - firstSlot, lastSlot := getEpochBounds(&staticEpochInfo) + firstSlot, lastSlot := GetEpochBounds(&staticEpochInfo) type testCase struct { expectedValue float64 metric prometheus.Gauge diff --git a/cmd/solana_exporter/utils.go b/cmd/solana_exporter/utils.go index 492036f..0c66a36 100644 --- a/cmd/solana_exporter/utils.go +++ b/cmd/solana_exporter/utils.go @@ -113,3 +113,9 @@ func CombineUnique[T comparable](args ...[]T) []T { } return uniqueItems } + +// GetEpochBounds returns the first slot and last slot within an [inclusive] Epoch +func GetEpochBounds(info *rpc.EpochInfo) (int64, int64) { + firstSlot := info.AbsoluteSlot - info.SlotIndex + return firstSlot, firstSlot + info.SlotsInEpoch - 1 +} diff --git a/cmd/solana_exporter/utils_test.go b/cmd/solana_exporter/utils_test.go index 8054091..a05bea6 100644 --- a/cmd/solana_exporter/utils_test.go +++ b/cmd/solana_exporter/utils_test.go @@ -56,3 +56,10 @@ func TestGetAssociatedVoteAccounts(t *testing.T) { assert.NoError(t, err) assert.Equal(t, votekeys, voteAccounts) } + +func TestGetEpochBounds(t *testing.T) { + epoch := rpc.EpochInfo{AbsoluteSlot: 25, SlotIndex: 5, SlotsInEpoch: 10} + first, last := GetEpochBounds(&epoch) + assert.Equal(t, int64(20), first) + assert.Equal(t, int64(29), last) +} diff --git a/pkg/rpc/client.go b/pkg/rpc/client.go index a4817d1..8fcff76 100644 --- a/pkg/rpc/client.go +++ b/pkg/rpc/client.go @@ -73,6 +73,8 @@ type Provider interface { GetLeaderSchedule(ctx context.Context, commitment Commitment, slot int64) (map[string][]int64, error) GetBlock(ctx context.Context, commitment Commitment, slot int64, transactionDetails string) (*Block, error) + + GetHealth(ctx context.Context) (*string, error) } func (c Commitment) MarshalJSON() ([]byte, error) { @@ -138,6 +140,7 @@ func getResponse[T any]( // check for an actual rpc error if rpcResponse.Error.Code != 0 { + rpcResponse.Error.Method = method return &rpcResponse.Error } return nil @@ -301,3 +304,14 @@ func (c *Client) GetBlock( } return &resp.Result, nil } + +// GetHealth returns the current health of the node. A healthy node is one that is within a blockchain-configured slots +// of the latest cluster confirmed slot. +// See API docs: https://solana.com/docs/rpc/http/gethealth +func (c *Client) GetHealth(ctx context.Context) (*string, error) { + var resp response[string] + if err := getResponse(ctx, c, "getHealth", []any{}, &resp); err != nil { + return nil, err + } + return &resp.Result, nil +} diff --git a/pkg/rpc/errors.go b/pkg/rpc/errors.go index 5e3fb9f..552f2e9 100644 --- a/pkg/rpc/errors.go +++ b/pkg/rpc/errors.go @@ -1,5 +1,10 @@ package rpc +import ( + "encoding/json" + "fmt" +) + // error codes: https://github.com/anza-xyz/agave/blob/489f483e1d7b30ef114e0123994818b2accfa389/rpc-client-api/src/custom_error.rs#L17 const ( BlockCleanedUpCode = -32001 @@ -21,3 +26,20 @@ const ( EpochRewardsPeriodActiveCode = -32017 SlotNotEpochBoundaryCode = -32018 ) + +type ( + NodeUnhealthyErrorData struct { + NumSlotsBehind int64 `json:"numSlotsBehind"` + } +) + +func UnpackRpcErrorData[T any](rpcErr *RPCError, formatted T) error { + bytesData, err := json.Marshal(rpcErr.Data) + if err != nil { + return fmt.Errorf("failed to marshal %s rpc-error data: %w", rpcErr.Method, err) + } + if err = json.Unmarshal(bytesData, formatted); err != nil { + return fmt.Errorf("failed to unmarshal %s rpc-error data: %w", rpcErr.Method, err) + } + return nil +} diff --git a/pkg/rpc/responses.go b/pkg/rpc/responses.go index 529a7bf..f587f22 100644 --- a/pkg/rpc/responses.go +++ b/pkg/rpc/responses.go @@ -10,6 +10,8 @@ type ( Message string `json:"message"` Code int64 `json:"code"` Data map[string]any `json:"data"` + // Method is not returned by the RPC, rather added by the client for visibility purposes + Method string } response[T any] struct { @@ -99,7 +101,7 @@ type ( ) func (e *RPCError) Error() string { - return fmt.Sprintf("rpc error (code: %d): %s (data: %v)", e.Code, e.Message, e.Data) + return fmt.Sprintf("%s rpc error (code: %d): %s (data: %v)", e.Method, e.Code, e.Message, e.Data) } func (hp *HostProduction) UnmarshalJSON(data []byte) error {