Update validators and healthcheck endpoint (#5)

This commit is contained in:
Nathaniel Parke 2021-01-11 10:54:17 -08:00 committed by GitHub
parent 48ee54d4f8
commit 2da98313c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 54 additions and 38 deletions

View File

@ -8,12 +8,9 @@ PATH=/home/sol/.local/share/solana/install/active_release/bin:/usr/sbin:/usr/bin
# Parameters from https://docs.solana.com/clusters#mainnet-beta
ENTRYPOINT=mainnet-beta.solana.com:8001
TRUSTED_VALIDATOR_PUBKEYS=(7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2 GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S)
EXPECTED_BANK_HASH=5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
EXPECTED_BANK_HASH=Fi4p8z3AkfsuGXZzQ4TD28N8QDNSWC7ccqAqTs2GPdPu
EXPECTED_GENESIS_HASH=5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
EXPECTED_SHRED_VERSION=64864
# NOTE: Check if this is reasonable
RPC_HEALTH_CHECK_SLOT_DISTANCE=15
EXPECTED_SHRED_VERSION=13490
# Delete any zero-length snapshots that can cause validator startup to fail
find /data/sol/ledger/snapshot-* -size 0 -print -exec rm {} \; || true
@ -48,6 +45,7 @@ args=(
--identity "$identity_keypair"
--enable-rpc-transaction-history
--limit-ledger-size 50000000
--health-check-slot-distance 500
--cuda
--rpc-port 8899
--private-rpc
@ -61,10 +59,6 @@ args=(
--wal-recovery-mode skip_any_corrupted_record
)
if [[ -n "$RPC_HEALTH_CHECK_SLOT_DISTANCE" ]]; then
args+=(--health-check-slot-distance "$RPC_HEALTH_CHECK_SLOT_DISTANCE")
fi
# Note: can get into a bad state that requires actually fetching a new snapshot. One such error that indicates this:
# "...processing for bank 0 must succeed: FailedToLoadEntries(InvalidShredData(Custom(\"could not reconstruct entries\")))"
if [[ -d /data/sol/ledger ]]; then

View File

@ -1,10 +1,12 @@
import logging
import socket
import time
import traceback
from functools import wraps
from pathlib import Path
from typing import Union, Tuple, Optional
from typing import Union, Tuple, Optional, Dict
import gevent
import jsonpickle
import requests
from flask import Flask
@ -15,10 +17,16 @@ app = Flask('health.main')
logger = logging.getLogger('health.main')
PORT = 9090
TRUSTED_VALIDATOR_ENDPOINT = 'http://vip-api.mainnet-beta.solana.com'
LOCAL_VALIDATOR_ENDPOINT = 'http://localhost:8899'
ENDPOINTS = {
'local': 'http://localhost:8899',
'mainnet': 'http://vip-api.mainnet-beta.solana.com',
'cluster': 'https://solana-api.projectserum.com',
}
UNHEALTHY_BLOCKHEIGHT_DIFF = 15
DATA_DIR = 'data'
UPSTREAM_DOWN_TOLERANCE_SECONDS = 30
_last_successful_trusted_fetch = 0
def serve_flask_app(app: Flask, port: int, allow_remote_connections: bool = False,
@ -52,41 +60,37 @@ def api_endpoint(f):
return wrapped
@app.route('/')
@api_endpoint
def get_status():
return f'Hello from {socket.gethostname()}.'
@app.route('/status')
@api_endpoint
def get_validator_status():
local = get_epoch_info(LOCAL_VALIDATOR_ENDPOINT)['result']['blockHeight']
trusted = get_epoch_info(TRUSTED_VALIDATOR_ENDPOINT)['result']['blockHeight']
return {
'local': local,
'trusted': trusted
}
return get_all_slots()
@app.route('/health')
@api_endpoint
def get_health_status():
local = get_epoch_info(LOCAL_VALIDATOR_ENDPOINT)['result']['blockHeight']
trusted = get_epoch_info(TRUSTED_VALIDATOR_ENDPOINT)['result']['blockHeight']
diff = trusted - local
if diff < 0:
logger.info(f'Local block height is greater than trusted validator. '
global _last_successful_trusted_fetch
slots = get_all_slots()
logger.info(f'slots: {slots}')
local = slots['local']
upstream_height = max([v for k, v in slots.items() if k != 'local'])
if upstream_height == 0 and _last_successful_trusted_fetch < time.time() - UPSTREAM_DOWN_TOLERANCE_SECONDS:
raise Exception(
f'Both upstreams have been returning errors for more than {UPSTREAM_DOWN_TOLERANCE_SECONDS} seconds'
)
elif upstream_height > 0:
_last_successful_trusted_fetch = time.time()
behind = upstream_height - local
if behind < 0:
logger.info(f'Local block height is greater than upstreams. '
f'Current block height: {local}, '
f'Trusted block height: {trusted}')
behind = max(0, diff)
f'Upstream block height: {upstream_height}')
unhealthy_blockheight_diff = load_data_file_locally('unhealthy_block_threshold') or UNHEALTHY_BLOCKHEIGHT_DIFF
if behind > int(unhealthy_blockheight_diff):
raise Exception(f'Local validator is behind trusted validator by more than {unhealthy_blockheight_diff} blocks.')
return {
'local': local,
'trusted': trusted
}
return slots
def load_data_file_locally(filename: str, mode='r') -> Optional[str]:
@ -97,19 +101,37 @@ def load_data_file_locally(filename: str, mode='r') -> Optional[str]:
return None
def get_all_slots() -> Dict[str, int]:
futures = {k: gevent.spawn(get_slot, v) for k, v in ENDPOINTS.items()}
return {k: v.get() for k, v in futures.items()}
def get_slot(url: str) -> int:
try:
return get_epoch_info(url)['result']['absoluteSlot']
except Exception as e:
logger.info(f'Received error fetching blockheight from {url}')
logger.info(e)
return 0
def get_epoch_info(url: str):
res = requests.post(
url,
headers={
'Content-Type': 'application/json'
json={
'jsonrpc': '2.0',
'id': 1,
'method': 'getEpochInfo',
'params': [{'commitment': 'single'}],
},
json={"jsonrpc":"2.0", "id":1, "method":"getEpochInfo", "params":[]}
timeout=1,
)
res.raise_for_status()
return res.json()
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
serve_flask_app(
app, PORT, allow_remote_connections=True, allow_multiple_listeners=True
)