From f116cdeed9f895559ade7f4a12b33e50dd765e78 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Wed, 13 Nov 2019 15:58:14 -0700 Subject: [PATCH] Add validator catchup command (#6922) --- Cargo.lock | 1 + .../running-validator/validator-monitor.md | 37 ++++---- cli/Cargo.toml | 1 + cli/src/cli.rs | 9 +- cli/src/cluster_query.rs | 94 ++++++++++++++++++- client/src/rpc_client.rs | 21 ++++- net/remote/remote-node.sh | 5 +- 7 files changed, 141 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0bd872b3b7..b892060492 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3199,6 +3199,7 @@ dependencies = [ "criterion-stats 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "ctrlc 3.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "dirs 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "indicatif 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/book/src/running-validator/validator-monitor.md b/book/src/running-validator/validator-monitor.md index bc35aeea42..205ad18225 100644 --- a/book/src/running-validator/validator-monitor.md +++ b/book/src/running-validator/validator-monitor.md @@ -14,46 +14,40 @@ From another console, confirm the IP address and **identity pubkey** of your val solana-gossip --entrypoint testnet.solana.com:8001 spy ``` -## Check Vote Activity +## Monitoring Catch Up -The vote pubkey for the validator can be found by running: +It may take some time to catch up with the cluster after your validator boots. +Use the `catchup` command to monitor your validator through this process: ```bash -solana-keygen pubkey ~/validator-vote-keypair.json +solana catchup ~/validator-keypair.json ``` -Provide the **vote pubkey** to the `solana show-vote-account` command to view the recent voting activity from your validator: +Until your validator has caught up, it will not be able to vote successfully and +stake cannot be delegated to it. -```bash -solana show-vote-account 2ozWvfaXQd1X6uKh8jERoRGApDqSqcEy6fF1oN13LL2G -``` +Also if you find the cluster's slot advancing faster than yours, you will likely +never catch up. This typically implies some kind of networking issue between +your validator and the rest of the cluster. ## Check Your Balance -Your account balance should decrease by the transaction fee amount as your validator submits votes, and increase after serving as the leader. Pass the `--lamports` are to observe in finer detail: +Your account balance should decrease by the transaction fee amount as your +validator submits votes, and increase after serving as the leader. Pass the +`--lamports` are to observe in finer detail: ```bash solana balance --lamports ``` -## Check Slot Number +## Check Vote Activity -After your validator boots, it may take some time to catch up with the cluster. Use the `get-slot` command to view the current slot that the cluster is processing: +The `solana show-vote-account` command displays the recent voting activity from your validator: ```bash -solana get-slot +solana show-vote-account ~/validator-vote-keypair.json ``` -The current slot that your validator is processing can then been seen with: - -```bash -solana --url http://127.0.0.1:8899 get-slot -``` - -Until your validator has caught up, it will not be able to vote successfully and stake cannot be delegated to it. - -Also if you find the cluster's slot advancing faster than yours, you will likely never catch up. This typically implies some kind of networking issue between your validator and the rest of the cluster. - ## Get Cluster Info There are several useful JSON-RPC endpoints for monitoring your validator on the cluster, as well as the health of the cluster: @@ -69,6 +63,7 @@ curl -X POST -H "Content-Type: application/json" -d '{"jsonrpc":"2.0","id":1, "m curl -X POST -H "Content-Type: application/json" -d '{"jsonrpc":"2.0","id":1, "method":"getEpochInfo"}' http://testnet.solana.com:8899 ``` + ## Validator Metrics Metrics are available for local monitoring of your validator. diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 9eaee3e74c..bdc4a9dcb1 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -19,6 +19,7 @@ console = "0.9.1" dirs = "2.0.2" lazy_static = "1.4.0" log = "0.4.8" +indicatif = "0.13.0" num-traits = "0.2" pretty-hex = "0.1.1" reqwest = { version = "0.9.22", default-features = false, features = ["rustls-tls"] } diff --git a/cli/src/cli.rs b/cli/src/cli.rs index 1b1b5503a2..82000bc735 100644 --- a/cli/src/cli.rs +++ b/cli/src/cli.rs @@ -6,7 +6,6 @@ use clap::{App, AppSettings, Arg, ArgMatches, SubCommand}; use log::*; use num_traits::FromPrimitive; use serde_json::{self, json, Value}; - use solana_budget_api::budget_instruction::{self, BudgetError}; use solana_clap_utils::{input_parsers::*, input_validators::*}; use solana_client::{client_error::ClientError, rpc_client::RpcClient}; @@ -33,7 +32,6 @@ use solana_sdk::{ use solana_stake_api::stake_state::{Lockup, StakeAuthorize}; use solana_storage_api::storage_instruction::StorageAccountType; use solana_vote_api::vote_state::VoteAuthorize; - use std::{ fs::File, io::{Read, Write}, @@ -71,6 +69,9 @@ impl std::ops::Deref for KeypairEq { #[allow(clippy::large_enum_variant)] pub enum CliCommand { // Cluster Query Commands + Catchup { + node_pubkey: Pubkey, + }, ClusterVersion, Fees, GetEpochInfo, @@ -237,6 +238,7 @@ impl Default for CliConfig { pub fn parse_command(matches: &ArgMatches<'_>) -> Result> { let response = match matches.subcommand() { // Cluster Query Commands + ("catchup", Some(matches)) => parse_catchup(matches), ("cluster-version", Some(_matches)) => Ok(CliCommandInfo { command: CliCommand::ClusterVersion, require_keypair: false, @@ -849,7 +851,8 @@ pub fn process_command(config: &CliConfig) -> ProcessResult { // Cluster Query Commands // Return software version of solana-cli and cluster entrypoint node - CliCommand::ClusterVersion => process_cluster_version(&rpc_client, config), + CliCommand::Catchup { node_pubkey } => process_catchup(&rpc_client, node_pubkey), + CliCommand::ClusterVersion => process_cluster_version(&rpc_client), CliCommand::Fees => process_fees(&rpc_client), CliCommand::GetGenesisHash => process_get_genesis_hash(&rpc_client), CliCommand::GetSlot => process_get_slot(&rpc_client), diff --git a/cli/src/cluster_query.rs b/cli/src/cluster_query.rs index b833a08121..4c442f68fb 100644 --- a/cli/src/cluster_query.rs +++ b/cli/src/cluster_query.rs @@ -7,16 +7,20 @@ use crate::{ }; use clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand}; use console::{style, Emoji}; +use indicatif::{ProgressBar, ProgressStyle}; +use solana_clap_utils::{input_parsers::*, input_validators::*}; use solana_client::{rpc_client::RpcClient, rpc_request::RpcVoteAccountInfo}; use solana_sdk::{ clock, commitment_config::CommitmentConfig, hash::Hash, + pubkey::Pubkey, signature::{Keypair, KeypairUtil}, system_transaction, }; use std::{ collections::VecDeque, + thread::sleep, time::{Duration, Instant}, }; @@ -31,6 +35,19 @@ pub trait ClusterQuerySubCommands { impl ClusterQuerySubCommands for App<'_, '_> { fn cluster_query_subcommands(self) -> Self { self.subcommand( + SubCommand::with_name("catchup") + .about("Wait for a validator to catch up to the cluster") + .arg( + Arg::with_name("node_pubkey") + .index(1) + .takes_value(true) + .value_name("PUBKEY") + .validator(is_pubkey_or_keypair) + .required(true) + .help("Identity pubkey of the validator"), + ), + ) + .subcommand( SubCommand::with_name("cluster-version") .about("Get the version of the cluster entrypoint"), ) @@ -97,6 +114,14 @@ impl ClusterQuerySubCommands for App<'_, '_> { } } +pub fn parse_catchup(matches: &ArgMatches<'_>) -> Result { + let node_pubkey = pubkey_of(matches, "node_pubkey").unwrap(); + Ok(CliCommandInfo { + command: CliCommand::Catchup { node_pubkey }, + require_keypair: false, + }) +} + pub fn parse_cluster_ping(matches: &ArgMatches<'_>) -> Result { let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64)); let count = if matches.is_present("count") { @@ -130,7 +155,74 @@ pub fn parse_show_validators(matches: &ArgMatches<'_>) -> Result ProcessResult { +/// Creates a new process bar for processing that will take an unknown amount of time +fn new_spinner_progress_bar() -> ProgressBar { + let progress_bar = ProgressBar::new(42); + progress_bar + .set_style(ProgressStyle::default_spinner().template("{spinner:.green} {wide_msg}")); + progress_bar.enable_steady_tick(100); + progress_bar +} + +pub fn process_catchup(rpc_client: &RpcClient, node_pubkey: &Pubkey) -> ProcessResult { + let cluster_nodes = rpc_client.get_cluster_nodes()?; + + let rpc_addr = cluster_nodes + .iter() + .find(|contact_info| contact_info.pubkey == node_pubkey.to_string()) + .ok_or_else(|| format!("Contact information not found for {}", node_pubkey))? + .rpc + .ok_or_else(|| format!("RPC service not found for {}", node_pubkey))?; + + let progress_bar = new_spinner_progress_bar(); + progress_bar.set_message("Connecting..."); + + let node_client = RpcClient::new_socket(rpc_addr); + let mut previous_rpc_slot = std::u64::MAX; + let mut previous_slot_distance = 0; + let sleep_interval = 5; + loop { + let rpc_slot = rpc_client.get_slot_with_commitment(CommitmentConfig::recent())?; + let node_slot = node_client.get_slot_with_commitment(CommitmentConfig::recent())?; + if node_slot > std::cmp::min(previous_rpc_slot, rpc_slot) { + progress_bar.finish_and_clear(); + return Ok(format!( + "{} has caught up (us:{} them:{})", + node_pubkey, node_slot, rpc_slot, + )); + } + + let slot_distance = rpc_slot as i64 - node_slot as i64; + progress_bar.set_message(&format!( + "Validator is {} slots away (us:{} them:{}){}", + slot_distance, + node_slot, + rpc_slot, + if previous_rpc_slot == std::u64::MAX { + "".to_string() + } else { + let slots_per_second = + (previous_slot_distance - slot_distance) as f64 / f64::from(sleep_interval); + + format!( + " and {} at {:.1} slots/second", + if slots_per_second < 0.0 { + "falling behind" + } else { + "gaining" + }, + slots_per_second, + ) + } + )); + + sleep(Duration::from_secs(sleep_interval as u64)); + previous_rpc_slot = rpc_slot; + previous_slot_distance = slot_distance; + } +} + +pub fn process_cluster_version(rpc_client: &RpcClient) -> ProcessResult { let remote_version = rpc_client.get_version()?; Ok(remote_version.solana_core) } diff --git a/client/src/rpc_client.rs b/client/src/rpc_client.rs index 768b444cb0..6809a6fcb1 100644 --- a/client/src/rpc_client.rs +++ b/client/src/rpc_client.rs @@ -4,7 +4,7 @@ use crate::{ generic_rpc_client_request::GenericRpcClientRequest, mock_rpc_client_request::MockRpcClientRequest, rpc_client_request::RpcClientRequest, - rpc_request::{RpcEpochInfo, RpcRequest, RpcVersionInfo, RpcVoteAccountStatus}, + rpc_request::{RpcContactInfo, RpcEpochInfo, RpcRequest, RpcVersionInfo, RpcVoteAccountStatus}, }; use bincode::serialize; use log::*; @@ -177,6 +177,25 @@ impl RpcClient { }) } + pub fn get_cluster_nodes(&self) -> io::Result> { + let response = self + .client + .send(&RpcRequest::GetClusterNodes, None, 0, None) + .map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("GetClusterNodes request failure: {:?}", err), + ) + })?; + + serde_json::from_value(response).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("GetClusterNodes parse failure: {}", err), + ) + }) + } + pub fn get_epoch_info(&self) -> io::Result { let response = self .client diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index 14ae3be233..c355fac241 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -378,9 +378,12 @@ EOF waitForNodeToInit if [[ $skipSetup != true && $nodeType != blockstreamer ]]; then + # Wait for the validator to catch up to the bootstrap leader before + # delegating stake to it + solana --url http://"$entrypointIp":8899 catchup config/validator-identity.json + args=( --url http://"$entrypointIp":8899 - --force "$stake" ) if [[ $airdropsEnabled != true ]]; then