diff --git a/Cargo.lock b/Cargo.lock index f1d41aea49..d69dc92082 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6985,6 +6985,7 @@ dependencies = [ "solana-vote-program", "spl-token-2022", "symlink", + "thiserror", "tikv-jemallocator", ] diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index ced6b20375..c9cbef28f7 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -6191,6 +6191,7 @@ dependencies = [ "solana-version", "solana-vote-program", "symlink", + "thiserror", "tikv-jemallocator", ] diff --git a/validator/Cargo.toml b/validator/Cargo.toml index 44c836e3bd..0cf6cf13dd 100644 --- a/validator/Cargo.toml +++ b/validator/Cargo.toml @@ -58,6 +58,7 @@ solana-tpu-client = { path = "../tpu-client", version = "=1.16.0", default-featu solana-version = { path = "../version", version = "=1.16.0" } solana-vote-program = { path = "../programs/vote", version = "=1.16.0" } symlink = "0.1.0" +thiserror = "1.0" [dev-dependencies] solana-account-decoder = { path = "../account-decoder", version = "=1.16.0" } diff --git a/validator/src/bootstrap.rs b/validator/src/bootstrap.rs index d5ef543760..e0bb70826c 100644 --- a/validator/src/bootstrap.rs +++ b/validator/src/bootstrap.rs @@ -36,6 +36,7 @@ use { }, time::{Duration, Instant}, }, + thiserror::Error, }; /// When downloading snapshots, wait at most this long for snapshot hashes from @@ -48,6 +49,8 @@ const BLACKLIST_CLEAR_THRESHOLD: Duration = Duration::from_secs(60); /// If we can't find a good snapshot download candidate after this time, just /// give up. const NEWER_SNAPSHOT_THRESHOLD: Duration = Duration::from_secs(180); +/// If we haven't found any RPC peers after this time, just give up. +const GET_RPC_PEERS_TIMEOUT: Duration = Duration::from_secs(300); pub const MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION: usize = 32; @@ -316,6 +319,15 @@ fn check_vote_account( Ok(()) } +#[derive(Error, Debug)] +pub enum GetRpcNodeError { + #[error("Unable to find any RPC peers")] + NoRpcPeersFound, + + #[error("Giving up, did not get newer snapshots from the cluster")] + NoNewerSnapshots, +} + /// Struct to wrap the return value from get_rpc_nodes(). The `rpc_contact_info` is the peer to /// download from, and `snapshot_hash` is the (optional) full and (optional) incremental /// snapshots to download. @@ -526,18 +538,25 @@ pub fn rpc_bootstrap( } while vetted_rpc_nodes.is_empty() { - let rpc_node_details_vec = get_rpc_nodes( + let rpc_node_details = match get_rpc_nodes( &gossip.as_ref().unwrap().0, cluster_entrypoints, validator_config, &mut blacklisted_rpc_nodes.write().unwrap(), &bootstrap_config, - ); - if rpc_node_details_vec.is_empty() { - return; - } + ) { + Ok(rpc_node_details) => rpc_node_details, + Err(err) => { + error!( + "Failed to get RPC nodes: {err}. Consider checking system \ + clock, removing `--no-port-check`, or adjusting \ + `--known-validator ...` arguments as applicable" + ); + exit(1); + } + }; - vetted_rpc_nodes = rpc_node_details_vec + vetted_rpc_nodes = rpc_node_details .into_par_iter() .map(|rpc_node_details| { let GetRpcNodeResult { @@ -626,8 +645,9 @@ fn get_rpc_nodes( validator_config: &ValidatorConfig, blacklisted_rpc_nodes: &mut HashSet, bootstrap_config: &RpcBootstrapConfig, -) -> Vec { +) -> Result, GetRpcNodeError> { let mut blacklist_timeout = Instant::now(); + let mut get_rpc_peers_timout = Instant::now(); let mut newer_cluster_snapshot_timeout = None; let mut retry_reason = None; loop { @@ -645,16 +665,21 @@ fn get_rpc_nodes( bootstrap_config, ); if rpc_peers.is_empty() { + if get_rpc_peers_timout.elapsed() > GET_RPC_PEERS_TIMEOUT { + return Err(GetRpcNodeError::NoRpcPeersFound); + } continue; } + // Reset timeouts if we found any viable RPC peers. blacklist_timeout = Instant::now(); + get_rpc_peers_timout = Instant::now(); if bootstrap_config.no_snapshot_fetch { let random_peer = &rpc_peers[thread_rng().gen_range(0, rpc_peers.len())]; - return vec![GetRpcNodeResult { + return Ok(vec![GetRpcNodeResult { rpc_contact_info: random_peer.clone(), snapshot_hash: None, - }]; + }]); } let known_validators_to_wait_for = if newer_cluster_snapshot_timeout @@ -678,8 +703,7 @@ fn get_rpc_nodes( None => newer_cluster_snapshot_timeout = Some(Instant::now()), Some(newer_cluster_snapshot_timeout) => { if newer_cluster_snapshot_timeout.elapsed() > NEWER_SNAPSHOT_THRESHOLD { - warn!("Giving up, did not get newer snapshots from the cluster."); - return vec![]; + return Err(GetRpcNodeError::NoNewerSnapshots); } } } @@ -709,7 +733,7 @@ fn get_rpc_nodes( }) .take(MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION) .collect(); - return rpc_node_results; + return Ok(rpc_node_results); } } }