Add timeout for getting bootstrap RPC peers (#30277)

* Add timeout for getting bootstrap RPC peers

* Get RPC return errors when no peers found

* Add config guidance
This commit is contained in:
Brennan 2023-02-15 14:24:42 -08:00 committed by GitHub
parent 615d1a8b69
commit ba43883cce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 12 deletions

1
Cargo.lock generated
View File

@ -6985,6 +6985,7 @@ dependencies = [
"solana-vote-program",
"spl-token-2022",
"symlink",
"thiserror",
"tikv-jemallocator",
]

View File

@ -6191,6 +6191,7 @@ dependencies = [
"solana-version",
"solana-vote-program",
"symlink",
"thiserror",
"tikv-jemallocator",
]

View File

@ -58,6 +58,7 @@ solana-tpu-client = { path = "../tpu-client", version = "=1.16.0", default-featu
solana-version = { path = "../version", version = "=1.16.0" }
solana-vote-program = { path = "../programs/vote", version = "=1.16.0" }
symlink = "0.1.0"
thiserror = "1.0"
[dev-dependencies]
solana-account-decoder = { path = "../account-decoder", version = "=1.16.0" }

View File

@ -36,6 +36,7 @@ use {
},
time::{Duration, Instant},
},
thiserror::Error,
};
/// When downloading snapshots, wait at most this long for snapshot hashes from
@ -48,6 +49,8 @@ const BLACKLIST_CLEAR_THRESHOLD: Duration = Duration::from_secs(60);
/// If we can't find a good snapshot download candidate after this time, just
/// give up.
const NEWER_SNAPSHOT_THRESHOLD: Duration = Duration::from_secs(180);
/// If we haven't found any RPC peers after this time, just give up.
const GET_RPC_PEERS_TIMEOUT: Duration = Duration::from_secs(300);
pub const MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION: usize = 32;
@ -316,6 +319,15 @@ fn check_vote_account(
Ok(())
}
#[derive(Error, Debug)]
pub enum GetRpcNodeError {
#[error("Unable to find any RPC peers")]
NoRpcPeersFound,
#[error("Giving up, did not get newer snapshots from the cluster")]
NoNewerSnapshots,
}
/// Struct to wrap the return value from get_rpc_nodes(). The `rpc_contact_info` is the peer to
/// download from, and `snapshot_hash` is the (optional) full and (optional) incremental
/// snapshots to download.
@ -526,18 +538,25 @@ pub fn rpc_bootstrap(
}
while vetted_rpc_nodes.is_empty() {
let rpc_node_details_vec = get_rpc_nodes(
let rpc_node_details = match get_rpc_nodes(
&gossip.as_ref().unwrap().0,
cluster_entrypoints,
validator_config,
&mut blacklisted_rpc_nodes.write().unwrap(),
&bootstrap_config,
);
if rpc_node_details_vec.is_empty() {
return;
}
) {
Ok(rpc_node_details) => rpc_node_details,
Err(err) => {
error!(
"Failed to get RPC nodes: {err}. Consider checking system \
clock, removing `--no-port-check`, or adjusting \
`--known-validator ...` arguments as applicable"
);
exit(1);
}
};
vetted_rpc_nodes = rpc_node_details_vec
vetted_rpc_nodes = rpc_node_details
.into_par_iter()
.map(|rpc_node_details| {
let GetRpcNodeResult {
@ -626,8 +645,9 @@ fn get_rpc_nodes(
validator_config: &ValidatorConfig,
blacklisted_rpc_nodes: &mut HashSet<Pubkey>,
bootstrap_config: &RpcBootstrapConfig,
) -> Vec<GetRpcNodeResult> {
) -> Result<Vec<GetRpcNodeResult>, GetRpcNodeError> {
let mut blacklist_timeout = Instant::now();
let mut get_rpc_peers_timout = Instant::now();
let mut newer_cluster_snapshot_timeout = None;
let mut retry_reason = None;
loop {
@ -645,16 +665,21 @@ fn get_rpc_nodes(
bootstrap_config,
);
if rpc_peers.is_empty() {
if get_rpc_peers_timout.elapsed() > GET_RPC_PEERS_TIMEOUT {
return Err(GetRpcNodeError::NoRpcPeersFound);
}
continue;
}
// Reset timeouts if we found any viable RPC peers.
blacklist_timeout = Instant::now();
get_rpc_peers_timout = Instant::now();
if bootstrap_config.no_snapshot_fetch {
let random_peer = &rpc_peers[thread_rng().gen_range(0, rpc_peers.len())];
return vec![GetRpcNodeResult {
return Ok(vec![GetRpcNodeResult {
rpc_contact_info: random_peer.clone(),
snapshot_hash: None,
}];
}]);
}
let known_validators_to_wait_for = if newer_cluster_snapshot_timeout
@ -678,8 +703,7 @@ fn get_rpc_nodes(
None => newer_cluster_snapshot_timeout = Some(Instant::now()),
Some(newer_cluster_snapshot_timeout) => {
if newer_cluster_snapshot_timeout.elapsed() > NEWER_SNAPSHOT_THRESHOLD {
warn!("Giving up, did not get newer snapshots from the cluster.");
return vec![];
return Err(GetRpcNodeError::NoNewerSnapshots);
}
}
}
@ -709,7 +733,7 @@ fn get_rpc_nodes(
})
.take(MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION)
.collect();
return rpc_node_results;
return Ok(rpc_node_results);
}
}
}