Add timeout for getting bootstrap RPC peers (#30277)
* Add timeout for getting bootstrap RPC peers * Get RPC return errors when no peers found * Add config guidance
This commit is contained in:
parent
615d1a8b69
commit
ba43883cce
|
@ -6985,6 +6985,7 @@ dependencies = [
|
|||
"solana-vote-program",
|
||||
"spl-token-2022",
|
||||
"symlink",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
]
|
||||
|
||||
|
|
|
@ -6191,6 +6191,7 @@ dependencies = [
|
|||
"solana-version",
|
||||
"solana-vote-program",
|
||||
"symlink",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
]
|
||||
|
||||
|
|
|
@ -58,6 +58,7 @@ solana-tpu-client = { path = "../tpu-client", version = "=1.16.0", default-featu
|
|||
solana-version = { path = "../version", version = "=1.16.0" }
|
||||
solana-vote-program = { path = "../programs/vote", version = "=1.16.0" }
|
||||
symlink = "0.1.0"
|
||||
thiserror = "1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
solana-account-decoder = { path = "../account-decoder", version = "=1.16.0" }
|
||||
|
|
|
@ -36,6 +36,7 @@ use {
|
|||
},
|
||||
time::{Duration, Instant},
|
||||
},
|
||||
thiserror::Error,
|
||||
};
|
||||
|
||||
/// When downloading snapshots, wait at most this long for snapshot hashes from
|
||||
|
@ -48,6 +49,8 @@ const BLACKLIST_CLEAR_THRESHOLD: Duration = Duration::from_secs(60);
|
|||
/// If we can't find a good snapshot download candidate after this time, just
|
||||
/// give up.
|
||||
const NEWER_SNAPSHOT_THRESHOLD: Duration = Duration::from_secs(180);
|
||||
/// If we haven't found any RPC peers after this time, just give up.
|
||||
const GET_RPC_PEERS_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
|
||||
pub const MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION: usize = 32;
|
||||
|
||||
|
@ -316,6 +319,15 @@ fn check_vote_account(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum GetRpcNodeError {
|
||||
#[error("Unable to find any RPC peers")]
|
||||
NoRpcPeersFound,
|
||||
|
||||
#[error("Giving up, did not get newer snapshots from the cluster")]
|
||||
NoNewerSnapshots,
|
||||
}
|
||||
|
||||
/// Struct to wrap the return value from get_rpc_nodes(). The `rpc_contact_info` is the peer to
|
||||
/// download from, and `snapshot_hash` is the (optional) full and (optional) incremental
|
||||
/// snapshots to download.
|
||||
|
@ -526,18 +538,25 @@ pub fn rpc_bootstrap(
|
|||
}
|
||||
|
||||
while vetted_rpc_nodes.is_empty() {
|
||||
let rpc_node_details_vec = get_rpc_nodes(
|
||||
let rpc_node_details = match get_rpc_nodes(
|
||||
&gossip.as_ref().unwrap().0,
|
||||
cluster_entrypoints,
|
||||
validator_config,
|
||||
&mut blacklisted_rpc_nodes.write().unwrap(),
|
||||
&bootstrap_config,
|
||||
);
|
||||
if rpc_node_details_vec.is_empty() {
|
||||
return;
|
||||
}
|
||||
) {
|
||||
Ok(rpc_node_details) => rpc_node_details,
|
||||
Err(err) => {
|
||||
error!(
|
||||
"Failed to get RPC nodes: {err}. Consider checking system \
|
||||
clock, removing `--no-port-check`, or adjusting \
|
||||
`--known-validator ...` arguments as applicable"
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
vetted_rpc_nodes = rpc_node_details_vec
|
||||
vetted_rpc_nodes = rpc_node_details
|
||||
.into_par_iter()
|
||||
.map(|rpc_node_details| {
|
||||
let GetRpcNodeResult {
|
||||
|
@ -626,8 +645,9 @@ fn get_rpc_nodes(
|
|||
validator_config: &ValidatorConfig,
|
||||
blacklisted_rpc_nodes: &mut HashSet<Pubkey>,
|
||||
bootstrap_config: &RpcBootstrapConfig,
|
||||
) -> Vec<GetRpcNodeResult> {
|
||||
) -> Result<Vec<GetRpcNodeResult>, GetRpcNodeError> {
|
||||
let mut blacklist_timeout = Instant::now();
|
||||
let mut get_rpc_peers_timout = Instant::now();
|
||||
let mut newer_cluster_snapshot_timeout = None;
|
||||
let mut retry_reason = None;
|
||||
loop {
|
||||
|
@ -645,16 +665,21 @@ fn get_rpc_nodes(
|
|||
bootstrap_config,
|
||||
);
|
||||
if rpc_peers.is_empty() {
|
||||
if get_rpc_peers_timout.elapsed() > GET_RPC_PEERS_TIMEOUT {
|
||||
return Err(GetRpcNodeError::NoRpcPeersFound);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Reset timeouts if we found any viable RPC peers.
|
||||
blacklist_timeout = Instant::now();
|
||||
get_rpc_peers_timout = Instant::now();
|
||||
if bootstrap_config.no_snapshot_fetch {
|
||||
let random_peer = &rpc_peers[thread_rng().gen_range(0, rpc_peers.len())];
|
||||
return vec![GetRpcNodeResult {
|
||||
return Ok(vec![GetRpcNodeResult {
|
||||
rpc_contact_info: random_peer.clone(),
|
||||
snapshot_hash: None,
|
||||
}];
|
||||
}]);
|
||||
}
|
||||
|
||||
let known_validators_to_wait_for = if newer_cluster_snapshot_timeout
|
||||
|
@ -678,8 +703,7 @@ fn get_rpc_nodes(
|
|||
None => newer_cluster_snapshot_timeout = Some(Instant::now()),
|
||||
Some(newer_cluster_snapshot_timeout) => {
|
||||
if newer_cluster_snapshot_timeout.elapsed() > NEWER_SNAPSHOT_THRESHOLD {
|
||||
warn!("Giving up, did not get newer snapshots from the cluster.");
|
||||
return vec![];
|
||||
return Err(GetRpcNodeError::NoNewerSnapshots);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -709,7 +733,7 @@ fn get_rpc_nodes(
|
|||
})
|
||||
.take(MAX_RPC_CONNECTIONS_EVALUATED_PER_ITERATION)
|
||||
.collect();
|
||||
return rpc_node_results;
|
||||
return Ok(rpc_node_results);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue