Replicator timeout (#2480)
* Add timeout to Replicator::new; used when polling for leader * Add timeout functionality to replicator ledger download Shares the same timeout as polling for leader Defaults to 30 seconds * Add docs for Replicator::new
This commit is contained in:
parent
6e8b69fc88
commit
f37eb533f1
|
@ -82,7 +82,7 @@ fn main() {
|
||||||
|
|
||||||
let leader_info = NodeInfo::new_entry_point(&network_addr);
|
let leader_info = NodeInfo::new_entry_point(&network_addr);
|
||||||
|
|
||||||
let replicator = Replicator::new(ledger_path, node, &leader_info, &keypair).unwrap();
|
let replicator = Replicator::new(ledger_path, node, &leader_info, &keypair, None).unwrap();
|
||||||
|
|
||||||
replicator.join();
|
replicator.join();
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ use crate::cluster_info::{ClusterInfo, Node, NodeInfo};
|
||||||
use crate::db_ledger::DbLedger;
|
use crate::db_ledger::DbLedger;
|
||||||
use crate::gossip_service::GossipService;
|
use crate::gossip_service::GossipService;
|
||||||
use crate::leader_scheduler::LeaderScheduler;
|
use crate::leader_scheduler::LeaderScheduler;
|
||||||
use crate::result::Result;
|
use crate::result::{self, Result};
|
||||||
use crate::rpc_request::{RpcClient, RpcRequest, RpcRequestHandler};
|
use crate::rpc_request::{RpcClient, RpcRequest, RpcRequestHandler};
|
||||||
use crate::service::Service;
|
use crate::service::Service;
|
||||||
use crate::storage_stage::{get_segment_from_entry, ENTRIES_PER_SEGMENT};
|
use crate::storage_stage::{get_segment_from_entry, ENTRIES_PER_SEGMENT};
|
||||||
|
@ -35,7 +35,7 @@ use std::sync::mpsc::channel;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::thread::sleep;
|
use std::thread::sleep;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
use std::time::Duration;
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
pub struct Replicator {
|
pub struct Replicator {
|
||||||
gossip_service: GossipService,
|
gossip_service: GossipService,
|
||||||
|
@ -99,15 +99,27 @@ fn get_entry_heights_from_last_id(
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Replicator {
|
impl Replicator {
|
||||||
|
/// Returns a Result that contains a replicator on success
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `ledger_path` - (Not actually optional) path to where the the ledger will be stored.
|
||||||
|
/// Causes panic if none
|
||||||
|
/// * `node` - The replicator node
|
||||||
|
/// * `leader_info` - NodeInfo representing the leader
|
||||||
|
/// * `keypair` - Keypair for this replicator
|
||||||
|
/// * `timeout` - (optional) timeout for polling for leader/downloading the ledger. Defaults to
|
||||||
|
/// 30 seconds
|
||||||
#[allow(clippy::new_ret_no_self)]
|
#[allow(clippy::new_ret_no_self)]
|
||||||
pub fn new(
|
pub fn new(
|
||||||
ledger_path: Option<&str>,
|
ledger_path: Option<&str>,
|
||||||
node: Node,
|
node: Node,
|
||||||
leader_info: &NodeInfo,
|
leader_info: &NodeInfo,
|
||||||
keypair: &Keypair,
|
keypair: &Keypair,
|
||||||
|
timeout: Option<Duration>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let exit = Arc::new(AtomicBool::new(false));
|
let exit = Arc::new(AtomicBool::new(false));
|
||||||
let done = Arc::new(AtomicBool::new(false));
|
let done = Arc::new(AtomicBool::new(false));
|
||||||
|
let timeout = timeout.unwrap_or_else(|| Duration::new(30, 0));
|
||||||
|
|
||||||
info!("Replicator: id: {}", keypair.pubkey());
|
info!("Replicator: id: {}", keypair.pubkey());
|
||||||
info!("Creating cluster info....");
|
info!("Creating cluster info....");
|
||||||
|
@ -138,7 +150,7 @@ impl Replicator {
|
||||||
);
|
);
|
||||||
|
|
||||||
info!("polling for leader");
|
info!("polling for leader");
|
||||||
let leader = Self::poll_for_leader(&cluster_info)?;
|
let leader = Self::poll_for_leader(&cluster_info, timeout)?;
|
||||||
|
|
||||||
info!("Got leader: {:?}", leader);
|
info!("Got leader: {:?}", leader);
|
||||||
|
|
||||||
|
@ -161,6 +173,8 @@ impl Replicator {
|
||||||
// todo: pull blobs off the retransmit_receiver and recycle them?
|
// todo: pull blobs off the retransmit_receiver and recycle them?
|
||||||
let (retransmit_sender, retransmit_receiver) = channel();
|
let (retransmit_sender, retransmit_receiver) = channel();
|
||||||
|
|
||||||
|
let (entry_sender, entry_receiver) = channel();
|
||||||
|
|
||||||
let t_window = window_service(
|
let t_window = window_service(
|
||||||
db_ledger.clone(),
|
db_ledger.clone(),
|
||||||
cluster_info.clone(),
|
cluster_info.clone(),
|
||||||
|
@ -168,7 +182,7 @@ impl Replicator {
|
||||||
entry_height,
|
entry_height,
|
||||||
max_entry_height,
|
max_entry_height,
|
||||||
blob_fetch_receiver,
|
blob_fetch_receiver,
|
||||||
None,
|
Some(entry_sender),
|
||||||
retransmit_sender,
|
retransmit_sender,
|
||||||
repair_socket,
|
repair_socket,
|
||||||
Arc::new(RwLock::new(LeaderScheduler::from_bootstrap_leader(
|
Arc::new(RwLock::new(LeaderScheduler::from_bootstrap_leader(
|
||||||
|
@ -178,9 +192,24 @@ impl Replicator {
|
||||||
);
|
);
|
||||||
|
|
||||||
info!("window created, waiting for ledger download done");
|
info!("window created, waiting for ledger download done");
|
||||||
|
let start = Instant::now();
|
||||||
|
let mut received_so_far = 0;
|
||||||
|
|
||||||
while !done.load(Ordering::Relaxed) {
|
while !done.load(Ordering::Relaxed) {
|
||||||
sleep(Duration::from_millis(100));
|
sleep(Duration::from_millis(100));
|
||||||
|
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
received_so_far += entry_receiver.try_recv().map(|v| v.len()).unwrap_or(0);
|
||||||
|
|
||||||
|
if received_so_far == 0 && elapsed > timeout {
|
||||||
|
return Err(result::Error::IO(io::Error::new(
|
||||||
|
ErrorKind::TimedOut,
|
||||||
|
"Timed out waiting to receive any blocks",
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Done receiving entries from window_service");
|
||||||
|
|
||||||
let mut node_info = node.info.clone();
|
let mut node_info = node.info.clone();
|
||||||
node_info.tvu = "0.0.0.0:0".parse().unwrap();
|
node_info.tvu = "0.0.0.0:0".parse().unwrap();
|
||||||
|
@ -279,16 +308,27 @@ impl Replicator {
|
||||||
self.entry_height
|
self.entry_height
|
||||||
}
|
}
|
||||||
|
|
||||||
fn poll_for_leader(cluster_info: &Arc<RwLock<ClusterInfo>>) -> Result<NodeInfo> {
|
fn poll_for_leader(
|
||||||
for _ in 0..30 {
|
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<NodeInfo> {
|
||||||
|
let start = Instant::now();
|
||||||
|
loop {
|
||||||
if let Some(l) = cluster_info.read().unwrap().get_gossip_top_leader() {
|
if let Some(l) = cluster_info.read().unwrap().get_gossip_top_leader() {
|
||||||
return Ok(l.clone());
|
return Ok(l.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
if elapsed > timeout {
|
||||||
|
return Err(result::Error::IO(io::Error::new(
|
||||||
|
ErrorKind::TimedOut,
|
||||||
|
"Timed out waiting to receive any blocks",
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
sleep(Duration::from_millis(900));
|
sleep(Duration::from_millis(900));
|
||||||
info!("{}", cluster_info.read().unwrap().node_info_trace());
|
info!("{}", cluster_info.read().unwrap().node_info_trace());
|
||||||
}
|
}
|
||||||
Err(Error::new(ErrorKind::Other, "Couldn't find leader"))?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn poll_for_last_id_and_entry_height(
|
fn poll_for_last_id_and_entry_height(
|
||||||
|
@ -355,7 +395,6 @@ impl Replicator {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use crate::replicator::sample_file;
|
use crate::replicator::sample_file;
|
||||||
use solana_sdk::hash::Hash;
|
use solana_sdk::hash::Hash;
|
||||||
use solana_sdk::signature::{Keypair, KeypairUtil};
|
use solana_sdk::signature::{Keypair, KeypairUtil};
|
||||||
|
@ -429,5 +468,4 @@ mod tests {
|
||||||
let res = sample_file(&in_path, &samples);
|
let res = sample_file(&in_path, &samples);
|
||||||
assert!(res.is_err());
|
assert!(res.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,6 +132,7 @@ fn test_replicator_startup() {
|
||||||
replicator_node,
|
replicator_node,
|
||||||
&leader_info,
|
&leader_info,
|
||||||
&replicator_keypair,
|
&replicator_keypair,
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -217,3 +218,116 @@ fn test_replicator_startup() {
|
||||||
let _ignored = remove_dir_all(&leader_ledger_path);
|
let _ignored = remove_dir_all(&leader_ledger_path);
|
||||||
let _ignored = remove_dir_all(&replicator_ledger_path);
|
let _ignored = remove_dir_all(&replicator_ledger_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_replicator_startup_leader_hang() {
|
||||||
|
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
solana_logger::setup();
|
||||||
|
info!("starting replicator test");
|
||||||
|
|
||||||
|
let replicator_ledger_path = &get_tmp_ledger_path("replicator_test_replicator_ledger");
|
||||||
|
let leader_ledger_path = "replicator_test_leader_ledger";
|
||||||
|
|
||||||
|
{
|
||||||
|
let replicator_keypair = Keypair::new();
|
||||||
|
|
||||||
|
info!("starting replicator node");
|
||||||
|
let replicator_node = Node::new_localhost_with_pubkey(replicator_keypair.pubkey());
|
||||||
|
|
||||||
|
let fake_gossip = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0);
|
||||||
|
let leader_info = NodeInfo::new_entry_point(&fake_gossip);
|
||||||
|
|
||||||
|
let replicator_res = Replicator::new(
|
||||||
|
Some(replicator_ledger_path),
|
||||||
|
replicator_node,
|
||||||
|
&leader_info,
|
||||||
|
&replicator_keypair,
|
||||||
|
Some(Duration::from_secs(3)),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(replicator_res.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ignored = DbLedger::destroy(&leader_ledger_path);
|
||||||
|
let _ignored = DbLedger::destroy(&replicator_ledger_path);
|
||||||
|
let _ignored = remove_dir_all(&leader_ledger_path);
|
||||||
|
let _ignored = remove_dir_all(&replicator_ledger_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_replicator_startup_ledger_hang() {
|
||||||
|
use std::net::UdpSocket;
|
||||||
|
|
||||||
|
solana_logger::setup();
|
||||||
|
info!("starting replicator test");
|
||||||
|
let replicator_ledger_path = &get_tmp_ledger_path("replicator_test_replicator_ledger");
|
||||||
|
|
||||||
|
info!("starting leader node");
|
||||||
|
let leader_keypair = Arc::new(Keypair::new());
|
||||||
|
let leader_node = Node::new_localhost_with_pubkey(leader_keypair.pubkey());
|
||||||
|
let leader_info = leader_node.info.clone();
|
||||||
|
|
||||||
|
let leader_ledger_path = "replicator_test_leader_ledger";
|
||||||
|
let (_, leader_ledger_path) = create_tmp_genesis(leader_ledger_path, 100, leader_info.id, 1);
|
||||||
|
|
||||||
|
let validator_ledger_path =
|
||||||
|
tmp_copy_ledger(&leader_ledger_path, "replicator_test_validator_ledger");
|
||||||
|
|
||||||
|
{
|
||||||
|
let signer_proxy =
|
||||||
|
VoteSignerProxy::new(&leader_keypair, Box::new(LocalVoteSigner::default()));
|
||||||
|
|
||||||
|
let _ = Fullnode::new(
|
||||||
|
leader_node,
|
||||||
|
&leader_ledger_path,
|
||||||
|
leader_keypair,
|
||||||
|
Arc::new(signer_proxy),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
LeaderScheduler::from_bootstrap_leader(leader_info.id.clone()),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
let validator_keypair = Arc::new(Keypair::new());
|
||||||
|
let signer_proxy =
|
||||||
|
VoteSignerProxy::new(&validator_keypair, Box::new(LocalVoteSigner::default()));
|
||||||
|
let validator_node = Node::new_localhost_with_pubkey(validator_keypair.pubkey());
|
||||||
|
|
||||||
|
let _ = Fullnode::new(
|
||||||
|
validator_node,
|
||||||
|
&validator_ledger_path,
|
||||||
|
validator_keypair,
|
||||||
|
Arc::new(signer_proxy),
|
||||||
|
Some(leader_info.gossip),
|
||||||
|
false,
|
||||||
|
LeaderScheduler::from_bootstrap_leader(leader_info.id),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
info!("starting replicator node");
|
||||||
|
let bad_keys = Keypair::new();
|
||||||
|
let mut replicator_node = Node::new_localhost_with_pubkey(bad_keys.pubkey());
|
||||||
|
|
||||||
|
// Pass bad TVU sockets to prevent successful ledger download
|
||||||
|
replicator_node.sockets.tvu = vec![UdpSocket::bind("0.0.0.0:0").unwrap()];
|
||||||
|
|
||||||
|
let leader_info = NodeInfo::new_entry_point(&leader_info.gossip);
|
||||||
|
|
||||||
|
let replicator_res = Replicator::new(
|
||||||
|
Some(replicator_ledger_path),
|
||||||
|
replicator_node,
|
||||||
|
&leader_info,
|
||||||
|
&bad_keys,
|
||||||
|
Some(Duration::from_secs(3)),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(replicator_res.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ignored = DbLedger::destroy(&leader_ledger_path);
|
||||||
|
let _ignored = DbLedger::destroy(&replicator_ledger_path);
|
||||||
|
let _ignored = remove_dir_all(&leader_ledger_path);
|
||||||
|
let _ignored = remove_dir_all(&replicator_ledger_path);
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue