tvu and tpu timeout on joining its microservices (#24111)

* panic when test timeout

* nonblocking send when when droping banks

* debug log

* timeout for tvu

* unused varaible

* timeout for tpu

* Revert "debug log"

This reverts commit da780a3301a51d7c496141a85fcd35014fe6dff5.

* add timeout const

* fix typo

* Revert "nonblocking send when when droping banks".
I will create another pull request for this.

This reverts commit 088c98ec0facf825b5eca058fb860deba6d28888.

* Update core/src/tpu.rs

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>

* Update core/src/tpu.rs

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>

* Update core/src/tvu.rs

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>

* Update core/src/tvu.rs

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>

* Update core/src/validator.rs

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>

Co-authored-by: Trent Nelson <trent.a.b.nelson@gmail.com>
This commit is contained in:
HaoranYi 2022-04-07 20:20:13 -05:00 committed by GitHub
parent fbe5e51a16
commit e105547c14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 56 additions and 3 deletions

View File

@ -14,7 +14,7 @@ use {
sigverify::TransactionSigVerifier,
sigverify_stage::SigVerifyStage,
},
crossbeam_channel::{unbounded, Receiver},
crossbeam_channel::{bounded, unbounded, Receiver, RecvTimeoutError},
solana_gossip::cluster_info::ClusterInfo,
solana_ledger::{blockstore::Blockstore, blockstore_processor::TransactionStatusSender},
solana_poh::poh_recorder::{PohRecorder, WorkingBankEntry},
@ -32,11 +32,15 @@ use {
net::UdpSocket,
sync::{atomic::AtomicBool, Arc, Mutex, RwLock},
thread,
time::Duration,
},
};
pub const DEFAULT_TPU_COALESCE_MS: u64 = 5;
/// Timeout interval when joining threads during TPU close
const TPU_THREADS_JOIN_TIMEOUT_SECONDS: u64 = 10;
// allow multiple connections for NAT and any open/close overlap
pub const MAX_QUIC_CONNECTIONS_PER_IP: usize = 8;
@ -208,6 +212,22 @@ impl Tpu {
}
pub fn join(self) -> thread::Result<()> {
// spawn a new thread to wait for tpu close
let (sender, receiver) = bounded(0);
let _ = thread::spawn(move || {
let _ = self.do_join();
sender.send(()).unwrap();
});
// exit can deadlock. put an upper-bound on how long we wait for it
let timeout = Duration::from_secs(TPU_THREADS_JOIN_TIMEOUT_SECONDS);
if let Err(RecvTimeoutError::Timeout) = receiver.recv_timeout(timeout) {
error!("timeout for closing tvu");
}
Ok(())
}
fn do_join(self) -> thread::Result<()> {
let results = vec![
self.fetch_stage.join(),
self.sigverify_stage.join(),

View File

@ -26,7 +26,7 @@ use {
tower_storage::TowerStorage,
voting_service::VotingService,
},
crossbeam_channel::{unbounded, Receiver},
crossbeam_channel::{bounded, unbounded, Receiver, RecvTimeoutError},
solana_geyser_plugin_manager::block_metadata_notifier_interface::BlockMetadataNotifierLock,
solana_gossip::cluster_info::ClusterInfo,
solana_ledger::{
@ -60,9 +60,13 @@ use {
net::UdpSocket,
sync::{atomic::AtomicBool, Arc, Mutex, RwLock},
thread,
time::Duration,
},
};
/// Timeout interval when joining threads during TVU close
const TVU_THREADS_JOIN_TIMEOUT_SECONDS: u64 = 10;
pub struct Tvu {
fetch_stage: ShredFetchStage,
sigverify_stage: SigVerifyStage,
@ -358,6 +362,22 @@ impl Tvu {
}
pub fn join(self) -> thread::Result<()> {
// spawn a new thread to wait for tvu close
let (sender, receiver) = bounded(0);
let _ = thread::spawn(move || {
let _ = self.do_join();
sender.send(()).unwrap();
});
// exit can deadlock. put an upper-bound on how long we wait for it
let timeout = Duration::from_secs(TVU_THREADS_JOIN_TIMEOUT_SECONDS);
if let Err(RecvTimeoutError::Timeout) = receiver.recv_timeout(timeout) {
error!("timeout for closing tvu");
}
Ok(())
}
fn do_join(self) -> thread::Result<()> {
self.retransmit_stage.join()?;
self.fetch_stage.join()?;
self.sigverify_stage.join()?;

View File

@ -1844,7 +1844,20 @@ mod tests {
*start_progress.read().unwrap(),
ValidatorStartProgress::Running
);
validator.close();
// spawn a new thread to wait for validator close
let (sender, receiver) = bounded(0);
let _ = thread::spawn(move || {
validator.close();
sender.send(()).unwrap();
});
// exit can deadlock. put an upper-bound on how long we wait for it
let timeout = Duration::from_secs(30);
if let Err(RecvTimeoutError::Timeout) = receiver.recv_timeout(timeout) {
panic!("timeout for closing validator");
}
remove_dir_all(validator_ledger_path).unwrap();
}