diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 33622ca393..8fec74b3f5 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -1088,7 +1088,12 @@ pub fn create_test_recorder( poh_recorder.set_bank(&bank); let poh_recorder = Arc::new(Mutex::new(poh_recorder)); - let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); + let poh_service = PohService::new( + poh_recorder.clone(), + &poh_config, + &exit, + bank.ticks_per_slot(), + ); (exit, poh_recorder, poh_service, entry_receiver) } diff --git a/core/src/poh_service.rs b/core/src/poh_service.rs index 656933d035..07bf38dbc9 100644 --- a/core/src/poh_service.rs +++ b/core/src/poh_service.rs @@ -1,7 +1,6 @@ //! The `poh_service` module implements a service that records the passing of //! "ticks", a measure of time in the PoH stream use crate::poh_recorder::PohRecorder; -use solana_sdk::clock::DEFAULT_TICKS_PER_SLOT; use solana_sdk::poh_config::PohConfig; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; @@ -25,6 +24,7 @@ impl PohService { poh_recorder: Arc>, poh_config: &Arc, poh_exit: &Arc, + ticks_per_slot: u64, ) -> Self { let poh_exit_ = poh_exit.clone(); let poh_config = poh_config.clone(); @@ -49,7 +49,12 @@ impl PohService { if let Some(cores) = core_affinity::get_core_ids() { core_affinity::set_for_current(cores[0]); } - Self::tick_producer(poh_recorder, &poh_exit_); + Self::tick_producer( + poh_recorder, + &poh_exit_, + poh_config.target_tick_duration.as_nanos() as u64, + ticks_per_slot, + ); } poh_exit_.store(true, Ordering::Relaxed); }) @@ -85,27 +90,48 @@ impl PohService { } } - fn tick_producer(poh_recorder: Arc>, poh_exit: &AtomicBool) { + fn tick_producer( + poh_recorder: Arc>, + poh_exit: &AtomicBool, + target_tick_ns: u64, + ticks_per_slot: u64, + ) { + info!("starting with target ns: {}", target_tick_ns); let poh = poh_recorder.lock().unwrap().poh.clone(); let mut now = Instant::now(); + let mut last_metric = Instant::now(); let mut num_ticks = 0; let mut num_hashes = 0; + let mut total_sleep_us = 0; loop { num_hashes += NUM_HASHES_PER_BATCH; if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) { // Lock PohRecorder only for the final hash... poh_recorder.lock().unwrap().tick(); num_ticks += 1; - if num_ticks >= DEFAULT_TICKS_PER_SLOT * 2 { + let elapsed_ns = now.elapsed().as_nanos() as u64; + // sleep is not accurate enough to get a predictable time. + // Kernel can not schedule the thread for a while. + while (now.elapsed().as_nanos() as u64) < target_tick_ns { + std::sync::atomic::spin_loop_hint(); + } + total_sleep_us += (now.elapsed().as_nanos() as u64 - elapsed_ns) / 1000; + now = Instant::now(); + + if last_metric.elapsed().as_millis() > 1000 { + let elapsed_ms = last_metric.elapsed().as_millis() as u64; + let ms_per_slot = (elapsed_ms * ticks_per_slot) / num_ticks; datapoint_info!( "poh-service", ("ticks", num_ticks as i64, i64), ("hashes", num_hashes as i64, i64), - ("elapsed_ms", now.elapsed().as_millis() as i64, i64), + ("elapsed_ms", ms_per_slot, i64), + ("total_sleep_ms", total_sleep_us / 1000, i64), ); + total_sleep_us = 0; num_ticks = 0; num_hashes = 0; - now = Instant::now(); + last_metric = Instant::now(); } if poh_exit.load(Ordering::Relaxed) { break; @@ -189,7 +215,7 @@ mod tests { .unwrap() }; - let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); + let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit, 0); poh_recorder.lock().unwrap().set_working_bank(working_bank); // get some events diff --git a/core/src/validator.rs b/core/src/validator.rs index 2ebc5f6ea0..a525c1ed79 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -40,6 +40,7 @@ use solana_ledger::{ blockstore_processor::{self, TransactionStatusSender}, leader_schedule::FixedSchedule, leader_schedule_cache::LeaderScheduleCache, + poh::compute_hash_time_ns, }; use solana_measure::measure::Measure; use solana_metrics::datapoint_info; @@ -114,6 +115,7 @@ pub struct ValidatorConfig { pub bpf_jit: bool, pub send_transaction_retry_ms: u64, pub send_transaction_leader_forward_count: u64, + pub no_poh_speed_test: bool, } impl Default for ValidatorConfig { @@ -156,6 +158,7 @@ impl Default for ValidatorConfig { bpf_jit: false, send_transaction_retry_ms: 2000, send_transaction_leader_forward_count: 2, + no_poh_speed_test: true, } } } @@ -531,11 +534,20 @@ impl Validator { (None, None) }; + if !config.no_poh_speed_test { + check_poh_speed(&genesis_config, None); + } + if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) { abort(); } - let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); + let poh_service = PohService::new( + poh_recorder.clone(), + &poh_config, + &exit, + bank.ticks_per_slot(), + ); assert_eq!( blockstore.new_shreds_signals.len(), 1, @@ -754,6 +766,35 @@ fn active_vote_account_exists_in_bank(bank: &Arc, vote_account: &Pubkey) - false } +fn check_poh_speed(genesis_config: &GenesisConfig, maybe_hash_samples: Option) { + if let Some(hashes_per_tick) = genesis_config.hashes_per_tick() { + let ticks_per_slot = genesis_config.ticks_per_slot(); + let hashes_per_slot = hashes_per_tick * ticks_per_slot; + + let hash_samples = maybe_hash_samples.unwrap_or(hashes_per_slot); + let hash_time_ns = compute_hash_time_ns(hash_samples); + + let my_ns_per_slot = (hash_time_ns * hashes_per_slot) / hash_samples; + debug!("computed: ns_per_slot: {}", my_ns_per_slot); + let target_ns_per_slot = genesis_config.ns_per_slot() as u64; + debug!( + "cluster ns_per_hash: {}ns ns_per_slot: {}", + target_ns_per_slot / hashes_per_slot, + target_ns_per_slot + ); + if my_ns_per_slot < target_ns_per_slot { + let extra_ns = target_ns_per_slot - my_ns_per_slot; + info!("PoH speed check: Will sleep {}ns per slot.", extra_ns); + } else { + error!( + "PoH is slower than cluster target tick rate! mine: {} cluster: {}. If you wish to continue, try --ignore-poh-speed", + my_ns_per_slot, target_ns_per_slot, + ); + abort(); + } + } +} + fn post_process_restored_tower( restored_tower: crate::consensus::Result, validator_identity: &Pubkey, @@ -1266,6 +1307,8 @@ pub fn is_snapshot_config_invalid( mod tests { use super::*; use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader}; + use solana_sdk::genesis_config::create_genesis_config; + use solana_sdk::poh_config::PohConfig; use std::fs::remove_dir_all; #[test] @@ -1384,7 +1427,6 @@ mod tests { #[test] fn test_wait_for_supermajority() { solana_logger::setup(); - use solana_sdk::genesis_config::create_genesis_config; use solana_sdk::hash::hash; let node_keypair = Arc::new(Keypair::new()); let cluster_info = ClusterInfo::new( @@ -1441,4 +1483,38 @@ mod tests { assert!(!is_snapshot_config_invalid(500, 100)); assert!(!is_snapshot_config_invalid(5, 5)); } + + #[test] + #[should_panic] + fn test_poh_speed() { + solana_logger::setup(); + let poh_config = PohConfig { + target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK), + // make PoH rate really fast to cause the panic condition + hashes_per_tick: Some( + 100 * solana_sdk::clock::DEFAULT_HASHES_PER_SECOND + / solana_sdk::clock::DEFAULT_TICKS_PER_SECOND, + ), + ..PohConfig::default() + }; + let genesis_config = GenesisConfig { + poh_config, + ..GenesisConfig::default() + }; + check_poh_speed(&genesis_config, Some(10_000)); + } + + #[test] + fn test_poh_speed_no_hashes_per_tick() { + let poh_config = PohConfig { + target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK), + hashes_per_tick: None, + ..PohConfig::default() + }; + let genesis_config = GenesisConfig { + poh_config, + ..GenesisConfig::default() + }; + check_poh_speed(&genesis_config, Some(10_000)); + } } diff --git a/ledger/src/poh.rs b/ledger/src/poh.rs index eaf075473a..7edc629f59 100644 --- a/ledger/src/poh.rs +++ b/ledger/src/poh.rs @@ -82,18 +82,18 @@ impl Poh { } } -pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 { - // calculate hash rate with the system under maximum load - info!( - "Running {} hashes in parallel on all threads...", - hashes_sample_size - ); +pub fn compute_hash_time_ns(hashes_sample_size: u64) -> u64 { + info!("Running {} hashes...", hashes_sample_size); let mut v = Hash::default(); let start = Instant::now(); for _ in 0..hashes_sample_size { v = hash(&v.as_ref()); } - let elapsed = start.elapsed().as_millis() as u64; + start.elapsed().as_nanos() as u64 +} + +pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 { + let elapsed = compute_hash_time_ns(hashes_sample_size) / (1000 * 1000); duration.as_millis() as u64 * hashes_sample_size / elapsed } diff --git a/validator/src/main.rs b/validator/src/main.rs index 957233ef11..3ff602c133 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -1104,6 +1104,11 @@ pub fn main() { .default_value("10000") .help("Milliseconds between printing contact debug from gossip."), ) + .arg( + Arg::with_name("no_poh_speed_test") + .long("no-poh-speed-test") + .help("Skip the check for PoH speed."), + ) .arg( Arg::with_name("accounts_hash_interval_slots") .long("accounts-hash-slots") @@ -1544,6 +1549,7 @@ pub fn main() { "rpc_send_transaction_leader_forward_count", u64 ), + no_poh_speed_test: matches.is_present("no_poh_speed_test"), ..ValidatorConfig::default() };