Add poh speed check and tick speed calibration (#14292)

This commit is contained in:
sakridge 2020-12-29 09:35:57 -08:00 committed by GitHub
parent 444ed768dc
commit 2074e407cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 130 additions and 17 deletions

View File

@ -1088,7 +1088,12 @@ pub fn create_test_recorder(
poh_recorder.set_bank(&bank); poh_recorder.set_bank(&bank);
let poh_recorder = Arc::new(Mutex::new(poh_recorder)); let poh_recorder = Arc::new(Mutex::new(poh_recorder));
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); let poh_service = PohService::new(
poh_recorder.clone(),
&poh_config,
&exit,
bank.ticks_per_slot(),
);
(exit, poh_recorder, poh_service, entry_receiver) (exit, poh_recorder, poh_service, entry_receiver)
} }

View File

@ -1,7 +1,6 @@
//! The `poh_service` module implements a service that records the passing of //! The `poh_service` module implements a service that records the passing of
//! "ticks", a measure of time in the PoH stream //! "ticks", a measure of time in the PoH stream
use crate::poh_recorder::PohRecorder; use crate::poh_recorder::PohRecorder;
use solana_sdk::clock::DEFAULT_TICKS_PER_SLOT;
use solana_sdk::poh_config::PohConfig; use solana_sdk::poh_config::PohConfig;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
@ -25,6 +24,7 @@ impl PohService {
poh_recorder: Arc<Mutex<PohRecorder>>, poh_recorder: Arc<Mutex<PohRecorder>>,
poh_config: &Arc<PohConfig>, poh_config: &Arc<PohConfig>,
poh_exit: &Arc<AtomicBool>, poh_exit: &Arc<AtomicBool>,
ticks_per_slot: u64,
) -> Self { ) -> Self {
let poh_exit_ = poh_exit.clone(); let poh_exit_ = poh_exit.clone();
let poh_config = poh_config.clone(); let poh_config = poh_config.clone();
@ -49,7 +49,12 @@ impl PohService {
if let Some(cores) = core_affinity::get_core_ids() { if let Some(cores) = core_affinity::get_core_ids() {
core_affinity::set_for_current(cores[0]); core_affinity::set_for_current(cores[0]);
} }
Self::tick_producer(poh_recorder, &poh_exit_); Self::tick_producer(
poh_recorder,
&poh_exit_,
poh_config.target_tick_duration.as_nanos() as u64,
ticks_per_slot,
);
} }
poh_exit_.store(true, Ordering::Relaxed); poh_exit_.store(true, Ordering::Relaxed);
}) })
@ -85,27 +90,48 @@ impl PohService {
} }
} }
fn tick_producer(poh_recorder: Arc<Mutex<PohRecorder>>, poh_exit: &AtomicBool) { fn tick_producer(
poh_recorder: Arc<Mutex<PohRecorder>>,
poh_exit: &AtomicBool,
target_tick_ns: u64,
ticks_per_slot: u64,
) {
info!("starting with target ns: {}", target_tick_ns);
let poh = poh_recorder.lock().unwrap().poh.clone(); let poh = poh_recorder.lock().unwrap().poh.clone();
let mut now = Instant::now(); let mut now = Instant::now();
let mut last_metric = Instant::now();
let mut num_ticks = 0; let mut num_ticks = 0;
let mut num_hashes = 0; let mut num_hashes = 0;
let mut total_sleep_us = 0;
loop { loop {
num_hashes += NUM_HASHES_PER_BATCH; num_hashes += NUM_HASHES_PER_BATCH;
if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) { if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) {
// Lock PohRecorder only for the final hash... // Lock PohRecorder only for the final hash...
poh_recorder.lock().unwrap().tick(); poh_recorder.lock().unwrap().tick();
num_ticks += 1; num_ticks += 1;
if num_ticks >= DEFAULT_TICKS_PER_SLOT * 2 { let elapsed_ns = now.elapsed().as_nanos() as u64;
// sleep is not accurate enough to get a predictable time.
// Kernel can not schedule the thread for a while.
while (now.elapsed().as_nanos() as u64) < target_tick_ns {
std::sync::atomic::spin_loop_hint();
}
total_sleep_us += (now.elapsed().as_nanos() as u64 - elapsed_ns) / 1000;
now = Instant::now();
if last_metric.elapsed().as_millis() > 1000 {
let elapsed_ms = last_metric.elapsed().as_millis() as u64;
let ms_per_slot = (elapsed_ms * ticks_per_slot) / num_ticks;
datapoint_info!( datapoint_info!(
"poh-service", "poh-service",
("ticks", num_ticks as i64, i64), ("ticks", num_ticks as i64, i64),
("hashes", num_hashes as i64, i64), ("hashes", num_hashes as i64, i64),
("elapsed_ms", now.elapsed().as_millis() as i64, i64), ("elapsed_ms", ms_per_slot, i64),
("total_sleep_ms", total_sleep_us / 1000, i64),
); );
total_sleep_us = 0;
num_ticks = 0; num_ticks = 0;
num_hashes = 0; num_hashes = 0;
now = Instant::now(); last_metric = Instant::now();
} }
if poh_exit.load(Ordering::Relaxed) { if poh_exit.load(Ordering::Relaxed) {
break; break;
@ -189,7 +215,7 @@ mod tests {
.unwrap() .unwrap()
}; };
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit, 0);
poh_recorder.lock().unwrap().set_working_bank(working_bank); poh_recorder.lock().unwrap().set_working_bank(working_bank);
// get some events // get some events

View File

@ -40,6 +40,7 @@ use solana_ledger::{
blockstore_processor::{self, TransactionStatusSender}, blockstore_processor::{self, TransactionStatusSender},
leader_schedule::FixedSchedule, leader_schedule::FixedSchedule,
leader_schedule_cache::LeaderScheduleCache, leader_schedule_cache::LeaderScheduleCache,
poh::compute_hash_time_ns,
}; };
use solana_measure::measure::Measure; use solana_measure::measure::Measure;
use solana_metrics::datapoint_info; use solana_metrics::datapoint_info;
@ -114,6 +115,7 @@ pub struct ValidatorConfig {
pub bpf_jit: bool, pub bpf_jit: bool,
pub send_transaction_retry_ms: u64, pub send_transaction_retry_ms: u64,
pub send_transaction_leader_forward_count: u64, pub send_transaction_leader_forward_count: u64,
pub no_poh_speed_test: bool,
} }
impl Default for ValidatorConfig { impl Default for ValidatorConfig {
@ -156,6 +158,7 @@ impl Default for ValidatorConfig {
bpf_jit: false, bpf_jit: false,
send_transaction_retry_ms: 2000, send_transaction_retry_ms: 2000,
send_transaction_leader_forward_count: 2, send_transaction_leader_forward_count: 2,
no_poh_speed_test: true,
} }
} }
} }
@ -531,11 +534,20 @@ impl Validator {
(None, None) (None, None)
}; };
if !config.no_poh_speed_test {
check_poh_speed(&genesis_config, None);
}
if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) { if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) {
abort(); abort();
} }
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit); let poh_service = PohService::new(
poh_recorder.clone(),
&poh_config,
&exit,
bank.ticks_per_slot(),
);
assert_eq!( assert_eq!(
blockstore.new_shreds_signals.len(), blockstore.new_shreds_signals.len(),
1, 1,
@ -754,6 +766,35 @@ fn active_vote_account_exists_in_bank(bank: &Arc<Bank>, vote_account: &Pubkey) -
false false
} }
fn check_poh_speed(genesis_config: &GenesisConfig, maybe_hash_samples: Option<u64>) {
if let Some(hashes_per_tick) = genesis_config.hashes_per_tick() {
let ticks_per_slot = genesis_config.ticks_per_slot();
let hashes_per_slot = hashes_per_tick * ticks_per_slot;
let hash_samples = maybe_hash_samples.unwrap_or(hashes_per_slot);
let hash_time_ns = compute_hash_time_ns(hash_samples);
let my_ns_per_slot = (hash_time_ns * hashes_per_slot) / hash_samples;
debug!("computed: ns_per_slot: {}", my_ns_per_slot);
let target_ns_per_slot = genesis_config.ns_per_slot() as u64;
debug!(
"cluster ns_per_hash: {}ns ns_per_slot: {}",
target_ns_per_slot / hashes_per_slot,
target_ns_per_slot
);
if my_ns_per_slot < target_ns_per_slot {
let extra_ns = target_ns_per_slot - my_ns_per_slot;
info!("PoH speed check: Will sleep {}ns per slot.", extra_ns);
} else {
error!(
"PoH is slower than cluster target tick rate! mine: {} cluster: {}. If you wish to continue, try --ignore-poh-speed",
my_ns_per_slot, target_ns_per_slot,
);
abort();
}
}
}
fn post_process_restored_tower( fn post_process_restored_tower(
restored_tower: crate::consensus::Result<Tower>, restored_tower: crate::consensus::Result<Tower>,
validator_identity: &Pubkey, validator_identity: &Pubkey,
@ -1266,6 +1307,8 @@ pub fn is_snapshot_config_invalid(
mod tests { mod tests {
use super::*; use super::*;
use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader}; use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader};
use solana_sdk::genesis_config::create_genesis_config;
use solana_sdk::poh_config::PohConfig;
use std::fs::remove_dir_all; use std::fs::remove_dir_all;
#[test] #[test]
@ -1384,7 +1427,6 @@ mod tests {
#[test] #[test]
fn test_wait_for_supermajority() { fn test_wait_for_supermajority() {
solana_logger::setup(); solana_logger::setup();
use solana_sdk::genesis_config::create_genesis_config;
use solana_sdk::hash::hash; use solana_sdk::hash::hash;
let node_keypair = Arc::new(Keypair::new()); let node_keypair = Arc::new(Keypair::new());
let cluster_info = ClusterInfo::new( let cluster_info = ClusterInfo::new(
@ -1441,4 +1483,38 @@ mod tests {
assert!(!is_snapshot_config_invalid(500, 100)); assert!(!is_snapshot_config_invalid(500, 100));
assert!(!is_snapshot_config_invalid(5, 5)); assert!(!is_snapshot_config_invalid(5, 5));
} }
#[test]
#[should_panic]
fn test_poh_speed() {
solana_logger::setup();
let poh_config = PohConfig {
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
// make PoH rate really fast to cause the panic condition
hashes_per_tick: Some(
100 * solana_sdk::clock::DEFAULT_HASHES_PER_SECOND
/ solana_sdk::clock::DEFAULT_TICKS_PER_SECOND,
),
..PohConfig::default()
};
let genesis_config = GenesisConfig {
poh_config,
..GenesisConfig::default()
};
check_poh_speed(&genesis_config, Some(10_000));
}
#[test]
fn test_poh_speed_no_hashes_per_tick() {
let poh_config = PohConfig {
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
hashes_per_tick: None,
..PohConfig::default()
};
let genesis_config = GenesisConfig {
poh_config,
..GenesisConfig::default()
};
check_poh_speed(&genesis_config, Some(10_000));
}
} }

View File

@ -82,18 +82,18 @@ impl Poh {
} }
} }
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 { pub fn compute_hash_time_ns(hashes_sample_size: u64) -> u64 {
// calculate hash rate with the system under maximum load info!("Running {} hashes...", hashes_sample_size);
info!(
"Running {} hashes in parallel on all threads...",
hashes_sample_size
);
let mut v = Hash::default(); let mut v = Hash::default();
let start = Instant::now(); let start = Instant::now();
for _ in 0..hashes_sample_size { for _ in 0..hashes_sample_size {
v = hash(&v.as_ref()); v = hash(&v.as_ref());
} }
let elapsed = start.elapsed().as_millis() as u64; start.elapsed().as_nanos() as u64
}
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 {
let elapsed = compute_hash_time_ns(hashes_sample_size) / (1000 * 1000);
duration.as_millis() as u64 * hashes_sample_size / elapsed duration.as_millis() as u64 * hashes_sample_size / elapsed
} }

View File

@ -1104,6 +1104,11 @@ pub fn main() {
.default_value("10000") .default_value("10000")
.help("Milliseconds between printing contact debug from gossip."), .help("Milliseconds between printing contact debug from gossip."),
) )
.arg(
Arg::with_name("no_poh_speed_test")
.long("no-poh-speed-test")
.help("Skip the check for PoH speed."),
)
.arg( .arg(
Arg::with_name("accounts_hash_interval_slots") Arg::with_name("accounts_hash_interval_slots")
.long("accounts-hash-slots") .long("accounts-hash-slots")
@ -1544,6 +1549,7 @@ pub fn main() {
"rpc_send_transaction_leader_forward_count", "rpc_send_transaction_leader_forward_count",
u64 u64
), ),
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
..ValidatorConfig::default() ..ValidatorConfig::default()
}; };