Add poh speed check and tick speed calibration (#14292)

This commit is contained in:
sakridge 2020-12-29 09:35:57 -08:00 committed by GitHub
parent 444ed768dc
commit 2074e407cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 130 additions and 17 deletions

View File

@ -1088,7 +1088,12 @@ pub fn create_test_recorder(
poh_recorder.set_bank(&bank);
let poh_recorder = Arc::new(Mutex::new(poh_recorder));
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
let poh_service = PohService::new(
poh_recorder.clone(),
&poh_config,
&exit,
bank.ticks_per_slot(),
);
(exit, poh_recorder, poh_service, entry_receiver)
}

View File

@ -1,7 +1,6 @@
//! The `poh_service` module implements a service that records the passing of
//! "ticks", a measure of time in the PoH stream
use crate::poh_recorder::PohRecorder;
use solana_sdk::clock::DEFAULT_TICKS_PER_SLOT;
use solana_sdk::poh_config::PohConfig;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
@ -25,6 +24,7 @@ impl PohService {
poh_recorder: Arc<Mutex<PohRecorder>>,
poh_config: &Arc<PohConfig>,
poh_exit: &Arc<AtomicBool>,
ticks_per_slot: u64,
) -> Self {
let poh_exit_ = poh_exit.clone();
let poh_config = poh_config.clone();
@ -49,7 +49,12 @@ impl PohService {
if let Some(cores) = core_affinity::get_core_ids() {
core_affinity::set_for_current(cores[0]);
}
Self::tick_producer(poh_recorder, &poh_exit_);
Self::tick_producer(
poh_recorder,
&poh_exit_,
poh_config.target_tick_duration.as_nanos() as u64,
ticks_per_slot,
);
}
poh_exit_.store(true, Ordering::Relaxed);
})
@ -85,27 +90,48 @@ impl PohService {
}
}
fn tick_producer(poh_recorder: Arc<Mutex<PohRecorder>>, poh_exit: &AtomicBool) {
fn tick_producer(
poh_recorder: Arc<Mutex<PohRecorder>>,
poh_exit: &AtomicBool,
target_tick_ns: u64,
ticks_per_slot: u64,
) {
info!("starting with target ns: {}", target_tick_ns);
let poh = poh_recorder.lock().unwrap().poh.clone();
let mut now = Instant::now();
let mut last_metric = Instant::now();
let mut num_ticks = 0;
let mut num_hashes = 0;
let mut total_sleep_us = 0;
loop {
num_hashes += NUM_HASHES_PER_BATCH;
if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) {
// Lock PohRecorder only for the final hash...
poh_recorder.lock().unwrap().tick();
num_ticks += 1;
if num_ticks >= DEFAULT_TICKS_PER_SLOT * 2 {
let elapsed_ns = now.elapsed().as_nanos() as u64;
// sleep is not accurate enough to get a predictable time.
// Kernel can not schedule the thread for a while.
while (now.elapsed().as_nanos() as u64) < target_tick_ns {
std::sync::atomic::spin_loop_hint();
}
total_sleep_us += (now.elapsed().as_nanos() as u64 - elapsed_ns) / 1000;
now = Instant::now();
if last_metric.elapsed().as_millis() > 1000 {
let elapsed_ms = last_metric.elapsed().as_millis() as u64;
let ms_per_slot = (elapsed_ms * ticks_per_slot) / num_ticks;
datapoint_info!(
"poh-service",
("ticks", num_ticks as i64, i64),
("hashes", num_hashes as i64, i64),
("elapsed_ms", now.elapsed().as_millis() as i64, i64),
("elapsed_ms", ms_per_slot, i64),
("total_sleep_ms", total_sleep_us / 1000, i64),
);
total_sleep_us = 0;
num_ticks = 0;
num_hashes = 0;
now = Instant::now();
last_metric = Instant::now();
}
if poh_exit.load(Ordering::Relaxed) {
break;
@ -189,7 +215,7 @@ mod tests {
.unwrap()
};
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit, 0);
poh_recorder.lock().unwrap().set_working_bank(working_bank);
// get some events

View File

@ -40,6 +40,7 @@ use solana_ledger::{
blockstore_processor::{self, TransactionStatusSender},
leader_schedule::FixedSchedule,
leader_schedule_cache::LeaderScheduleCache,
poh::compute_hash_time_ns,
};
use solana_measure::measure::Measure;
use solana_metrics::datapoint_info;
@ -114,6 +115,7 @@ pub struct ValidatorConfig {
pub bpf_jit: bool,
pub send_transaction_retry_ms: u64,
pub send_transaction_leader_forward_count: u64,
pub no_poh_speed_test: bool,
}
impl Default for ValidatorConfig {
@ -156,6 +158,7 @@ impl Default for ValidatorConfig {
bpf_jit: false,
send_transaction_retry_ms: 2000,
send_transaction_leader_forward_count: 2,
no_poh_speed_test: true,
}
}
}
@ -531,11 +534,20 @@ impl Validator {
(None, None)
};
if !config.no_poh_speed_test {
check_poh_speed(&genesis_config, None);
}
if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) {
abort();
}
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
let poh_service = PohService::new(
poh_recorder.clone(),
&poh_config,
&exit,
bank.ticks_per_slot(),
);
assert_eq!(
blockstore.new_shreds_signals.len(),
1,
@ -754,6 +766,35 @@ fn active_vote_account_exists_in_bank(bank: &Arc<Bank>, vote_account: &Pubkey) -
false
}
fn check_poh_speed(genesis_config: &GenesisConfig, maybe_hash_samples: Option<u64>) {
if let Some(hashes_per_tick) = genesis_config.hashes_per_tick() {
let ticks_per_slot = genesis_config.ticks_per_slot();
let hashes_per_slot = hashes_per_tick * ticks_per_slot;
let hash_samples = maybe_hash_samples.unwrap_or(hashes_per_slot);
let hash_time_ns = compute_hash_time_ns(hash_samples);
let my_ns_per_slot = (hash_time_ns * hashes_per_slot) / hash_samples;
debug!("computed: ns_per_slot: {}", my_ns_per_slot);
let target_ns_per_slot = genesis_config.ns_per_slot() as u64;
debug!(
"cluster ns_per_hash: {}ns ns_per_slot: {}",
target_ns_per_slot / hashes_per_slot,
target_ns_per_slot
);
if my_ns_per_slot < target_ns_per_slot {
let extra_ns = target_ns_per_slot - my_ns_per_slot;
info!("PoH speed check: Will sleep {}ns per slot.", extra_ns);
} else {
error!(
"PoH is slower than cluster target tick rate! mine: {} cluster: {}. If you wish to continue, try --ignore-poh-speed",
my_ns_per_slot, target_ns_per_slot,
);
abort();
}
}
}
fn post_process_restored_tower(
restored_tower: crate::consensus::Result<Tower>,
validator_identity: &Pubkey,
@ -1266,6 +1307,8 @@ pub fn is_snapshot_config_invalid(
mod tests {
use super::*;
use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader};
use solana_sdk::genesis_config::create_genesis_config;
use solana_sdk::poh_config::PohConfig;
use std::fs::remove_dir_all;
#[test]
@ -1384,7 +1427,6 @@ mod tests {
#[test]
fn test_wait_for_supermajority() {
solana_logger::setup();
use solana_sdk::genesis_config::create_genesis_config;
use solana_sdk::hash::hash;
let node_keypair = Arc::new(Keypair::new());
let cluster_info = ClusterInfo::new(
@ -1441,4 +1483,38 @@ mod tests {
assert!(!is_snapshot_config_invalid(500, 100));
assert!(!is_snapshot_config_invalid(5, 5));
}
#[test]
#[should_panic]
fn test_poh_speed() {
solana_logger::setup();
let poh_config = PohConfig {
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
// make PoH rate really fast to cause the panic condition
hashes_per_tick: Some(
100 * solana_sdk::clock::DEFAULT_HASHES_PER_SECOND
/ solana_sdk::clock::DEFAULT_TICKS_PER_SECOND,
),
..PohConfig::default()
};
let genesis_config = GenesisConfig {
poh_config,
..GenesisConfig::default()
};
check_poh_speed(&genesis_config, Some(10_000));
}
#[test]
fn test_poh_speed_no_hashes_per_tick() {
let poh_config = PohConfig {
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
hashes_per_tick: None,
..PohConfig::default()
};
let genesis_config = GenesisConfig {
poh_config,
..GenesisConfig::default()
};
check_poh_speed(&genesis_config, Some(10_000));
}
}

View File

@ -82,18 +82,18 @@ impl Poh {
}
}
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 {
// calculate hash rate with the system under maximum load
info!(
"Running {} hashes in parallel on all threads...",
hashes_sample_size
);
pub fn compute_hash_time_ns(hashes_sample_size: u64) -> u64 {
info!("Running {} hashes...", hashes_sample_size);
let mut v = Hash::default();
let start = Instant::now();
for _ in 0..hashes_sample_size {
v = hash(&v.as_ref());
}
let elapsed = start.elapsed().as_millis() as u64;
start.elapsed().as_nanos() as u64
}
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 {
let elapsed = compute_hash_time_ns(hashes_sample_size) / (1000 * 1000);
duration.as_millis() as u64 * hashes_sample_size / elapsed
}

View File

@ -1104,6 +1104,11 @@ pub fn main() {
.default_value("10000")
.help("Milliseconds between printing contact debug from gossip."),
)
.arg(
Arg::with_name("no_poh_speed_test")
.long("no-poh-speed-test")
.help("Skip the check for PoH speed."),
)
.arg(
Arg::with_name("accounts_hash_interval_slots")
.long("accounts-hash-slots")
@ -1544,6 +1549,7 @@ pub fn main() {
"rpc_send_transaction_leader_forward_count",
u64
),
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
..ValidatorConfig::default()
};