Add poh speed check and tick speed calibration (#14292)
This commit is contained in:
parent
444ed768dc
commit
2074e407cd
|
@ -1088,7 +1088,12 @@ pub fn create_test_recorder(
|
||||||
poh_recorder.set_bank(&bank);
|
poh_recorder.set_bank(&bank);
|
||||||
|
|
||||||
let poh_recorder = Arc::new(Mutex::new(poh_recorder));
|
let poh_recorder = Arc::new(Mutex::new(poh_recorder));
|
||||||
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
|
let poh_service = PohService::new(
|
||||||
|
poh_recorder.clone(),
|
||||||
|
&poh_config,
|
||||||
|
&exit,
|
||||||
|
bank.ticks_per_slot(),
|
||||||
|
);
|
||||||
|
|
||||||
(exit, poh_recorder, poh_service, entry_receiver)
|
(exit, poh_recorder, poh_service, entry_receiver)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
//! The `poh_service` module implements a service that records the passing of
|
//! The `poh_service` module implements a service that records the passing of
|
||||||
//! "ticks", a measure of time in the PoH stream
|
//! "ticks", a measure of time in the PoH stream
|
||||||
use crate::poh_recorder::PohRecorder;
|
use crate::poh_recorder::PohRecorder;
|
||||||
use solana_sdk::clock::DEFAULT_TICKS_PER_SLOT;
|
|
||||||
use solana_sdk::poh_config::PohConfig;
|
use solana_sdk::poh_config::PohConfig;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
@ -25,6 +24,7 @@ impl PohService {
|
||||||
poh_recorder: Arc<Mutex<PohRecorder>>,
|
poh_recorder: Arc<Mutex<PohRecorder>>,
|
||||||
poh_config: &Arc<PohConfig>,
|
poh_config: &Arc<PohConfig>,
|
||||||
poh_exit: &Arc<AtomicBool>,
|
poh_exit: &Arc<AtomicBool>,
|
||||||
|
ticks_per_slot: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let poh_exit_ = poh_exit.clone();
|
let poh_exit_ = poh_exit.clone();
|
||||||
let poh_config = poh_config.clone();
|
let poh_config = poh_config.clone();
|
||||||
|
@ -49,7 +49,12 @@ impl PohService {
|
||||||
if let Some(cores) = core_affinity::get_core_ids() {
|
if let Some(cores) = core_affinity::get_core_ids() {
|
||||||
core_affinity::set_for_current(cores[0]);
|
core_affinity::set_for_current(cores[0]);
|
||||||
}
|
}
|
||||||
Self::tick_producer(poh_recorder, &poh_exit_);
|
Self::tick_producer(
|
||||||
|
poh_recorder,
|
||||||
|
&poh_exit_,
|
||||||
|
poh_config.target_tick_duration.as_nanos() as u64,
|
||||||
|
ticks_per_slot,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
poh_exit_.store(true, Ordering::Relaxed);
|
poh_exit_.store(true, Ordering::Relaxed);
|
||||||
})
|
})
|
||||||
|
@ -85,27 +90,48 @@ impl PohService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tick_producer(poh_recorder: Arc<Mutex<PohRecorder>>, poh_exit: &AtomicBool) {
|
fn tick_producer(
|
||||||
|
poh_recorder: Arc<Mutex<PohRecorder>>,
|
||||||
|
poh_exit: &AtomicBool,
|
||||||
|
target_tick_ns: u64,
|
||||||
|
ticks_per_slot: u64,
|
||||||
|
) {
|
||||||
|
info!("starting with target ns: {}", target_tick_ns);
|
||||||
let poh = poh_recorder.lock().unwrap().poh.clone();
|
let poh = poh_recorder.lock().unwrap().poh.clone();
|
||||||
let mut now = Instant::now();
|
let mut now = Instant::now();
|
||||||
|
let mut last_metric = Instant::now();
|
||||||
let mut num_ticks = 0;
|
let mut num_ticks = 0;
|
||||||
let mut num_hashes = 0;
|
let mut num_hashes = 0;
|
||||||
|
let mut total_sleep_us = 0;
|
||||||
loop {
|
loop {
|
||||||
num_hashes += NUM_HASHES_PER_BATCH;
|
num_hashes += NUM_HASHES_PER_BATCH;
|
||||||
if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) {
|
if poh.lock().unwrap().hash(NUM_HASHES_PER_BATCH) {
|
||||||
// Lock PohRecorder only for the final hash...
|
// Lock PohRecorder only for the final hash...
|
||||||
poh_recorder.lock().unwrap().tick();
|
poh_recorder.lock().unwrap().tick();
|
||||||
num_ticks += 1;
|
num_ticks += 1;
|
||||||
if num_ticks >= DEFAULT_TICKS_PER_SLOT * 2 {
|
let elapsed_ns = now.elapsed().as_nanos() as u64;
|
||||||
|
// sleep is not accurate enough to get a predictable time.
|
||||||
|
// Kernel can not schedule the thread for a while.
|
||||||
|
while (now.elapsed().as_nanos() as u64) < target_tick_ns {
|
||||||
|
std::sync::atomic::spin_loop_hint();
|
||||||
|
}
|
||||||
|
total_sleep_us += (now.elapsed().as_nanos() as u64 - elapsed_ns) / 1000;
|
||||||
|
now = Instant::now();
|
||||||
|
|
||||||
|
if last_metric.elapsed().as_millis() > 1000 {
|
||||||
|
let elapsed_ms = last_metric.elapsed().as_millis() as u64;
|
||||||
|
let ms_per_slot = (elapsed_ms * ticks_per_slot) / num_ticks;
|
||||||
datapoint_info!(
|
datapoint_info!(
|
||||||
"poh-service",
|
"poh-service",
|
||||||
("ticks", num_ticks as i64, i64),
|
("ticks", num_ticks as i64, i64),
|
||||||
("hashes", num_hashes as i64, i64),
|
("hashes", num_hashes as i64, i64),
|
||||||
("elapsed_ms", now.elapsed().as_millis() as i64, i64),
|
("elapsed_ms", ms_per_slot, i64),
|
||||||
|
("total_sleep_ms", total_sleep_us / 1000, i64),
|
||||||
);
|
);
|
||||||
|
total_sleep_us = 0;
|
||||||
num_ticks = 0;
|
num_ticks = 0;
|
||||||
num_hashes = 0;
|
num_hashes = 0;
|
||||||
now = Instant::now();
|
last_metric = Instant::now();
|
||||||
}
|
}
|
||||||
if poh_exit.load(Ordering::Relaxed) {
|
if poh_exit.load(Ordering::Relaxed) {
|
||||||
break;
|
break;
|
||||||
|
@ -189,7 +215,7 @@ mod tests {
|
||||||
.unwrap()
|
.unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
|
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit, 0);
|
||||||
poh_recorder.lock().unwrap().set_working_bank(working_bank);
|
poh_recorder.lock().unwrap().set_working_bank(working_bank);
|
||||||
|
|
||||||
// get some events
|
// get some events
|
||||||
|
|
|
@ -40,6 +40,7 @@ use solana_ledger::{
|
||||||
blockstore_processor::{self, TransactionStatusSender},
|
blockstore_processor::{self, TransactionStatusSender},
|
||||||
leader_schedule::FixedSchedule,
|
leader_schedule::FixedSchedule,
|
||||||
leader_schedule_cache::LeaderScheduleCache,
|
leader_schedule_cache::LeaderScheduleCache,
|
||||||
|
poh::compute_hash_time_ns,
|
||||||
};
|
};
|
||||||
use solana_measure::measure::Measure;
|
use solana_measure::measure::Measure;
|
||||||
use solana_metrics::datapoint_info;
|
use solana_metrics::datapoint_info;
|
||||||
|
@ -114,6 +115,7 @@ pub struct ValidatorConfig {
|
||||||
pub bpf_jit: bool,
|
pub bpf_jit: bool,
|
||||||
pub send_transaction_retry_ms: u64,
|
pub send_transaction_retry_ms: u64,
|
||||||
pub send_transaction_leader_forward_count: u64,
|
pub send_transaction_leader_forward_count: u64,
|
||||||
|
pub no_poh_speed_test: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for ValidatorConfig {
|
impl Default for ValidatorConfig {
|
||||||
|
@ -156,6 +158,7 @@ impl Default for ValidatorConfig {
|
||||||
bpf_jit: false,
|
bpf_jit: false,
|
||||||
send_transaction_retry_ms: 2000,
|
send_transaction_retry_ms: 2000,
|
||||||
send_transaction_leader_forward_count: 2,
|
send_transaction_leader_forward_count: 2,
|
||||||
|
no_poh_speed_test: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -531,11 +534,20 @@ impl Validator {
|
||||||
(None, None)
|
(None, None)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if !config.no_poh_speed_test {
|
||||||
|
check_poh_speed(&genesis_config, None);
|
||||||
|
}
|
||||||
|
|
||||||
if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) {
|
if wait_for_supermajority(config, &bank, &cluster_info, rpc_override_health_check) {
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
let poh_service = PohService::new(poh_recorder.clone(), &poh_config, &exit);
|
let poh_service = PohService::new(
|
||||||
|
poh_recorder.clone(),
|
||||||
|
&poh_config,
|
||||||
|
&exit,
|
||||||
|
bank.ticks_per_slot(),
|
||||||
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
blockstore.new_shreds_signals.len(),
|
blockstore.new_shreds_signals.len(),
|
||||||
1,
|
1,
|
||||||
|
@ -754,6 +766,35 @@ fn active_vote_account_exists_in_bank(bank: &Arc<Bank>, vote_account: &Pubkey) -
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn check_poh_speed(genesis_config: &GenesisConfig, maybe_hash_samples: Option<u64>) {
|
||||||
|
if let Some(hashes_per_tick) = genesis_config.hashes_per_tick() {
|
||||||
|
let ticks_per_slot = genesis_config.ticks_per_slot();
|
||||||
|
let hashes_per_slot = hashes_per_tick * ticks_per_slot;
|
||||||
|
|
||||||
|
let hash_samples = maybe_hash_samples.unwrap_or(hashes_per_slot);
|
||||||
|
let hash_time_ns = compute_hash_time_ns(hash_samples);
|
||||||
|
|
||||||
|
let my_ns_per_slot = (hash_time_ns * hashes_per_slot) / hash_samples;
|
||||||
|
debug!("computed: ns_per_slot: {}", my_ns_per_slot);
|
||||||
|
let target_ns_per_slot = genesis_config.ns_per_slot() as u64;
|
||||||
|
debug!(
|
||||||
|
"cluster ns_per_hash: {}ns ns_per_slot: {}",
|
||||||
|
target_ns_per_slot / hashes_per_slot,
|
||||||
|
target_ns_per_slot
|
||||||
|
);
|
||||||
|
if my_ns_per_slot < target_ns_per_slot {
|
||||||
|
let extra_ns = target_ns_per_slot - my_ns_per_slot;
|
||||||
|
info!("PoH speed check: Will sleep {}ns per slot.", extra_ns);
|
||||||
|
} else {
|
||||||
|
error!(
|
||||||
|
"PoH is slower than cluster target tick rate! mine: {} cluster: {}. If you wish to continue, try --ignore-poh-speed",
|
||||||
|
my_ns_per_slot, target_ns_per_slot,
|
||||||
|
);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn post_process_restored_tower(
|
fn post_process_restored_tower(
|
||||||
restored_tower: crate::consensus::Result<Tower>,
|
restored_tower: crate::consensus::Result<Tower>,
|
||||||
validator_identity: &Pubkey,
|
validator_identity: &Pubkey,
|
||||||
|
@ -1266,6 +1307,8 @@ pub fn is_snapshot_config_invalid(
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader};
|
use solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader};
|
||||||
|
use solana_sdk::genesis_config::create_genesis_config;
|
||||||
|
use solana_sdk::poh_config::PohConfig;
|
||||||
use std::fs::remove_dir_all;
|
use std::fs::remove_dir_all;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -1384,7 +1427,6 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_wait_for_supermajority() {
|
fn test_wait_for_supermajority() {
|
||||||
solana_logger::setup();
|
solana_logger::setup();
|
||||||
use solana_sdk::genesis_config::create_genesis_config;
|
|
||||||
use solana_sdk::hash::hash;
|
use solana_sdk::hash::hash;
|
||||||
let node_keypair = Arc::new(Keypair::new());
|
let node_keypair = Arc::new(Keypair::new());
|
||||||
let cluster_info = ClusterInfo::new(
|
let cluster_info = ClusterInfo::new(
|
||||||
|
@ -1441,4 +1483,38 @@ mod tests {
|
||||||
assert!(!is_snapshot_config_invalid(500, 100));
|
assert!(!is_snapshot_config_invalid(500, 100));
|
||||||
assert!(!is_snapshot_config_invalid(5, 5));
|
assert!(!is_snapshot_config_invalid(5, 5));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn test_poh_speed() {
|
||||||
|
solana_logger::setup();
|
||||||
|
let poh_config = PohConfig {
|
||||||
|
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
|
||||||
|
// make PoH rate really fast to cause the panic condition
|
||||||
|
hashes_per_tick: Some(
|
||||||
|
100 * solana_sdk::clock::DEFAULT_HASHES_PER_SECOND
|
||||||
|
/ solana_sdk::clock::DEFAULT_TICKS_PER_SECOND,
|
||||||
|
),
|
||||||
|
..PohConfig::default()
|
||||||
|
};
|
||||||
|
let genesis_config = GenesisConfig {
|
||||||
|
poh_config,
|
||||||
|
..GenesisConfig::default()
|
||||||
|
};
|
||||||
|
check_poh_speed(&genesis_config, Some(10_000));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_poh_speed_no_hashes_per_tick() {
|
||||||
|
let poh_config = PohConfig {
|
||||||
|
target_tick_duration: Duration::from_millis(solana_sdk::clock::MS_PER_TICK),
|
||||||
|
hashes_per_tick: None,
|
||||||
|
..PohConfig::default()
|
||||||
|
};
|
||||||
|
let genesis_config = GenesisConfig {
|
||||||
|
poh_config,
|
||||||
|
..GenesisConfig::default()
|
||||||
|
};
|
||||||
|
check_poh_speed(&genesis_config, Some(10_000));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,18 +82,18 @@ impl Poh {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 {
|
pub fn compute_hash_time_ns(hashes_sample_size: u64) -> u64 {
|
||||||
// calculate hash rate with the system under maximum load
|
info!("Running {} hashes...", hashes_sample_size);
|
||||||
info!(
|
|
||||||
"Running {} hashes in parallel on all threads...",
|
|
||||||
hashes_sample_size
|
|
||||||
);
|
|
||||||
let mut v = Hash::default();
|
let mut v = Hash::default();
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
for _ in 0..hashes_sample_size {
|
for _ in 0..hashes_sample_size {
|
||||||
v = hash(&v.as_ref());
|
v = hash(&v.as_ref());
|
||||||
}
|
}
|
||||||
let elapsed = start.elapsed().as_millis() as u64;
|
start.elapsed().as_nanos() as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn compute_hashes_per_tick(duration: Duration, hashes_sample_size: u64) -> u64 {
|
||||||
|
let elapsed = compute_hash_time_ns(hashes_sample_size) / (1000 * 1000);
|
||||||
duration.as_millis() as u64 * hashes_sample_size / elapsed
|
duration.as_millis() as u64 * hashes_sample_size / elapsed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1104,6 +1104,11 @@ pub fn main() {
|
||||||
.default_value("10000")
|
.default_value("10000")
|
||||||
.help("Milliseconds between printing contact debug from gossip."),
|
.help("Milliseconds between printing contact debug from gossip."),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("no_poh_speed_test")
|
||||||
|
.long("no-poh-speed-test")
|
||||||
|
.help("Skip the check for PoH speed."),
|
||||||
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("accounts_hash_interval_slots")
|
Arg::with_name("accounts_hash_interval_slots")
|
||||||
.long("accounts-hash-slots")
|
.long("accounts-hash-slots")
|
||||||
|
@ -1544,6 +1549,7 @@ pub fn main() {
|
||||||
"rpc_send_transaction_leader_forward_count",
|
"rpc_send_transaction_leader_forward_count",
|
||||||
u64
|
u64
|
||||||
),
|
),
|
||||||
|
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
|
||||||
..ValidatorConfig::default()
|
..ValidatorConfig::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue