Improve poh recorder metrics (#22730)
* Improve poh recorder metrics * Add metric for poh service send record * feedback * clean up
This commit is contained in:
parent
3993cd765c
commit
115b488807
|
@ -21,10 +21,11 @@ use {
|
||||||
genesis_utils::{create_genesis_config, GenesisConfigInfo},
|
genesis_utils::{create_genesis_config, GenesisConfigInfo},
|
||||||
leader_schedule_cache::LeaderScheduleCache,
|
leader_schedule_cache::LeaderScheduleCache,
|
||||||
},
|
},
|
||||||
|
solana_measure::measure::Measure,
|
||||||
solana_runtime::bank::Bank,
|
solana_runtime::bank::Bank,
|
||||||
solana_sdk::{
|
solana_sdk::{
|
||||||
clock::NUM_CONSECUTIVE_LEADER_SLOTS, hash::Hash, poh_config::PohConfig, pubkey::Pubkey,
|
clock::NUM_CONSECUTIVE_LEADER_SLOTS, hash::Hash, poh_config::PohConfig, pubkey::Pubkey,
|
||||||
timing, transaction::VersionedTransaction,
|
transaction::VersionedTransaction,
|
||||||
},
|
},
|
||||||
std::{
|
std::{
|
||||||
cmp,
|
cmp,
|
||||||
|
@ -217,12 +218,11 @@ pub struct PohRecorder {
|
||||||
record_lock_contention_us: u64,
|
record_lock_contention_us: u64,
|
||||||
flush_cache_no_tick_us: u64,
|
flush_cache_no_tick_us: u64,
|
||||||
flush_cache_tick_us: u64,
|
flush_cache_tick_us: u64,
|
||||||
prepare_send_us: u64,
|
send_entry_us: u64,
|
||||||
send_us: u64,
|
|
||||||
tick_lock_contention_us: u64,
|
tick_lock_contention_us: u64,
|
||||||
tick_overhead_us: u64,
|
|
||||||
total_sleep_us: u64,
|
total_sleep_us: u64,
|
||||||
record_us: u64,
|
record_us: u64,
|
||||||
|
report_metrics_us: u64,
|
||||||
ticks_from_record: u64,
|
ticks_from_record: u64,
|
||||||
last_metric: Instant,
|
last_metric: Instant,
|
||||||
record_sender: Sender<Record>,
|
record_sender: Sender<Record>,
|
||||||
|
@ -539,20 +539,22 @@ impl PohRecorder {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tick(&mut self) {
|
pub fn tick(&mut self) {
|
||||||
let now = Instant::now();
|
let ((poh_entry, target_time), tick_lock_contention_time) = Measure::this(
|
||||||
let (poh_entry, target_time) = {
|
|_| {
|
||||||
let mut poh_l = self.poh.lock().unwrap();
|
let mut poh_l = self.poh.lock().unwrap();
|
||||||
let poh_entry = poh_l.tick();
|
let poh_entry = poh_l.tick();
|
||||||
let target_time = if poh_entry.is_some() {
|
let target_time = if poh_entry.is_some() {
|
||||||
Some(poh_l.target_poh_time(self.target_ns_per_tick))
|
Some(poh_l.target_poh_time(self.target_ns_per_tick))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
(poh_entry, target_time)
|
||||||
|
},
|
||||||
|
(),
|
||||||
|
"TickLockContention",
|
||||||
|
);
|
||||||
|
self.tick_lock_contention_us += tick_lock_contention_time.as_us();
|
||||||
|
|
||||||
(poh_entry, target_time)
|
|
||||||
};
|
|
||||||
self.tick_lock_contention_us += timing::duration_as_us(&now.elapsed());
|
|
||||||
let now = Instant::now();
|
|
||||||
if let Some(poh_entry) = poh_entry {
|
if let Some(poh_entry) = poh_entry {
|
||||||
self.tick_height += 1;
|
self.tick_height += 1;
|
||||||
trace!("tick_height {}", self.tick_height);
|
trace!("tick_height {}", self.tick_height);
|
||||||
|
@ -561,30 +563,37 @@ impl PohRecorder {
|
||||||
.leader_first_tick_height_including_grace_ticks
|
.leader_first_tick_height_including_grace_ticks
|
||||||
.is_none()
|
.is_none()
|
||||||
{
|
{
|
||||||
self.tick_overhead_us += timing::duration_as_us(&now.elapsed());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let entry = Entry {
|
self.tick_cache.push((
|
||||||
num_hashes: poh_entry.num_hashes,
|
Entry {
|
||||||
hash: poh_entry.hash,
|
num_hashes: poh_entry.num_hashes,
|
||||||
transactions: vec![],
|
hash: poh_entry.hash,
|
||||||
};
|
transactions: vec![],
|
||||||
self.tick_overhead_us += timing::duration_as_us(&now.elapsed());
|
},
|
||||||
|
self.tick_height,
|
||||||
|
));
|
||||||
|
|
||||||
let now = Instant::now();
|
let (_flush_res, flush_cache_and_tick_time) =
|
||||||
self.tick_cache.push((entry, self.tick_height));
|
Measure::this(|_| self.flush_cache(true), (), "FlushCacheAndTick");
|
||||||
let _ = self.flush_cache(true);
|
self.flush_cache_tick_us += flush_cache_and_tick_time.as_us();
|
||||||
self.flush_cache_tick_us += timing::duration_as_us(&now.elapsed());
|
|
||||||
let target_time = target_time.unwrap();
|
let sleep_time = Measure::this(
|
||||||
// sleep is not accurate enough to get a predictable time.
|
|_| {
|
||||||
// Kernel can not schedule the thread for a while.
|
let target_time = target_time.unwrap();
|
||||||
let started_waiting = Instant::now();
|
// sleep is not accurate enough to get a predictable time.
|
||||||
while Instant::now() < target_time {
|
// Kernel can not schedule the thread for a while.
|
||||||
// TODO: a caller could possibly desire to reset or record while we're spinning here
|
while Instant::now() < target_time {
|
||||||
std::hint::spin_loop();
|
// TODO: a caller could possibly desire to reset or record while we're spinning here
|
||||||
}
|
std::hint::spin_loop();
|
||||||
self.total_sleep_us += started_waiting.elapsed().as_nanos() as u64 / 1000;
|
}
|
||||||
|
},
|
||||||
|
(),
|
||||||
|
"PohSleep",
|
||||||
|
)
|
||||||
|
.1;
|
||||||
|
self.total_sleep_us += sleep_time.as_us();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -597,28 +606,26 @@ impl PohRecorder {
|
||||||
("record_us", self.record_us, i64),
|
("record_us", self.record_us, i64),
|
||||||
("flush_cache_no_tick_us", self.flush_cache_no_tick_us, i64),
|
("flush_cache_no_tick_us", self.flush_cache_no_tick_us, i64),
|
||||||
("flush_cache_tick_us", self.flush_cache_tick_us, i64),
|
("flush_cache_tick_us", self.flush_cache_tick_us, i64),
|
||||||
("prepare_send_us", self.prepare_send_us, i64),
|
("send_entry_us", self.send_entry_us, i64),
|
||||||
("send_us", self.send_us, i64),
|
|
||||||
("ticks_from_record", self.ticks_from_record, i64),
|
("ticks_from_record", self.ticks_from_record, i64),
|
||||||
("total_sleep_us", self.total_sleep_us, i64),
|
("total_sleep_us", self.total_sleep_us, i64),
|
||||||
("tick_overhead", self.tick_overhead_us, i64),
|
|
||||||
(
|
(
|
||||||
"record_lock_contention",
|
"record_lock_contention_us",
|
||||||
self.record_lock_contention_us,
|
self.record_lock_contention_us,
|
||||||
i64
|
i64
|
||||||
),
|
),
|
||||||
|
("report_metrics_us", self.report_metrics_us, i64),
|
||||||
);
|
);
|
||||||
|
|
||||||
self.tick_lock_contention_us = 0;
|
self.tick_lock_contention_us = 0;
|
||||||
self.record_us = 0;
|
self.record_us = 0;
|
||||||
self.tick_overhead_us = 0;
|
|
||||||
self.total_sleep_us = 0;
|
self.total_sleep_us = 0;
|
||||||
self.record_lock_contention_us = 0;
|
self.record_lock_contention_us = 0;
|
||||||
self.flush_cache_no_tick_us = 0;
|
self.flush_cache_no_tick_us = 0;
|
||||||
self.flush_cache_tick_us = 0;
|
self.flush_cache_tick_us = 0;
|
||||||
self.prepare_send_us = 0;
|
self.send_entry_us = 0;
|
||||||
self.send_us = 0;
|
|
||||||
self.ticks_from_record = 0;
|
self.ticks_from_record = 0;
|
||||||
|
self.report_metrics_us = 0;
|
||||||
self.last_metric = Instant::now();
|
self.last_metric = Instant::now();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -632,11 +639,16 @@ impl PohRecorder {
|
||||||
// Entries without transactions are used to track real-time passing in the ledger and
|
// Entries without transactions are used to track real-time passing in the ledger and
|
||||||
// cannot be generated by `record()`
|
// cannot be generated by `record()`
|
||||||
assert!(!transactions.is_empty(), "No transactions provided");
|
assert!(!transactions.is_empty(), "No transactions provided");
|
||||||
self.report_metrics(bank_slot);
|
|
||||||
|
let ((), report_metrics_time) =
|
||||||
|
Measure::this(|_| self.report_metrics(bank_slot), (), "ReportMetrics");
|
||||||
|
self.report_metrics_us += report_metrics_time.as_us();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let now = Instant::now();
|
let (flush_cache_res, flush_cache_time) =
|
||||||
self.flush_cache(false)?;
|
Measure::this(|_| self.flush_cache(false), (), "FlushCache");
|
||||||
self.flush_cache_no_tick_us += timing::duration_as_us(&now.elapsed());
|
self.flush_cache_no_tick_us += flush_cache_time.as_us();
|
||||||
|
flush_cache_res?;
|
||||||
|
|
||||||
let working_bank = self
|
let working_bank = self
|
||||||
.working_bank
|
.working_bank
|
||||||
|
@ -646,30 +658,34 @@ impl PohRecorder {
|
||||||
return Err(PohRecorderError::MaxHeightReached);
|
return Err(PohRecorderError::MaxHeightReached);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
let (mut poh_lock, poh_lock_time) =
|
||||||
let now = Instant::now();
|
Measure::this(|_| self.poh.lock().unwrap(), (), "PohLock");
|
||||||
let mut poh_lock = self.poh.lock().unwrap();
|
self.record_lock_contention_us += poh_lock_time.as_us();
|
||||||
|
|
||||||
self.record_lock_contention_us += timing::duration_as_us(&now.elapsed());
|
let (record_mixin_res, record_mixin_time) =
|
||||||
let now = Instant::now();
|
Measure::this(|_| poh_lock.record(mixin), (), "RecordMixin");
|
||||||
let res = poh_lock.record(mixin);
|
self.record_us += record_mixin_time.as_us();
|
||||||
drop(poh_lock);
|
|
||||||
self.record_us += timing::duration_as_us(&now.elapsed());
|
drop(poh_lock);
|
||||||
let now = Instant::now();
|
|
||||||
if let Some(poh_entry) = res {
|
if let Some(poh_entry) = record_mixin_res {
|
||||||
let entry = Entry {
|
let (send_entry_res, send_entry_time) = Measure::this(
|
||||||
num_hashes: poh_entry.num_hashes,
|
|_| {
|
||||||
hash: poh_entry.hash,
|
let entry = Entry {
|
||||||
transactions,
|
num_hashes: poh_entry.num_hashes,
|
||||||
};
|
hash: poh_entry.hash,
|
||||||
let bank_clone = working_bank.bank.clone();
|
transactions,
|
||||||
self.prepare_send_us += timing::duration_as_us(&now.elapsed());
|
};
|
||||||
let now = Instant::now();
|
let bank_clone = working_bank.bank.clone();
|
||||||
self.sender.send((bank_clone, (entry, self.tick_height)))?;
|
self.sender.send((bank_clone, (entry, self.tick_height)))
|
||||||
self.send_us += timing::duration_as_us(&now.elapsed());
|
},
|
||||||
return Ok(());
|
(),
|
||||||
}
|
"SendPohEntry",
|
||||||
|
);
|
||||||
|
self.send_entry_us += send_entry_time.as_us();
|
||||||
|
return Ok(send_entry_res?);
|
||||||
}
|
}
|
||||||
|
|
||||||
// record() might fail if the next PoH hash needs to be a tick. But that's ok, tick()
|
// record() might fail if the next PoH hash needs to be a tick. But that's ok, tick()
|
||||||
// and re-record()
|
// and re-record()
|
||||||
self.ticks_from_record += 1;
|
self.ticks_from_record += 1;
|
||||||
|
@ -729,11 +745,10 @@ impl PohRecorder {
|
||||||
record_lock_contention_us: 0,
|
record_lock_contention_us: 0,
|
||||||
flush_cache_tick_us: 0,
|
flush_cache_tick_us: 0,
|
||||||
flush_cache_no_tick_us: 0,
|
flush_cache_no_tick_us: 0,
|
||||||
prepare_send_us: 0,
|
send_entry_us: 0,
|
||||||
send_us: 0,
|
|
||||||
tick_lock_contention_us: 0,
|
tick_lock_contention_us: 0,
|
||||||
record_us: 0,
|
record_us: 0,
|
||||||
tick_overhead_us: 0,
|
report_metrics_us: 0,
|
||||||
total_sleep_us: 0,
|
total_sleep_us: 0,
|
||||||
ticks_from_record: 0,
|
ticks_from_record: 0,
|
||||||
last_metric: Instant::now(),
|
last_metric: Instant::now(),
|
||||||
|
|
|
@ -43,6 +43,7 @@ struct PohTiming {
|
||||||
total_tick_time_ns: u64,
|
total_tick_time_ns: u64,
|
||||||
last_metric: Instant,
|
last_metric: Instant,
|
||||||
total_record_time_us: u64,
|
total_record_time_us: u64,
|
||||||
|
total_send_record_result_us: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PohTiming {
|
impl PohTiming {
|
||||||
|
@ -56,6 +57,7 @@ impl PohTiming {
|
||||||
total_tick_time_ns: 0,
|
total_tick_time_ns: 0,
|
||||||
last_metric: Instant::now(),
|
last_metric: Instant::now(),
|
||||||
total_record_time_us: 0,
|
total_record_time_us: 0,
|
||||||
|
total_send_record_result_us: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn report(&mut self, ticks_per_slot: u64) {
|
fn report(&mut self, ticks_per_slot: u64) {
|
||||||
|
@ -72,6 +74,11 @@ impl PohTiming {
|
||||||
("total_lock_time_us", self.total_lock_time_ns / 1000, i64),
|
("total_lock_time_us", self.total_lock_time_ns / 1000, i64),
|
||||||
("total_hash_time_us", self.total_hash_time_ns / 1000, i64),
|
("total_hash_time_us", self.total_hash_time_ns / 1000, i64),
|
||||||
("total_record_time_us", self.total_record_time_us, i64),
|
("total_record_time_us", self.total_record_time_us, i64),
|
||||||
|
(
|
||||||
|
"total_send_record_result_us",
|
||||||
|
self.total_send_record_result_us,
|
||||||
|
i64
|
||||||
|
),
|
||||||
);
|
);
|
||||||
self.total_sleep_us = 0;
|
self.total_sleep_us = 0;
|
||||||
self.num_ticks = 0;
|
self.num_ticks = 0;
|
||||||
|
@ -81,6 +88,7 @@ impl PohTiming {
|
||||||
self.total_hash_time_ns = 0;
|
self.total_hash_time_ns = 0;
|
||||||
self.last_metric = Instant::now();
|
self.last_metric = Instant::now();
|
||||||
self.total_record_time_us = 0;
|
self.total_record_time_us = 0;
|
||||||
|
self.total_send_record_result_us = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -254,7 +262,10 @@ impl PohService {
|
||||||
record.mixin,
|
record.mixin,
|
||||||
std::mem::take(&mut record.transactions),
|
std::mem::take(&mut record.transactions),
|
||||||
);
|
);
|
||||||
let _ = record.sender.send(res); // what do we do on failure here? Ignore for now.
|
// what do we do on failure here? Ignore for now.
|
||||||
|
let (_send_res, send_record_result_time) =
|
||||||
|
Measure::this(|_| record.sender.send(res), (), "SendRecordResult");
|
||||||
|
timing.total_send_record_result_us += send_record_result_time.as_us();
|
||||||
timing.num_hashes += 1; // note: may have also ticked inside record
|
timing.num_hashes += 1; // note: may have also ticked inside record
|
||||||
|
|
||||||
let new_record_result = record_receiver.try_recv();
|
let new_record_result = record_receiver.try_recv();
|
||||||
|
|
Loading…
Reference in New Issue