Improve poh recorder metrics (#22730)

* Improve poh recorder metrics

* Add metric for poh service send record

* feedback

* clean up
This commit is contained in:
Justin Starry 2022-01-27 08:44:41 +08:00 committed by GitHub
parent 3993cd765c
commit 115b488807
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 100 additions and 74 deletions

View File

@ -21,10 +21,11 @@ use {
genesis_utils::{create_genesis_config, GenesisConfigInfo}, genesis_utils::{create_genesis_config, GenesisConfigInfo},
leader_schedule_cache::LeaderScheduleCache, leader_schedule_cache::LeaderScheduleCache,
}, },
solana_measure::measure::Measure,
solana_runtime::bank::Bank, solana_runtime::bank::Bank,
solana_sdk::{ solana_sdk::{
clock::NUM_CONSECUTIVE_LEADER_SLOTS, hash::Hash, poh_config::PohConfig, pubkey::Pubkey, clock::NUM_CONSECUTIVE_LEADER_SLOTS, hash::Hash, poh_config::PohConfig, pubkey::Pubkey,
timing, transaction::VersionedTransaction, transaction::VersionedTransaction,
}, },
std::{ std::{
cmp, cmp,
@ -217,12 +218,11 @@ pub struct PohRecorder {
record_lock_contention_us: u64, record_lock_contention_us: u64,
flush_cache_no_tick_us: u64, flush_cache_no_tick_us: u64,
flush_cache_tick_us: u64, flush_cache_tick_us: u64,
prepare_send_us: u64, send_entry_us: u64,
send_us: u64,
tick_lock_contention_us: u64, tick_lock_contention_us: u64,
tick_overhead_us: u64,
total_sleep_us: u64, total_sleep_us: u64,
record_us: u64, record_us: u64,
report_metrics_us: u64,
ticks_from_record: u64, ticks_from_record: u64,
last_metric: Instant, last_metric: Instant,
record_sender: Sender<Record>, record_sender: Sender<Record>,
@ -539,8 +539,8 @@ impl PohRecorder {
} }
pub fn tick(&mut self) { pub fn tick(&mut self) {
let now = Instant::now(); let ((poh_entry, target_time), tick_lock_contention_time) = Measure::this(
let (poh_entry, target_time) = { |_| {
let mut poh_l = self.poh.lock().unwrap(); let mut poh_l = self.poh.lock().unwrap();
let poh_entry = poh_l.tick(); let poh_entry = poh_l.tick();
let target_time = if poh_entry.is_some() { let target_time = if poh_entry.is_some() {
@ -548,11 +548,13 @@ impl PohRecorder {
} else { } else {
None None
}; };
(poh_entry, target_time) (poh_entry, target_time)
}; },
self.tick_lock_contention_us += timing::duration_as_us(&now.elapsed()); (),
let now = Instant::now(); "TickLockContention",
);
self.tick_lock_contention_us += tick_lock_contention_time.as_us();
if let Some(poh_entry) = poh_entry { if let Some(poh_entry) = poh_entry {
self.tick_height += 1; self.tick_height += 1;
trace!("tick_height {}", self.tick_height); trace!("tick_height {}", self.tick_height);
@ -561,30 +563,37 @@ impl PohRecorder {
.leader_first_tick_height_including_grace_ticks .leader_first_tick_height_including_grace_ticks
.is_none() .is_none()
{ {
self.tick_overhead_us += timing::duration_as_us(&now.elapsed());
return; return;
} }
let entry = Entry { self.tick_cache.push((
Entry {
num_hashes: poh_entry.num_hashes, num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash, hash: poh_entry.hash,
transactions: vec![], transactions: vec![],
}; },
self.tick_overhead_us += timing::duration_as_us(&now.elapsed()); self.tick_height,
));
let now = Instant::now(); let (_flush_res, flush_cache_and_tick_time) =
self.tick_cache.push((entry, self.tick_height)); Measure::this(|_| self.flush_cache(true), (), "FlushCacheAndTick");
let _ = self.flush_cache(true); self.flush_cache_tick_us += flush_cache_and_tick_time.as_us();
self.flush_cache_tick_us += timing::duration_as_us(&now.elapsed());
let sleep_time = Measure::this(
|_| {
let target_time = target_time.unwrap(); let target_time = target_time.unwrap();
// sleep is not accurate enough to get a predictable time. // sleep is not accurate enough to get a predictable time.
// Kernel can not schedule the thread for a while. // Kernel can not schedule the thread for a while.
let started_waiting = Instant::now();
while Instant::now() < target_time { while Instant::now() < target_time {
// TODO: a caller could possibly desire to reset or record while we're spinning here // TODO: a caller could possibly desire to reset or record while we're spinning here
std::hint::spin_loop(); std::hint::spin_loop();
} }
self.total_sleep_us += started_waiting.elapsed().as_nanos() as u64 / 1000; },
(),
"PohSleep",
)
.1;
self.total_sleep_us += sleep_time.as_us();
} }
} }
@ -597,28 +606,26 @@ impl PohRecorder {
("record_us", self.record_us, i64), ("record_us", self.record_us, i64),
("flush_cache_no_tick_us", self.flush_cache_no_tick_us, i64), ("flush_cache_no_tick_us", self.flush_cache_no_tick_us, i64),
("flush_cache_tick_us", self.flush_cache_tick_us, i64), ("flush_cache_tick_us", self.flush_cache_tick_us, i64),
("prepare_send_us", self.prepare_send_us, i64), ("send_entry_us", self.send_entry_us, i64),
("send_us", self.send_us, i64),
("ticks_from_record", self.ticks_from_record, i64), ("ticks_from_record", self.ticks_from_record, i64),
("total_sleep_us", self.total_sleep_us, i64), ("total_sleep_us", self.total_sleep_us, i64),
("tick_overhead", self.tick_overhead_us, i64),
( (
"record_lock_contention", "record_lock_contention_us",
self.record_lock_contention_us, self.record_lock_contention_us,
i64 i64
), ),
("report_metrics_us", self.report_metrics_us, i64),
); );
self.tick_lock_contention_us = 0; self.tick_lock_contention_us = 0;
self.record_us = 0; self.record_us = 0;
self.tick_overhead_us = 0;
self.total_sleep_us = 0; self.total_sleep_us = 0;
self.record_lock_contention_us = 0; self.record_lock_contention_us = 0;
self.flush_cache_no_tick_us = 0; self.flush_cache_no_tick_us = 0;
self.flush_cache_tick_us = 0; self.flush_cache_tick_us = 0;
self.prepare_send_us = 0; self.send_entry_us = 0;
self.send_us = 0;
self.ticks_from_record = 0; self.ticks_from_record = 0;
self.report_metrics_us = 0;
self.last_metric = Instant::now(); self.last_metric = Instant::now();
} }
} }
@ -632,11 +639,16 @@ impl PohRecorder {
// Entries without transactions are used to track real-time passing in the ledger and // Entries without transactions are used to track real-time passing in the ledger and
// cannot be generated by `record()` // cannot be generated by `record()`
assert!(!transactions.is_empty(), "No transactions provided"); assert!(!transactions.is_empty(), "No transactions provided");
self.report_metrics(bank_slot);
let ((), report_metrics_time) =
Measure::this(|_| self.report_metrics(bank_slot), (), "ReportMetrics");
self.report_metrics_us += report_metrics_time.as_us();
loop { loop {
let now = Instant::now(); let (flush_cache_res, flush_cache_time) =
self.flush_cache(false)?; Measure::this(|_| self.flush_cache(false), (), "FlushCache");
self.flush_cache_no_tick_us += timing::duration_as_us(&now.elapsed()); self.flush_cache_no_tick_us += flush_cache_time.as_us();
flush_cache_res?;
let working_bank = self let working_bank = self
.working_bank .working_bank
@ -646,30 +658,34 @@ impl PohRecorder {
return Err(PohRecorderError::MaxHeightReached); return Err(PohRecorderError::MaxHeightReached);
} }
{ let (mut poh_lock, poh_lock_time) =
let now = Instant::now(); Measure::this(|_| self.poh.lock().unwrap(), (), "PohLock");
let mut poh_lock = self.poh.lock().unwrap(); self.record_lock_contention_us += poh_lock_time.as_us();
let (record_mixin_res, record_mixin_time) =
Measure::this(|_| poh_lock.record(mixin), (), "RecordMixin");
self.record_us += record_mixin_time.as_us();
self.record_lock_contention_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now();
let res = poh_lock.record(mixin);
drop(poh_lock); drop(poh_lock);
self.record_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now(); if let Some(poh_entry) = record_mixin_res {
if let Some(poh_entry) = res { let (send_entry_res, send_entry_time) = Measure::this(
|_| {
let entry = Entry { let entry = Entry {
num_hashes: poh_entry.num_hashes, num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash, hash: poh_entry.hash,
transactions, transactions,
}; };
let bank_clone = working_bank.bank.clone(); let bank_clone = working_bank.bank.clone();
self.prepare_send_us += timing::duration_as_us(&now.elapsed()); self.sender.send((bank_clone, (entry, self.tick_height)))
let now = Instant::now(); },
self.sender.send((bank_clone, (entry, self.tick_height)))?; (),
self.send_us += timing::duration_as_us(&now.elapsed()); "SendPohEntry",
return Ok(()); );
} self.send_entry_us += send_entry_time.as_us();
return Ok(send_entry_res?);
} }
// record() might fail if the next PoH hash needs to be a tick. But that's ok, tick() // record() might fail if the next PoH hash needs to be a tick. But that's ok, tick()
// and re-record() // and re-record()
self.ticks_from_record += 1; self.ticks_from_record += 1;
@ -729,11 +745,10 @@ impl PohRecorder {
record_lock_contention_us: 0, record_lock_contention_us: 0,
flush_cache_tick_us: 0, flush_cache_tick_us: 0,
flush_cache_no_tick_us: 0, flush_cache_no_tick_us: 0,
prepare_send_us: 0, send_entry_us: 0,
send_us: 0,
tick_lock_contention_us: 0, tick_lock_contention_us: 0,
record_us: 0, record_us: 0,
tick_overhead_us: 0, report_metrics_us: 0,
total_sleep_us: 0, total_sleep_us: 0,
ticks_from_record: 0, ticks_from_record: 0,
last_metric: Instant::now(), last_metric: Instant::now(),

View File

@ -43,6 +43,7 @@ struct PohTiming {
total_tick_time_ns: u64, total_tick_time_ns: u64,
last_metric: Instant, last_metric: Instant,
total_record_time_us: u64, total_record_time_us: u64,
total_send_record_result_us: u64,
} }
impl PohTiming { impl PohTiming {
@ -56,6 +57,7 @@ impl PohTiming {
total_tick_time_ns: 0, total_tick_time_ns: 0,
last_metric: Instant::now(), last_metric: Instant::now(),
total_record_time_us: 0, total_record_time_us: 0,
total_send_record_result_us: 0,
} }
} }
fn report(&mut self, ticks_per_slot: u64) { fn report(&mut self, ticks_per_slot: u64) {
@ -72,6 +74,11 @@ impl PohTiming {
("total_lock_time_us", self.total_lock_time_ns / 1000, i64), ("total_lock_time_us", self.total_lock_time_ns / 1000, i64),
("total_hash_time_us", self.total_hash_time_ns / 1000, i64), ("total_hash_time_us", self.total_hash_time_ns / 1000, i64),
("total_record_time_us", self.total_record_time_us, i64), ("total_record_time_us", self.total_record_time_us, i64),
(
"total_send_record_result_us",
self.total_send_record_result_us,
i64
),
); );
self.total_sleep_us = 0; self.total_sleep_us = 0;
self.num_ticks = 0; self.num_ticks = 0;
@ -81,6 +88,7 @@ impl PohTiming {
self.total_hash_time_ns = 0; self.total_hash_time_ns = 0;
self.last_metric = Instant::now(); self.last_metric = Instant::now();
self.total_record_time_us = 0; self.total_record_time_us = 0;
self.total_send_record_result_us = 0;
} }
} }
} }
@ -254,7 +262,10 @@ impl PohService {
record.mixin, record.mixin,
std::mem::take(&mut record.transactions), std::mem::take(&mut record.transactions),
); );
let _ = record.sender.send(res); // what do we do on failure here? Ignore for now. // what do we do on failure here? Ignore for now.
let (_send_res, send_record_result_time) =
Measure::this(|_| record.sender.send(res), (), "SendRecordResult");
timing.total_send_record_result_us += send_record_result_time.as_us();
timing.num_hashes += 1; // note: may have also ticked inside record timing.num_hashes += 1; // note: may have also ticked inside record
let new_record_result = record_receiver.try_recv(); let new_record_result = record_receiver.try_recv();