Improve poh recorder metrics (#22730)

* Improve poh recorder metrics

* Add metric for poh service send record

* feedback

* clean up
This commit is contained in:
Justin Starry 2022-01-27 08:44:41 +08:00 committed by GitHub
parent 3993cd765c
commit 115b488807
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 100 additions and 74 deletions

View File

@ -21,10 +21,11 @@ use {
genesis_utils::{create_genesis_config, GenesisConfigInfo},
leader_schedule_cache::LeaderScheduleCache,
},
solana_measure::measure::Measure,
solana_runtime::bank::Bank,
solana_sdk::{
clock::NUM_CONSECUTIVE_LEADER_SLOTS, hash::Hash, poh_config::PohConfig, pubkey::Pubkey,
timing, transaction::VersionedTransaction,
transaction::VersionedTransaction,
},
std::{
cmp,
@ -217,12 +218,11 @@ pub struct PohRecorder {
record_lock_contention_us: u64,
flush_cache_no_tick_us: u64,
flush_cache_tick_us: u64,
prepare_send_us: u64,
send_us: u64,
send_entry_us: u64,
tick_lock_contention_us: u64,
tick_overhead_us: u64,
total_sleep_us: u64,
record_us: u64,
report_metrics_us: u64,
ticks_from_record: u64,
last_metric: Instant,
record_sender: Sender<Record>,
@ -539,20 +539,22 @@ impl PohRecorder {
}
pub fn tick(&mut self) {
let now = Instant::now();
let (poh_entry, target_time) = {
let mut poh_l = self.poh.lock().unwrap();
let poh_entry = poh_l.tick();
let target_time = if poh_entry.is_some() {
Some(poh_l.target_poh_time(self.target_ns_per_tick))
} else {
None
};
let ((poh_entry, target_time), tick_lock_contention_time) = Measure::this(
|_| {
let mut poh_l = self.poh.lock().unwrap();
let poh_entry = poh_l.tick();
let target_time = if poh_entry.is_some() {
Some(poh_l.target_poh_time(self.target_ns_per_tick))
} else {
None
};
(poh_entry, target_time)
},
(),
"TickLockContention",
);
self.tick_lock_contention_us += tick_lock_contention_time.as_us();
(poh_entry, target_time)
};
self.tick_lock_contention_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now();
if let Some(poh_entry) = poh_entry {
self.tick_height += 1;
trace!("tick_height {}", self.tick_height);
@ -561,30 +563,37 @@ impl PohRecorder {
.leader_first_tick_height_including_grace_ticks
.is_none()
{
self.tick_overhead_us += timing::duration_as_us(&now.elapsed());
return;
}
let entry = Entry {
num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash,
transactions: vec![],
};
self.tick_overhead_us += timing::duration_as_us(&now.elapsed());
self.tick_cache.push((
Entry {
num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash,
transactions: vec![],
},
self.tick_height,
));
let now = Instant::now();
self.tick_cache.push((entry, self.tick_height));
let _ = self.flush_cache(true);
self.flush_cache_tick_us += timing::duration_as_us(&now.elapsed());
let target_time = target_time.unwrap();
// sleep is not accurate enough to get a predictable time.
// Kernel can not schedule the thread for a while.
let started_waiting = Instant::now();
while Instant::now() < target_time {
// TODO: a caller could possibly desire to reset or record while we're spinning here
std::hint::spin_loop();
}
self.total_sleep_us += started_waiting.elapsed().as_nanos() as u64 / 1000;
let (_flush_res, flush_cache_and_tick_time) =
Measure::this(|_| self.flush_cache(true), (), "FlushCacheAndTick");
self.flush_cache_tick_us += flush_cache_and_tick_time.as_us();
let sleep_time = Measure::this(
|_| {
let target_time = target_time.unwrap();
// sleep is not accurate enough to get a predictable time.
// Kernel can not schedule the thread for a while.
while Instant::now() < target_time {
// TODO: a caller could possibly desire to reset or record while we're spinning here
std::hint::spin_loop();
}
},
(),
"PohSleep",
)
.1;
self.total_sleep_us += sleep_time.as_us();
}
}
@ -597,28 +606,26 @@ impl PohRecorder {
("record_us", self.record_us, i64),
("flush_cache_no_tick_us", self.flush_cache_no_tick_us, i64),
("flush_cache_tick_us", self.flush_cache_tick_us, i64),
("prepare_send_us", self.prepare_send_us, i64),
("send_us", self.send_us, i64),
("send_entry_us", self.send_entry_us, i64),
("ticks_from_record", self.ticks_from_record, i64),
("total_sleep_us", self.total_sleep_us, i64),
("tick_overhead", self.tick_overhead_us, i64),
(
"record_lock_contention",
"record_lock_contention_us",
self.record_lock_contention_us,
i64
),
("report_metrics_us", self.report_metrics_us, i64),
);
self.tick_lock_contention_us = 0;
self.record_us = 0;
self.tick_overhead_us = 0;
self.total_sleep_us = 0;
self.record_lock_contention_us = 0;
self.flush_cache_no_tick_us = 0;
self.flush_cache_tick_us = 0;
self.prepare_send_us = 0;
self.send_us = 0;
self.send_entry_us = 0;
self.ticks_from_record = 0;
self.report_metrics_us = 0;
self.last_metric = Instant::now();
}
}
@ -632,11 +639,16 @@ impl PohRecorder {
// Entries without transactions are used to track real-time passing in the ledger and
// cannot be generated by `record()`
assert!(!transactions.is_empty(), "No transactions provided");
self.report_metrics(bank_slot);
let ((), report_metrics_time) =
Measure::this(|_| self.report_metrics(bank_slot), (), "ReportMetrics");
self.report_metrics_us += report_metrics_time.as_us();
loop {
let now = Instant::now();
self.flush_cache(false)?;
self.flush_cache_no_tick_us += timing::duration_as_us(&now.elapsed());
let (flush_cache_res, flush_cache_time) =
Measure::this(|_| self.flush_cache(false), (), "FlushCache");
self.flush_cache_no_tick_us += flush_cache_time.as_us();
flush_cache_res?;
let working_bank = self
.working_bank
@ -646,30 +658,34 @@ impl PohRecorder {
return Err(PohRecorderError::MaxHeightReached);
}
{
let now = Instant::now();
let mut poh_lock = self.poh.lock().unwrap();
let (mut poh_lock, poh_lock_time) =
Measure::this(|_| self.poh.lock().unwrap(), (), "PohLock");
self.record_lock_contention_us += poh_lock_time.as_us();
self.record_lock_contention_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now();
let res = poh_lock.record(mixin);
drop(poh_lock);
self.record_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now();
if let Some(poh_entry) = res {
let entry = Entry {
num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash,
transactions,
};
let bank_clone = working_bank.bank.clone();
self.prepare_send_us += timing::duration_as_us(&now.elapsed());
let now = Instant::now();
self.sender.send((bank_clone, (entry, self.tick_height)))?;
self.send_us += timing::duration_as_us(&now.elapsed());
return Ok(());
}
let (record_mixin_res, record_mixin_time) =
Measure::this(|_| poh_lock.record(mixin), (), "RecordMixin");
self.record_us += record_mixin_time.as_us();
drop(poh_lock);
if let Some(poh_entry) = record_mixin_res {
let (send_entry_res, send_entry_time) = Measure::this(
|_| {
let entry = Entry {
num_hashes: poh_entry.num_hashes,
hash: poh_entry.hash,
transactions,
};
let bank_clone = working_bank.bank.clone();
self.sender.send((bank_clone, (entry, self.tick_height)))
},
(),
"SendPohEntry",
);
self.send_entry_us += send_entry_time.as_us();
return Ok(send_entry_res?);
}
// record() might fail if the next PoH hash needs to be a tick. But that's ok, tick()
// and re-record()
self.ticks_from_record += 1;
@ -729,11 +745,10 @@ impl PohRecorder {
record_lock_contention_us: 0,
flush_cache_tick_us: 0,
flush_cache_no_tick_us: 0,
prepare_send_us: 0,
send_us: 0,
send_entry_us: 0,
tick_lock_contention_us: 0,
record_us: 0,
tick_overhead_us: 0,
report_metrics_us: 0,
total_sleep_us: 0,
ticks_from_record: 0,
last_metric: Instant::now(),

View File

@ -43,6 +43,7 @@ struct PohTiming {
total_tick_time_ns: u64,
last_metric: Instant,
total_record_time_us: u64,
total_send_record_result_us: u64,
}
impl PohTiming {
@ -56,6 +57,7 @@ impl PohTiming {
total_tick_time_ns: 0,
last_metric: Instant::now(),
total_record_time_us: 0,
total_send_record_result_us: 0,
}
}
fn report(&mut self, ticks_per_slot: u64) {
@ -72,6 +74,11 @@ impl PohTiming {
("total_lock_time_us", self.total_lock_time_ns / 1000, i64),
("total_hash_time_us", self.total_hash_time_ns / 1000, i64),
("total_record_time_us", self.total_record_time_us, i64),
(
"total_send_record_result_us",
self.total_send_record_result_us,
i64
),
);
self.total_sleep_us = 0;
self.num_ticks = 0;
@ -81,6 +88,7 @@ impl PohTiming {
self.total_hash_time_ns = 0;
self.last_metric = Instant::now();
self.total_record_time_us = 0;
self.total_send_record_result_us = 0;
}
}
}
@ -254,7 +262,10 @@ impl PohService {
record.mixin,
std::mem::take(&mut record.transactions),
);
let _ = record.sender.send(res); // what do we do on failure here? Ignore for now.
// what do we do on failure here? Ignore for now.
let (_send_res, send_record_result_time) =
Measure::this(|_| record.sender.send(res), (), "SendRecordResult");
timing.total_send_record_result_us += send_record_result_time.as_us();
timing.num_hashes += 1; // note: may have also ticked inside record
let new_record_result = record_receiver.try_recv();