Entry verify cleanup and gossip counters (#10632)

* Add prune message counter

* Switch to us verification time to match other counters

* Add separate transaction/poh verify timing
This commit is contained in:
sakridge 2020-06-16 14:00:29 -07:00 committed by GitHub
parent 1b7ef69bce
commit 1eca9b19ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 94 additions and 62 deletions

View File

@ -226,6 +226,8 @@ struct GossipStats {
process_prune: Counter, process_prune: Counter,
process_push_message: Counter, process_push_message: Counter,
prune_received_cache: Counter, prune_received_cache: Counter,
prune_message_count: Counter,
prune_message_len: Counter,
purge: Counter, purge: Counter,
epoch_slots_lookup: Counter, epoch_slots_lookup: Counter,
epoch_slots_push: Counter, epoch_slots_push: Counter,
@ -1641,7 +1643,7 @@ impl ClusterInfo {
} }
ret ret
}); });
let rsp = Self::handle_push_message(self, recycler, &from, data, stakes); let rsp = self.handle_push_message(recycler, &from, data, stakes);
if let Some(rsp) = rsp { if let Some(rsp) = rsp {
let _ignore_disconnect = response_sender.send(rsp); let _ignore_disconnect = response_sender.send(rsp);
} }
@ -1653,11 +1655,10 @@ impl ClusterInfo {
Protocol::PruneMessage(from, data) => { Protocol::PruneMessage(from, data) => {
let start = allocated.get(); let start = allocated.get();
if data.verify() { if data.verify() {
inc_new_counter_debug!("cluster_info-prune_message", 1); self.stats.prune_message_count.add_relaxed(1);
inc_new_counter_debug!( self.stats
"cluster_info-prune_message-size", .prune_message_len
data.prunes.len() .add_relaxed(data.prunes.len() as u64);
);
match self match self
.time_gossip_write_lock("process_prune", &self.stats.process_prune) .time_gossip_write_lock("process_prune", &self.stats.process_prune)
.process_prune_msg( .process_prune_msg(
@ -2241,6 +2242,16 @@ impl ClusterInfo {
self.stats.new_pull_requests_count.clear(), self.stats.new_pull_requests_count.clear(),
i64 i64
), ),
(
"prune_message_count",
self.stats.prune_message_count.clear(),
i64
),
(
"prune_message_len",
self.stats.prune_message_len.clear(),
i64
),
); );
*last_print = Instant::now(); *last_print = Instant::now();

View File

@ -42,7 +42,16 @@ impl ReplaySlotStats {
self.fetch_fail_elapsed as i64, self.fetch_fail_elapsed as i64,
i64 i64
), ),
("entry_verification_time", self.verify_elapsed as i64, i64), (
"entry_poh_verification_time",
self.poh_verify_elapsed as i64,
i64
),
(
"entry_transaction_verification_time",
self.transaction_verify_elapsed as i64,
i64
),
("replay_time", self.replay_elapsed as i64, i64), ("replay_time", self.replay_elapsed as i64, i64),
( (
"replay_total_elapsed", "replay_total_elapsed",

View File

@ -484,7 +484,8 @@ fn confirm_full_slot(
pub struct ConfirmationTiming { pub struct ConfirmationTiming {
pub started: Instant, pub started: Instant,
pub replay_elapsed: u64, pub replay_elapsed: u64,
pub verify_elapsed: u64, pub poh_verify_elapsed: u64,
pub transaction_verify_elapsed: u64,
pub fetch_elapsed: u64, pub fetch_elapsed: u64,
pub fetch_fail_elapsed: u64, pub fetch_fail_elapsed: u64,
} }
@ -494,7 +495,8 @@ impl Default for ConfirmationTiming {
Self { Self {
started: Instant::now(), started: Instant::now(),
replay_elapsed: 0, replay_elapsed: 0,
verify_elapsed: 0, poh_verify_elapsed: 0,
transaction_verify_elapsed: 0,
fetch_elapsed: 0, fetch_elapsed: 0,
fetch_fail_elapsed: 0, fetch_fail_elapsed: 0,
} }
@ -599,11 +601,13 @@ pub fn confirm_slot(
timing.replay_elapsed += replay_elapsed.as_us(); timing.replay_elapsed += replay_elapsed.as_us();
if let Some(mut verifier) = verifier { if let Some(mut verifier) = verifier {
if !verifier.finish_verify(&entries) { let verified = verifier.finish_verify(&entries);
timing.poh_verify_elapsed += verifier.poh_duration_us();
timing.transaction_verify_elapsed += verifier.transaction_duration_us();
if !verified {
warn!("Ledger proof of history failed at slot: {}", bank.slot()); warn!("Ledger proof of history failed at slot: {}", bank.slot());
return Err(BlockError::InvalidEntryHash.into()); return Err(BlockError::InvalidEntryHash.into());
} }
timing.verify_elapsed += verifier.duration_ms();
} }
process_result?; process_result?;

View File

@ -151,12 +151,22 @@ pub fn next_hash(start_hash: &Hash, num_hashes: u64, transactions: &[Transaction
} }
} }
pub struct VerificationData { pub struct GpuVerificationData {
thread_h: Option<JoinHandle<u64>>, thread_h: Option<JoinHandle<u64>>,
verification_status: EntryVerificationStatus,
hashes: Option<Arc<Mutex<PinnedVec<Hash>>>>, hashes: Option<Arc<Mutex<PinnedVec<Hash>>>>,
tx_hashes: Vec<Option<Hash>>, tx_hashes: Vec<Option<Hash>>,
duration_ms: u64, }
pub enum DeviceVerificationData {
CPU(),
GPU(GpuVerificationData),
}
pub struct EntryVerificationState {
verification_status: EntryVerificationStatus,
poh_duration_us: u64,
transaction_duration_us: u64,
device_verification_data: DeviceVerificationData,
} }
#[derive(Default, Clone)] #[derive(Default, Clone)]
@ -172,33 +182,30 @@ pub enum EntryVerificationStatus {
Pending, Pending,
} }
pub enum EntryVerificationState {
CPU(VerificationData),
GPU(VerificationData),
}
impl EntryVerificationState { impl EntryVerificationState {
pub fn status(&self) -> EntryVerificationStatus { pub fn status(&self) -> EntryVerificationStatus {
match self { self.verification_status
EntryVerificationState::CPU(state) => state.verification_status,
EntryVerificationState::GPU(state) => state.verification_status,
}
} }
pub fn duration_ms(&self) -> u64 { pub fn poh_duration_us(&self) -> u64 {
match self { self.poh_duration_us
EntryVerificationState::CPU(state) => state.duration_ms, }
EntryVerificationState::GPU(state) => state.duration_ms,
} pub fn set_transaction_duration_us(&mut self, new: u64) {
self.transaction_duration_us = new;
}
pub fn transaction_duration_us(&self) -> u64 {
self.transaction_duration_us
} }
pub fn finish_verify(&mut self, entries: &[Entry]) -> bool { pub fn finish_verify(&mut self, entries: &[Entry]) -> bool {
match self { match &mut self.device_verification_data {
EntryVerificationState::GPU(verification_state) => { DeviceVerificationData::GPU(verification_state) => {
let gpu_time_ms = verification_state.thread_h.take().unwrap().join().unwrap(); let gpu_time_us = verification_state.thread_h.take().unwrap().join().unwrap();
let mut verify_check_time = Measure::start("verify_check"); let mut verify_check_time = Measure::start("verify_check");
let hashes = verification_state.hashes.take().expect("hashes.as_ref"); let hashes = verification_state.hashes.take().unwrap();
let hashes = Arc::try_unwrap(hashes) let hashes = Arc::try_unwrap(hashes)
.expect("unwrap Arc") .expect("unwrap Arc")
.into_inner() .into_inner()
@ -225,21 +232,17 @@ impl EntryVerificationState {
}); });
verify_check_time.stop(); verify_check_time.stop();
verification_state.duration_ms += gpu_time_ms + verify_check_time.as_ms(); self.poh_duration_us += gpu_time_us + verify_check_time.as_us();
inc_new_counter_info!(
"entry_verify-duration",
verification_state.duration_ms as usize
);
verification_state.verification_status = if res { self.verification_status = if res {
EntryVerificationStatus::Success EntryVerificationStatus::Success
} else { } else {
EntryVerificationStatus::Failure EntryVerificationStatus::Failure
}; };
res res
} }
EntryVerificationState::CPU(verification_state) => { DeviceVerificationData::CPU() => {
verification_state.verification_status == EntryVerificationStatus::Success self.verification_status == EntryVerificationStatus::Success
} }
} }
} }
@ -290,19 +293,17 @@ impl EntrySlice for [Entry] {
}) })
}) })
}); });
let duration_ms = timing::duration_as_ms(&now.elapsed()); let poh_duration_us = timing::duration_as_us(&now.elapsed());
inc_new_counter_warn!("entry_verify-duration", duration_ms as usize); EntryVerificationState {
EntryVerificationState::CPU(VerificationData {
thread_h: None,
verification_status: if res { verification_status: if res {
EntryVerificationStatus::Success EntryVerificationStatus::Success
} else { } else {
EntryVerificationStatus::Failure EntryVerificationStatus::Failure
}, },
hashes: None, poh_duration_us,
tx_hashes: vec![], transaction_duration_us: 0,
duration_ms, device_verification_data: DeviceVerificationData::CPU(),
}) }
} }
fn verify_transaction_signatures(&self) -> bool { fn verify_transaction_signatures(&self) -> bool {
@ -324,19 +325,22 @@ impl EntrySlice for [Entry] {
) -> EntryVerificationState { ) -> EntryVerificationState {
let start = Instant::now(); let start = Instant::now();
let res = self.verify_transaction_signatures(); let res = self.verify_transaction_signatures();
let transaction_duration_us = timing::duration_as_us(&start.elapsed());
if !res { if !res {
return EntryVerificationState::CPU(VerificationData { return EntryVerificationState {
thread_h: None,
verification_status: EntryVerificationStatus::Failure, verification_status: EntryVerificationStatus::Failure,
duration_ms: timing::duration_as_ms(&start.elapsed()), transaction_duration_us,
hashes: None, poh_duration_us: 0,
tx_hashes: vec![], device_verification_data: DeviceVerificationData::CPU(),
}); };
} }
let start = Instant::now();
let api = perf_libs::api(); let api = perf_libs::api();
if api.is_none() { if api.is_none() {
return self.verify_cpu(start_hash); let mut res: EntryVerificationState = self.verify_cpu(start_hash);
res.set_transaction_duration_us(transaction_duration_us);
return res;
} }
let api = api.unwrap(); let api = api.unwrap();
inc_new_counter_warn!("entry_verify-num_entries", self.len() as usize); inc_new_counter_warn!("entry_verify-num_entries", self.len() as usize);
@ -386,11 +390,11 @@ impl EntrySlice for [Entry] {
if res != 0 { if res != 0 {
panic!("GPU PoH verify many failed"); panic!("GPU PoH verify many failed");
} }
inc_new_counter_warn!( inc_new_counter_info!(
"entry_verify-gpu_thread", "entry_verify-gpu_thread",
timing::duration_as_ms(&gpu_wait.elapsed()) as usize timing::duration_as_us(&gpu_wait.elapsed()) as usize
); );
timing::duration_as_ms(&gpu_wait.elapsed()) timing::duration_as_us(&gpu_wait.elapsed())
}); });
let tx_hashes = PAR_THREAD_POOL.with(|thread_pool| { let tx_hashes = PAR_THREAD_POOL.with(|thread_pool| {
@ -407,13 +411,17 @@ impl EntrySlice for [Entry] {
}) })
}); });
EntryVerificationState::GPU(VerificationData { let device_verification_data = DeviceVerificationData::GPU(GpuVerificationData {
thread_h: Some(gpu_verify_thread), thread_h: Some(gpu_verify_thread),
verification_status: EntryVerificationStatus::Pending,
tx_hashes, tx_hashes,
duration_ms: timing::duration_as_ms(&start.elapsed()),
hashes: Some(hashes), hashes: Some(hashes),
}) });
EntryVerificationState {
verification_status: EntryVerificationStatus::Pending,
poh_duration_us: timing::duration_as_us(&start.elapsed()),
transaction_duration_us,
device_verification_data,
}
} }
fn verify_tick_hash_count(&self, tick_hash_count: &mut u64, hashes_per_tick: u64) -> bool { fn verify_tick_hash_count(&self, tick_hash_count: &mut u64, hashes_per_tick: u64) -> bool {