adds counters for errors in window-service run_insert (#20670)

This commit is contained in:
behzad nouri 2021-10-15 14:13:26 +00:00 committed by GitHub
parent 0419e6c22e
commit 0f03971c3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 43 additions and 5 deletions

View File

@ -50,6 +50,12 @@ struct WindowServiceMetrics {
num_shreds_received: u64,
shred_receiver_elapsed_us: u64,
prune_shreds_elapsed_us: u64,
num_shreds_pruned_invalid_repair: usize,
num_errors: u64,
num_errors_blockstore: u64,
num_errors_cross_beam_recv_timeout: u64,
num_errors_other: u64,
num_errors_try_crossbeam_send: u64,
}
impl WindowServiceMetrics {
@ -68,8 +74,39 @@ impl WindowServiceMetrics {
self.prune_shreds_elapsed_us as i64,
i64
),
(
"num_shreds_pruned_invalid_repair",
self.num_shreds_pruned_invalid_repair,
i64
),
("num_errors", self.num_errors, i64),
("num_errors_blockstore", self.num_errors_blockstore, i64),
("num_errors_other", self.num_errors_other, i64),
(
"num_errors_try_crossbeam_send",
self.num_errors_try_crossbeam_send,
i64
),
(
"num_errors_cross_beam_recv_timeout",
self.num_errors_cross_beam_recv_timeout,
i64
),
);
}
fn record_error(&mut self, err: &Error) {
self.num_errors += 1;
match err {
Error::TryCrossbeamSend => self.num_errors_try_crossbeam_send += 1,
Error::CrossbeamRecvTimeout(_) => self.num_errors_cross_beam_recv_timeout += 1,
Error::Blockstore(err) => {
self.num_errors_blockstore += 1;
error!("blockstore error: {}", err);
}
_ => self.num_errors_other += 1,
}
}
}
#[derive(Default)]
@ -269,6 +306,7 @@ fn run_insert<F>(
where
F: Fn(Shred),
{
ws_metrics.run_insert_count += 1;
let mut shred_receiver_elapsed = Measure::start("shred_receiver_elapsed");
let timer = Duration::from_millis(200);
let (mut shreds, mut repair_infos) = shred_receiver.recv_timeout(timer)?;
@ -277,15 +315,19 @@ where
repair_infos.extend(more_repair_infos);
}
shred_receiver_elapsed.stop();
ws_metrics.shred_receiver_elapsed_us += shred_receiver_elapsed.as_us();
ws_metrics.num_shreds_received += shreds.len() as u64;
let mut prune_shreds_elapsed = Measure::start("prune_shreds_elapsed");
let num_shreds = shreds.len();
prune_shreds_invalid_repair(&mut shreds, &mut repair_infos, outstanding_requests);
ws_metrics.num_shreds_pruned_invalid_repair = num_shreds - shreds.len();
let repairs: Vec<_> = repair_infos
.iter()
.map(|repair_info| repair_info.is_some())
.collect();
prune_shreds_elapsed.stop();
ws_metrics.prune_shreds_elapsed_us += prune_shreds_elapsed.as_us();
let (completed_data_sets, inserted_indices) = blockstore.insert_shreds_handle_duplicate(
shreds,
@ -303,11 +345,6 @@ where
}
completed_data_sets_sender.try_send(completed_data_sets)?;
ws_metrics.run_insert_count += 1;
ws_metrics.shred_receiver_elapsed_us += shred_receiver_elapsed.as_us();
ws_metrics.prune_shreds_elapsed_us += prune_shreds_elapsed.as_us();
Ok(())
}
@ -567,6 +604,7 @@ impl WindowService {
&retransmit_sender,
&outstanding_requests,
) {
ws_metrics.record_error(&e);
if Self::should_exit_on_error(e, &mut handle_timeout, &handle_error) {
break;
}