add metrics for startup (#17913)

* add metrics for startup

* roll timings up higher

* fix test

* fix duplicate
This commit is contained in:
Jeff Washington (jwash) 2021-06-14 17:46:49 -05:00 committed by GitHub
parent 7fde9b6ff0
commit 471b34132e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 23 deletions

View File

@ -146,7 +146,7 @@ mod tests {
let old_last_bank = old_bank_forks.get(old_last_slot).unwrap();
let deserialized_bank = snapshot_utils::bank_from_archive(
let (deserialized_bank, _timing) = snapshot_utils::bank_from_archive(
&account_paths,
&[],
&old_bank_forks

View File

@ -130,7 +130,7 @@ fn load_from_snapshot(
process::exit(1);
}
let deserialized_bank = snapshot_utils::bank_from_archive(
let (deserialized_bank, timings) = snapshot_utils::bank_from_archive(
&account_paths,
&process_options.frozen_accounts,
&snapshot_config.snapshot_path,
@ -174,6 +174,7 @@ fn load_from_snapshot(
&VerifyRecyclers::default(),
transaction_status_sender,
cache_block_meta_sender,
timings,
),
Some(deserialized_bank_slot_and_hash),
)

View File

@ -25,6 +25,7 @@ use solana_runtime::{
bank_forks::BankForks,
bank_utils,
commitment::VOTE_THRESHOLD_SIZE,
snapshot_utils::BankFromArchiveTimings,
transaction_batch::TransactionBatch,
vote_account::ArcVoteAccount,
vote_sender_types::ReplayVoteSender,
@ -421,6 +422,7 @@ pub fn process_blockstore(
&recyclers,
None,
cache_block_meta_sender,
BankFromArchiveTimings::default(),
)
}
@ -432,6 +434,7 @@ pub(crate) fn process_blockstore_from_root(
recyclers: &VerifyRecyclers,
transaction_status_sender: Option<&TransactionStatusSender>,
cache_block_meta_sender: Option<&CacheBlockMetaSender>,
timings: BankFromArchiveTimings,
) -> BlockstoreProcessorResult {
do_process_blockstore_from_root(
blockstore,
@ -440,6 +443,7 @@ pub(crate) fn process_blockstore_from_root(
recyclers,
transaction_status_sender,
cache_block_meta_sender,
timings,
)
}
@ -450,6 +454,7 @@ fn do_process_blockstore_from_root(
recyclers: &VerifyRecyclers,
transaction_status_sender: Option<&TransactionStatusSender>,
cache_block_meta_sender: Option<&CacheBlockMetaSender>,
timings: BankFromArchiveTimings,
) -> BlockstoreProcessorResult {
info!("processing ledger from slot {}...", bank.slot());
@ -529,18 +534,45 @@ fn do_process_blockstore_from_root(
}
let bank_forks = BankForks::new_from_banks(&initial_forks, root);
let processing_time = now.elapsed();
let debug_verify = false;
let mut time_cap = Measure::start("capitalization");
// We might be promptly restarted after bad capitalization was detected while creating newer snapshot.
// In that case, we're most likely restored from the last good snapshot and replayed up to this root.
// So again check here for the bad capitalization to avoid to continue until the next snapshot creation.
if !bank_forks
.root_bank()
.calculate_and_verify_capitalization(debug_verify)
{
return Err(BlockstoreProcessorError::RootBankWithMismatchedCapitalization(root));
}
time_cap.stop();
datapoint_info!(
"process_blockstore_from_root",
("total_time_us", now.elapsed().as_micros(), i64),
("total_time_us", processing_time.as_micros(), i64),
("frozen_banks", bank_forks.frozen_banks().len(), i64),
("slot", bank_forks.root(), i64),
("forks", initial_forks.len(), i64)
("forks", initial_forks.len(), i64),
("calculate_capitalization_us", time_cap.as_us(), i64),
("untar_us", timings.untar_us, i64),
(
"rebuild_bank_from_snapshots_us",
timings.rebuild_bank_from_snapshots_us,
i64
),
(
"verify_snapshot_bank_us",
timings.verify_snapshot_bank_us,
i64
),
);
info!("ledger processing timing: {:?}", timing);
info!(
"ledger processed in {}. root slot is {}, {} fork{} at {}, with {} frozen bank{}",
HumanTime::from(chrono::Duration::from_std(now.elapsed()).unwrap())
HumanTime::from(chrono::Duration::from_std(processing_time).unwrap())
.to_text_en(Accuracy::Precise, Tense::Present),
bank_forks.root(),
initial_forks.len(),
@ -558,17 +590,6 @@ fn do_process_blockstore_from_root(
);
assert!(bank_forks.active_banks().is_empty());
let debug_verify = false;
// We might be promptly restarted after bad capitalization was detected while creating newer snapshot.
// In that case, we're most likely restored from the last good snapshot and replayed up to this root.
// So again check here for the bad capitalization to avoid to continue until the next snapshot creation.
if !bank_forks
.root_bank()
.calculate_and_verify_capitalization(debug_verify)
{
return Err(BlockstoreProcessorError::RootBankWithMismatchedCapitalization(root));
}
Ok((bank_forks, leader_schedule_cache))
}
@ -2897,9 +2918,16 @@ pub mod tests {
bank1.squash();
// Test process_blockstore_from_root() from slot 1 onwards
let (bank_forks, _leader_schedule) =
do_process_blockstore_from_root(&blockstore, bank1, &opts, &recyclers, None, None)
.unwrap();
let (bank_forks, _leader_schedule) = do_process_blockstore_from_root(
&blockstore,
bank1,
&opts,
&recyclers,
None,
None,
BankFromArchiveTimings::default(),
)
.unwrap();
assert_eq!(frozen_bank_slots(&bank_forks), vec![5, 6]);
assert_eq!(bank_forks.working_bank().slot(), 6);

View File

@ -592,6 +592,13 @@ pub fn remove_snapshot<P: AsRef<Path>>(slot: Slot, snapshot_path: P) -> Result<(
Ok(())
}
#[derive(Debug, Default)]
pub struct BankFromArchiveTimings {
pub rebuild_bank_from_snapshots_us: u64,
pub untar_us: u64,
pub verify_snapshot_bank_us: u64,
}
#[allow(clippy::too_many_arguments)]
pub fn bank_from_archive<P: AsRef<Path>>(
account_paths: &[PathBuf],
@ -606,17 +613,19 @@ pub fn bank_from_archive<P: AsRef<Path>>(
accounts_db_caching_enabled: bool,
limit_load_slot_count_from_snapshot: Option<usize>,
shrink_ratio: AccountShrinkThreshold,
) -> Result<Bank> {
) -> Result<(Bank, BankFromArchiveTimings)> {
let unpack_dir = tempfile::Builder::new()
.prefix(TMP_SNAPSHOT_PREFIX)
.tempdir_in(snapshot_path)?;
let mut untar = Measure::start("untar");
let unpacked_append_vec_map = untar_snapshot_in(
&snapshot_tar,
&unpack_dir.as_ref(),
account_paths,
archive_format,
)?;
untar.stop();
let mut measure = Measure::start("bank rebuild from snapshot");
let unpacked_snapshots_dir = unpack_dir.as_ref().join("snapshots");
@ -639,14 +648,20 @@ pub fn bank_from_archive<P: AsRef<Path>>(
limit_load_slot_count_from_snapshot,
shrink_ratio,
)?;
measure.stop();
let mut verify = Measure::start("verify");
if !bank.verify_snapshot_bank() {
panic!("Snapshot bank for slot {} failed to verify", bank.slot());
}
measure.stop();
info!("{}", measure);
verify.stop();
let timings = BankFromArchiveTimings {
rebuild_bank_from_snapshots_us: measure.as_us(),
untar_us: untar.as_us(),
verify_snapshot_bank_us: verify.as_us(),
};
Ok(bank)
Ok((bank, timings))
}
pub fn get_snapshot_archive_path(