add metrics for startup (#17913)

* add metrics for startup

* roll timings up higher

* fix test

* fix duplicate
This commit is contained in:
Jeff Washington (jwash) 2021-06-14 17:46:49 -05:00 committed by GitHub
parent 7fde9b6ff0
commit 471b34132e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 23 deletions

View File

@ -146,7 +146,7 @@ mod tests {
let old_last_bank = old_bank_forks.get(old_last_slot).unwrap(); let old_last_bank = old_bank_forks.get(old_last_slot).unwrap();
let deserialized_bank = snapshot_utils::bank_from_archive( let (deserialized_bank, _timing) = snapshot_utils::bank_from_archive(
&account_paths, &account_paths,
&[], &[],
&old_bank_forks &old_bank_forks

View File

@ -130,7 +130,7 @@ fn load_from_snapshot(
process::exit(1); process::exit(1);
} }
let deserialized_bank = snapshot_utils::bank_from_archive( let (deserialized_bank, timings) = snapshot_utils::bank_from_archive(
&account_paths, &account_paths,
&process_options.frozen_accounts, &process_options.frozen_accounts,
&snapshot_config.snapshot_path, &snapshot_config.snapshot_path,
@ -174,6 +174,7 @@ fn load_from_snapshot(
&VerifyRecyclers::default(), &VerifyRecyclers::default(),
transaction_status_sender, transaction_status_sender,
cache_block_meta_sender, cache_block_meta_sender,
timings,
), ),
Some(deserialized_bank_slot_and_hash), Some(deserialized_bank_slot_and_hash),
) )

View File

@ -25,6 +25,7 @@ use solana_runtime::{
bank_forks::BankForks, bank_forks::BankForks,
bank_utils, bank_utils,
commitment::VOTE_THRESHOLD_SIZE, commitment::VOTE_THRESHOLD_SIZE,
snapshot_utils::BankFromArchiveTimings,
transaction_batch::TransactionBatch, transaction_batch::TransactionBatch,
vote_account::ArcVoteAccount, vote_account::ArcVoteAccount,
vote_sender_types::ReplayVoteSender, vote_sender_types::ReplayVoteSender,
@ -421,6 +422,7 @@ pub fn process_blockstore(
&recyclers, &recyclers,
None, None,
cache_block_meta_sender, cache_block_meta_sender,
BankFromArchiveTimings::default(),
) )
} }
@ -432,6 +434,7 @@ pub(crate) fn process_blockstore_from_root(
recyclers: &VerifyRecyclers, recyclers: &VerifyRecyclers,
transaction_status_sender: Option<&TransactionStatusSender>, transaction_status_sender: Option<&TransactionStatusSender>,
cache_block_meta_sender: Option<&CacheBlockMetaSender>, cache_block_meta_sender: Option<&CacheBlockMetaSender>,
timings: BankFromArchiveTimings,
) -> BlockstoreProcessorResult { ) -> BlockstoreProcessorResult {
do_process_blockstore_from_root( do_process_blockstore_from_root(
blockstore, blockstore,
@ -440,6 +443,7 @@ pub(crate) fn process_blockstore_from_root(
recyclers, recyclers,
transaction_status_sender, transaction_status_sender,
cache_block_meta_sender, cache_block_meta_sender,
timings,
) )
} }
@ -450,6 +454,7 @@ fn do_process_blockstore_from_root(
recyclers: &VerifyRecyclers, recyclers: &VerifyRecyclers,
transaction_status_sender: Option<&TransactionStatusSender>, transaction_status_sender: Option<&TransactionStatusSender>,
cache_block_meta_sender: Option<&CacheBlockMetaSender>, cache_block_meta_sender: Option<&CacheBlockMetaSender>,
timings: BankFromArchiveTimings,
) -> BlockstoreProcessorResult { ) -> BlockstoreProcessorResult {
info!("processing ledger from slot {}...", bank.slot()); info!("processing ledger from slot {}...", bank.slot());
@ -529,18 +534,45 @@ fn do_process_blockstore_from_root(
} }
let bank_forks = BankForks::new_from_banks(&initial_forks, root); let bank_forks = BankForks::new_from_banks(&initial_forks, root);
let processing_time = now.elapsed();
let debug_verify = false;
let mut time_cap = Measure::start("capitalization");
// We might be promptly restarted after bad capitalization was detected while creating newer snapshot.
// In that case, we're most likely restored from the last good snapshot and replayed up to this root.
// So again check here for the bad capitalization to avoid to continue until the next snapshot creation.
if !bank_forks
.root_bank()
.calculate_and_verify_capitalization(debug_verify)
{
return Err(BlockstoreProcessorError::RootBankWithMismatchedCapitalization(root));
}
time_cap.stop();
datapoint_info!( datapoint_info!(
"process_blockstore_from_root", "process_blockstore_from_root",
("total_time_us", now.elapsed().as_micros(), i64), ("total_time_us", processing_time.as_micros(), i64),
("frozen_banks", bank_forks.frozen_banks().len(), i64), ("frozen_banks", bank_forks.frozen_banks().len(), i64),
("slot", bank_forks.root(), i64), ("slot", bank_forks.root(), i64),
("forks", initial_forks.len(), i64) ("forks", initial_forks.len(), i64),
("calculate_capitalization_us", time_cap.as_us(), i64),
("untar_us", timings.untar_us, i64),
(
"rebuild_bank_from_snapshots_us",
timings.rebuild_bank_from_snapshots_us,
i64
),
(
"verify_snapshot_bank_us",
timings.verify_snapshot_bank_us,
i64
),
); );
info!("ledger processing timing: {:?}", timing); info!("ledger processing timing: {:?}", timing);
info!( info!(
"ledger processed in {}. root slot is {}, {} fork{} at {}, with {} frozen bank{}", "ledger processed in {}. root slot is {}, {} fork{} at {}, with {} frozen bank{}",
HumanTime::from(chrono::Duration::from_std(now.elapsed()).unwrap()) HumanTime::from(chrono::Duration::from_std(processing_time).unwrap())
.to_text_en(Accuracy::Precise, Tense::Present), .to_text_en(Accuracy::Precise, Tense::Present),
bank_forks.root(), bank_forks.root(),
initial_forks.len(), initial_forks.len(),
@ -558,17 +590,6 @@ fn do_process_blockstore_from_root(
); );
assert!(bank_forks.active_banks().is_empty()); assert!(bank_forks.active_banks().is_empty());
let debug_verify = false;
// We might be promptly restarted after bad capitalization was detected while creating newer snapshot.
// In that case, we're most likely restored from the last good snapshot and replayed up to this root.
// So again check here for the bad capitalization to avoid to continue until the next snapshot creation.
if !bank_forks
.root_bank()
.calculate_and_verify_capitalization(debug_verify)
{
return Err(BlockstoreProcessorError::RootBankWithMismatchedCapitalization(root));
}
Ok((bank_forks, leader_schedule_cache)) Ok((bank_forks, leader_schedule_cache))
} }
@ -2897,9 +2918,16 @@ pub mod tests {
bank1.squash(); bank1.squash();
// Test process_blockstore_from_root() from slot 1 onwards // Test process_blockstore_from_root() from slot 1 onwards
let (bank_forks, _leader_schedule) = let (bank_forks, _leader_schedule) = do_process_blockstore_from_root(
do_process_blockstore_from_root(&blockstore, bank1, &opts, &recyclers, None, None) &blockstore,
.unwrap(); bank1,
&opts,
&recyclers,
None,
None,
BankFromArchiveTimings::default(),
)
.unwrap();
assert_eq!(frozen_bank_slots(&bank_forks), vec![5, 6]); assert_eq!(frozen_bank_slots(&bank_forks), vec![5, 6]);
assert_eq!(bank_forks.working_bank().slot(), 6); assert_eq!(bank_forks.working_bank().slot(), 6);

View File

@ -592,6 +592,13 @@ pub fn remove_snapshot<P: AsRef<Path>>(slot: Slot, snapshot_path: P) -> Result<(
Ok(()) Ok(())
} }
#[derive(Debug, Default)]
pub struct BankFromArchiveTimings {
pub rebuild_bank_from_snapshots_us: u64,
pub untar_us: u64,
pub verify_snapshot_bank_us: u64,
}
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn bank_from_archive<P: AsRef<Path>>( pub fn bank_from_archive<P: AsRef<Path>>(
account_paths: &[PathBuf], account_paths: &[PathBuf],
@ -606,17 +613,19 @@ pub fn bank_from_archive<P: AsRef<Path>>(
accounts_db_caching_enabled: bool, accounts_db_caching_enabled: bool,
limit_load_slot_count_from_snapshot: Option<usize>, limit_load_slot_count_from_snapshot: Option<usize>,
shrink_ratio: AccountShrinkThreshold, shrink_ratio: AccountShrinkThreshold,
) -> Result<Bank> { ) -> Result<(Bank, BankFromArchiveTimings)> {
let unpack_dir = tempfile::Builder::new() let unpack_dir = tempfile::Builder::new()
.prefix(TMP_SNAPSHOT_PREFIX) .prefix(TMP_SNAPSHOT_PREFIX)
.tempdir_in(snapshot_path)?; .tempdir_in(snapshot_path)?;
let mut untar = Measure::start("untar");
let unpacked_append_vec_map = untar_snapshot_in( let unpacked_append_vec_map = untar_snapshot_in(
&snapshot_tar, &snapshot_tar,
&unpack_dir.as_ref(), &unpack_dir.as_ref(),
account_paths, account_paths,
archive_format, archive_format,
)?; )?;
untar.stop();
let mut measure = Measure::start("bank rebuild from snapshot"); let mut measure = Measure::start("bank rebuild from snapshot");
let unpacked_snapshots_dir = unpack_dir.as_ref().join("snapshots"); let unpacked_snapshots_dir = unpack_dir.as_ref().join("snapshots");
@ -639,14 +648,20 @@ pub fn bank_from_archive<P: AsRef<Path>>(
limit_load_slot_count_from_snapshot, limit_load_slot_count_from_snapshot,
shrink_ratio, shrink_ratio,
)?; )?;
measure.stop();
let mut verify = Measure::start("verify");
if !bank.verify_snapshot_bank() { if !bank.verify_snapshot_bank() {
panic!("Snapshot bank for slot {} failed to verify", bank.slot()); panic!("Snapshot bank for slot {} failed to verify", bank.slot());
} }
measure.stop(); verify.stop();
info!("{}", measure); let timings = BankFromArchiveTimings {
rebuild_bank_from_snapshots_us: measure.as_us(),
untar_us: untar.as_us(),
verify_snapshot_bank_us: verify.as_us(),
};
Ok(bank) Ok((bank, timings))
} }
pub fn get_snapshot_archive_path( pub fn get_snapshot_archive_path(