add metrics to handle_snapshot_requests (#17937)

This commit is contained in:
Jeff Washington (jwash) 2021-06-14 15:46:19 -05:00 committed by GitHub
parent a0872232d3
commit e6bbd4b3f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 91 additions and 70 deletions

View File

@ -222,7 +222,7 @@ mod tests {
// set_root should send a snapshot request // set_root should send a snapshot request
bank_forks.set_root(bank.slot(), &request_sender, None); bank_forks.set_root(bank.slot(), &request_sender, None);
bank.update_accounts_hash(); bank.update_accounts_hash();
snapshot_request_handler.handle_snapshot_requests(false, false, false); snapshot_request_handler.handle_snapshot_requests(false, false, false, 0);
} }
} }

View File

@ -89,11 +89,13 @@ impl SnapshotRequestHandler {
accounts_db_caching_enabled: bool, accounts_db_caching_enabled: bool,
test_hash_calculation: bool, test_hash_calculation: bool,
use_index_hash_calculation: bool, use_index_hash_calculation: bool,
non_snapshot_time_us: u128,
) -> Option<u64> { ) -> Option<u64> {
self.snapshot_request_receiver self.snapshot_request_receiver
.try_iter() .try_iter()
.last() .last()
.map(|snapshot_request| { .map(|snapshot_request| {
let mut total_time = Measure::start("wallclock time elapsed");
let SnapshotRequest { let SnapshotRequest {
snapshot_root_bank, snapshot_root_bank,
status_cache_slot_deltas, status_cache_slot_deltas,
@ -188,6 +190,7 @@ impl SnapshotRequestHandler {
let mut purge_old_snapshots_time = Measure::start("purge_old_snapshots_time"); let mut purge_old_snapshots_time = Measure::start("purge_old_snapshots_time");
snapshot_utils::purge_old_snapshots(&self.snapshot_config.snapshot_path); snapshot_utils::purge_old_snapshots(&self.snapshot_config.snapshot_path);
purge_old_snapshots_time.stop(); purge_old_snapshots_time.stop();
total_time.stop();
datapoint_info!( datapoint_info!(
"handle_snapshot_requests-timing", "handle_snapshot_requests-timing",
@ -205,6 +208,8 @@ impl SnapshotRequestHandler {
purge_old_snapshots_time.as_us(), purge_old_snapshots_time.as_us(),
i64 i64
), ),
("total_us", total_time.as_us(), i64),
("non_snapshot_time_us", non_snapshot_time_us, i64),
); );
snapshot_root_bank.block_height() snapshot_root_bank.block_height()
}) })
@ -251,6 +256,7 @@ impl AbsRequestHandler {
accounts_db_caching_enabled: bool, accounts_db_caching_enabled: bool,
test_hash_calculation: bool, test_hash_calculation: bool,
use_index_hash_calculation: bool, use_index_hash_calculation: bool,
non_snapshot_time_us: u128,
) -> Option<u64> { ) -> Option<u64> {
self.snapshot_request_handler self.snapshot_request_handler
.as_ref() .as_ref()
@ -259,6 +265,7 @@ impl AbsRequestHandler {
accounts_db_caching_enabled, accounts_db_caching_enabled,
test_hash_calculation, test_hash_calculation,
use_index_hash_calculation, use_index_hash_calculation,
non_snapshot_time_us,
) )
}) })
} }
@ -297,87 +304,101 @@ impl AccountsBackgroundService {
let mut last_expiration_check_time = Instant::now(); let mut last_expiration_check_time = Instant::now();
let t_background = Builder::new() let t_background = Builder::new()
.name("solana-bg-accounts".to_string()) .name("solana-bg-accounts".to_string())
.spawn(move || loop { .spawn(move || {
if exit.load(Ordering::Relaxed) { let mut last_snapshot_end_time = None;
break; loop {
} if exit.load(Ordering::Relaxed) {
break;
}
// Grab the current root bank // Grab the current root bank
let bank = bank_forks.read().unwrap().root_bank().clone(); let bank = bank_forks.read().unwrap().root_bank().clone();
// Purge accounts of any dead slots // Purge accounts of any dead slots
Self::remove_dead_slots( Self::remove_dead_slots(
&bank, &bank,
&request_handler, &request_handler,
&mut removed_slots_count, &mut removed_slots_count,
&mut total_remove_slots_time, &mut total_remove_slots_time,
); );
Self::expire_old_recycle_stores(&bank, &mut last_expiration_check_time); Self::expire_old_recycle_stores(&bank, &mut last_expiration_check_time);
// Check to see if there were any requests for snapshotting banks let non_snapshot_time = last_snapshot_end_time
// < the current root bank `bank` above. .map(|last_snapshot_end_time: Instant| {
last_snapshot_end_time.elapsed().as_micros()
})
.unwrap_or_default();
// Claim: Any snapshot request for slot `N` found here implies that the last cleanup // Check to see if there were any requests for snapshotting banks
// slot `M` satisfies `M < N` // < the current root bank `bank` above.
//
// Proof: Assume for contradiction that we find a snapshot request for slot `N` here, // Claim: Any snapshot request for slot `N` found here implies that the last cleanup
// but cleanup has already happened on some slot `M >= N`. Because the call to // slot `M` satisfies `M < N`
// `bank.clean_accounts(true)` (in the code below) implies we only clean slots `<= bank - 1`, //
// then that means in some *previous* iteration of this loop, we must have gotten a root // Proof: Assume for contradiction that we find a snapshot request for slot `N` here,
// bank for slot some slot `R` where `R > N`, but did not see the snapshot for `N` in the // but cleanup has already happened on some slot `M >= N`. Because the call to
// snapshot request channel. // `bank.clean_accounts(true)` (in the code below) implies we only clean slots `<= bank - 1`,
// // then that means in some *previous* iteration of this loop, we must have gotten a root
// However, this is impossible because BankForks.set_root() will always flush the snapshot // bank for slot some slot `R` where `R > N`, but did not see the snapshot for `N` in the
// request for `N` to the snapshot request channel before setting a root `R > N`, and // snapshot request channel.
// snapshot_request_handler.handle_requests() will always look for the latest //
// available snapshot in the channel. // However, this is impossible because BankForks.set_root() will always flush the snapshot
let snapshot_block_height = request_handler.handle_snapshot_requests( // request for `N` to the snapshot request channel before setting a root `R > N`, and
accounts_db_caching_enabled, // snapshot_request_handler.handle_requests() will always look for the latest
test_hash_calculation, // available snapshot in the channel.
use_index_hash_calculation, let snapshot_block_height = request_handler.handle_snapshot_requests(
); accounts_db_caching_enabled,
if accounts_db_caching_enabled { test_hash_calculation,
// Note that the flush will do an internal clean of the use_index_hash_calculation,
// cache up to bank.slot(), so should be safe as long non_snapshot_time,
// as any later snapshots that are taken are of );
// slots >= bank.slot() if snapshot_block_height.is_some() {
bank.flush_accounts_cache_if_needed(); last_snapshot_end_time = Some(Instant::now());
} }
if let Some(snapshot_block_height) = snapshot_block_height {
// Safe, see proof above
assert!(last_cleaned_block_height <= snapshot_block_height);
last_cleaned_block_height = snapshot_block_height;
} else {
if accounts_db_caching_enabled { if accounts_db_caching_enabled {
bank.shrink_candidate_slots(); // Note that the flush will do an internal clean of the
// cache up to bank.slot(), so should be safe as long
// as any later snapshots that are taken are of
// slots >= bank.slot()
bank.flush_accounts_cache_if_needed();
}
if let Some(snapshot_block_height) = snapshot_block_height {
// Safe, see proof above
assert!(last_cleaned_block_height <= snapshot_block_height);
last_cleaned_block_height = snapshot_block_height;
} else { } else {
// under sustained writes, shrink can lag behind so cap to
// SHRUNKEN_ACCOUNT_PER_INTERVAL (which is based on INTERVAL_MS,
// which in turn roughly associated block time)
consumed_budget = bank
.process_stale_slot_with_budget(
consumed_budget,
SHRUNKEN_ACCOUNT_PER_INTERVAL,
)
.min(SHRUNKEN_ACCOUNT_PER_INTERVAL);
}
if bank.block_height() - last_cleaned_block_height
> (CLEAN_INTERVAL_BLOCKS + thread_rng().gen_range(0, 10))
{
if accounts_db_caching_enabled { if accounts_db_caching_enabled {
// Note that the flush will do an internal clean of the bank.shrink_candidate_slots();
// cache up to bank.slot(), so should be safe as long } else {
// as any later snapshots that are taken are of // under sustained writes, shrink can lag behind so cap to
// slots >= bank.slot() // SHRUNKEN_ACCOUNT_PER_INTERVAL (which is based on INTERVAL_MS,
bank.force_flush_accounts_cache(); // which in turn roughly associated block time)
consumed_budget = bank
.process_stale_slot_with_budget(
consumed_budget,
SHRUNKEN_ACCOUNT_PER_INTERVAL,
)
.min(SHRUNKEN_ACCOUNT_PER_INTERVAL);
}
if bank.block_height() - last_cleaned_block_height
> (CLEAN_INTERVAL_BLOCKS + thread_rng().gen_range(0, 10))
{
if accounts_db_caching_enabled {
// Note that the flush will do an internal clean of the
// cache up to bank.slot(), so should be safe as long
// as any later snapshots that are taken are of
// slots >= bank.slot()
bank.force_flush_accounts_cache();
}
bank.clean_accounts(true, false);
last_cleaned_block_height = bank.block_height();
} }
bank.clean_accounts(true, false);
last_cleaned_block_height = bank.block_height();
} }
sleep(Duration::from_millis(INTERVAL_MS));
} }
sleep(Duration::from_millis(INTERVAL_MS));
}) })
.unwrap(); .unwrap();
Self { t_background } Self { t_background }