Fix intermittent failures in test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_startup() (#24440)

This commit is contained in:
Brooks Prumo 2022-04-18 13:08:06 -05:00 committed by GitHub
parent bcc5b96bda
commit 1769a4e7b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 83 additions and 74 deletions

View File

@ -891,7 +891,7 @@ fn test_incremental_snapshot_download() {
#[test] #[test]
#[serial] #[serial]
fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_startup() { fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_startup() {
solana_logger::setup_with_default(RUST_LOG_FILTER); solana_logger::setup_with_default("error,local_cluster=info");
// If these intervals change, also make sure to change the loop timers accordingly. // If these intervals change, also make sure to change the loop timers accordingly.
let accounts_hash_interval = 3; let accounts_hash_interval = 3;
let incremental_snapshot_interval = accounts_hash_interval * 3; let incremental_snapshot_interval = accounts_hash_interval * 3;
@ -951,30 +951,28 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
); );
info!("Waiting for leader to create snapshots..."); info!("Waiting for leader to create snapshots...");
let (incremental_snapshot_archive_info, full_snapshot_archive_info) = let (incremental_snapshot_archive, full_snapshot_archive) =
LocalCluster::wait_for_next_incremental_snapshot( LocalCluster::wait_for_next_incremental_snapshot(
&cluster, &cluster,
leader_snapshot_test_config.snapshot_archives_dir.path(), leader_snapshot_test_config.snapshot_archives_dir.path(),
); );
debug!( debug!(
"Found snapshots:\n\tfull snapshot: {}\n\tincremental snapshot: {}", "Found snapshots:\n\tfull snapshot: {}\n\tincremental snapshot: {}",
full_snapshot_archive_info.path().display(), full_snapshot_archive.path().display(),
incremental_snapshot_archive_info.path().display() incremental_snapshot_archive.path().display()
); );
assert_eq!( assert_eq!(
full_snapshot_archive_info.slot(), full_snapshot_archive.slot(),
incremental_snapshot_archive_info.base_slot() incremental_snapshot_archive.base_slot()
); );
info!("Waiting for leader to create snapshots... DONE");
// Download the snapshots, then boot a validator from them. // Download the snapshots, then boot a validator from them.
info!("Downloading full snapshot to validator..."); info!("Downloading full snapshot to validator...");
download_snapshot_archive( download_snapshot_archive(
&cluster.entry_point_info.rpc, &cluster.entry_point_info.rpc,
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
( (full_snapshot_archive.slot(), *full_snapshot_archive.hash()),
full_snapshot_archive_info.slot(),
*full_snapshot_archive_info.hash(),
),
SnapshotType::FullSnapshot, SnapshotType::FullSnapshot,
validator_snapshot_test_config validator_snapshot_test_config
.validator_config .validator_config
@ -992,25 +990,25 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
&mut None, &mut None,
) )
.unwrap(); .unwrap();
let downloaded_full_snapshot_archive_info = let downloaded_full_snapshot_archive = snapshot_utils::get_highest_full_snapshot_archive_info(
snapshot_utils::get_highest_full_snapshot_archive_info( validator_snapshot_test_config.snapshot_archives_dir.path(),
validator_snapshot_test_config.snapshot_archives_dir.path(), )
) .unwrap();
.unwrap();
debug!( debug!(
"Downloaded full snapshot, slot: {}", "Downloaded full snapshot, slot: {}",
downloaded_full_snapshot_archive_info.slot() downloaded_full_snapshot_archive.slot()
); );
info!("Downloading full snapshot to validator... DONE");
info!("Downloading incremental snapshot to validator..."); info!("Downloading incremental snapshot to validator...");
download_snapshot_archive( download_snapshot_archive(
&cluster.entry_point_info.rpc, &cluster.entry_point_info.rpc,
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
( (
incremental_snapshot_archive_info.slot(), incremental_snapshot_archive.slot(),
*incremental_snapshot_archive_info.hash(), *incremental_snapshot_archive.hash(),
), ),
SnapshotType::IncrementalSnapshot(incremental_snapshot_archive_info.base_slot()), SnapshotType::IncrementalSnapshot(incremental_snapshot_archive.base_slot()),
validator_snapshot_test_config validator_snapshot_test_config
.validator_config .validator_config
.snapshot_config .snapshot_config
@ -1027,21 +1025,22 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
&mut None, &mut None,
) )
.unwrap(); .unwrap();
let downloaded_incremental_snapshot_archive_info = let downloaded_incremental_snapshot_archive =
snapshot_utils::get_highest_incremental_snapshot_archive_info( snapshot_utils::get_highest_incremental_snapshot_archive_info(
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
full_snapshot_archive_info.slot(), full_snapshot_archive.slot(),
) )
.unwrap(); .unwrap();
debug!( debug!(
"Downloaded incremental snapshot, slot: {}, base slot: {}", "Downloaded incremental snapshot, slot: {}, base slot: {}",
downloaded_incremental_snapshot_archive_info.slot(), downloaded_incremental_snapshot_archive.slot(),
downloaded_incremental_snapshot_archive_info.base_slot(), downloaded_incremental_snapshot_archive.base_slot(),
); );
assert_eq!( assert_eq!(
downloaded_full_snapshot_archive_info.slot(), downloaded_full_snapshot_archive.slot(),
downloaded_incremental_snapshot_archive_info.base_slot() downloaded_incremental_snapshot_archive.base_slot()
); );
info!("Downloading incremental snapshot to validator... DONE");
// closure to copy files in a directory to another directory // closure to copy files in a directory to another directory
let copy_files = |from: &Path, to: &Path| { let copy_files = |from: &Path, to: &Path| {
@ -1109,7 +1108,7 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
backup_validator_snapshot_archives_dir.path(), backup_validator_snapshot_archives_dir.path(),
); );
info!("Starting a new validator..."); info!("Starting the validator...");
let validator_identity = Arc::new(Keypair::new()); let validator_identity = Arc::new(Keypair::new());
cluster.add_validator( cluster.add_validator(
&validator_snapshot_test_config.validator_config, &validator_snapshot_test_config.validator_config,
@ -1118,11 +1117,14 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
None, None,
SocketAddrSpace::Unspecified, SocketAddrSpace::Unspecified,
); );
info!("Starting the validator... DONE");
// To ensure that a snapshot will be taken during startup, the blockstore needs to have roots // To ensure that a snapshot will be taken during startup, the blockstore needs to have roots
// that cross a full snapshot interval. // that cross a full snapshot interval.
info!("Waiting for the validator to see enough slots to cross a full snapshot interval..."); info!("Waiting for the validator to see enough slots to cross a full snapshot interval...");
let starting_slot = incremental_snapshot_archive_info.slot(); let starting_slot = incremental_snapshot_archive.slot();
let next_full_snapshot_slot = starting_slot + full_snapshot_interval;
trace!("next full snapshot slot: {next_full_snapshot_slot}");
let timer = Instant::now(); let timer = Instant::now();
loop { loop {
let validator_current_slot = cluster let validator_current_slot = cluster
@ -1130,6 +1132,7 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
.unwrap() .unwrap()
.get_slot_with_commitment(CommitmentConfig::finalized()) .get_slot_with_commitment(CommitmentConfig::finalized())
.unwrap(); .unwrap();
trace!("validator current slot: {validator_current_slot}");
if validator_current_slot > (starting_slot + full_snapshot_interval) { if validator_current_slot > (starting_slot + full_snapshot_interval) {
break; break;
} }
@ -1139,7 +1142,10 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
); );
std::thread::yield_now(); std::thread::yield_now();
} }
trace!("Waited {:?}", timer.elapsed()); debug!("Waited {:?}", timer.elapsed());
info!(
"Waiting for the validator to see enough slots to cross a full snapshot interval... DONE"
);
// Get the highest full snapshot archive info for the validator, now that it has crossed the // Get the highest full snapshot archive info for the validator, now that it has crossed the
// next full snapshot interval. We are going to use this to look up the same snapshot on the // next full snapshot interval. We are going to use this to look up the same snapshot on the
@ -1147,7 +1153,7 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
// during startup. This ensures the snapshot creation process during startup is correct. // during startup. This ensures the snapshot creation process during startup is correct.
// //
// Putting this all in its own block so its clear we're only intended to keep the leader's info // Putting this all in its own block so its clear we're only intended to keep the leader's info
let leader_full_snapshot_archive_info_for_comparison = { let leader_full_snapshot_archive_for_comparison = {
let validator_full_snapshot = snapshot_utils::get_highest_full_snapshot_archive_info( let validator_full_snapshot = snapshot_utils::get_highest_full_snapshot_archive_info(
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
) )
@ -1161,10 +1167,6 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
full_snapshot.slot() == validator_full_snapshot.slot() full_snapshot.slot() == validator_full_snapshot.slot()
&& full_snapshot.hash() == validator_full_snapshot.hash() && full_snapshot.hash() == validator_full_snapshot.hash()
}); });
// NOTE: If this unwrap() ever fails, it may be that the leader's old full snapshot archives
// were purged. If that happens, increase the maximum_full_snapshot_archives_to_retain
// in the leader's Snapshotconfig.
let leader_full_snapshot = leader_full_snapshots.first().unwrap(); let leader_full_snapshot = leader_full_snapshots.first().unwrap();
// And for sanity, the full snapshot from the leader and the validator MUST be the same // And for sanity, the full snapshot from the leader and the validator MUST be the same
@ -1178,15 +1180,17 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
leader_full_snapshot.clone() leader_full_snapshot.clone()
}; };
trace!("leader full snapshot archive for comparison: {leader_full_snapshot_archive_for_comparison:#?}");
trace!( info!("Delete all the snapshots on the validator and restore the originals from the backup...");
"Delete all the snapshots on the validator and restore the originals from the backup..."
);
delete_files_with_remote(validator_snapshot_test_config.snapshot_archives_dir.path()); delete_files_with_remote(validator_snapshot_test_config.snapshot_archives_dir.path());
copy_files_with_remote( copy_files_with_remote(
backup_validator_snapshot_archives_dir.path(), backup_validator_snapshot_archives_dir.path(),
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
); );
info!(
"Delete all the snapshots on the validator and restore the originals from the backup... DONE"
);
// Get the highest full snapshot slot *before* restarting, as a comparison // Get the highest full snapshot slot *before* restarting, as a comparison
let validator_full_snapshot_slot_at_startup = let validator_full_snapshot_slot_at_startup =
@ -1194,6 +1198,7 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
) )
.unwrap(); .unwrap();
trace!("validator full snapshot slot at startup: {validator_full_snapshot_slot_at_startup}");
info!("Restarting the validator..."); info!("Restarting the validator...");
let validator_info = cluster.exit_node(&validator_identity.pubkey()); let validator_info = cluster.exit_node(&validator_identity.pubkey());
@ -1202,63 +1207,64 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
validator_info, validator_info,
SocketAddrSpace::Unspecified, SocketAddrSpace::Unspecified,
); );
info!("Restarting the validator... DONE");
// Now, we want to ensure that the validator can make a new incremental snapshot based on the // Now, we want to ensure that the validator can make a new incremental snapshot based on the
// new full snapshot that was created during the restart. // new full snapshot that was created during the restart.
info!("Waiting for the validator to make new snapshots...");
let validator_next_full_snapshot_slot =
validator_full_snapshot_slot_at_startup + full_snapshot_interval;
let validator_next_incremental_snapshot_slot =
validator_next_full_snapshot_slot + incremental_snapshot_interval;
trace!("Waiting for validator next full snapshot slot: {validator_next_full_snapshot_slot}");
trace!("Waiting for validator next incremental snapshot slot: {validator_next_incremental_snapshot_slot}");
let timer = Instant::now(); let timer = Instant::now();
let ( loop {
validator_highest_full_snapshot_archive_info, if let Some(full_snapshot_slot) = snapshot_utils::get_highest_full_snapshot_archive_slot(
_validator_highest_incremental_snapshot_archive_info, validator_snapshot_test_config.snapshot_archives_dir.path(),
) = loop { ) {
if let Some(highest_full_snapshot_info) = if full_snapshot_slot >= validator_next_full_snapshot_slot {
snapshot_utils::get_highest_full_snapshot_archive_info( if let Some(incremental_snapshot_slot) =
validator_snapshot_test_config.snapshot_archives_dir.path(), snapshot_utils::get_highest_incremental_snapshot_archive_slot(
)
{
if highest_full_snapshot_info.slot() > validator_full_snapshot_slot_at_startup {
if let Some(highest_incremental_snapshot_info) =
snapshot_utils::get_highest_incremental_snapshot_archive_info(
validator_snapshot_test_config.snapshot_archives_dir.path(), validator_snapshot_test_config.snapshot_archives_dir.path(),
highest_full_snapshot_info.slot(), full_snapshot_slot,
) )
{ {
info!("Success! Made new full and incremental snapshots!"); if incremental_snapshot_slot >= validator_next_incremental_snapshot_slot {
trace!( // specific incremental snapshot is not important, just that one was created
"Full snapshot slot: {}, incremental snapshot slot: {}", debug!(
highest_full_snapshot_info.slot(), "Validator made new snapshots, full snapshot slot: {}, incremental snapshot slot: {}",
highest_incremental_snapshot_info.slot(), full_snapshot_slot,
); incremental_snapshot_slot,
break ( );
highest_full_snapshot_info, break;
highest_incremental_snapshot_info, }
);
} }
} }
} }
assert!( assert!(
timer.elapsed() < Duration::from_secs(10), timer.elapsed() < Duration::from_secs(10),
"It should not take longer than 10 seconds to cross the next incremental snapshot interval." "It should not take longer than 10 seconds to cross the next incremental snapshot interval."
); );
std::thread::yield_now(); std::thread::yield_now();
}; }
trace!("Waited {:?}", timer.elapsed()); debug!("Waited {:?}", timer.elapsed());
info!("Waiting for the validator to make new snapshots... DONE");
// Check to make sure that the full snapshot the validator created during startup is the same // Check to make sure that the full snapshot the validator created during startup is the same
// as the snapshot the leader created. // as the snapshot the leader created.
// NOTE: If the assert fires and the _slots_ don't match (specifically are off by a full let validator_full_snapshot_archives = snapshot_utils::get_full_snapshot_archives(
// snapshot interval), then that means the loop to get the validator_snapshot_test_config.snapshot_archives_dir.path(),
// `validator_highest_full_snapshot_archive_info` saw the wrong one, and that may've been due );
// to some weird scheduling/delays on the machine running the test. Run the test again. If let validator_full_snapshot_archive_for_comparison = validator_full_snapshot_archives
// this ever fails repeatedly then the test will need to be modified to handle this case. .into_iter()
.find(|validator_full_snapshot_archive| validator_full_snapshot_archive.slot() == leader_full_snapshot_archive_for_comparison.slot())
.expect("validator shall have a full snapshot with the same slot as the one that will be used for comparison with the leader");
trace!("Validator full snapshot archive for comparison: {validator_full_snapshot_archive_for_comparison:#?}");
assert_eq!( assert_eq!(
( validator_full_snapshot_archive_for_comparison.hash(),
validator_highest_full_snapshot_archive_info.slot(), leader_full_snapshot_archive_for_comparison.hash(),
validator_highest_full_snapshot_archive_info.hash()
),
(
leader_full_snapshot_archive_info_for_comparison.slot(),
leader_full_snapshot_archive_info_for_comparison.hash()
)
); );
// And lastly, startup another node with the new snapshots to ensure they work // And lastly, startup another node with the new snapshots to ensure they work
@ -1290,6 +1296,7 @@ fn test_incremental_snapshot_download_with_crossing_full_snapshot_interval_at_st
None, None,
SocketAddrSpace::Unspecified, SocketAddrSpace::Unspecified,
); );
info!("Starting final validator... DONE");
// Success! // Success!
} }
@ -2892,6 +2899,8 @@ impl SnapshotValidatorConfig {
incremental_snapshot_archive_interval_slots, incremental_snapshot_archive_interval_slots,
snapshot_archives_dir: snapshot_archives_dir.path().to_path_buf(), snapshot_archives_dir: snapshot_archives_dir.path().to_path_buf(),
bank_snapshots_dir: bank_snapshots_dir.path().to_path_buf(), bank_snapshots_dir: bank_snapshots_dir.path().to_path_buf(),
maximum_full_snapshot_archives_to_retain: usize::MAX,
maximum_incremental_snapshot_archives_to_retain: usize::MAX,
..SnapshotConfig::default() ..SnapshotConfig::default()
}; };