From db82d9e914d90c5129ede4f337984b1babc03ba9 Mon Sep 17 00:00:00 2001 From: carllin Date: Thu, 4 Jun 2020 23:32:53 -0700 Subject: [PATCH] Enable more fine-grained control in partition tests (#10418) Co-authored-by: Carl --- local-cluster/src/cluster_tests.rs | 4 +- local-cluster/src/local_cluster.rs | 26 +++++- local-cluster/tests/local_cluster.rs | 121 ++++++++++++--------------- 3 files changed, 80 insertions(+), 71 deletions(-) diff --git a/local-cluster/src/cluster_tests.rs b/local-cluster/src/cluster_tests.rs index 0b47ef48df..f0d886dda0 100644 --- a/local-cluster/src/cluster_tests.rs +++ b/local-cluster/src/cluster_tests.rs @@ -284,7 +284,7 @@ pub fn kill_entry_and_spend_and_verify_rest( } } -pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo]) { +pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo], test_name: &str) { let mut roots = vec![HashSet::new(); contact_infos.len()]; let mut done = false; let mut last_print = Instant::now(); @@ -295,7 +295,7 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo]) roots[i].insert(slot); let min_node = roots.iter().map(|r| r.len()).min().unwrap_or(0); if last_print.elapsed().as_secs() > 3 { - info!("PARTITION_TEST min observed roots {}/16", min_node); + info!("{} min observed roots {}/16", test_name, min_node); last_print = Instant::now(); } done = min_node >= num_new_roots; diff --git a/local-cluster/src/local_cluster.rs b/local-cluster/src/local_cluster.rs index 960db5dbfb..7c04f8f4c0 100644 --- a/local-cluster/src/local_cluster.rs +++ b/local-cluster/src/local_cluster.rs @@ -1,4 +1,7 @@ -use crate::cluster::{Cluster, ClusterValidatorInfo, ValidatorInfo}; +use crate::{ + cluster::{Cluster, ClusterValidatorInfo, ValidatorInfo}, + cluster_tests, +}; use itertools::izip; use log::*; use solana_client::thin_client::{create_client, ThinClient}; @@ -137,7 +140,7 @@ impl LocalCluster { OperatingMode::Stable | OperatingMode::Preview => { genesis_config.native_instruction_processors = solana_genesis_programs::get_programs(genesis_config.operating_mode, 0) - .unwrap_or_else(|| vec![]) + .unwrap_or_default() } _ => (), } @@ -336,6 +339,25 @@ impl LocalCluster { Self::transfer_with_client(&client, source_keypair, dest_pubkey, lamports) } + pub fn check_for_new_roots(&self, num_new_roots: usize, test_name: &str) { + let alive_node_contact_infos: Vec<_> = self + .validators + .values() + .map(|v| v.info.contact_info.clone()) + .collect(); + assert!(!alive_node_contact_infos.is_empty()); + info!("{} discovering nodes", test_name); + let cluster_nodes = discover_cluster( + &alive_node_contact_infos[0].gossip, + alive_node_contact_infos.len(), + ) + .unwrap(); + info!("{} discovered {} nodes", test_name, cluster_nodes.len()); + info!("{} looking for new roots on all nodes", test_name); + cluster_tests::check_for_new_roots(num_new_roots, &alive_node_contact_infos, test_name); + info!("{} done waiting for roots", test_name); + } + fn transfer_with_client( client: &ThinClient, source_keypair: &Keypair, diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 9ef8e3f3bd..ff096da535 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -207,16 +207,21 @@ fn test_leader_failure_4() { /// * `leader_schedule` - An option that specifies whether the cluster should /// run with a fixed, predetermined leader schedule #[allow(clippy::cognitive_complexity)] -fn run_cluster_partition( - partitions: &[&[(usize, bool)]], +fn run_cluster_partition( + partitions: &[&[usize]], leader_schedule: Option<(LeaderSchedule, Vec>)>, -) { + on_partition_start: E, + on_partition_resolved: F, +) where + E: Fn(&mut LocalCluster) -> (), + F: Fn(&mut LocalCluster) -> (), +{ solana_logger::setup(); info!("PARTITION_TEST!"); let num_nodes = partitions.len(); let node_stakes: Vec<_> = partitions .iter() - .flat_map(|p| p.iter().map(|(stake_weight, _)| 100 * *stake_weight as u64)) + .flat_map(|p| p.iter().map(|stake_weight| 100 * *stake_weight as u64)) .collect(); assert_eq!(node_stakes.len(), num_nodes); let cluster_lamports = node_stakes.iter().sum::() * 2; @@ -226,7 +231,7 @@ fn run_cluster_partition( validator_config.enable_partition = Some(enable_partition.clone()); // Returns: - // 1) The keys for the validiators + // 1) The keys for the validators // 2) The amount of time it would take to iterate through one full iteration of the given // leader schedule let (validator_keys, leader_schedule_time): (Vec<_>, u64) = { @@ -252,7 +257,6 @@ fn run_cluster_partition( } }; - let validator_pubkeys: Vec<_> = validator_keys.iter().map(|v| v.pubkey()).collect(); let config = ClusterConfig { cluster_lamports, node_stakes, @@ -287,7 +291,8 @@ fn run_cluster_partition( if reached_epoch { info!("PARTITION_TEST start partition"); - enable_partition.clone().store(false, Ordering::Relaxed); + enable_partition.store(false, Ordering::Relaxed); + on_partition_start(&mut cluster); break; } else { sleep(Duration::from_millis(100)); @@ -298,56 +303,22 @@ fn run_cluster_partition( info!("PARTITION_TEST remove partition"); enable_partition.store(true, Ordering::Relaxed); - let mut dead_nodes = HashSet::new(); - let mut alive_node_contact_infos = vec![]; - let should_exits: Vec<_> = partitions - .iter() - .flat_map(|p| p.iter().map(|(_, should_exit)| should_exit)) - .collect(); - assert_eq!(should_exits.len(), validator_pubkeys.len()); - let timeout = 10; - if timeout > 0 { - // Give partitions time to propagate their blocks from during the partition - // after the partition resolves - let propagation_time = leader_schedule_time; - info!("PARTITION_TEST resolving partition. sleeping {}ms", timeout); - sleep(Duration::from_millis(10_000)); - info!( - "PARTITION_TEST waiting for blocks to propagate after partition {}ms", - propagation_time - ); - sleep(Duration::from_millis(propagation_time)); - info!("PARTITION_TEST resuming normal operation"); - for (pubkey, should_exit) in validator_pubkeys.iter().zip(should_exits) { - if *should_exit { - info!("Killing validator with id: {}", pubkey); - cluster.exit_node(pubkey); - dead_nodes.insert(*pubkey); - } else { - alive_node_contact_infos.push( - cluster - .validators - .get(pubkey) - .unwrap() - .info - .contact_info - .clone(), - ); - } - } - } - - assert_eq!(alive_node_contact_infos.is_empty(), false); - info!("PARTITION_TEST discovering nodes"); - let cluster_nodes = discover_cluster( - &alive_node_contact_infos[0].gossip, - alive_node_contact_infos.len(), - ) - .unwrap(); - info!("PARTITION_TEST discovered {} nodes", cluster_nodes.len()); - info!("PARTITION_TEST looking for new roots on all nodes"); - cluster_tests::check_for_new_roots(16, &alive_node_contact_infos); - info!("PARTITION_TEST done waiting for roots"); + // Give partitions time to propagate their blocks from during the partition + // after the partition resolves + let timeout = 10_000; + let propagation_time = leader_schedule_time; + info!( + "PARTITION_TEST resolving partition. sleeping {} ms", + timeout + ); + sleep(Duration::from_millis(timeout)); + info!( + "PARTITION_TEST waiting for blocks to propagate after partition {}ms", + propagation_time + ); + sleep(Duration::from_millis(propagation_time)); + info!("PARTITION_TEST resuming normal operation"); + on_partition_resolved(&mut cluster); } #[allow(unused_attributes)] @@ -355,7 +326,11 @@ fn run_cluster_partition( #[test] #[serial] fn test_cluster_partition_1_2() { - run_cluster_partition(&[&[(1, false)], &[(1, false), (1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1, 1]], None, empty, on_partition_resolved) } #[allow(unused_attributes)] @@ -363,13 +338,21 @@ fn test_cluster_partition_1_2() { #[test] #[serial] fn test_cluster_partition_1_1() { - run_cluster_partition(&[&[(1, false)], &[(1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1]], None, empty, on_partition_resolved) } #[test] #[serial] fn test_cluster_partition_1_1_1() { - run_cluster_partition(&[&[(1, false)], &[(1, false)], &[(1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1], &[1]], None, empty, on_partition_resolved) } #[test] @@ -387,7 +370,7 @@ fn test_kill_partition() { // 5) Check for recovery let mut leader_schedule = vec![]; let num_slots_per_validator = 8; - let partitions: [&[(usize, bool)]; 3] = [&[(9, true)], &[(10, false)], &[(10, false)]]; + let partitions: [&[usize]; 3] = [&[9], &[10], &[10]]; let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) .take(partitions.len()) .collect(); @@ -406,12 +389,21 @@ fn test_kill_partition() { } info!("leader_schedule: {}", leader_schedule.len()); + let empty = |_: &mut LocalCluster| {}; + let validator_to_kill = validator_keys[0].pubkey(); + let on_partition_resolved = |cluster: &mut LocalCluster| { + info!("Killing validator with id: {}", validator_to_kill); + cluster.exit_node(&validator_to_kill); + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; run_cluster_partition( &partitions, Some(( LeaderSchedule::new_from_schedule(leader_schedule), validator_keys, )), + empty, + on_partition_resolved, ) } @@ -1113,12 +1105,7 @@ fn test_faulty_node(faulty_node_type: BroadcastStageType) { let cluster = LocalCluster::new(&cluster_config); // Check for new roots - let alive_node_contact_infos: Vec<_> = cluster - .validators - .values() - .map(|v| v.info.contact_info.clone()) - .collect(); - cluster_tests::check_for_new_roots(16, &alive_node_contact_infos); + cluster.check_for_new_roots(16, &"test_faulty_node"); } #[test]