Remove repairman as its spamming cluster with unwanted repairs (#8193)

* Remove repairman as its spamming cluster with unwanted repairs * remove obsolete test
2020-02-10 17:00:00 -08:00 · 2020-02-10 17:00:00 -08:00 · 1d06aa3b31
parent 0b263f8714
commit 1d06aa3b31
4 changed files with 3 additions and 1287 deletions
--- a/core/src/cluster_info_repair_listener.rs
+++ b/core/src/cluster_info_repair_listener.rs
--- a/core/src/lib.rs
+++ b/core/src/lib.rs
@ -15,7 +15,6 @@ pub mod contact_info;
 pub mod blockstream;
 pub mod blockstream_service;
 pub mod cluster_info;
-pub mod cluster_info_repair_listener;
 pub mod consensus;
 pub mod crds;
 pub mod crds_gossip;
--- a/core/src/repair_service.rs
+++ b/core/src/repair_service.rs
@ -1,9 +1,6 @@
 //! The `repair_service` module implements the tools necessary to generate a thread which
 //! regularly finds missing shreds in the ledger and sends repair requests for those shreds
-use crate::{
-    cluster_info::ClusterInfo, cluster_info_repair_listener::ClusterInfoRepairListener,
-    result::Result,
-};
+use crate::{cluster_info::ClusterInfo, result::Result};
 use solana_ledger::{
    bank_forks::BankForks,
    blockstore::{Blockstore, CompletedSlotsReceiver, SlotMeta},
@ -66,7 +63,6 @@ impl Default for RepairSlotRange {

 pub struct RepairService {
    t_repair: JoinHandle<()>,
-    cluster_info_repair_listener: Option<ClusterInfoRepairListener>,
 }

 impl RepairService {
@ -77,19 +73,6 @@ impl RepairService {
        cluster_info: Arc<RwLock<ClusterInfo>>,
        repair_strategy: RepairStrategy,
    ) -> Self {
-        let cluster_info_repair_listener = match repair_strategy {
-            RepairStrategy::RepairAll {
-                ref epoch_schedule, ..
-            } => Some(ClusterInfoRepairListener::new(
-                &blockstore,
-                &exit,
-                cluster_info.clone(),
-                *epoch_schedule,
-            )),
-
-            _ => None,
-        };
-
        let t_repair = Builder::new()
            .name("solana-repair-service".to_string())
            .spawn(move || {
@ -103,10 +86,7 @@ impl RepairService {
            })
            .unwrap();

-        RepairService {
-            t_repair,
-            cluster_info_repair_listener,
-        }
+        RepairService { t_repair }
    }

    fn run(
@ -391,14 +371,7 @@ impl RepairService {
    }

    pub fn join(self) -> thread::Result<()> {
-        let mut results = vec![self.t_repair.join()];
-        if let Some(cluster_info_repair_listener) = self.cluster_info_repair_listener {
-            results.push(cluster_info_repair_listener.join());
-        }
-        for r in results {
-            r?;
-        }
-        Ok(())
+        self.t_repair.join()
    }
 }

--- a/local-cluster/tests/local_cluster.rs
+++ b/local-cluster/tests/local_cluster.rs
@ -943,100 +943,6 @@ fn test_no_voting() {
    }
 }

-#[test]
-fn test_repairman_catchup() {
-    solana_logger::setup();
-    error!("test_repairman_catchup");
-    run_repairman_catchup(3);
-}
-
-fn run_repairman_catchup(num_repairmen: u64) {
-    let mut validator_config = ValidatorConfig::default();
-    let num_ticks_per_second = 100;
-    let num_ticks_per_slot = 40;
-    let num_slots_per_epoch = MINIMUM_SLOTS_PER_EPOCH as u64;
-    let num_root_buffer_slots = 10;
-    // Calculate the leader schedule num_root_buffer_slots ahead. Otherwise, if stakers_slot_offset ==
-    // num_slots_per_epoch, and num_slots_per_epoch == MINIMUM_SLOTS_PER_EPOCH, then repairmen
-    // will stop sending repairs after the last slot in epoch 1 (0-indexed), because the root
-    // is at most in the first epoch.
-    //
-    // For example:
-    // Assume:
-    // 1) num_slots_per_epoch = 32
-    // 2) stakers_slot_offset = 32
-    // 3) MINIMUM_SLOTS_PER_EPOCH = 32
-    //
-    // Then the last slot in epoch 1 is slot 63. After completing slots 0 to 63, the root on the
-    // repairee is at most 31. Because, the stakers_slot_offset == 32, then the max confirmed epoch
-    // on the repairee is epoch 1.
-    // Thus the repairmen won't send any slots past epoch 1, slot 63 to this repairee until the repairee
-    // updates their root, and the repairee can't update their root until they get slot 64, so no progress
-    // is made. This is also not accounting for the fact that the repairee may not vote on every slot, so
-    // their root could actually be much less than 31. This is why we give a num_root_buffer_slots buffer.
-    let stakers_slot_offset = num_slots_per_epoch + num_root_buffer_slots;
-
-    validator_config.rpc_config.enable_validator_exit = true;
-
-    let lamports_per_repairman = 1000;
-
-    // Make the repairee_stake small relative to the repairmen stake so that the repairee doesn't
-    // get included in the leader schedule, causing slots to get skipped while it's still trying
-    // to catch up
-    let repairee_stake = 3;
-    let cluster_lamports = 2 * lamports_per_repairman * num_repairmen + repairee_stake;
-    let node_stakes: Vec<_> = (0..num_repairmen).map(|_| lamports_per_repairman).collect();
-    let mut cluster = LocalCluster::new(&ClusterConfig {
-        node_stakes,
-        cluster_lamports,
-        validator_configs: vec![validator_config.clone(); num_repairmen as usize],
-        ticks_per_slot: num_ticks_per_slot,
-        slots_per_epoch: num_slots_per_epoch,
-        stakers_slot_offset,
-        poh_config: PohConfig::new_sleep(Duration::from_millis(1000 / num_ticks_per_second)),
-        ..ClusterConfig::default()
-    });
-
-    let repairman_pubkeys: HashSet<_> = cluster.get_node_pubkeys().into_iter().collect();
-    let epoch_schedule = EpochSchedule::custom(num_slots_per_epoch, stakers_slot_offset, true);
-    let num_warmup_epochs = epoch_schedule.get_leader_schedule_epoch(0) + 1;
-
-    // Sleep for longer than the first N warmup epochs, with a one epoch buffer for timing issues
-    cluster_tests::sleep_n_epochs(
-        num_warmup_epochs as f64 + 1.0,
-        &cluster.genesis_config.poh_config,
-        num_ticks_per_slot,
-        num_slots_per_epoch,
-    );
-
-    // Start up a new node, wait for catchup. Backwards repair won't be sufficient because the
-    // leader is sending shreds past this validator's first two confirmed epochs. Thus, the repairman
-    // protocol will have to kick in for this validator to repair.
-    cluster.add_validator(&validator_config, repairee_stake, Arc::new(Keypair::new()));
-
-    let all_pubkeys = cluster.get_node_pubkeys();
-    let repairee_id = all_pubkeys
-        .into_iter()
-        .find(|x| !repairman_pubkeys.contains(x))
-        .unwrap();
-
-    // Wait for repairman protocol to catch this validator up
-    let repairee_client = cluster.get_validator_client(&repairee_id).unwrap();
-    let mut current_slot = 0;
-
-    // Make sure this validator can get repaired past the first few warmup epochs
-    let target_slot = (num_warmup_epochs) * num_slots_per_epoch + 1;
-    while current_slot <= target_slot {
-        trace!("current_slot: {}", current_slot);
-        if let Ok(slot) = repairee_client.get_slot_with_commitment(CommitmentConfig::recent()) {
-            current_slot = slot;
-        } else {
-            continue;
-        }
-        sleep(Duration::from_secs(1));
-    }
-}
-
 fn wait_for_next_snapshot<P: AsRef<Path>>(cluster: &LocalCluster, tar: P) {
    // Get slot after which this was generated
    let client = cluster