Remove repairman as its spamming cluster with unwanted repairs (#8193)
* Remove repairman as its spamming cluster with unwanted repairs * remove obsolete test
This commit is contained in:
parent
0b263f8714
commit
1d06aa3b31
File diff suppressed because it is too large
Load Diff
|
@ -15,7 +15,6 @@ pub mod contact_info;
|
|||
pub mod blockstream;
|
||||
pub mod blockstream_service;
|
||||
pub mod cluster_info;
|
||||
pub mod cluster_info_repair_listener;
|
||||
pub mod consensus;
|
||||
pub mod crds;
|
||||
pub mod crds_gossip;
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
//! The `repair_service` module implements the tools necessary to generate a thread which
|
||||
//! regularly finds missing shreds in the ledger and sends repair requests for those shreds
|
||||
use crate::{
|
||||
cluster_info::ClusterInfo, cluster_info_repair_listener::ClusterInfoRepairListener,
|
||||
result::Result,
|
||||
};
|
||||
use crate::{cluster_info::ClusterInfo, result::Result};
|
||||
use solana_ledger::{
|
||||
bank_forks::BankForks,
|
||||
blockstore::{Blockstore, CompletedSlotsReceiver, SlotMeta},
|
||||
|
@ -66,7 +63,6 @@ impl Default for RepairSlotRange {
|
|||
|
||||
pub struct RepairService {
|
||||
t_repair: JoinHandle<()>,
|
||||
cluster_info_repair_listener: Option<ClusterInfoRepairListener>,
|
||||
}
|
||||
|
||||
impl RepairService {
|
||||
|
@ -77,19 +73,6 @@ impl RepairService {
|
|||
cluster_info: Arc<RwLock<ClusterInfo>>,
|
||||
repair_strategy: RepairStrategy,
|
||||
) -> Self {
|
||||
let cluster_info_repair_listener = match repair_strategy {
|
||||
RepairStrategy::RepairAll {
|
||||
ref epoch_schedule, ..
|
||||
} => Some(ClusterInfoRepairListener::new(
|
||||
&blockstore,
|
||||
&exit,
|
||||
cluster_info.clone(),
|
||||
*epoch_schedule,
|
||||
)),
|
||||
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let t_repair = Builder::new()
|
||||
.name("solana-repair-service".to_string())
|
||||
.spawn(move || {
|
||||
|
@ -103,10 +86,7 @@ impl RepairService {
|
|||
})
|
||||
.unwrap();
|
||||
|
||||
RepairService {
|
||||
t_repair,
|
||||
cluster_info_repair_listener,
|
||||
}
|
||||
RepairService { t_repair }
|
||||
}
|
||||
|
||||
fn run(
|
||||
|
@ -391,14 +371,7 @@ impl RepairService {
|
|||
}
|
||||
|
||||
pub fn join(self) -> thread::Result<()> {
|
||||
let mut results = vec![self.t_repair.join()];
|
||||
if let Some(cluster_info_repair_listener) = self.cluster_info_repair_listener {
|
||||
results.push(cluster_info_repair_listener.join());
|
||||
}
|
||||
for r in results {
|
||||
r?;
|
||||
}
|
||||
Ok(())
|
||||
self.t_repair.join()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -943,100 +943,6 @@ fn test_no_voting() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_repairman_catchup() {
|
||||
solana_logger::setup();
|
||||
error!("test_repairman_catchup");
|
||||
run_repairman_catchup(3);
|
||||
}
|
||||
|
||||
fn run_repairman_catchup(num_repairmen: u64) {
|
||||
let mut validator_config = ValidatorConfig::default();
|
||||
let num_ticks_per_second = 100;
|
||||
let num_ticks_per_slot = 40;
|
||||
let num_slots_per_epoch = MINIMUM_SLOTS_PER_EPOCH as u64;
|
||||
let num_root_buffer_slots = 10;
|
||||
// Calculate the leader schedule num_root_buffer_slots ahead. Otherwise, if stakers_slot_offset ==
|
||||
// num_slots_per_epoch, and num_slots_per_epoch == MINIMUM_SLOTS_PER_EPOCH, then repairmen
|
||||
// will stop sending repairs after the last slot in epoch 1 (0-indexed), because the root
|
||||
// is at most in the first epoch.
|
||||
//
|
||||
// For example:
|
||||
// Assume:
|
||||
// 1) num_slots_per_epoch = 32
|
||||
// 2) stakers_slot_offset = 32
|
||||
// 3) MINIMUM_SLOTS_PER_EPOCH = 32
|
||||
//
|
||||
// Then the last slot in epoch 1 is slot 63. After completing slots 0 to 63, the root on the
|
||||
// repairee is at most 31. Because, the stakers_slot_offset == 32, then the max confirmed epoch
|
||||
// on the repairee is epoch 1.
|
||||
// Thus the repairmen won't send any slots past epoch 1, slot 63 to this repairee until the repairee
|
||||
// updates their root, and the repairee can't update their root until they get slot 64, so no progress
|
||||
// is made. This is also not accounting for the fact that the repairee may not vote on every slot, so
|
||||
// their root could actually be much less than 31. This is why we give a num_root_buffer_slots buffer.
|
||||
let stakers_slot_offset = num_slots_per_epoch + num_root_buffer_slots;
|
||||
|
||||
validator_config.rpc_config.enable_validator_exit = true;
|
||||
|
||||
let lamports_per_repairman = 1000;
|
||||
|
||||
// Make the repairee_stake small relative to the repairmen stake so that the repairee doesn't
|
||||
// get included in the leader schedule, causing slots to get skipped while it's still trying
|
||||
// to catch up
|
||||
let repairee_stake = 3;
|
||||
let cluster_lamports = 2 * lamports_per_repairman * num_repairmen + repairee_stake;
|
||||
let node_stakes: Vec<_> = (0..num_repairmen).map(|_| lamports_per_repairman).collect();
|
||||
let mut cluster = LocalCluster::new(&ClusterConfig {
|
||||
node_stakes,
|
||||
cluster_lamports,
|
||||
validator_configs: vec![validator_config.clone(); num_repairmen as usize],
|
||||
ticks_per_slot: num_ticks_per_slot,
|
||||
slots_per_epoch: num_slots_per_epoch,
|
||||
stakers_slot_offset,
|
||||
poh_config: PohConfig::new_sleep(Duration::from_millis(1000 / num_ticks_per_second)),
|
||||
..ClusterConfig::default()
|
||||
});
|
||||
|
||||
let repairman_pubkeys: HashSet<_> = cluster.get_node_pubkeys().into_iter().collect();
|
||||
let epoch_schedule = EpochSchedule::custom(num_slots_per_epoch, stakers_slot_offset, true);
|
||||
let num_warmup_epochs = epoch_schedule.get_leader_schedule_epoch(0) + 1;
|
||||
|
||||
// Sleep for longer than the first N warmup epochs, with a one epoch buffer for timing issues
|
||||
cluster_tests::sleep_n_epochs(
|
||||
num_warmup_epochs as f64 + 1.0,
|
||||
&cluster.genesis_config.poh_config,
|
||||
num_ticks_per_slot,
|
||||
num_slots_per_epoch,
|
||||
);
|
||||
|
||||
// Start up a new node, wait for catchup. Backwards repair won't be sufficient because the
|
||||
// leader is sending shreds past this validator's first two confirmed epochs. Thus, the repairman
|
||||
// protocol will have to kick in for this validator to repair.
|
||||
cluster.add_validator(&validator_config, repairee_stake, Arc::new(Keypair::new()));
|
||||
|
||||
let all_pubkeys = cluster.get_node_pubkeys();
|
||||
let repairee_id = all_pubkeys
|
||||
.into_iter()
|
||||
.find(|x| !repairman_pubkeys.contains(x))
|
||||
.unwrap();
|
||||
|
||||
// Wait for repairman protocol to catch this validator up
|
||||
let repairee_client = cluster.get_validator_client(&repairee_id).unwrap();
|
||||
let mut current_slot = 0;
|
||||
|
||||
// Make sure this validator can get repaired past the first few warmup epochs
|
||||
let target_slot = (num_warmup_epochs) * num_slots_per_epoch + 1;
|
||||
while current_slot <= target_slot {
|
||||
trace!("current_slot: {}", current_slot);
|
||||
if let Ok(slot) = repairee_client.get_slot_with_commitment(CommitmentConfig::recent()) {
|
||||
current_slot = slot;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_for_next_snapshot<P: AsRef<Path>>(cluster: &LocalCluster, tar: P) {
|
||||
// Get slot after which this was generated
|
||||
let client = cluster
|
||||
|
|
Loading…
Reference in New Issue