diff --git a/core/src/ancestor_hashes_service.rs b/core/src/ancestor_hashes_service.rs index 24d132c5e2..6f134c89bc 100644 --- a/core/src/ancestor_hashes_service.rs +++ b/core/src/ancestor_hashes_service.rs @@ -1554,6 +1554,8 @@ mod test { let ReplayBlockstoreComponents { blockstore: requester_blockstore, vote_simulator, + my_pubkey, + leader_schedule_cache, .. } = setup_dead_slot(dead_slot, correct_bank_hashes); @@ -1639,6 +1641,8 @@ mod test { None, &mut PurgeRepairSlotCounter::default(), &dumped_slots_sender, + &my_pubkey, + &leader_schedule_cache, ); // Simulate making a request diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 46aba2c26f..c983755e7e 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -919,6 +919,8 @@ impl ReplayStage { poh_bank.map(|bank| bank.slot()), &mut purge_repair_slot_counter, &dumped_slots_sender, + &my_pubkey, + &leader_schedule_cache, ); dump_then_repair_correct_slots_time.stop(); @@ -1162,6 +1164,7 @@ impl ReplayStage { (progress, heaviest_subtree_fork_choice) } + #[allow(clippy::too_many_arguments)] pub fn dump_then_repair_correct_slots( duplicate_slots_to_repair: &mut DuplicateSlotsToRepair, ancestors: &mut HashMap>, @@ -1172,6 +1175,8 @@ impl ReplayStage { poh_bank_slot: Option, purge_repair_slot_counter: &mut PurgeRepairSlotCounter, dumped_slots_sender: &DumpedSlotsSender, + my_pubkey: &Pubkey, + leader_schedule_cache: &LeaderScheduleCache, ) { if duplicate_slots_to_repair.is_empty() { return; @@ -1180,10 +1185,10 @@ impl ReplayStage { let root_bank = bank_forks.read().unwrap().root_bank(); let mut dumped = vec![]; // TODO: handle if alternate version of descendant also got confirmed after ancestor was - // confirmed, what happens then? Should probably keep track of purged list and skip things - // in `duplicate_slots_to_repair` that have already been purged. Add test. + // confirmed, what happens then? Should probably keep track of dumped list and skip things + // in `duplicate_slots_to_repair` that have already been dumped. Add test. duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| { - // Should not purge duplicate slots if there is currently a poh bank building + // Should not dump duplicate slots if there is currently a poh bank building // on top of that slot, as BankingStage might still be referencing/touching that state // concurrently. // Luckily for us, because the fork choice rule removes duplicate slots from fork @@ -1203,13 +1208,13 @@ impl ReplayStage { }) .unwrap_or(false); - let did_purge_repair = { + let did_dump_repair = { if !is_poh_building_on_duplicate_fork { let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot); if let Some(frozen_hash) = frozen_hash { if frozen_hash == *correct_hash { warn!( - "Trying to purge slot {} with correct_hash {}", + "Trying to dump slot {} with correct_hash {}", *duplicate_slot, *correct_hash ); return false; @@ -1219,19 +1224,29 @@ impl ReplayStage { ) { warn!( - "Trying to purge unfrozen slot {} that is not dead", + "Trying to dump unfrozen slot {} that is not dead", *duplicate_slot ); return false; } } else { warn!( - "Trying to purge slot {} which does not exist in bank forks", + "Trying to dump slot {} which does not exist in bank forks", *duplicate_slot ); return false; } + + // Should not dump slots for which we were the leader + if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) { + panic!("We are attempting to dump a block that we produced. \ + This indicates that we are producing duplicate blocks, \ + or that there is a bug in our runtime/replay code which \ + causes us to compute different bank hashes than the rest of the cluster. \ + We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}"); + } + Self::purge_unconfirmed_duplicate_slot( *duplicate_slot, ancestors, @@ -1249,7 +1264,11 @@ impl ReplayStage { .and_modify(|x| *x += 1) .or_insert(1); if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS { - panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash); + panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \ + and are unable to freeze a block with bankhash {correct_hash}, \ + instead we have a block with bankhash {frozen_hash:?}. \ + This is most likely a bug in the runtime. \ + At this point manual intervention is needed to make progress. Exiting"); } warn!( "Notifying repair service to repair duplicate slot: {}, attempt {}", @@ -1266,8 +1285,8 @@ impl ReplayStage { } }; - // If we purged/repaired, then no need to keep the slot in the set of pending work - !did_purge_repair + // If we dumped/repaired, then no need to keep the slot in the set of pending work + !did_dump_repair }); // Notify repair of the dumped slots along with the correct hash @@ -3674,9 +3693,9 @@ pub(crate) mod tests { pub struct ReplayBlockstoreComponents { pub blockstore: Arc, validator_node_to_vote_keys: HashMap, - my_pubkey: Pubkey, + pub(crate) my_pubkey: Pubkey, cluster_info: ClusterInfo, - leader_schedule_cache: Arc, + pub(crate) leader_schedule_cache: Arc, poh_recorder: RwLock, tower: Tower, rpc_subscriptions: Arc, @@ -6037,6 +6056,7 @@ pub(crate) mod tests { let ReplayBlockstoreComponents { ref mut vote_simulator, ref blockstore, + ref leader_schedule_cache, .. } = replay_blockstore_components(Some(forks), 1, None); @@ -6073,6 +6093,8 @@ pub(crate) mod tests { None, &mut purge_repair_slot_counter, &dumped_slots_sender, + &Pubkey::new_unique(), + leader_schedule_cache, ); assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap()); @@ -6128,6 +6150,7 @@ pub(crate) mod tests { ref mut tower, ref blockstore, ref mut vote_simulator, + ref leader_schedule_cache, .. } = replay_components; @@ -6192,6 +6215,8 @@ pub(crate) mod tests { None, &mut PurgeRepairSlotCounter::default(), &dumped_slots_sender, + &Pubkey::new_unique(), + leader_schedule_cache, ); // Check everything was purged properly @@ -7031,6 +7056,54 @@ pub(crate) mod tests { assert_eq!(received_slots, vec![8, 9, 11]); } + #[test] + #[should_panic(expected = "We are attempting to dump a block that we produced")] + fn test_dump_own_slots_fails() { + // Create the tree of banks in a BankForks object + let forks = tr(0) / (tr(1)) / (tr(2)); + + let ReplayBlockstoreComponents { + ref mut vote_simulator, + ref blockstore, + ref my_pubkey, + ref leader_schedule_cache, + .. + } = replay_blockstore_components(Some(forks), 1, None); + + let VoteSimulator { + ref mut progress, + ref bank_forks, + .. + } = vote_simulator; + + let (mut ancestors, mut descendants) = { + let r_bank_forks = bank_forks.read().unwrap(); + (r_bank_forks.ancestors(), r_bank_forks.descendants()) + }; + + // Insert different versions of both 1 and 2. Although normally these slots would be dumped, + // because we were the leader for these slots we should panic + let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default(); + duplicate_slots_to_repair.insert(1, Hash::new_unique()); + duplicate_slots_to_repair.insert(2, Hash::new_unique()); + let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default(); + let (dumped_slots_sender, _) = unbounded(); + + ReplayStage::dump_then_repair_correct_slots( + &mut duplicate_slots_to_repair, + &mut ancestors, + &mut descendants, + progress, + bank_forks, + blockstore, + None, + &mut purge_repair_slot_counter, + &dumped_slots_sender, + my_pubkey, + leader_schedule_cache, + ); + } + fn run_compute_and_select_forks( bank_forks: &RwLock, progress: &mut ProgressMap,