Fix repair behavior concerning our own leader slots (#30200)

panic when trying to dump & repair a block that we produced
This commit is contained in:
Ashwin Sekar 2023-02-09 13:30:12 -08:00 committed by GitHub
parent 4b17acf64e
commit 67f644473b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 12 deletions

View File

@ -1554,6 +1554,8 @@ mod test {
let ReplayBlockstoreComponents {
blockstore: requester_blockstore,
vote_simulator,
my_pubkey,
leader_schedule_cache,
..
} = setup_dead_slot(dead_slot, correct_bank_hashes);
@ -1639,6 +1641,8 @@ mod test {
None,
&mut PurgeRepairSlotCounter::default(),
&dumped_slots_sender,
&my_pubkey,
&leader_schedule_cache,
);
// Simulate making a request

View File

@ -919,6 +919,8 @@ impl ReplayStage {
poh_bank.map(|bank| bank.slot()),
&mut purge_repair_slot_counter,
&dumped_slots_sender,
&my_pubkey,
&leader_schedule_cache,
);
dump_then_repair_correct_slots_time.stop();
@ -1162,6 +1164,7 @@ impl ReplayStage {
(progress, heaviest_subtree_fork_choice)
}
#[allow(clippy::too_many_arguments)]
pub fn dump_then_repair_correct_slots(
duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
ancestors: &mut HashMap<Slot, HashSet<Slot>>,
@ -1172,6 +1175,8 @@ impl ReplayStage {
poh_bank_slot: Option<Slot>,
purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
dumped_slots_sender: &DumpedSlotsSender,
my_pubkey: &Pubkey,
leader_schedule_cache: &LeaderScheduleCache,
) {
if duplicate_slots_to_repair.is_empty() {
return;
@ -1180,10 +1185,10 @@ impl ReplayStage {
let root_bank = bank_forks.read().unwrap().root_bank();
let mut dumped = vec![];
// TODO: handle if alternate version of descendant also got confirmed after ancestor was
// confirmed, what happens then? Should probably keep track of purged list and skip things
// in `duplicate_slots_to_repair` that have already been purged. Add test.
// confirmed, what happens then? Should probably keep track of dumped list and skip things
// in `duplicate_slots_to_repair` that have already been dumped. Add test.
duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
// Should not purge duplicate slots if there is currently a poh bank building
// Should not dump duplicate slots if there is currently a poh bank building
// on top of that slot, as BankingStage might still be referencing/touching that state
// concurrently.
// Luckily for us, because the fork choice rule removes duplicate slots from fork
@ -1203,13 +1208,13 @@ impl ReplayStage {
})
.unwrap_or(false);
let did_purge_repair = {
let did_dump_repair = {
if !is_poh_building_on_duplicate_fork {
let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
if let Some(frozen_hash) = frozen_hash {
if frozen_hash == *correct_hash {
warn!(
"Trying to purge slot {} with correct_hash {}",
"Trying to dump slot {} with correct_hash {}",
*duplicate_slot, *correct_hash
);
return false;
@ -1219,19 +1224,29 @@ impl ReplayStage {
)
{
warn!(
"Trying to purge unfrozen slot {} that is not dead",
"Trying to dump unfrozen slot {} that is not dead",
*duplicate_slot
);
return false;
}
} else {
warn!(
"Trying to purge slot {} which does not exist in bank forks",
"Trying to dump slot {} which does not exist in bank forks",
*duplicate_slot
);
return false;
}
// Should not dump slots for which we were the leader
if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) {
panic!("We are attempting to dump a block that we produced. \
This indicates that we are producing duplicate blocks, \
or that there is a bug in our runtime/replay code which \
causes us to compute different bank hashes than the rest of the cluster. \
We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}");
}
Self::purge_unconfirmed_duplicate_slot(
*duplicate_slot,
ancestors,
@ -1249,7 +1264,11 @@ impl ReplayStage {
.and_modify(|x| *x += 1)
.or_insert(1);
if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash);
panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \
and are unable to freeze a block with bankhash {correct_hash}, \
instead we have a block with bankhash {frozen_hash:?}. \
This is most likely a bug in the runtime. \
At this point manual intervention is needed to make progress. Exiting");
}
warn!(
"Notifying repair service to repair duplicate slot: {}, attempt {}",
@ -1266,8 +1285,8 @@ impl ReplayStage {
}
};
// If we purged/repaired, then no need to keep the slot in the set of pending work
!did_purge_repair
// If we dumped/repaired, then no need to keep the slot in the set of pending work
!did_dump_repair
});
// Notify repair of the dumped slots along with the correct hash
@ -3674,9 +3693,9 @@ pub(crate) mod tests {
pub struct ReplayBlockstoreComponents {
pub blockstore: Arc<Blockstore>,
validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
my_pubkey: Pubkey,
pub(crate) my_pubkey: Pubkey,
cluster_info: ClusterInfo,
leader_schedule_cache: Arc<LeaderScheduleCache>,
pub(crate) leader_schedule_cache: Arc<LeaderScheduleCache>,
poh_recorder: RwLock<PohRecorder>,
tower: Tower,
rpc_subscriptions: Arc<RpcSubscriptions>,
@ -6037,6 +6056,7 @@ pub(crate) mod tests {
let ReplayBlockstoreComponents {
ref mut vote_simulator,
ref blockstore,
ref leader_schedule_cache,
..
} = replay_blockstore_components(Some(forks), 1, None);
@ -6073,6 +6093,8 @@ pub(crate) mod tests {
None,
&mut purge_repair_slot_counter,
&dumped_slots_sender,
&Pubkey::new_unique(),
leader_schedule_cache,
);
assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());
@ -6128,6 +6150,7 @@ pub(crate) mod tests {
ref mut tower,
ref blockstore,
ref mut vote_simulator,
ref leader_schedule_cache,
..
} = replay_components;
@ -6192,6 +6215,8 @@ pub(crate) mod tests {
None,
&mut PurgeRepairSlotCounter::default(),
&dumped_slots_sender,
&Pubkey::new_unique(),
leader_schedule_cache,
);
// Check everything was purged properly
@ -7031,6 +7056,54 @@ pub(crate) mod tests {
assert_eq!(received_slots, vec![8, 9, 11]);
}
#[test]
#[should_panic(expected = "We are attempting to dump a block that we produced")]
fn test_dump_own_slots_fails() {
// Create the tree of banks in a BankForks object
let forks = tr(0) / (tr(1)) / (tr(2));
let ReplayBlockstoreComponents {
ref mut vote_simulator,
ref blockstore,
ref my_pubkey,
ref leader_schedule_cache,
..
} = replay_blockstore_components(Some(forks), 1, None);
let VoteSimulator {
ref mut progress,
ref bank_forks,
..
} = vote_simulator;
let (mut ancestors, mut descendants) = {
let r_bank_forks = bank_forks.read().unwrap();
(r_bank_forks.ancestors(), r_bank_forks.descendants())
};
// Insert different versions of both 1 and 2. Although normally these slots would be dumped,
// because we were the leader for these slots we should panic
let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default();
duplicate_slots_to_repair.insert(1, Hash::new_unique());
duplicate_slots_to_repair.insert(2, Hash::new_unique());
let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default();
let (dumped_slots_sender, _) = unbounded();
ReplayStage::dump_then_repair_correct_slots(
&mut duplicate_slots_to_repair,
&mut ancestors,
&mut descendants,
progress,
bank_forks,
blockstore,
None,
&mut purge_repair_slot_counter,
&dumped_slots_sender,
my_pubkey,
leader_schedule_cache,
);
}
fn run_compute_and_select_forks(
bank_forks: &RwLock<BankForks>,
progress: &mut ProgressMap,