Fix repair behavior concerning our own leader slots (#30200)
panic when trying to dump & repair a block that we produced
This commit is contained in:
parent
4b17acf64e
commit
67f644473b
|
@ -1554,6 +1554,8 @@ mod test {
|
|||
let ReplayBlockstoreComponents {
|
||||
blockstore: requester_blockstore,
|
||||
vote_simulator,
|
||||
my_pubkey,
|
||||
leader_schedule_cache,
|
||||
..
|
||||
} = setup_dead_slot(dead_slot, correct_bank_hashes);
|
||||
|
||||
|
@ -1639,6 +1641,8 @@ mod test {
|
|||
None,
|
||||
&mut PurgeRepairSlotCounter::default(),
|
||||
&dumped_slots_sender,
|
||||
&my_pubkey,
|
||||
&leader_schedule_cache,
|
||||
);
|
||||
|
||||
// Simulate making a request
|
||||
|
|
|
@ -919,6 +919,8 @@ impl ReplayStage {
|
|||
poh_bank.map(|bank| bank.slot()),
|
||||
&mut purge_repair_slot_counter,
|
||||
&dumped_slots_sender,
|
||||
&my_pubkey,
|
||||
&leader_schedule_cache,
|
||||
);
|
||||
dump_then_repair_correct_slots_time.stop();
|
||||
|
||||
|
@ -1162,6 +1164,7 @@ impl ReplayStage {
|
|||
(progress, heaviest_subtree_fork_choice)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn dump_then_repair_correct_slots(
|
||||
duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
|
||||
ancestors: &mut HashMap<Slot, HashSet<Slot>>,
|
||||
|
@ -1172,6 +1175,8 @@ impl ReplayStage {
|
|||
poh_bank_slot: Option<Slot>,
|
||||
purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
|
||||
dumped_slots_sender: &DumpedSlotsSender,
|
||||
my_pubkey: &Pubkey,
|
||||
leader_schedule_cache: &LeaderScheduleCache,
|
||||
) {
|
||||
if duplicate_slots_to_repair.is_empty() {
|
||||
return;
|
||||
|
@ -1180,10 +1185,10 @@ impl ReplayStage {
|
|||
let root_bank = bank_forks.read().unwrap().root_bank();
|
||||
let mut dumped = vec![];
|
||||
// TODO: handle if alternate version of descendant also got confirmed after ancestor was
|
||||
// confirmed, what happens then? Should probably keep track of purged list and skip things
|
||||
// in `duplicate_slots_to_repair` that have already been purged. Add test.
|
||||
// confirmed, what happens then? Should probably keep track of dumped list and skip things
|
||||
// in `duplicate_slots_to_repair` that have already been dumped. Add test.
|
||||
duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
|
||||
// Should not purge duplicate slots if there is currently a poh bank building
|
||||
// Should not dump duplicate slots if there is currently a poh bank building
|
||||
// on top of that slot, as BankingStage might still be referencing/touching that state
|
||||
// concurrently.
|
||||
// Luckily for us, because the fork choice rule removes duplicate slots from fork
|
||||
|
@ -1203,13 +1208,13 @@ impl ReplayStage {
|
|||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
let did_purge_repair = {
|
||||
let did_dump_repair = {
|
||||
if !is_poh_building_on_duplicate_fork {
|
||||
let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
|
||||
if let Some(frozen_hash) = frozen_hash {
|
||||
if frozen_hash == *correct_hash {
|
||||
warn!(
|
||||
"Trying to purge slot {} with correct_hash {}",
|
||||
"Trying to dump slot {} with correct_hash {}",
|
||||
*duplicate_slot, *correct_hash
|
||||
);
|
||||
return false;
|
||||
|
@ -1219,19 +1224,29 @@ impl ReplayStage {
|
|||
)
|
||||
{
|
||||
warn!(
|
||||
"Trying to purge unfrozen slot {} that is not dead",
|
||||
"Trying to dump unfrozen slot {} that is not dead",
|
||||
*duplicate_slot
|
||||
);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
warn!(
|
||||
"Trying to purge slot {} which does not exist in bank forks",
|
||||
"Trying to dump slot {} which does not exist in bank forks",
|
||||
*duplicate_slot
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Should not dump slots for which we were the leader
|
||||
if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) {
|
||||
panic!("We are attempting to dump a block that we produced. \
|
||||
This indicates that we are producing duplicate blocks, \
|
||||
or that there is a bug in our runtime/replay code which \
|
||||
causes us to compute different bank hashes than the rest of the cluster. \
|
||||
We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}");
|
||||
}
|
||||
|
||||
Self::purge_unconfirmed_duplicate_slot(
|
||||
*duplicate_slot,
|
||||
ancestors,
|
||||
|
@ -1249,7 +1264,11 @@ impl ReplayStage {
|
|||
.and_modify(|x| *x += 1)
|
||||
.or_insert(1);
|
||||
if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
|
||||
panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash);
|
||||
panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \
|
||||
and are unable to freeze a block with bankhash {correct_hash}, \
|
||||
instead we have a block with bankhash {frozen_hash:?}. \
|
||||
This is most likely a bug in the runtime. \
|
||||
At this point manual intervention is needed to make progress. Exiting");
|
||||
}
|
||||
warn!(
|
||||
"Notifying repair service to repair duplicate slot: {}, attempt {}",
|
||||
|
@ -1266,8 +1285,8 @@ impl ReplayStage {
|
|||
}
|
||||
};
|
||||
|
||||
// If we purged/repaired, then no need to keep the slot in the set of pending work
|
||||
!did_purge_repair
|
||||
// If we dumped/repaired, then no need to keep the slot in the set of pending work
|
||||
!did_dump_repair
|
||||
});
|
||||
|
||||
// Notify repair of the dumped slots along with the correct hash
|
||||
|
@ -3674,9 +3693,9 @@ pub(crate) mod tests {
|
|||
pub struct ReplayBlockstoreComponents {
|
||||
pub blockstore: Arc<Blockstore>,
|
||||
validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
|
||||
my_pubkey: Pubkey,
|
||||
pub(crate) my_pubkey: Pubkey,
|
||||
cluster_info: ClusterInfo,
|
||||
leader_schedule_cache: Arc<LeaderScheduleCache>,
|
||||
pub(crate) leader_schedule_cache: Arc<LeaderScheduleCache>,
|
||||
poh_recorder: RwLock<PohRecorder>,
|
||||
tower: Tower,
|
||||
rpc_subscriptions: Arc<RpcSubscriptions>,
|
||||
|
@ -6037,6 +6056,7 @@ pub(crate) mod tests {
|
|||
let ReplayBlockstoreComponents {
|
||||
ref mut vote_simulator,
|
||||
ref blockstore,
|
||||
ref leader_schedule_cache,
|
||||
..
|
||||
} = replay_blockstore_components(Some(forks), 1, None);
|
||||
|
||||
|
@ -6073,6 +6093,8 @@ pub(crate) mod tests {
|
|||
None,
|
||||
&mut purge_repair_slot_counter,
|
||||
&dumped_slots_sender,
|
||||
&Pubkey::new_unique(),
|
||||
leader_schedule_cache,
|
||||
);
|
||||
assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());
|
||||
|
||||
|
@ -6128,6 +6150,7 @@ pub(crate) mod tests {
|
|||
ref mut tower,
|
||||
ref blockstore,
|
||||
ref mut vote_simulator,
|
||||
ref leader_schedule_cache,
|
||||
..
|
||||
} = replay_components;
|
||||
|
||||
|
@ -6192,6 +6215,8 @@ pub(crate) mod tests {
|
|||
None,
|
||||
&mut PurgeRepairSlotCounter::default(),
|
||||
&dumped_slots_sender,
|
||||
&Pubkey::new_unique(),
|
||||
leader_schedule_cache,
|
||||
);
|
||||
|
||||
// Check everything was purged properly
|
||||
|
@ -7031,6 +7056,54 @@ pub(crate) mod tests {
|
|||
assert_eq!(received_slots, vec![8, 9, 11]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "We are attempting to dump a block that we produced")]
|
||||
fn test_dump_own_slots_fails() {
|
||||
// Create the tree of banks in a BankForks object
|
||||
let forks = tr(0) / (tr(1)) / (tr(2));
|
||||
|
||||
let ReplayBlockstoreComponents {
|
||||
ref mut vote_simulator,
|
||||
ref blockstore,
|
||||
ref my_pubkey,
|
||||
ref leader_schedule_cache,
|
||||
..
|
||||
} = replay_blockstore_components(Some(forks), 1, None);
|
||||
|
||||
let VoteSimulator {
|
||||
ref mut progress,
|
||||
ref bank_forks,
|
||||
..
|
||||
} = vote_simulator;
|
||||
|
||||
let (mut ancestors, mut descendants) = {
|
||||
let r_bank_forks = bank_forks.read().unwrap();
|
||||
(r_bank_forks.ancestors(), r_bank_forks.descendants())
|
||||
};
|
||||
|
||||
// Insert different versions of both 1 and 2. Although normally these slots would be dumped,
|
||||
// because we were the leader for these slots we should panic
|
||||
let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default();
|
||||
duplicate_slots_to_repair.insert(1, Hash::new_unique());
|
||||
duplicate_slots_to_repair.insert(2, Hash::new_unique());
|
||||
let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default();
|
||||
let (dumped_slots_sender, _) = unbounded();
|
||||
|
||||
ReplayStage::dump_then_repair_correct_slots(
|
||||
&mut duplicate_slots_to_repair,
|
||||
&mut ancestors,
|
||||
&mut descendants,
|
||||
progress,
|
||||
bank_forks,
|
||||
blockstore,
|
||||
None,
|
||||
&mut purge_repair_slot_counter,
|
||||
&dumped_slots_sender,
|
||||
my_pubkey,
|
||||
leader_schedule_cache,
|
||||
);
|
||||
}
|
||||
|
||||
fn run_compute_and_select_forks(
|
||||
bank_forks: &RwLock<BankForks>,
|
||||
progress: &mut ProgressMap,
|
||||
|
|
Loading…
Reference in New Issue