Fix repair behavior concerning our own leader slots (#30200)
panic when trying to dump & repair a block that we produced
This commit is contained in:
parent
4b17acf64e
commit
67f644473b
|
@ -1554,6 +1554,8 @@ mod test {
|
||||||
let ReplayBlockstoreComponents {
|
let ReplayBlockstoreComponents {
|
||||||
blockstore: requester_blockstore,
|
blockstore: requester_blockstore,
|
||||||
vote_simulator,
|
vote_simulator,
|
||||||
|
my_pubkey,
|
||||||
|
leader_schedule_cache,
|
||||||
..
|
..
|
||||||
} = setup_dead_slot(dead_slot, correct_bank_hashes);
|
} = setup_dead_slot(dead_slot, correct_bank_hashes);
|
||||||
|
|
||||||
|
@ -1639,6 +1641,8 @@ mod test {
|
||||||
None,
|
None,
|
||||||
&mut PurgeRepairSlotCounter::default(),
|
&mut PurgeRepairSlotCounter::default(),
|
||||||
&dumped_slots_sender,
|
&dumped_slots_sender,
|
||||||
|
&my_pubkey,
|
||||||
|
&leader_schedule_cache,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Simulate making a request
|
// Simulate making a request
|
||||||
|
|
|
@ -919,6 +919,8 @@ impl ReplayStage {
|
||||||
poh_bank.map(|bank| bank.slot()),
|
poh_bank.map(|bank| bank.slot()),
|
||||||
&mut purge_repair_slot_counter,
|
&mut purge_repair_slot_counter,
|
||||||
&dumped_slots_sender,
|
&dumped_slots_sender,
|
||||||
|
&my_pubkey,
|
||||||
|
&leader_schedule_cache,
|
||||||
);
|
);
|
||||||
dump_then_repair_correct_slots_time.stop();
|
dump_then_repair_correct_slots_time.stop();
|
||||||
|
|
||||||
|
@ -1162,6 +1164,7 @@ impl ReplayStage {
|
||||||
(progress, heaviest_subtree_fork_choice)
|
(progress, heaviest_subtree_fork_choice)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn dump_then_repair_correct_slots(
|
pub fn dump_then_repair_correct_slots(
|
||||||
duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
|
duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
|
||||||
ancestors: &mut HashMap<Slot, HashSet<Slot>>,
|
ancestors: &mut HashMap<Slot, HashSet<Slot>>,
|
||||||
|
@ -1172,6 +1175,8 @@ impl ReplayStage {
|
||||||
poh_bank_slot: Option<Slot>,
|
poh_bank_slot: Option<Slot>,
|
||||||
purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
|
purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
|
||||||
dumped_slots_sender: &DumpedSlotsSender,
|
dumped_slots_sender: &DumpedSlotsSender,
|
||||||
|
my_pubkey: &Pubkey,
|
||||||
|
leader_schedule_cache: &LeaderScheduleCache,
|
||||||
) {
|
) {
|
||||||
if duplicate_slots_to_repair.is_empty() {
|
if duplicate_slots_to_repair.is_empty() {
|
||||||
return;
|
return;
|
||||||
|
@ -1180,10 +1185,10 @@ impl ReplayStage {
|
||||||
let root_bank = bank_forks.read().unwrap().root_bank();
|
let root_bank = bank_forks.read().unwrap().root_bank();
|
||||||
let mut dumped = vec![];
|
let mut dumped = vec![];
|
||||||
// TODO: handle if alternate version of descendant also got confirmed after ancestor was
|
// TODO: handle if alternate version of descendant also got confirmed after ancestor was
|
||||||
// confirmed, what happens then? Should probably keep track of purged list and skip things
|
// confirmed, what happens then? Should probably keep track of dumped list and skip things
|
||||||
// in `duplicate_slots_to_repair` that have already been purged. Add test.
|
// in `duplicate_slots_to_repair` that have already been dumped. Add test.
|
||||||
duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
|
duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
|
||||||
// Should not purge duplicate slots if there is currently a poh bank building
|
// Should not dump duplicate slots if there is currently a poh bank building
|
||||||
// on top of that slot, as BankingStage might still be referencing/touching that state
|
// on top of that slot, as BankingStage might still be referencing/touching that state
|
||||||
// concurrently.
|
// concurrently.
|
||||||
// Luckily for us, because the fork choice rule removes duplicate slots from fork
|
// Luckily for us, because the fork choice rule removes duplicate slots from fork
|
||||||
|
@ -1203,13 +1208,13 @@ impl ReplayStage {
|
||||||
})
|
})
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
let did_purge_repair = {
|
let did_dump_repair = {
|
||||||
if !is_poh_building_on_duplicate_fork {
|
if !is_poh_building_on_duplicate_fork {
|
||||||
let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
|
let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
|
||||||
if let Some(frozen_hash) = frozen_hash {
|
if let Some(frozen_hash) = frozen_hash {
|
||||||
if frozen_hash == *correct_hash {
|
if frozen_hash == *correct_hash {
|
||||||
warn!(
|
warn!(
|
||||||
"Trying to purge slot {} with correct_hash {}",
|
"Trying to dump slot {} with correct_hash {}",
|
||||||
*duplicate_slot, *correct_hash
|
*duplicate_slot, *correct_hash
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
|
@ -1219,19 +1224,29 @@ impl ReplayStage {
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
warn!(
|
warn!(
|
||||||
"Trying to purge unfrozen slot {} that is not dead",
|
"Trying to dump unfrozen slot {} that is not dead",
|
||||||
*duplicate_slot
|
*duplicate_slot
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
warn!(
|
warn!(
|
||||||
"Trying to purge slot {} which does not exist in bank forks",
|
"Trying to dump slot {} which does not exist in bank forks",
|
||||||
*duplicate_slot
|
*duplicate_slot
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Should not dump slots for which we were the leader
|
||||||
|
if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) {
|
||||||
|
panic!("We are attempting to dump a block that we produced. \
|
||||||
|
This indicates that we are producing duplicate blocks, \
|
||||||
|
or that there is a bug in our runtime/replay code which \
|
||||||
|
causes us to compute different bank hashes than the rest of the cluster. \
|
||||||
|
We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}");
|
||||||
|
}
|
||||||
|
|
||||||
Self::purge_unconfirmed_duplicate_slot(
|
Self::purge_unconfirmed_duplicate_slot(
|
||||||
*duplicate_slot,
|
*duplicate_slot,
|
||||||
ancestors,
|
ancestors,
|
||||||
|
@ -1249,7 +1264,11 @@ impl ReplayStage {
|
||||||
.and_modify(|x| *x += 1)
|
.and_modify(|x| *x += 1)
|
||||||
.or_insert(1);
|
.or_insert(1);
|
||||||
if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
|
if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
|
||||||
panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash);
|
panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \
|
||||||
|
and are unable to freeze a block with bankhash {correct_hash}, \
|
||||||
|
instead we have a block with bankhash {frozen_hash:?}. \
|
||||||
|
This is most likely a bug in the runtime. \
|
||||||
|
At this point manual intervention is needed to make progress. Exiting");
|
||||||
}
|
}
|
||||||
warn!(
|
warn!(
|
||||||
"Notifying repair service to repair duplicate slot: {}, attempt {}",
|
"Notifying repair service to repair duplicate slot: {}, attempt {}",
|
||||||
|
@ -1266,8 +1285,8 @@ impl ReplayStage {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// If we purged/repaired, then no need to keep the slot in the set of pending work
|
// If we dumped/repaired, then no need to keep the slot in the set of pending work
|
||||||
!did_purge_repair
|
!did_dump_repair
|
||||||
});
|
});
|
||||||
|
|
||||||
// Notify repair of the dumped slots along with the correct hash
|
// Notify repair of the dumped slots along with the correct hash
|
||||||
|
@ -3674,9 +3693,9 @@ pub(crate) mod tests {
|
||||||
pub struct ReplayBlockstoreComponents {
|
pub struct ReplayBlockstoreComponents {
|
||||||
pub blockstore: Arc<Blockstore>,
|
pub blockstore: Arc<Blockstore>,
|
||||||
validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
|
validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
|
||||||
my_pubkey: Pubkey,
|
pub(crate) my_pubkey: Pubkey,
|
||||||
cluster_info: ClusterInfo,
|
cluster_info: ClusterInfo,
|
||||||
leader_schedule_cache: Arc<LeaderScheduleCache>,
|
pub(crate) leader_schedule_cache: Arc<LeaderScheduleCache>,
|
||||||
poh_recorder: RwLock<PohRecorder>,
|
poh_recorder: RwLock<PohRecorder>,
|
||||||
tower: Tower,
|
tower: Tower,
|
||||||
rpc_subscriptions: Arc<RpcSubscriptions>,
|
rpc_subscriptions: Arc<RpcSubscriptions>,
|
||||||
|
@ -6037,6 +6056,7 @@ pub(crate) mod tests {
|
||||||
let ReplayBlockstoreComponents {
|
let ReplayBlockstoreComponents {
|
||||||
ref mut vote_simulator,
|
ref mut vote_simulator,
|
||||||
ref blockstore,
|
ref blockstore,
|
||||||
|
ref leader_schedule_cache,
|
||||||
..
|
..
|
||||||
} = replay_blockstore_components(Some(forks), 1, None);
|
} = replay_blockstore_components(Some(forks), 1, None);
|
||||||
|
|
||||||
|
@ -6073,6 +6093,8 @@ pub(crate) mod tests {
|
||||||
None,
|
None,
|
||||||
&mut purge_repair_slot_counter,
|
&mut purge_repair_slot_counter,
|
||||||
&dumped_slots_sender,
|
&dumped_slots_sender,
|
||||||
|
&Pubkey::new_unique(),
|
||||||
|
leader_schedule_cache,
|
||||||
);
|
);
|
||||||
assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());
|
assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());
|
||||||
|
|
||||||
|
@ -6128,6 +6150,7 @@ pub(crate) mod tests {
|
||||||
ref mut tower,
|
ref mut tower,
|
||||||
ref blockstore,
|
ref blockstore,
|
||||||
ref mut vote_simulator,
|
ref mut vote_simulator,
|
||||||
|
ref leader_schedule_cache,
|
||||||
..
|
..
|
||||||
} = replay_components;
|
} = replay_components;
|
||||||
|
|
||||||
|
@ -6192,6 +6215,8 @@ pub(crate) mod tests {
|
||||||
None,
|
None,
|
||||||
&mut PurgeRepairSlotCounter::default(),
|
&mut PurgeRepairSlotCounter::default(),
|
||||||
&dumped_slots_sender,
|
&dumped_slots_sender,
|
||||||
|
&Pubkey::new_unique(),
|
||||||
|
leader_schedule_cache,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Check everything was purged properly
|
// Check everything was purged properly
|
||||||
|
@ -7031,6 +7056,54 @@ pub(crate) mod tests {
|
||||||
assert_eq!(received_slots, vec![8, 9, 11]);
|
assert_eq!(received_slots, vec![8, 9, 11]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "We are attempting to dump a block that we produced")]
|
||||||
|
fn test_dump_own_slots_fails() {
|
||||||
|
// Create the tree of banks in a BankForks object
|
||||||
|
let forks = tr(0) / (tr(1)) / (tr(2));
|
||||||
|
|
||||||
|
let ReplayBlockstoreComponents {
|
||||||
|
ref mut vote_simulator,
|
||||||
|
ref blockstore,
|
||||||
|
ref my_pubkey,
|
||||||
|
ref leader_schedule_cache,
|
||||||
|
..
|
||||||
|
} = replay_blockstore_components(Some(forks), 1, None);
|
||||||
|
|
||||||
|
let VoteSimulator {
|
||||||
|
ref mut progress,
|
||||||
|
ref bank_forks,
|
||||||
|
..
|
||||||
|
} = vote_simulator;
|
||||||
|
|
||||||
|
let (mut ancestors, mut descendants) = {
|
||||||
|
let r_bank_forks = bank_forks.read().unwrap();
|
||||||
|
(r_bank_forks.ancestors(), r_bank_forks.descendants())
|
||||||
|
};
|
||||||
|
|
||||||
|
// Insert different versions of both 1 and 2. Although normally these slots would be dumped,
|
||||||
|
// because we were the leader for these slots we should panic
|
||||||
|
let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default();
|
||||||
|
duplicate_slots_to_repair.insert(1, Hash::new_unique());
|
||||||
|
duplicate_slots_to_repair.insert(2, Hash::new_unique());
|
||||||
|
let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default();
|
||||||
|
let (dumped_slots_sender, _) = unbounded();
|
||||||
|
|
||||||
|
ReplayStage::dump_then_repair_correct_slots(
|
||||||
|
&mut duplicate_slots_to_repair,
|
||||||
|
&mut ancestors,
|
||||||
|
&mut descendants,
|
||||||
|
progress,
|
||||||
|
bank_forks,
|
||||||
|
blockstore,
|
||||||
|
None,
|
||||||
|
&mut purge_repair_slot_counter,
|
||||||
|
&dumped_slots_sender,
|
||||||
|
my_pubkey,
|
||||||
|
leader_schedule_cache,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn run_compute_and_select_forks(
|
fn run_compute_and_select_forks(
|
||||||
bank_forks: &RwLock<BankForks>,
|
bank_forks: &RwLock<BankForks>,
|
||||||
progress: &mut ProgressMap,
|
progress: &mut ProgressMap,
|
||||||
|
|
Loading…
Reference in New Issue