Fix repair behavior concerning our own leader slots (#30200)

panic when trying to dump & repair a block that we produced
2023-02-09 13:30:12 -08:00 · 2023-02-09 13:30:12 -08:00 · 67f644473b
parent 4b17acf64e
commit 67f644473b
2 changed files with 89 additions and 12 deletions
--- a/core/src/ancestor_hashes_service.rs
+++ b/core/src/ancestor_hashes_service.rs
@ -1554,6 +1554,8 @@ mod test {
        let ReplayBlockstoreComponents {
            blockstore: requester_blockstore,
            vote_simulator,
+            my_pubkey,
+            leader_schedule_cache,
            ..
        } = setup_dead_slot(dead_slot, correct_bank_hashes);

@ -1639,6 +1641,8 @@ mod test {
            None,
            &mut PurgeRepairSlotCounter::default(),
            &dumped_slots_sender,
+            &my_pubkey,
+            &leader_schedule_cache,
        );

        // Simulate making a request
--- a/core/src/replay_stage.rs
+++ b/core/src/replay_stage.rs
@ -919,6 +919,8 @@ impl ReplayStage {
                    poh_bank.map(|bank| bank.slot()),
                    &mut purge_repair_slot_counter,
                    &dumped_slots_sender,
+                    &my_pubkey,
+                    &leader_schedule_cache,
                );
                dump_then_repair_correct_slots_time.stop();

@ -1162,6 +1164,7 @@ impl ReplayStage {
        (progress, heaviest_subtree_fork_choice)
    }

+    #[allow(clippy::too_many_arguments)]
    pub fn dump_then_repair_correct_slots(
        duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
        ancestors: &mut HashMap<Slot, HashSet<Slot>>,
@ -1172,6 +1175,8 @@ impl ReplayStage {
        poh_bank_slot: Option<Slot>,
        purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
        dumped_slots_sender: &DumpedSlotsSender,
+        my_pubkey: &Pubkey,
+        leader_schedule_cache: &LeaderScheduleCache,
    ) {
        if duplicate_slots_to_repair.is_empty() {
            return;
@ -1180,10 +1185,10 @@ impl ReplayStage {
        let root_bank = bank_forks.read().unwrap().root_bank();
        let mut dumped = vec![];
        // TODO: handle if alternate version of descendant also got confirmed after ancestor was
-        // confirmed, what happens then? Should probably keep track of purged list and skip things
-        // in `duplicate_slots_to_repair` that have already been purged. Add test.
+        // confirmed, what happens then? Should probably keep track of dumped list and skip things
+        // in `duplicate_slots_to_repair` that have already been dumped. Add test.
        duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
-            // Should not purge duplicate slots if there is currently a poh bank building
+            // Should not dump duplicate slots if there is currently a poh bank building
            // on top of that slot, as BankingStage might still be referencing/touching that state
            // concurrently.
            // Luckily for us, because the fork choice rule removes duplicate slots from fork
@ -1203,13 +1208,13 @@ impl ReplayStage {
                })
                .unwrap_or(false);

-            let did_purge_repair = {
+            let did_dump_repair = {
                if !is_poh_building_on_duplicate_fork {
                    let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
                    if let Some(frozen_hash) = frozen_hash {
                        if frozen_hash == *correct_hash {
                            warn!(
-                                "Trying to purge slot {} with correct_hash {}",
+                                "Trying to dump slot {} with correct_hash {}",
                                *duplicate_slot, *correct_hash
                            );
                            return false;
@ -1219,19 +1224,29 @@ impl ReplayStage {
                            )
                        {
                            warn!(
-                                "Trying to purge unfrozen slot {} that is not dead",
+                                "Trying to dump unfrozen slot {} that is not dead",
                                *duplicate_slot
                            );
                            return false;
                        }
                    } else {
                        warn!(
-                            "Trying to purge slot {} which does not exist in bank forks",
+                            "Trying to dump slot {} which does not exist in bank forks",
                            *duplicate_slot
                        );
                        return false;
                    }

+
+                    // Should not dump slots for which we were the leader
+                    if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) {
+                            panic!("We are attempting to dump a block that we produced. \
+                                This indicates that we are producing duplicate blocks, \
+                                or that there is a bug in our runtime/replay code which \
+                                causes us to compute different bank hashes than the rest of the cluster. \
+                                We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}");
+                    }
+
                    Self::purge_unconfirmed_duplicate_slot(
                        *duplicate_slot,
                        ancestors,
@ -1249,7 +1264,11 @@ impl ReplayStage {
                        .and_modify(|x| *x += 1)
                        .or_insert(1);
                    if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
-                        panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash);
+                        panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \
+                            and are unable to freeze a block with bankhash {correct_hash}, \
+                            instead we have a block with bankhash {frozen_hash:?}. \
+                            This is most likely a bug in the runtime. \
+                            At this point manual intervention is needed to make progress. Exiting");
                    }
                    warn!(
                        "Notifying repair service to repair duplicate slot: {}, attempt {}",
@ -1266,8 +1285,8 @@ impl ReplayStage {
                }
            };

-            // If we purged/repaired, then no need to keep the slot in the set of pending work
-            !did_purge_repair
+            // If we dumped/repaired, then no need to keep the slot in the set of pending work
+            !did_dump_repair
        });

        // Notify repair of the dumped slots along with the correct hash
@ -3674,9 +3693,9 @@ pub(crate) mod tests {
    pub struct ReplayBlockstoreComponents {
        pub blockstore: Arc<Blockstore>,
        validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
-        my_pubkey: Pubkey,
+        pub(crate) my_pubkey: Pubkey,
        cluster_info: ClusterInfo,
-        leader_schedule_cache: Arc<LeaderScheduleCache>,
+        pub(crate) leader_schedule_cache: Arc<LeaderScheduleCache>,
        poh_recorder: RwLock<PohRecorder>,
        tower: Tower,
        rpc_subscriptions: Arc<RpcSubscriptions>,
@ -6037,6 +6056,7 @@ pub(crate) mod tests {
        let ReplayBlockstoreComponents {
            ref mut vote_simulator,
            ref blockstore,
+            ref leader_schedule_cache,
            ..
        } = replay_blockstore_components(Some(forks), 1, None);

@ -6073,6 +6093,8 @@ pub(crate) mod tests {
            None,
            &mut purge_repair_slot_counter,
            &dumped_slots_sender,
+            &Pubkey::new_unique(),
+            leader_schedule_cache,
        );
        assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());

@ -6128,6 +6150,7 @@ pub(crate) mod tests {
            ref mut tower,
            ref blockstore,
            ref mut vote_simulator,
+            ref leader_schedule_cache,
            ..
        } = replay_components;

@ -6192,6 +6215,8 @@ pub(crate) mod tests {
            None,
            &mut PurgeRepairSlotCounter::default(),
            &dumped_slots_sender,
+            &Pubkey::new_unique(),
+            leader_schedule_cache,
        );

        // Check everything was purged properly
@ -7031,6 +7056,54 @@ pub(crate) mod tests {
        assert_eq!(received_slots, vec![8, 9, 11]);
    }

+    #[test]
+    #[should_panic(expected = "We are attempting to dump a block that we produced")]
+    fn test_dump_own_slots_fails() {
+        // Create the tree of banks in a BankForks object
+        let forks = tr(0) / (tr(1)) / (tr(2));
+
+        let ReplayBlockstoreComponents {
+            ref mut vote_simulator,
+            ref blockstore,
+            ref my_pubkey,
+            ref leader_schedule_cache,
+            ..
+        } = replay_blockstore_components(Some(forks), 1, None);
+
+        let VoteSimulator {
+            ref mut progress,
+            ref bank_forks,
+            ..
+        } = vote_simulator;
+
+        let (mut ancestors, mut descendants) = {
+            let r_bank_forks = bank_forks.read().unwrap();
+            (r_bank_forks.ancestors(), r_bank_forks.descendants())
+        };
+
+        // Insert different versions of both 1 and 2. Although normally these slots would be dumped,
+        // because we were the leader for these slots we should panic
+        let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default();
+        duplicate_slots_to_repair.insert(1, Hash::new_unique());
+        duplicate_slots_to_repair.insert(2, Hash::new_unique());
+        let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default();
+        let (dumped_slots_sender, _) = unbounded();
+
+        ReplayStage::dump_then_repair_correct_slots(
+            &mut duplicate_slots_to_repair,
+            &mut ancestors,
+            &mut descendants,
+            progress,
+            bank_forks,
+            blockstore,
+            None,
+            &mut purge_repair_slot_counter,
+            &dumped_slots_sender,
+            my_pubkey,
+            leader_schedule_cache,
+        );
+    }
+
    fn run_compute_and_select_forks(
        bank_forks: &RwLock<BankForks>,
        progress: &mut ProgressMap,