From 1df15d85c3b5a07b2e03236c20ce6ebcd1ea52a4 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 30 Oct 2020 19:31:23 +0900 Subject: [PATCH] Fix tower/blockstore unsync due to external causes (#12671) * Fix tower/blockstore unsync due to external causes * Add and clean up long comments * Clean up test * Comment about warped_slot_history * Run test_future_tower with master-only/master-slave * Update comments about false leader condition --- core/src/consensus.rs | 225 +++++++++++++++++++-------- local-cluster/tests/local_cluster.rs | 129 ++++++++++++++- sdk/program/src/slot_history.rs | 2 +- 3 files changed, 287 insertions(+), 69 deletions(-) diff --git a/core/src/consensus.rs b/core/src/consensus.rs index 3a91419894..839459438f 100644 --- a/core/src/consensus.rs +++ b/core/src/consensus.rs @@ -515,6 +515,59 @@ impl Tower { .map(|last_voted_slot| { let root = self.root(); let empty_ancestors = HashSet::default(); + let empty_ancestors_due_to_minor_unsynced_ledger = || { + // This condition (stale stray last vote) shouldn't occur under normal validator + // operation, indicating something unusual happened. + // This condition could be introduced by manual ledger mishandling, + // validator SEGV, OS/HW crash, or plain No Free Space FS error. + + // However, returning empty ancestors as a fallback here shouldn't result in + // slashing by itself (Note that we couldn't fully preclude any kind of slashing if + // the failure was OS or HW level). + + // Firstly, lockout is ensured elsewhere. + + // Also, there is no risk of optimistic conf. violation. Although empty ancestors + // could result in incorrect (= more than actual) locked_out_stake and + // false-positive SwitchProof later in this function, there should be no such a + // heavier fork candidate, first of all, if the last vote (or any of its + // unavailable ancestors) were already optimistically confirmed. + // The only exception is that other validator is already violating it... + if self.is_first_switch_check() && switch_slot < last_voted_slot { + // `switch < last` is needed not to warn! this message just because of using + // newer snapshots on validator restart + let message = format!( + "bank_forks doesn't have corresponding data for the stray restored \ + last vote({}), meaning some inconsistency between saved tower and ledger.", + last_voted_slot + ); + warn!("{}", message); + datapoint_warn!("tower_warn", ("warn", message, String)); + } + &empty_ancestors + }; + + let suspended_decision_due_to_major_unsynced_ledger = || { + // This peculiar corner handling is needed mainly for a tower which is newer than + // blockstore. (Yeah, we tolerate it for ease of maintaining validator by operators) + // This condition could be introduced by manual ledger mishandling, + // validator SEGV, OS/HW crash, or plain No Free Space FS error. + + // When we're in this clause, it basically means validator is badly running + // with a future tower while replaying past slots, especially problematic is + // last_voted_slot. + // So, don't re-vote on it by returning pseudo FailedSwitchThreshold, otherwise + // there would be slashing because of double vote on one of last_vote_ancestors. + // (Well, needless to say, re-creating the duplicate block must be handled properly + // at the banking stage: https://github.com/solana-labs/solana/issues/8232) + // + // To be specific, the replay stage is tricked into a false perception where + // last_vote_ancestors is AVAILABLE for descendant-of-`switch_slot`, stale, and + // stray slots (which should always be empty_ancestors). + // + // This is covered by test_future_tower_* in local_cluster + SwitchForkDecision::FailedSwitchThreshold(0, total_stake) + }; let last_vote_ancestors = ancestors.get(&last_voted_slot).unwrap_or_else(|| { @@ -529,35 +582,7 @@ impl Tower { // all of them. panic!("no ancestors found with slot: {}", last_voted_slot); } else { - // This condition (stale stray last vote) shouldn't occur under normal validator - // operation, indicating something unusual happened. - // Possible causes include: OS/HW crash, validator process crash, only saved tower - // is moved over to a new setup, etc... - - // However, returning empty ancestors as a fallback here shouldn't result in - // slashing by itself (Note that we couldn't fully preclude any kind of slashing if - // the failure was OS or HW level). - - // Firstly, lockout is ensured elsewhere. - - // Also, there is no risk of optimistic conf. violation. Although empty ancestors - // could result in incorrect (= more than actual) locked_out_stake and - // false-positive SwitchProof later in this function, there should be no such a - // heavier fork candidate, first of all, if the last vote (or any of its - // unavailable ancestors) were already optimistically confirmed. - // The only exception is that other validator is already violating it... - if self.is_first_switch_check() && switch_slot < last_voted_slot { - // `switch < last` is needed not to warn! this message just because of using - // newer snapshots on validator restart - let message = format!( - "bank_forks doesn't have corresponding data for the stray restored \ - last vote({}), meaning some inconsistency between saved tower and ledger.", - last_voted_slot - ); - warn!("{}", message); - datapoint_warn!("tower_warn", ("warn", message, String)); - } - &empty_ancestors + empty_ancestors_due_to_minor_unsynced_ledger() } }); @@ -569,13 +594,18 @@ impl Tower { return SwitchForkDecision::SameFork; } - assert!( - !last_vote_ancestors.contains(&switch_slot), - "Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}", - switch_slot, - last_vote_ancestors, - last_voted_slot - ); + if last_vote_ancestors.contains(&switch_slot) { + if !self.is_stray_last_vote() { + panic!( + "Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}", + switch_slot, + last_vote_ancestors, + last_voted_slot + ); + } else { + return suspended_decision_due_to_major_unsynced_ledger(); + } + } // By this point, we know the `switch_slot` is on a different fork // (is neither an ancestor nor descendant of `last_vote`), so a @@ -846,14 +876,6 @@ impl Tower { ); assert_eq!(slot_history.check(replayed_root), Check::Found); - // reconcile_blockstore_roots_with_tower() should already have aligned these. - assert!( - tower_root <= replayed_root, - format!( - "tower root: {:?} >= replayed root slot: {}", - tower_root, replayed_root - ) - ); assert!( self.last_vote == Vote::default() && self.lockouts.votes.is_empty() || self.last_vote != Vote::default() && !self.lockouts.votes.is_empty(), @@ -864,16 +886,59 @@ impl Tower { ); if let Some(last_voted_slot) = self.last_voted_slot() { - if slot_history.check(last_voted_slot) == Check::TooOld { - // We could try hard to anchor with other older votes, but opt to simplify the - // following logic - return Err(TowerError::TooOldTower( + if tower_root <= replayed_root { + // Normally, we goes into this clause with possible help of + // reconcile_blockstore_roots_with_tower() + if slot_history.check(last_voted_slot) == Check::TooOld { + // We could try hard to anchor with other older votes, but opt to simplify the + // following logic + return Err(TowerError::TooOldTower( + last_voted_slot, + slot_history.oldest(), + )); + } + + self.adjust_lockouts_with_slot_history(slot_history)?; + self.initialize_root(replayed_root); + } else { + // This should never occur under normal operation. + // While this validator's voting is suspended this way, + // suspended_decision_due_to_major_unsynced_ledger() will be also touched. + let message = format!( + "For some reason, we're REPROCESSING slots which has already been \ + voted and ROOTED by us; \ + VOTING will be SUSPENDED UNTIL {}!", last_voted_slot, - slot_history.oldest(), - )); + ); + error!("{}", message); + datapoint_error!("tower_error", ("error", message, String)); + + // Let's pass-through adjust_lockouts_with_slot_history just for sanitization, + // using a synthesized SlotHistory. + + let mut warped_slot_history = (*slot_history).clone(); + // Blockstore doesn't have the tower_root slot because of + // (replayed_root < tower_root) in this else clause, meaning the tower is from + // the future from the view of blockstore. + // Pretend the blockstore has the future tower_root to anchor exactly with that + // slot by adding tower_root to a slot history. The added slot will be newer + // than all slots in the slot history (remember tower_root > replayed_root), + // satisfying the slot history invariant. + // Thus, the whole process will be safe as well because tower_root exists + // within both tower and slot history, guaranteeing the success of adjustment + // and retaining all of future votes correctly while sanitizing. + warped_slot_history.add(tower_root); + + self.adjust_lockouts_with_slot_history(&warped_slot_history)?; + // don't update root; future tower's root should be kept across validator + // restarts to continue to show the scary messages at restarts until the next + // voting. } - self.adjust_lockouts_with_slot_history(slot_history)?; - self.initialize_root(replayed_root); + } else { + // This else clause is for newly created tower. + // initialize_lockouts_from_bank() should ensure the following invariant, + // otherwise we're screwing something up. + assert_eq!(tower_root, replayed_root); } Ok(self) @@ -1152,8 +1217,11 @@ impl SavedTower { } } -// Given an untimely crash, tower may have roots that are not reflected in blockstore because -// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots +// Given an untimely crash, tower may have roots that are not reflected in blockstore, +// or the reverse of this. +// That's because we don't impose any ordering guarantee or any kind of write barriers +// between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when +// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots. pub fn reconcile_blockstore_roots_with_tower( tower: &Tower, blockstore: &Blockstore, @@ -1173,12 +1241,23 @@ pub fn reconcile_blockstore_roots_with_tower( ), }) .collect(); - assert!( - !new_roots.is_empty(), - "at least 1 parent slot must be found" - ); - - blockstore.set_roots(&new_roots)?; + if !new_roots.is_empty() { + info!( + "Reconciling slots as root based on tower root: {:?} ({}..{}) ", + new_roots, tower_root, last_blockstore_root + ); + blockstore.set_roots(&new_roots)?; + } else { + // This indicates we're in bad state; but still don't panic here. + // That's because we might have a chance of recovering properly with + // newer snapshot. + warn!( + "Couldn't find any ancestor slots from tower root ({}) \ + towards blockstore root ({}); blockstore pruned or only \ + tower moved into new ledger?", + tower_root, last_blockstore_root, + ); + } } Ok(()) } @@ -2700,8 +2779,7 @@ pub mod test { } #[test] - #[should_panic(expected = "at least 1 parent slot must be found")] - fn test_reconcile_blockstore_roots_with_tower_panic_no_parent() { + fn test_reconcile_blockstore_roots_with_tower_nop_no_parent() { solana_logger::setup(); let blockstore_path = get_tmp_ledger_path!(); { @@ -2717,7 +2795,9 @@ pub mod test { let mut tower = Tower::new_with_key(&Pubkey::default()); tower.lockouts.root_slot = Some(4); + assert_eq!(blockstore.last_root(), 0); reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap(); + assert_eq!(blockstore.last_root(), 0); } Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction"); } @@ -3068,4 +3148,25 @@ pub mod test { assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok()); } + + #[test] + fn test_adjust_lockouts_after_replay_future_tower() { + let mut tower = Tower::new_for_tests(10, 0.9); + tower.lockouts.votes.push_back(Lockout::new(13)); + tower.lockouts.votes.push_back(Lockout::new(14)); + let vote = Vote::new(vec![14], Hash::default()); + tower.last_vote = vote; + tower.initialize_root(12); + + let mut slot_history = SlotHistory::default(); + slot_history.add(0); + slot_history.add(2); + + let tower = tower + .adjust_lockouts_after_replay(2, &slot_history) + .unwrap(); + assert_eq!(tower.root(), 12); + assert_eq!(tower.voted_slots(), vec![13, 14]); + assert_eq!(tower.stray_restored_slot, Some(14)); + } } diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 3a4dc8d50b..df2789e0be 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -43,6 +43,7 @@ use solana_sdk::{ signature::{Keypair, Signer}, system_transaction, }; +use solana_vote_program::vote_state::MAX_LOCKOUT_HISTORY; use std::sync::atomic::{AtomicBool, Ordering}; use std::{ collections::{HashMap, HashSet}, @@ -1630,7 +1631,7 @@ fn purge_slots(blockstore: &Blockstore, start_slot: Slot, slot_count: Slot) { blockstore.purge_slots(start_slot, start_slot + slot_count, PurgeType::Exact); } -fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option { +fn restore_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option { let tower = Tower::restore(&ledger_path, &node_pubkey); if let Err(tower_err) = tower { if tower_err.is_file_missing() { @@ -1640,11 +1641,15 @@ fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option } } // actually saved tower must have at least one vote. - let last_vote = Tower::restore(&ledger_path, &node_pubkey) - .unwrap() - .last_voted_slot() - .unwrap(); - Some(last_vote) + Tower::restore(&ledger_path, &node_pubkey).ok() +} + +fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option { + restore_tower(ledger_path, node_pubkey).map(|tower| tower.last_voted_slot().unwrap()) +} + +fn root_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option { + restore_tower(ledger_path, node_pubkey).map(|tower| tower.root()) } fn remove_tower(ledger_path: &Path, node_pubkey: &Pubkey) { @@ -1865,6 +1870,118 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b } } +enum ClusterMode { + MasterOnly, + MasterSlave, +} + +fn do_test_future_tower(cluster_mode: ClusterMode) { + solana_logger::setup(); + + // First set up the cluster with 4 nodes + let slots_per_epoch = 2048; + let node_stakes = match cluster_mode { + ClusterMode::MasterOnly => vec![100], + ClusterMode::MasterSlave => vec![100, 0], + }; + + let validator_keys = vec![ + "28bN3xyvrP4E8LwEgtLjhnkb7cY4amQb6DrYAbAYjgRV4GAGgkVM2K7wnxnAS7WDneuavza7x21MiafLu1HkwQt4", + "2saHBBoTkLMmttmPQP8KfBkcCw45S5cwtV3wTdGCscRC8uxdgvHxpHiWXKx4LvJjNJtnNcbSv5NdheokFFqnNDt8", + ] + .iter() + .map(|s| (Arc::new(Keypair::from_base58_string(s)), true)) + .take(node_stakes.len()) + .collect::>(); + let validators = validator_keys + .iter() + .map(|(kp, _)| kp.pubkey()) + .collect::>(); + let validator_a_pubkey = match cluster_mode { + ClusterMode::MasterOnly => validators[0], + ClusterMode::MasterSlave => validators[1], + }; + + let config = ClusterConfig { + cluster_lamports: 100_000, + node_stakes: node_stakes.clone(), + validator_configs: vec![ValidatorConfig::default(); node_stakes.len()], + validator_keys: Some(validator_keys), + slots_per_epoch, + stakers_slot_offset: slots_per_epoch, + skip_warmup_slots: true, + ..ClusterConfig::default() + }; + let mut cluster = LocalCluster::new(&config); + + let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey); + + loop { + sleep(Duration::from_millis(100)); + + if let Some(root) = root_in_tower(&val_a_ledger_path, &validator_a_pubkey) { + if root >= 15 { + break; + } + } + } + let purged_slot_before_restart = 10; + let validator_a_info = cluster.exit_node(&validator_a_pubkey); + { + // create a warped future tower without mangling the tower itself + info!( + "Revert blockstore before slot {} and effectively create a future tower", + purged_slot_before_restart, + ); + let blockstore = open_blockstore(&val_a_ledger_path); + purge_slots(&blockstore, purged_slot_before_restart, 100); + } + + cluster.restart_node(&validator_a_pubkey, validator_a_info); + + let mut newly_rooted = false; + let some_root_after_restart = purged_slot_before_restart + 25; // 25 is arbitrary; just wait a bit + for _ in 0..600 { + sleep(Duration::from_millis(100)); + + if let Some(root) = root_in_tower(&val_a_ledger_path, &validator_a_pubkey) { + if root >= some_root_after_restart { + newly_rooted = true; + break; + } + } + } + let _validator_a_info = cluster.exit_node(&validator_a_pubkey); + if newly_rooted { + // there should be no forks; i.e. monotonically increasing ancestor chain + let last_vote = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey).unwrap(); + let blockstore = open_blockstore(&val_a_ledger_path); + let actual_block_ancestors = AncestorIterator::new_inclusive(last_vote, &blockstore) + .take_while(|a| *a >= some_root_after_restart) + .collect::>(); + let expected_countinuous_no_fork_votes = (some_root_after_restart..=last_vote) + .rev() + .collect::>(); + assert_eq!(actual_block_ancestors, expected_countinuous_no_fork_votes); + assert!(actual_block_ancestors.len() > MAX_LOCKOUT_HISTORY); + info!("validator managed to handle future tower!"); + } else { + panic!("no root detected"); + } +} + +#[test] +#[serial] +fn test_future_tower_master_only() { + do_test_future_tower(ClusterMode::MasterOnly); +} + +#[test] +#[serial] +fn test_future_tower_master_slave() { + do_test_future_tower(ClusterMode::MasterSlave); +} + #[test] #[serial] fn test_no_optimistic_confirmation_violation_with_tower() { diff --git a/sdk/program/src/slot_history.rs b/sdk/program/src/slot_history.rs index 293ae38e1b..3d6208cdc9 100644 --- a/sdk/program/src/slot_history.rs +++ b/sdk/program/src/slot_history.rs @@ -6,7 +6,7 @@ use bv::BitVec; use bv::BitsMut; #[repr(C)] -#[derive(Serialize, Deserialize, PartialEq)] +#[derive(Clone, Serialize, Deserialize, PartialEq)] pub struct SlotHistory { pub bits: BitVec, pub next_slot: Slot,