Fix tower/blockstore unsync due to external causes (#12671)

* Fix tower/blockstore unsync due to external causes

* Add and clean up long comments

* Clean up test

* Comment about warped_slot_history

* Run test_future_tower with master-only/master-slave

* Update comments about false leader condition
This commit is contained in:
Ryo Onodera 2020-10-30 19:31:23 +09:00 committed by GitHub
parent 9263ae1c60
commit 1df15d85c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 287 additions and 69 deletions

View File

@ -515,6 +515,59 @@ impl Tower {
.map(|last_voted_slot| {
let root = self.root();
let empty_ancestors = HashSet::default();
let empty_ancestors_due_to_minor_unsynced_ledger = || {
// This condition (stale stray last vote) shouldn't occur under normal validator
// operation, indicating something unusual happened.
// This condition could be introduced by manual ledger mishandling,
// validator SEGV, OS/HW crash, or plain No Free Space FS error.
// However, returning empty ancestors as a fallback here shouldn't result in
// slashing by itself (Note that we couldn't fully preclude any kind of slashing if
// the failure was OS or HW level).
// Firstly, lockout is ensured elsewhere.
// Also, there is no risk of optimistic conf. violation. Although empty ancestors
// could result in incorrect (= more than actual) locked_out_stake and
// false-positive SwitchProof later in this function, there should be no such a
// heavier fork candidate, first of all, if the last vote (or any of its
// unavailable ancestors) were already optimistically confirmed.
// The only exception is that other validator is already violating it...
if self.is_first_switch_check() && switch_slot < last_voted_slot {
// `switch < last` is needed not to warn! this message just because of using
// newer snapshots on validator restart
let message = format!(
"bank_forks doesn't have corresponding data for the stray restored \
last vote({}), meaning some inconsistency between saved tower and ledger.",
last_voted_slot
);
warn!("{}", message);
datapoint_warn!("tower_warn", ("warn", message, String));
}
&empty_ancestors
};
let suspended_decision_due_to_major_unsynced_ledger = || {
// This peculiar corner handling is needed mainly for a tower which is newer than
// blockstore. (Yeah, we tolerate it for ease of maintaining validator by operators)
// This condition could be introduced by manual ledger mishandling,
// validator SEGV, OS/HW crash, or plain No Free Space FS error.
// When we're in this clause, it basically means validator is badly running
// with a future tower while replaying past slots, especially problematic is
// last_voted_slot.
// So, don't re-vote on it by returning pseudo FailedSwitchThreshold, otherwise
// there would be slashing because of double vote on one of last_vote_ancestors.
// (Well, needless to say, re-creating the duplicate block must be handled properly
// at the banking stage: https://github.com/solana-labs/solana/issues/8232)
//
// To be specific, the replay stage is tricked into a false perception where
// last_vote_ancestors is AVAILABLE for descendant-of-`switch_slot`, stale, and
// stray slots (which should always be empty_ancestors).
//
// This is covered by test_future_tower_* in local_cluster
SwitchForkDecision::FailedSwitchThreshold(0, total_stake)
};
let last_vote_ancestors =
ancestors.get(&last_voted_slot).unwrap_or_else(|| {
@ -529,35 +582,7 @@ impl Tower {
// all of them.
panic!("no ancestors found with slot: {}", last_voted_slot);
} else {
// This condition (stale stray last vote) shouldn't occur under normal validator
// operation, indicating something unusual happened.
// Possible causes include: OS/HW crash, validator process crash, only saved tower
// is moved over to a new setup, etc...
// However, returning empty ancestors as a fallback here shouldn't result in
// slashing by itself (Note that we couldn't fully preclude any kind of slashing if
// the failure was OS or HW level).
// Firstly, lockout is ensured elsewhere.
// Also, there is no risk of optimistic conf. violation. Although empty ancestors
// could result in incorrect (= more than actual) locked_out_stake and
// false-positive SwitchProof later in this function, there should be no such a
// heavier fork candidate, first of all, if the last vote (or any of its
// unavailable ancestors) were already optimistically confirmed.
// The only exception is that other validator is already violating it...
if self.is_first_switch_check() && switch_slot < last_voted_slot {
// `switch < last` is needed not to warn! this message just because of using
// newer snapshots on validator restart
let message = format!(
"bank_forks doesn't have corresponding data for the stray restored \
last vote({}), meaning some inconsistency between saved tower and ledger.",
last_voted_slot
);
warn!("{}", message);
datapoint_warn!("tower_warn", ("warn", message, String));
}
&empty_ancestors
empty_ancestors_due_to_minor_unsynced_ledger()
}
});
@ -569,13 +594,18 @@ impl Tower {
return SwitchForkDecision::SameFork;
}
assert!(
!last_vote_ancestors.contains(&switch_slot),
"Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}",
switch_slot,
last_vote_ancestors,
last_voted_slot
);
if last_vote_ancestors.contains(&switch_slot) {
if !self.is_stray_last_vote() {
panic!(
"Should never consider switching to slot ({}), which is ancestors({:?}) of last vote: {}",
switch_slot,
last_vote_ancestors,
last_voted_slot
);
} else {
return suspended_decision_due_to_major_unsynced_ledger();
}
}
// By this point, we know the `switch_slot` is on a different fork
// (is neither an ancestor nor descendant of `last_vote`), so a
@ -846,14 +876,6 @@ impl Tower {
);
assert_eq!(slot_history.check(replayed_root), Check::Found);
// reconcile_blockstore_roots_with_tower() should already have aligned these.
assert!(
tower_root <= replayed_root,
format!(
"tower root: {:?} >= replayed root slot: {}",
tower_root, replayed_root
)
);
assert!(
self.last_vote == Vote::default() && self.lockouts.votes.is_empty()
|| self.last_vote != Vote::default() && !self.lockouts.votes.is_empty(),
@ -864,16 +886,59 @@ impl Tower {
);
if let Some(last_voted_slot) = self.last_voted_slot() {
if slot_history.check(last_voted_slot) == Check::TooOld {
// We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
if tower_root <= replayed_root {
// Normally, we goes into this clause with possible help of
// reconcile_blockstore_roots_with_tower()
if slot_history.check(last_voted_slot) == Check::TooOld {
// We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
last_voted_slot,
slot_history.oldest(),
));
}
self.adjust_lockouts_with_slot_history(slot_history)?;
self.initialize_root(replayed_root);
} else {
// This should never occur under normal operation.
// While this validator's voting is suspended this way,
// suspended_decision_due_to_major_unsynced_ledger() will be also touched.
let message = format!(
"For some reason, we're REPROCESSING slots which has already been \
voted and ROOTED by us; \
VOTING will be SUSPENDED UNTIL {}!",
last_voted_slot,
slot_history.oldest(),
));
);
error!("{}", message);
datapoint_error!("tower_error", ("error", message, String));
// Let's pass-through adjust_lockouts_with_slot_history just for sanitization,
// using a synthesized SlotHistory.
let mut warped_slot_history = (*slot_history).clone();
// Blockstore doesn't have the tower_root slot because of
// (replayed_root < tower_root) in this else clause, meaning the tower is from
// the future from the view of blockstore.
// Pretend the blockstore has the future tower_root to anchor exactly with that
// slot by adding tower_root to a slot history. The added slot will be newer
// than all slots in the slot history (remember tower_root > replayed_root),
// satisfying the slot history invariant.
// Thus, the whole process will be safe as well because tower_root exists
// within both tower and slot history, guaranteeing the success of adjustment
// and retaining all of future votes correctly while sanitizing.
warped_slot_history.add(tower_root);
self.adjust_lockouts_with_slot_history(&warped_slot_history)?;
// don't update root; future tower's root should be kept across validator
// restarts to continue to show the scary messages at restarts until the next
// voting.
}
self.adjust_lockouts_with_slot_history(slot_history)?;
self.initialize_root(replayed_root);
} else {
// This else clause is for newly created tower.
// initialize_lockouts_from_bank() should ensure the following invariant,
// otherwise we're screwing something up.
assert_eq!(tower_root, replayed_root);
}
Ok(self)
@ -1152,8 +1217,11 @@ impl SavedTower {
}
}
// Given an untimely crash, tower may have roots that are not reflected in blockstore because
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots
// Given an untimely crash, tower may have roots that are not reflected in blockstore,
// or the reverse of this.
// That's because we don't impose any ordering guarantee or any kind of write barriers
// between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots.
pub fn reconcile_blockstore_roots_with_tower(
tower: &Tower,
blockstore: &Blockstore,
@ -1173,12 +1241,23 @@ pub fn reconcile_blockstore_roots_with_tower(
),
})
.collect();
assert!(
!new_roots.is_empty(),
"at least 1 parent slot must be found"
);
blockstore.set_roots(&new_roots)?;
if !new_roots.is_empty() {
info!(
"Reconciling slots as root based on tower root: {:?} ({}..{}) ",
new_roots, tower_root, last_blockstore_root
);
blockstore.set_roots(&new_roots)?;
} else {
// This indicates we're in bad state; but still don't panic here.
// That's because we might have a chance of recovering properly with
// newer snapshot.
warn!(
"Couldn't find any ancestor slots from tower root ({}) \
towards blockstore root ({}); blockstore pruned or only \
tower moved into new ledger?",
tower_root, last_blockstore_root,
);
}
}
Ok(())
}
@ -2700,8 +2779,7 @@ pub mod test {
}
#[test]
#[should_panic(expected = "at least 1 parent slot must be found")]
fn test_reconcile_blockstore_roots_with_tower_panic_no_parent() {
fn test_reconcile_blockstore_roots_with_tower_nop_no_parent() {
solana_logger::setup();
let blockstore_path = get_tmp_ledger_path!();
{
@ -2717,7 +2795,9 @@ pub mod test {
let mut tower = Tower::new_with_key(&Pubkey::default());
tower.lockouts.root_slot = Some(4);
assert_eq!(blockstore.last_root(), 0);
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
assert_eq!(blockstore.last_root(), 0);
}
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");
}
@ -3068,4 +3148,25 @@ pub mod test {
assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok());
}
#[test]
fn test_adjust_lockouts_after_replay_future_tower() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(13));
tower.lockouts.votes.push_back(Lockout::new(14));
let vote = Vote::new(vec![14], Hash::default());
tower.last_vote = vote;
tower.initialize_root(12);
let mut slot_history = SlotHistory::default();
slot_history.add(0);
slot_history.add(2);
let tower = tower
.adjust_lockouts_after_replay(2, &slot_history)
.unwrap();
assert_eq!(tower.root(), 12);
assert_eq!(tower.voted_slots(), vec![13, 14]);
assert_eq!(tower.stray_restored_slot, Some(14));
}
}

View File

@ -43,6 +43,7 @@ use solana_sdk::{
signature::{Keypair, Signer},
system_transaction,
};
use solana_vote_program::vote_state::MAX_LOCKOUT_HISTORY;
use std::sync::atomic::{AtomicBool, Ordering};
use std::{
collections::{HashMap, HashSet},
@ -1630,7 +1631,7 @@ fn purge_slots(blockstore: &Blockstore, start_slot: Slot, slot_count: Slot) {
blockstore.purge_slots(start_slot, start_slot + slot_count, PurgeType::Exact);
}
fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option<Slot> {
fn restore_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option<Tower> {
let tower = Tower::restore(&ledger_path, &node_pubkey);
if let Err(tower_err) = tower {
if tower_err.is_file_missing() {
@ -1640,11 +1641,15 @@ fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option<Slot>
}
}
// actually saved tower must have at least one vote.
let last_vote = Tower::restore(&ledger_path, &node_pubkey)
.unwrap()
.last_voted_slot()
.unwrap();
Some(last_vote)
Tower::restore(&ledger_path, &node_pubkey).ok()
}
fn last_vote_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option<Slot> {
restore_tower(ledger_path, node_pubkey).map(|tower| tower.last_voted_slot().unwrap())
}
fn root_in_tower(ledger_path: &Path, node_pubkey: &Pubkey) -> Option<Slot> {
restore_tower(ledger_path, node_pubkey).map(|tower| tower.root())
}
fn remove_tower(ledger_path: &Path, node_pubkey: &Pubkey) {
@ -1865,6 +1870,118 @@ fn do_test_optimistic_confirmation_violation_with_or_without_tower(with_tower: b
}
}
enum ClusterMode {
MasterOnly,
MasterSlave,
}
fn do_test_future_tower(cluster_mode: ClusterMode) {
solana_logger::setup();
// First set up the cluster with 4 nodes
let slots_per_epoch = 2048;
let node_stakes = match cluster_mode {
ClusterMode::MasterOnly => vec![100],
ClusterMode::MasterSlave => vec![100, 0],
};
let validator_keys = vec![
"28bN3xyvrP4E8LwEgtLjhnkb7cY4amQb6DrYAbAYjgRV4GAGgkVM2K7wnxnAS7WDneuavza7x21MiafLu1HkwQt4",
"2saHBBoTkLMmttmPQP8KfBkcCw45S5cwtV3wTdGCscRC8uxdgvHxpHiWXKx4LvJjNJtnNcbSv5NdheokFFqnNDt8",
]
.iter()
.map(|s| (Arc::new(Keypair::from_base58_string(s)), true))
.take(node_stakes.len())
.collect::<Vec<_>>();
let validators = validator_keys
.iter()
.map(|(kp, _)| kp.pubkey())
.collect::<Vec<_>>();
let validator_a_pubkey = match cluster_mode {
ClusterMode::MasterOnly => validators[0],
ClusterMode::MasterSlave => validators[1],
};
let config = ClusterConfig {
cluster_lamports: 100_000,
node_stakes: node_stakes.clone(),
validator_configs: vec![ValidatorConfig::default(); node_stakes.len()],
validator_keys: Some(validator_keys),
slots_per_epoch,
stakers_slot_offset: slots_per_epoch,
skip_warmup_slots: true,
..ClusterConfig::default()
};
let mut cluster = LocalCluster::new(&config);
let val_a_ledger_path = cluster.ledger_path(&validator_a_pubkey);
loop {
sleep(Duration::from_millis(100));
if let Some(root) = root_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
if root >= 15 {
break;
}
}
}
let purged_slot_before_restart = 10;
let validator_a_info = cluster.exit_node(&validator_a_pubkey);
{
// create a warped future tower without mangling the tower itself
info!(
"Revert blockstore before slot {} and effectively create a future tower",
purged_slot_before_restart,
);
let blockstore = open_blockstore(&val_a_ledger_path);
purge_slots(&blockstore, purged_slot_before_restart, 100);
}
cluster.restart_node(&validator_a_pubkey, validator_a_info);
let mut newly_rooted = false;
let some_root_after_restart = purged_slot_before_restart + 25; // 25 is arbitrary; just wait a bit
for _ in 0..600 {
sleep(Duration::from_millis(100));
if let Some(root) = root_in_tower(&val_a_ledger_path, &validator_a_pubkey) {
if root >= some_root_after_restart {
newly_rooted = true;
break;
}
}
}
let _validator_a_info = cluster.exit_node(&validator_a_pubkey);
if newly_rooted {
// there should be no forks; i.e. monotonically increasing ancestor chain
let last_vote = last_vote_in_tower(&val_a_ledger_path, &validator_a_pubkey).unwrap();
let blockstore = open_blockstore(&val_a_ledger_path);
let actual_block_ancestors = AncestorIterator::new_inclusive(last_vote, &blockstore)
.take_while(|a| *a >= some_root_after_restart)
.collect::<Vec<_>>();
let expected_countinuous_no_fork_votes = (some_root_after_restart..=last_vote)
.rev()
.collect::<Vec<_>>();
assert_eq!(actual_block_ancestors, expected_countinuous_no_fork_votes);
assert!(actual_block_ancestors.len() > MAX_LOCKOUT_HISTORY);
info!("validator managed to handle future tower!");
} else {
panic!("no root detected");
}
}
#[test]
#[serial]
fn test_future_tower_master_only() {
do_test_future_tower(ClusterMode::MasterOnly);
}
#[test]
#[serial]
fn test_future_tower_master_slave() {
do_test_future_tower(ClusterMode::MasterSlave);
}
#[test]
#[serial]
fn test_no_optimistic_confirmation_violation_with_tower() {

View File

@ -6,7 +6,7 @@ use bv::BitVec;
use bv::BitsMut;
#[repr(C)]
#[derive(Serialize, Deserialize, PartialEq)]
#[derive(Clone, Serialize, Deserialize, PartialEq)]
pub struct SlotHistory {
pub bits: BitVec<u64>,
pub next_slot: Slot,