Various clean-ups before assert adjustment (#13006)

* Various clean-ups before assert adjustment

* oops
This commit is contained in:
Ryo Onodera 2020-10-21 10:26:20 +09:00 committed by GitHub
parent 2bb27cdb25
commit efdb560e97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 47 deletions

View File

@ -107,6 +107,8 @@ pub struct Tower {
// (This is a special field for slashing-free validator restart with edge cases). // (This is a special field for slashing-free validator restart with edge cases).
// This could be emptied after some time; but left intact indefinitely for easier // This could be emptied after some time; but left intact indefinitely for easier
// implementation // implementation
// Further, stray slot can be stale or not. `Stale` here means whether given
// bank_forks (=~ ledger) lacks the slot or not.
stray_restored_slot: Option<Slot>, stray_restored_slot: Option<Slot>,
#[serde(skip)] #[serde(skip)]
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>, pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
@ -438,7 +440,7 @@ impl Tower {
// root may be forcibly set by arbitrary replay root slot, for example from a root // root may be forcibly set by arbitrary replay root slot, for example from a root
// after replaying a snapshot. // after replaying a snapshot.
// Also, tower.root() couldn't be None; do_initialize_lockouts() ensures that. // Also, tower.root() couldn't be None; initialize_lockouts() ensures that.
// Conceptually, every tower must have been constructed from a concrete starting point, // Conceptually, every tower must have been constructed from a concrete starting point,
// which establishes the origin of trust (i.e. root) whether booting from genesis (slot 0) or // which establishes the origin of trust (i.e. root) whether booting from genesis (slot 0) or
// snapshot (slot N). In other words, there should be no possibility a Tower doesn't have // snapshot (slot N). In other words, there should be no possibility a Tower doesn't have
@ -517,7 +519,7 @@ impl Tower {
let last_vote_ancestors = let last_vote_ancestors =
ancestors.get(&last_voted_slot).unwrap_or_else(|| { ancestors.get(&last_voted_slot).unwrap_or_else(|| {
if !self.is_stray_last_vote() { if !self.is_stray_last_vote() {
// Unless last vote is stray, ancestors.get(last_voted_slot) must // Unless last vote is stray and stale, ancestors.get(last_voted_slot) must
// return Some(_), justifying to panic! here. // return Some(_), justifying to panic! here.
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None, // Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be // if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
@ -527,8 +529,8 @@ impl Tower {
// all of them. // all of them.
panic!("no ancestors found with slot: {}", last_voted_slot); panic!("no ancestors found with slot: {}", last_voted_slot);
} else { } else {
// This condition shouldn't occur under normal validator operation, indicating // This condition (stale stray last vote) shouldn't occur under normal validator
// something unusual happened. // operation, indicating something unusual happened.
// Possible causes include: OS/HW crash, validator process crash, only saved tower // Possible causes include: OS/HW crash, validator process crash, only saved tower
// is moved over to a new setup, etc... // is moved over to a new setup, etc...
@ -829,17 +831,18 @@ impl Tower {
// The tower root can be older/newer if the validator booted from a newer/older snapshot, so // The tower root can be older/newer if the validator booted from a newer/older snapshot, so
// tower lockouts may need adjustment // tower lockouts may need adjustment
pub fn adjust_lockouts_after_replay( pub fn adjust_lockouts_after_replay(
self, mut self,
replayed_root: Slot, replayed_root: Slot,
slot_history: &SlotHistory, slot_history: &SlotHistory,
) -> Result<Self> { ) -> Result<Self> {
// sanity assertions for roots // sanity assertions for roots
let tower_root = self.root(); let tower_root = self.root();
info!( info!(
"adjusting lockouts (after replay up to {}): {:?} tower root: {}", "adjusting lockouts (after replay up to {}): {:?} tower root: {} replayed root: {}",
replayed_root, replayed_root,
self.voted_slots(), self.voted_slots(),
tower_root, tower_root,
replayed_root,
); );
assert_eq!(slot_history.check(replayed_root), Check::Found); assert_eq!(slot_history.check(replayed_root), Check::Found);
@ -860,30 +863,24 @@ impl Tower {
) )
); );
// return immediately if votes are empty... if let Some(last_voted_slot) = self.last_voted_slot() {
if self.lockouts.votes.is_empty() { if slot_history.check(last_voted_slot) == Check::TooOld {
return Ok(self); // We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
last_voted_slot,
slot_history.oldest(),
));
}
self.adjust_lockouts_with_slot_history(slot_history)?;
self.initialize_root(replayed_root);
} }
let last_voted_slot = self.last_voted_slot().unwrap(); Ok(self)
if slot_history.check(last_voted_slot) == Check::TooOld {
// We could try hard to anchor with other older votes, but opt to simplify the
// following logic
return Err(TowerError::TooOldTower(
last_voted_slot,
slot_history.oldest(),
));
}
self.do_adjust_lockouts_after_replay(tower_root, replayed_root, slot_history)
} }
fn do_adjust_lockouts_after_replay( fn adjust_lockouts_with_slot_history(&mut self, slot_history: &SlotHistory) -> Result<()> {
mut self, let tower_root = self.root();
tower_root: Slot,
replayed_root: Slot,
slot_history: &SlotHistory,
) -> Result<Self> {
// retained slots will be consisted only from divergent slots // retained slots will be consisted only from divergent slots
let mut retain_flags_for_each_vote_in_reverse: Vec<_> = let mut retain_flags_for_each_vote_in_reverse: Vec<_> =
Vec::with_capacity(self.lockouts.votes.len()); Vec::with_capacity(self.lockouts.votes.len());
@ -931,7 +928,12 @@ impl Tower {
if !voting_from_genesis { if !voting_from_genesis {
// Unless we're voting since genesis, slots_in_tower must always be older than last checked_slot // Unless we're voting since genesis, slots_in_tower must always be older than last checked_slot
// including all vote slot and the root slot. // including all vote slot and the root slot.
assert!(*slot_in_tower < checked_slot) assert!(
*slot_in_tower < checked_slot,
"slot_in_tower({}) < checked_slot({})",
*slot_in_tower,
checked_slot
);
} }
} }
@ -959,15 +961,10 @@ impl Tower {
retain_flags_for_each_vote_in_reverse.into_iter().rev(); retain_flags_for_each_vote_in_reverse.into_iter().rev();
let original_votes_len = self.lockouts.votes.len(); let original_votes_len = self.lockouts.votes.len();
self.do_initialize_lockouts(replayed_root, move |_| { self.initialize_lockouts(move |_| retain_flags_for_each_vote.next().unwrap());
retain_flags_for_each_vote.next().unwrap()
});
if self.lockouts.votes.is_empty() { if self.lockouts.votes.is_empty() {
info!( info!("All restored votes were behind; resetting root_slot and last_vote in tower!");
"All restored votes were behind replayed_root({}); resetting root_slot and last_vote in tower!",
replayed_root
);
// we might not have banks for those votes so just reset. // we might not have banks for those votes so just reset.
// That's because the votes may well past replayed_root // That's because the votes may well past replayed_root
self.last_vote = Vote::default(); self.last_vote = Vote::default();
@ -986,7 +983,7 @@ impl Tower {
self.stray_restored_slot = Some(self.last_vote.last_voted_slot().unwrap()); self.stray_restored_slot = Some(self.last_vote.last_voted_slot().unwrap());
} }
Ok(self) Ok(())
} }
fn initialize_lockouts_from_bank( fn initialize_lockouts_from_bank(
@ -999,7 +996,8 @@ impl Tower {
let vote_state = VoteState::deserialize(&vote_account.data) let vote_state = VoteState::deserialize(&vote_account.data)
.expect("vote_account isn't a VoteState?"); .expect("vote_account isn't a VoteState?");
self.lockouts = vote_state; self.lockouts = vote_state;
self.do_initialize_lockouts(root, |v| v.slot > root); self.initialize_root(root);
self.initialize_lockouts(|v| v.slot > root);
trace!( trace!(
"Lockouts in tower for {} is initialized using bank {}", "Lockouts in tower for {} is initialized using bank {}",
self.node_pubkey, self.node_pubkey,
@ -1010,7 +1008,7 @@ impl Tower {
"vote account's node_pubkey doesn't match", "vote account's node_pubkey doesn't match",
); );
} else { } else {
self.do_initialize_lockouts(root, |_| true); self.initialize_root(root);
info!( info!(
"vote account({}) not found in bank (slot={})", "vote account({}) not found in bank (slot={})",
vote_account_pubkey, vote_account_pubkey,
@ -1019,13 +1017,16 @@ impl Tower {
} }
} }
fn do_initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, root: Slot, should_retain: F) { fn initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, should_retain: F) {
// Updating root is needed to correctly restore from newly-saved tower for the next
// boot
self.lockouts.root_slot = Some(root);
self.lockouts.votes.retain(should_retain); self.lockouts.votes.retain(should_retain);
} }
// Updating root is needed to correctly restore from newly-saved tower for the next
// boot
fn initialize_root(&mut self, root: Slot) {
self.lockouts.root_slot = Some(root);
}
pub fn get_filename(path: &Path, node_pubkey: &Pubkey) -> PathBuf { pub fn get_filename(path: &Path, node_pubkey: &Pubkey) -> PathBuf {
path.join(format!("tower-{}", node_pubkey)) path.join(format!("tower-{}", node_pubkey))
.with_extension("bin") .with_extension("bin")
@ -1176,7 +1177,7 @@ pub fn reconcile_blockstore_roots_with_tower(
"at least 1 parent slot must be found" "at least 1 parent slot must be found"
); );
blockstore.set_roots(&new_roots)? blockstore.set_roots(&new_roots)?;
} }
Ok(()) Ok(())
} }
@ -2796,7 +2797,7 @@ pub mod test {
} }
#[test] #[test]
fn test_adjust_lockouts_after_relay_all_rooted_with_too_old() { fn test_adjust_lockouts_after_replay_all_rooted_with_too_old() {
use solana_sdk::slot_history::MAX_ENTRIES; use solana_sdk::slot_history::MAX_ENTRIES;
let mut tower = Tower::new_for_tests(10, 0.9); let mut tower = Tower::new_for_tests(10, 0.9);
@ -2999,4 +3000,54 @@ pub mod test {
"The tower is fatally inconsistent with blockstore: not too old once after got too old?" "The tower is fatally inconsistent with blockstore: not too old once after got too old?"
); );
} }
#[test]
#[should_panic(expected = "slot_in_tower(2) < checked_slot(1)")]
fn test_adjust_lockouts_after_replay_reversed_votes() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(2));
tower.lockouts.votes.push_back(Lockout::new(1));
let vote = Vote::new(vec![1], Hash::default());
tower.last_vote = vote;
let mut slot_history = SlotHistory::default();
slot_history.add(0);
slot_history.add(2);
tower
.adjust_lockouts_after_replay(2, &slot_history)
.unwrap();
}
#[test]
#[should_panic(expected = "slot_in_tower(3) < checked_slot(3)")]
fn test_adjust_lockouts_after_replay_repeated_non_root_votes() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(2));
tower.lockouts.votes.push_back(Lockout::new(3));
tower.lockouts.votes.push_back(Lockout::new(3));
let vote = Vote::new(vec![3], Hash::default());
tower.last_vote = vote;
let mut slot_history = SlotHistory::default();
slot_history.add(0);
slot_history.add(2);
tower
.adjust_lockouts_after_replay(2, &slot_history)
.unwrap();
}
#[test]
fn test_adjust_lockouts_after_replay_vote_on_genesis() {
let mut tower = Tower::new_for_tests(10, 0.9);
tower.lockouts.votes.push_back(Lockout::new(0));
let vote = Vote::new(vec![0], Hash::default());
tower.last_vote = vote;
let mut slot_history = SlotHistory::default();
slot_history.add(0);
assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok());
}
} }

View File

@ -498,7 +498,7 @@ impl HeaviestSubtreeForkChoice {
let heaviest_slot_on_same_voted_fork = self.best_slot(last_voted_slot); let heaviest_slot_on_same_voted_fork = self.best_slot(last_voted_slot);
if heaviest_slot_on_same_voted_fork.is_none() { if heaviest_slot_on_same_voted_fork.is_none() {
if !tower.is_stray_last_vote() { if !tower.is_stray_last_vote() {
// Unless last vote is stray, self.bast_slot(last_voted_slot) must return // Unless last vote is stray and stale, self.bast_slot(last_voted_slot) must return
// Some(_), justifying to panic! here. // Some(_), justifying to panic! here.
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None, // Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be // if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
@ -507,12 +507,12 @@ impl HeaviestSubtreeForkChoice {
// validator has been running, so we must be able to fetch best_slots for all of // validator has been running, so we must be able to fetch best_slots for all of
// them. // them.
panic!( panic!(
"a bank at last_voted_slot({}) is a frozen bank so must have been\ "a bank at last_voted_slot({}) is a frozen bank so must have been \
added to heaviest_subtree_fork_choice at time of freezing", added to heaviest_subtree_fork_choice at time of freezing",
last_voted_slot, last_voted_slot,
) )
} else { } else {
// fork_infos doesn't have corresponding data for the stray restored last vote, // fork_infos doesn't have corresponding data for the stale stray last vote,
// meaning some inconsistency between saved tower and ledger. // meaning some inconsistency between saved tower and ledger.
// (newer snapshot, or only a saved tower is moved over to new setup?) // (newer snapshot, or only a saved tower is moved over to new setup?)
return None; return None;

View File

@ -744,7 +744,7 @@ fn post_process_restored_tower(
error!("Requested mandatory tower restore failed: {}", err); error!("Requested mandatory tower restore failed: {}", err);
error!( error!(
"And there is an existing vote_account containing actual votes. \ "And there is an existing vote_account containing actual votes. \
Aborting due to possible conflicting duplicate votes" Aborting due to possible conflicting duplicate votes",
); );
process::exit(1); process::exit(1);
} }