Various clean-ups before assert adjustment (#13006)
* Various clean-ups before assert adjustment * oops
This commit is contained in:
parent
2bb27cdb25
commit
efdb560e97
|
@ -107,6 +107,8 @@ pub struct Tower {
|
||||||
// (This is a special field for slashing-free validator restart with edge cases).
|
// (This is a special field for slashing-free validator restart with edge cases).
|
||||||
// This could be emptied after some time; but left intact indefinitely for easier
|
// This could be emptied after some time; but left intact indefinitely for easier
|
||||||
// implementation
|
// implementation
|
||||||
|
// Further, stray slot can be stale or not. `Stale` here means whether given
|
||||||
|
// bank_forks (=~ ledger) lacks the slot or not.
|
||||||
stray_restored_slot: Option<Slot>,
|
stray_restored_slot: Option<Slot>,
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
|
pub last_switch_threshold_check: Option<(Slot, SwitchForkDecision)>,
|
||||||
|
@ -438,7 +440,7 @@ impl Tower {
|
||||||
|
|
||||||
// root may be forcibly set by arbitrary replay root slot, for example from a root
|
// root may be forcibly set by arbitrary replay root slot, for example from a root
|
||||||
// after replaying a snapshot.
|
// after replaying a snapshot.
|
||||||
// Also, tower.root() couldn't be None; do_initialize_lockouts() ensures that.
|
// Also, tower.root() couldn't be None; initialize_lockouts() ensures that.
|
||||||
// Conceptually, every tower must have been constructed from a concrete starting point,
|
// Conceptually, every tower must have been constructed from a concrete starting point,
|
||||||
// which establishes the origin of trust (i.e. root) whether booting from genesis (slot 0) or
|
// which establishes the origin of trust (i.e. root) whether booting from genesis (slot 0) or
|
||||||
// snapshot (slot N). In other words, there should be no possibility a Tower doesn't have
|
// snapshot (slot N). In other words, there should be no possibility a Tower doesn't have
|
||||||
|
@ -517,7 +519,7 @@ impl Tower {
|
||||||
let last_vote_ancestors =
|
let last_vote_ancestors =
|
||||||
ancestors.get(&last_voted_slot).unwrap_or_else(|| {
|
ancestors.get(&last_voted_slot).unwrap_or_else(|| {
|
||||||
if !self.is_stray_last_vote() {
|
if !self.is_stray_last_vote() {
|
||||||
// Unless last vote is stray, ancestors.get(last_voted_slot) must
|
// Unless last vote is stray and stale, ancestors.get(last_voted_slot) must
|
||||||
// return Some(_), justifying to panic! here.
|
// return Some(_), justifying to panic! here.
|
||||||
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
|
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
|
||||||
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
|
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
|
||||||
|
@ -527,8 +529,8 @@ impl Tower {
|
||||||
// all of them.
|
// all of them.
|
||||||
panic!("no ancestors found with slot: {}", last_voted_slot);
|
panic!("no ancestors found with slot: {}", last_voted_slot);
|
||||||
} else {
|
} else {
|
||||||
// This condition shouldn't occur under normal validator operation, indicating
|
// This condition (stale stray last vote) shouldn't occur under normal validator
|
||||||
// something unusual happened.
|
// operation, indicating something unusual happened.
|
||||||
// Possible causes include: OS/HW crash, validator process crash, only saved tower
|
// Possible causes include: OS/HW crash, validator process crash, only saved tower
|
||||||
// is moved over to a new setup, etc...
|
// is moved over to a new setup, etc...
|
||||||
|
|
||||||
|
@ -829,17 +831,18 @@ impl Tower {
|
||||||
// The tower root can be older/newer if the validator booted from a newer/older snapshot, so
|
// The tower root can be older/newer if the validator booted from a newer/older snapshot, so
|
||||||
// tower lockouts may need adjustment
|
// tower lockouts may need adjustment
|
||||||
pub fn adjust_lockouts_after_replay(
|
pub fn adjust_lockouts_after_replay(
|
||||||
self,
|
mut self,
|
||||||
replayed_root: Slot,
|
replayed_root: Slot,
|
||||||
slot_history: &SlotHistory,
|
slot_history: &SlotHistory,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
// sanity assertions for roots
|
// sanity assertions for roots
|
||||||
let tower_root = self.root();
|
let tower_root = self.root();
|
||||||
info!(
|
info!(
|
||||||
"adjusting lockouts (after replay up to {}): {:?} tower root: {}",
|
"adjusting lockouts (after replay up to {}): {:?} tower root: {} replayed root: {}",
|
||||||
replayed_root,
|
replayed_root,
|
||||||
self.voted_slots(),
|
self.voted_slots(),
|
||||||
tower_root,
|
tower_root,
|
||||||
|
replayed_root,
|
||||||
);
|
);
|
||||||
assert_eq!(slot_history.check(replayed_root), Check::Found);
|
assert_eq!(slot_history.check(replayed_root), Check::Found);
|
||||||
|
|
||||||
|
@ -860,30 +863,24 @@ impl Tower {
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
// return immediately if votes are empty...
|
if let Some(last_voted_slot) = self.last_voted_slot() {
|
||||||
if self.lockouts.votes.is_empty() {
|
if slot_history.check(last_voted_slot) == Check::TooOld {
|
||||||
return Ok(self);
|
// We could try hard to anchor with other older votes, but opt to simplify the
|
||||||
|
// following logic
|
||||||
|
return Err(TowerError::TooOldTower(
|
||||||
|
last_voted_slot,
|
||||||
|
slot_history.oldest(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
self.adjust_lockouts_with_slot_history(slot_history)?;
|
||||||
|
self.initialize_root(replayed_root);
|
||||||
}
|
}
|
||||||
|
|
||||||
let last_voted_slot = self.last_voted_slot().unwrap();
|
Ok(self)
|
||||||
if slot_history.check(last_voted_slot) == Check::TooOld {
|
|
||||||
// We could try hard to anchor with other older votes, but opt to simplify the
|
|
||||||
// following logic
|
|
||||||
return Err(TowerError::TooOldTower(
|
|
||||||
last_voted_slot,
|
|
||||||
slot_history.oldest(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
self.do_adjust_lockouts_after_replay(tower_root, replayed_root, slot_history)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn do_adjust_lockouts_after_replay(
|
fn adjust_lockouts_with_slot_history(&mut self, slot_history: &SlotHistory) -> Result<()> {
|
||||||
mut self,
|
let tower_root = self.root();
|
||||||
tower_root: Slot,
|
|
||||||
replayed_root: Slot,
|
|
||||||
slot_history: &SlotHistory,
|
|
||||||
) -> Result<Self> {
|
|
||||||
// retained slots will be consisted only from divergent slots
|
// retained slots will be consisted only from divergent slots
|
||||||
let mut retain_flags_for_each_vote_in_reverse: Vec<_> =
|
let mut retain_flags_for_each_vote_in_reverse: Vec<_> =
|
||||||
Vec::with_capacity(self.lockouts.votes.len());
|
Vec::with_capacity(self.lockouts.votes.len());
|
||||||
|
@ -931,7 +928,12 @@ impl Tower {
|
||||||
if !voting_from_genesis {
|
if !voting_from_genesis {
|
||||||
// Unless we're voting since genesis, slots_in_tower must always be older than last checked_slot
|
// Unless we're voting since genesis, slots_in_tower must always be older than last checked_slot
|
||||||
// including all vote slot and the root slot.
|
// including all vote slot and the root slot.
|
||||||
assert!(*slot_in_tower < checked_slot)
|
assert!(
|
||||||
|
*slot_in_tower < checked_slot,
|
||||||
|
"slot_in_tower({}) < checked_slot({})",
|
||||||
|
*slot_in_tower,
|
||||||
|
checked_slot
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -959,15 +961,10 @@ impl Tower {
|
||||||
retain_flags_for_each_vote_in_reverse.into_iter().rev();
|
retain_flags_for_each_vote_in_reverse.into_iter().rev();
|
||||||
|
|
||||||
let original_votes_len = self.lockouts.votes.len();
|
let original_votes_len = self.lockouts.votes.len();
|
||||||
self.do_initialize_lockouts(replayed_root, move |_| {
|
self.initialize_lockouts(move |_| retain_flags_for_each_vote.next().unwrap());
|
||||||
retain_flags_for_each_vote.next().unwrap()
|
|
||||||
});
|
|
||||||
|
|
||||||
if self.lockouts.votes.is_empty() {
|
if self.lockouts.votes.is_empty() {
|
||||||
info!(
|
info!("All restored votes were behind; resetting root_slot and last_vote in tower!");
|
||||||
"All restored votes were behind replayed_root({}); resetting root_slot and last_vote in tower!",
|
|
||||||
replayed_root
|
|
||||||
);
|
|
||||||
// we might not have banks for those votes so just reset.
|
// we might not have banks for those votes so just reset.
|
||||||
// That's because the votes may well past replayed_root
|
// That's because the votes may well past replayed_root
|
||||||
self.last_vote = Vote::default();
|
self.last_vote = Vote::default();
|
||||||
|
@ -986,7 +983,7 @@ impl Tower {
|
||||||
self.stray_restored_slot = Some(self.last_vote.last_voted_slot().unwrap());
|
self.stray_restored_slot = Some(self.last_vote.last_voted_slot().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(self)
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn initialize_lockouts_from_bank(
|
fn initialize_lockouts_from_bank(
|
||||||
|
@ -999,7 +996,8 @@ impl Tower {
|
||||||
let vote_state = VoteState::deserialize(&vote_account.data)
|
let vote_state = VoteState::deserialize(&vote_account.data)
|
||||||
.expect("vote_account isn't a VoteState?");
|
.expect("vote_account isn't a VoteState?");
|
||||||
self.lockouts = vote_state;
|
self.lockouts = vote_state;
|
||||||
self.do_initialize_lockouts(root, |v| v.slot > root);
|
self.initialize_root(root);
|
||||||
|
self.initialize_lockouts(|v| v.slot > root);
|
||||||
trace!(
|
trace!(
|
||||||
"Lockouts in tower for {} is initialized using bank {}",
|
"Lockouts in tower for {} is initialized using bank {}",
|
||||||
self.node_pubkey,
|
self.node_pubkey,
|
||||||
|
@ -1010,7 +1008,7 @@ impl Tower {
|
||||||
"vote account's node_pubkey doesn't match",
|
"vote account's node_pubkey doesn't match",
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
self.do_initialize_lockouts(root, |_| true);
|
self.initialize_root(root);
|
||||||
info!(
|
info!(
|
||||||
"vote account({}) not found in bank (slot={})",
|
"vote account({}) not found in bank (slot={})",
|
||||||
vote_account_pubkey,
|
vote_account_pubkey,
|
||||||
|
@ -1019,13 +1017,16 @@ impl Tower {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn do_initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, root: Slot, should_retain: F) {
|
fn initialize_lockouts<F: FnMut(&Lockout) -> bool>(&mut self, should_retain: F) {
|
||||||
// Updating root is needed to correctly restore from newly-saved tower for the next
|
|
||||||
// boot
|
|
||||||
self.lockouts.root_slot = Some(root);
|
|
||||||
self.lockouts.votes.retain(should_retain);
|
self.lockouts.votes.retain(should_retain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Updating root is needed to correctly restore from newly-saved tower for the next
|
||||||
|
// boot
|
||||||
|
fn initialize_root(&mut self, root: Slot) {
|
||||||
|
self.lockouts.root_slot = Some(root);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_filename(path: &Path, node_pubkey: &Pubkey) -> PathBuf {
|
pub fn get_filename(path: &Path, node_pubkey: &Pubkey) -> PathBuf {
|
||||||
path.join(format!("tower-{}", node_pubkey))
|
path.join(format!("tower-{}", node_pubkey))
|
||||||
.with_extension("bin")
|
.with_extension("bin")
|
||||||
|
@ -1176,7 +1177,7 @@ pub fn reconcile_blockstore_roots_with_tower(
|
||||||
"at least 1 parent slot must be found"
|
"at least 1 parent slot must be found"
|
||||||
);
|
);
|
||||||
|
|
||||||
blockstore.set_roots(&new_roots)?
|
blockstore.set_roots(&new_roots)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -2796,7 +2797,7 @@ pub mod test {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_adjust_lockouts_after_relay_all_rooted_with_too_old() {
|
fn test_adjust_lockouts_after_replay_all_rooted_with_too_old() {
|
||||||
use solana_sdk::slot_history::MAX_ENTRIES;
|
use solana_sdk::slot_history::MAX_ENTRIES;
|
||||||
|
|
||||||
let mut tower = Tower::new_for_tests(10, 0.9);
|
let mut tower = Tower::new_for_tests(10, 0.9);
|
||||||
|
@ -2999,4 +3000,54 @@ pub mod test {
|
||||||
"The tower is fatally inconsistent with blockstore: not too old once after got too old?"
|
"The tower is fatally inconsistent with blockstore: not too old once after got too old?"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "slot_in_tower(2) < checked_slot(1)")]
|
||||||
|
fn test_adjust_lockouts_after_replay_reversed_votes() {
|
||||||
|
let mut tower = Tower::new_for_tests(10, 0.9);
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(2));
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(1));
|
||||||
|
let vote = Vote::new(vec![1], Hash::default());
|
||||||
|
tower.last_vote = vote;
|
||||||
|
|
||||||
|
let mut slot_history = SlotHistory::default();
|
||||||
|
slot_history.add(0);
|
||||||
|
slot_history.add(2);
|
||||||
|
|
||||||
|
tower
|
||||||
|
.adjust_lockouts_after_replay(2, &slot_history)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "slot_in_tower(3) < checked_slot(3)")]
|
||||||
|
fn test_adjust_lockouts_after_replay_repeated_non_root_votes() {
|
||||||
|
let mut tower = Tower::new_for_tests(10, 0.9);
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(2));
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(3));
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(3));
|
||||||
|
let vote = Vote::new(vec![3], Hash::default());
|
||||||
|
tower.last_vote = vote;
|
||||||
|
|
||||||
|
let mut slot_history = SlotHistory::default();
|
||||||
|
slot_history.add(0);
|
||||||
|
slot_history.add(2);
|
||||||
|
|
||||||
|
tower
|
||||||
|
.adjust_lockouts_after_replay(2, &slot_history)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_adjust_lockouts_after_replay_vote_on_genesis() {
|
||||||
|
let mut tower = Tower::new_for_tests(10, 0.9);
|
||||||
|
tower.lockouts.votes.push_back(Lockout::new(0));
|
||||||
|
let vote = Vote::new(vec![0], Hash::default());
|
||||||
|
tower.last_vote = vote;
|
||||||
|
|
||||||
|
let mut slot_history = SlotHistory::default();
|
||||||
|
slot_history.add(0);
|
||||||
|
|
||||||
|
assert!(tower.adjust_lockouts_after_replay(0, &slot_history).is_ok());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -498,7 +498,7 @@ impl HeaviestSubtreeForkChoice {
|
||||||
let heaviest_slot_on_same_voted_fork = self.best_slot(last_voted_slot);
|
let heaviest_slot_on_same_voted_fork = self.best_slot(last_voted_slot);
|
||||||
if heaviest_slot_on_same_voted_fork.is_none() {
|
if heaviest_slot_on_same_voted_fork.is_none() {
|
||||||
if !tower.is_stray_last_vote() {
|
if !tower.is_stray_last_vote() {
|
||||||
// Unless last vote is stray, self.bast_slot(last_voted_slot) must return
|
// Unless last vote is stray and stale, self.bast_slot(last_voted_slot) must return
|
||||||
// Some(_), justifying to panic! here.
|
// Some(_), justifying to panic! here.
|
||||||
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
|
// Also, adjust_lockouts_after_replay() correctly makes last_voted_slot None,
|
||||||
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
|
// if all saved votes are ancestors of replayed_root_slot. So this code shouldn't be
|
||||||
|
@ -507,12 +507,12 @@ impl HeaviestSubtreeForkChoice {
|
||||||
// validator has been running, so we must be able to fetch best_slots for all of
|
// validator has been running, so we must be able to fetch best_slots for all of
|
||||||
// them.
|
// them.
|
||||||
panic!(
|
panic!(
|
||||||
"a bank at last_voted_slot({}) is a frozen bank so must have been\
|
"a bank at last_voted_slot({}) is a frozen bank so must have been \
|
||||||
added to heaviest_subtree_fork_choice at time of freezing",
|
added to heaviest_subtree_fork_choice at time of freezing",
|
||||||
last_voted_slot,
|
last_voted_slot,
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
// fork_infos doesn't have corresponding data for the stray restored last vote,
|
// fork_infos doesn't have corresponding data for the stale stray last vote,
|
||||||
// meaning some inconsistency between saved tower and ledger.
|
// meaning some inconsistency between saved tower and ledger.
|
||||||
// (newer snapshot, or only a saved tower is moved over to new setup?)
|
// (newer snapshot, or only a saved tower is moved over to new setup?)
|
||||||
return None;
|
return None;
|
||||||
|
|
|
@ -744,7 +744,7 @@ fn post_process_restored_tower(
|
||||||
error!("Requested mandatory tower restore failed: {}", err);
|
error!("Requested mandatory tower restore failed: {}", err);
|
||||||
error!(
|
error!(
|
||||||
"And there is an existing vote_account containing actual votes. \
|
"And there is an existing vote_account containing actual votes. \
|
||||||
Aborting due to possible conflicting duplicate votes"
|
Aborting due to possible conflicting duplicate votes",
|
||||||
);
|
);
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue