Add stake breakdown to metrics for HeaviestForkFailures (#31067)

This commit is contained in:
Ashwin Sekar 2023-04-05 20:35:12 -06:00 committed by GitHub
parent 0ff8a09041
commit 85dbd3d94d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 103 additions and 37 deletions

View File

@ -39,11 +39,27 @@ use {
thiserror::Error, thiserror::Error,
}; };
#[derive(PartialEq, Eq, Clone, Copy, Debug, Default)]
pub enum ThresholdDecision {
#[default]
PassedThreshold,
FailedThreshold(/* Observed stake */ u64),
}
impl ThresholdDecision {
pub fn passed(&self) -> bool {
matches!(self, Self::PassedThreshold)
}
}
#[derive(PartialEq, Eq, Clone, Debug, AbiExample)] #[derive(PartialEq, Eq, Clone, Debug, AbiExample)]
pub enum SwitchForkDecision { pub enum SwitchForkDecision {
SwitchProof(Hash), SwitchProof(Hash),
SameFork, SameFork,
FailedSwitchThreshold(u64, u64), FailedSwitchThreshold(
/* Switch proof stake */ u64,
/* Total stake */ u64,
),
FailedSwitchDuplicateRollback(Slot), FailedSwitchDuplicateRollback(Slot),
} }
@ -978,12 +994,15 @@ impl Tower {
self.last_switch_threshold_check.is_none() self.last_switch_threshold_check.is_none()
} }
/// Performs threshold check for `slot`
///
/// If it passes the check returns None, otherwise returns Some(fork_stake)
pub fn check_vote_stake_threshold( pub fn check_vote_stake_threshold(
&self, &self,
slot: Slot, slot: Slot,
voted_stakes: &VotedStakes, voted_stakes: &VotedStakes,
total_stake: Stake, total_stake: Stake,
) -> bool { ) -> ThresholdDecision {
let mut vote_state = self.vote_state.clone(); let mut vote_state = self.vote_state.clone();
process_slot_vote_unchecked(&mut vote_state, slot); process_slot_vote_unchecked(&mut vote_state, slot);
let vote = vote_state.nth_recent_vote(self.threshold_depth); let vote = vote_state.nth_recent_vote(self.threshold_depth);
@ -999,16 +1018,20 @@ impl Tower {
if old_vote.slot() == vote.slot() if old_vote.slot() == vote.slot()
&& old_vote.confirmation_count() == vote.confirmation_count() && old_vote.confirmation_count() == vote.confirmation_count()
{ {
return true; return ThresholdDecision::PassedThreshold;
} }
} }
} }
lockout > self.threshold_size if lockout > self.threshold_size {
return ThresholdDecision::PassedThreshold;
}
ThresholdDecision::FailedThreshold(*fork_stake)
} else { } else {
false // We haven't seen any votes on this fork yet, so no stake
ThresholdDecision::FailedThreshold(0)
} }
} else { } else {
true ThresholdDecision::PassedThreshold
} }
} }
@ -2009,16 +2032,17 @@ pub mod test {
&node_pubkey, &node_pubkey,
&mut tower, &mut tower,
); );
for slot in 46..=48 { assert_eq!(
if slot == 48 { *results.get(&46).unwrap(),
assert!(results.get(&slot).unwrap().is_empty()); vec![HeaviestForkFailures::FailedSwitchThreshold(46, 0, 40000)]
} else { );
assert_eq!( assert_eq!(
*results.get(&slot).unwrap(), *results.get(&47).unwrap(),
vec![HeaviestForkFailures::FailedSwitchThreshold(slot)] vec![HeaviestForkFailures::FailedSwitchThreshold(
); 47, 10000, 40000
} )]
} );
assert!(results.get(&48).unwrap().is_empty());
} }
#[test] #[test]
@ -2202,7 +2226,7 @@ pub mod test {
fn test_check_vote_threshold_without_votes() { fn test_check_vote_threshold_without_votes() {
let tower = Tower::new_for_tests(1, 0.67); let tower = Tower::new_for_tests(1, 0.67);
let stakes = vec![(0, 1)].into_iter().collect(); let stakes = vec![(0, 1)].into_iter().collect();
assert!(tower.check_vote_stake_threshold(0, &stakes, 2)); assert!(tower.check_vote_stake_threshold(0, &stakes, 2).passed());
} }
#[test] #[test]
@ -2214,7 +2238,9 @@ pub mod test {
stakes.insert(i, 1); stakes.insert(i, 1);
tower.record_vote(i, Hash::default()); tower.record_vote(i, Hash::default());
} }
assert!(!tower.check_vote_stake_threshold(MAX_LOCKOUT_HISTORY as u64 + 1, &stakes, 2,)); assert!(!tower
.check_vote_stake_threshold(MAX_LOCKOUT_HISTORY as u64 + 1, &stakes, 2,)
.passed());
} }
#[test] #[test]
@ -2329,14 +2355,14 @@ pub mod test {
let mut tower = Tower::new_for_tests(1, 0.67); let mut tower = Tower::new_for_tests(1, 0.67);
let stakes = vec![(0, 1)].into_iter().collect(); let stakes = vec![(0, 1)].into_iter().collect();
tower.record_vote(0, Hash::default()); tower.record_vote(0, Hash::default());
assert!(!tower.check_vote_stake_threshold(1, &stakes, 2)); assert!(!tower.check_vote_stake_threshold(1, &stakes, 2).passed());
} }
#[test] #[test]
fn test_check_vote_threshold_above_threshold() { fn test_check_vote_threshold_above_threshold() {
let mut tower = Tower::new_for_tests(1, 0.67); let mut tower = Tower::new_for_tests(1, 0.67);
let stakes = vec![(0, 2)].into_iter().collect(); let stakes = vec![(0, 2)].into_iter().collect();
tower.record_vote(0, Hash::default()); tower.record_vote(0, Hash::default());
assert!(tower.check_vote_stake_threshold(1, &stakes, 2)); assert!(tower.check_vote_stake_threshold(1, &stakes, 2).passed());
} }
#[test] #[test]
@ -2346,7 +2372,7 @@ pub mod test {
tower.record_vote(0, Hash::default()); tower.record_vote(0, Hash::default());
tower.record_vote(1, Hash::default()); tower.record_vote(1, Hash::default());
tower.record_vote(2, Hash::default()); tower.record_vote(2, Hash::default());
assert!(tower.check_vote_stake_threshold(6, &stakes, 2)); assert!(tower.check_vote_stake_threshold(6, &stakes, 2).passed());
} }
#[test] #[test]
@ -2354,7 +2380,7 @@ pub mod test {
let mut tower = Tower::new_for_tests(1, 0.67); let mut tower = Tower::new_for_tests(1, 0.67);
let stakes = HashMap::new(); let stakes = HashMap::new();
tower.record_vote(0, Hash::default()); tower.record_vote(0, Hash::default());
assert!(!tower.check_vote_stake_threshold(1, &stakes, 2)); assert!(!tower.check_vote_stake_threshold(1, &stakes, 2).passed());
} }
#[test] #[test]
@ -2365,7 +2391,7 @@ pub mod test {
tower.record_vote(0, Hash::default()); tower.record_vote(0, Hash::default());
tower.record_vote(1, Hash::default()); tower.record_vote(1, Hash::default());
tower.record_vote(2, Hash::default()); tower.record_vote(2, Hash::default());
assert!(tower.check_vote_stake_threshold(6, &stakes, 2,)); assert!(tower.check_vote_stake_threshold(6, &stakes, 2,).passed());
} }
#[test] #[test]
@ -2483,7 +2509,9 @@ pub mod test {
|_| None, |_| None,
&mut LatestValidatorVotesForFrozenBanks::default(), &mut LatestValidatorVotesForFrozenBanks::default(),
); );
assert!(tower.check_vote_stake_threshold(vote_to_evaluate, &voted_stakes, total_stake,)); assert!(tower
.check_vote_stake_threshold(vote_to_evaluate, &voted_stakes, total_stake,)
.passed());
// CASE 2: Now we want to evaluate a vote for slot VOTE_THRESHOLD_DEPTH + 1. This slot // CASE 2: Now we want to evaluate a vote for slot VOTE_THRESHOLD_DEPTH + 1. This slot
// will expire the vote in one of the vote accounts, so we should have insufficient // will expire the vote in one of the vote accounts, so we should have insufficient
@ -2501,7 +2529,9 @@ pub mod test {
|_| None, |_| None,
&mut LatestValidatorVotesForFrozenBanks::default(), &mut LatestValidatorVotesForFrozenBanks::default(),
); );
assert!(!tower.check_vote_stake_threshold(vote_to_evaluate, &voted_stakes, total_stake,)); assert!(!tower
.check_vote_stake_threshold(vote_to_evaluate, &voted_stakes, total_stake,)
.passed());
} }
fn vote_and_check_recent(num_votes: usize) { fn vote_and_check_recent(num_votes: usize) {

View File

@ -2,7 +2,7 @@ use {
crate::{ crate::{
cluster_info_vote_listener::SlotVoteTracker, cluster_info_vote_listener::SlotVoteTracker,
cluster_slots::SlotPubkeys, cluster_slots::SlotPubkeys,
consensus::{Stake, VotedStakes}, consensus::{Stake, ThresholdDecision, VotedStakes},
replay_stage::SUPERMINORITY_THRESHOLD, replay_stage::SUPERMINORITY_THRESHOLD,
}, },
solana_ledger::blockstore_processor::{ConfirmationProgress, ConfirmationTiming}, solana_ledger::blockstore_processor::{ConfirmationProgress, ConfirmationTiming},
@ -299,7 +299,7 @@ pub struct ForkStats {
pub has_voted: bool, pub has_voted: bool,
pub is_recent: bool, pub is_recent: bool,
pub is_empty: bool, pub is_empty: bool,
pub vote_threshold: bool, pub vote_threshold: ThresholdDecision,
pub is_locked_out: bool, pub is_locked_out: bool,
pub voted_stakes: VotedStakes, pub voted_stakes: VotedStakes,
pub is_supermajority_confirmed: bool, pub is_supermajority_confirmed: bool,

View File

@ -14,7 +14,8 @@ use {
cluster_slots_service::ClusterSlotsUpdateSender, cluster_slots_service::ClusterSlotsUpdateSender,
commitment_service::{AggregateCommitmentService, CommitmentAggregationData}, commitment_service::{AggregateCommitmentService, CommitmentAggregationData},
consensus::{ consensus::{
ComputedBankState, Stake, SwitchForkDecision, Tower, VotedStakes, SWITCH_FORK_THRESHOLD, ComputedBankState, Stake, SwitchForkDecision, ThresholdDecision, Tower, VotedStakes,
SWITCH_FORK_THRESHOLD,
}, },
cost_update_service::CostUpdate, cost_update_service::CostUpdate,
fork_choice::{ForkChoice, SelectVoteAndResetForkResult}, fork_choice::{ForkChoice, SelectVoteAndResetForkResult},
@ -108,9 +109,21 @@ lazy_static! {
#[derive(PartialEq, Eq, Debug)] #[derive(PartialEq, Eq, Debug)]
pub enum HeaviestForkFailures { pub enum HeaviestForkFailures {
LockedOut(u64), LockedOut(u64),
FailedThreshold(u64), FailedThreshold(
FailedSwitchThreshold(u64), Slot,
NoPropagatedConfirmation(u64), /* Observed stake */ u64,
/* Total stake */ u64,
),
FailedSwitchThreshold(
Slot,
/* Observed stake */ u64,
/* Total stake */ u64,
),
NoPropagatedConfirmation(
Slot,
/* Observed stake */ u64,
/* Total stake */ u64,
),
} }
// Implement a destructor for the ReplayStage thread to signal it exited // Implement a destructor for the ReplayStage thread to signal it exited
@ -800,7 +813,7 @@ impl ReplayStage {
); );
for r in &heaviest_fork_failures { for r in &heaviest_fork_failures {
if let HeaviestForkFailures::NoPropagatedConfirmation(slot) = r { if let HeaviestForkFailures::NoPropagatedConfirmation(slot, ..) = r {
if let Some(latest_leader_slot) = if let Some(latest_leader_slot) =
progress.get_latest_leader_slot_must_exist(*slot) progress.get_latest_leader_slot_must_exist(*slot)
{ {
@ -3150,6 +3163,8 @@ impl ReplayStage {
); );
failure_reasons.push(HeaviestForkFailures::FailedSwitchThreshold( failure_reasons.push(HeaviestForkFailures::FailedSwitchThreshold(
heaviest_bank.slot(), heaviest_bank.slot(),
switch_proof_stake,
total_stake,
)); ));
reset_bank.map(|b| (b, switch_fork_decision)) reset_bank.map(|b| (b, switch_fork_decision))
} }
@ -3198,6 +3213,8 @@ impl ReplayStage {
); );
failure_reasons.push(HeaviestForkFailures::FailedSwitchThreshold( failure_reasons.push(HeaviestForkFailures::FailedSwitchThreshold(
heaviest_bank.slot(), heaviest_bank.slot(),
0, // In this case we never actually performed the switch check, 0 for now
0,
)); ));
reset_bank.map(|b| (b, switch_fork_decision)) reset_bank.map(|b| (b, switch_fork_decision))
} }
@ -3206,14 +3223,25 @@ impl ReplayStage {
}; };
if let Some((bank, switch_fork_decision)) = selected_fork { if let Some((bank, switch_fork_decision)) = selected_fork {
let (is_locked_out, vote_threshold, is_leader_slot, fork_weight) = { let (
is_locked_out,
vote_threshold,
propagated_stake,
is_leader_slot,
fork_weight,
total_threshold_stake,
total_epoch_stake,
) = {
let fork_stats = progress.get_fork_stats(bank.slot()).unwrap(); let fork_stats = progress.get_fork_stats(bank.slot()).unwrap();
let propagated_stats = &progress.get_propagated_stats(bank.slot()).unwrap(); let propagated_stats = &progress.get_propagated_stats(bank.slot()).unwrap();
( (
fork_stats.is_locked_out, fork_stats.is_locked_out,
fork_stats.vote_threshold, fork_stats.vote_threshold,
propagated_stats.propagated_validators_stake,
propagated_stats.is_leader_slot, propagated_stats.is_leader_slot,
fork_stats.weight, fork_stats.weight,
fork_stats.total_stake,
propagated_stats.total_epoch_stake,
) )
}; };
@ -3225,15 +3253,23 @@ impl ReplayStage {
if is_locked_out { if is_locked_out {
failure_reasons.push(HeaviestForkFailures::LockedOut(bank.slot())); failure_reasons.push(HeaviestForkFailures::LockedOut(bank.slot()));
} }
if !vote_threshold { if let ThresholdDecision::FailedThreshold(fork_stake) = vote_threshold {
failure_reasons.push(HeaviestForkFailures::FailedThreshold(bank.slot())); failure_reasons.push(HeaviestForkFailures::FailedThreshold(
bank.slot(),
fork_stake,
total_threshold_stake,
));
} }
if !propagation_confirmed { if !propagation_confirmed {
failure_reasons.push(HeaviestForkFailures::NoPropagatedConfirmation(bank.slot())); failure_reasons.push(HeaviestForkFailures::NoPropagatedConfirmation(
bank.slot(),
propagated_stake,
total_epoch_stake,
));
} }
if !is_locked_out if !is_locked_out
&& vote_threshold && vote_threshold.passed()
&& propagation_confirmed && propagation_confirmed
&& switch_fork_decision.can_vote() && switch_fork_decision.can_vote()
{ {